/******************************************************************************* * Trombone is a flexible text processing and analysis library used * primarily by Voyant Tools (voyant-tools.org). * * Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell * * This file is part of Trombone. * * Trombone is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Trombone is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Trombone. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package org.voyanttools.trombone.model; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; import java.util.Comparator; import org.apache.commons.io.IOUtils; import org.voyanttools.trombone.storage.Storage; import org.voyanttools.trombone.util.FlexibleParameters; import com.thoughtworks.xstream.annotations.XStreamAlias; import com.thoughtworks.xstream.annotations.XStreamConverter; /** * @author sgs * */ @XStreamAlias("document") @XStreamConverter(DocumentConverter.class) public class IndexedDocument implements DocumentContainer, Comparable<IndexedDocument> { private String id; private DocumentMetadata metadata = null; private Storage storage; public enum Sort { INDEXASC, INDEXDESC, TITLEASC, TITLEDESC, AUTHORASC, AUTHORDESC, TOKENSCOUNTLEXICALASC, TOKENSCOUNTLEXICALDESC, TYPESCOUNTLEXICALASC, TYPESCOUNTLEXICALDESC, TYPETOKENRATIOLEXICALASC, TYPETOKENRATIOLEXICALDESC, PUBDATEASC, PUBDATEDESC; public static Sort getForgivingly(FlexibleParameters parameters) { String sort = parameters.getParameterValue("sort", "").toUpperCase(); String sortPrefix = "INDEX"; if (sort.startsWith("TITLE")) {sortPrefix="TITLE";} if (sort.startsWith("TOKENSCOUNT")) {sortPrefix="TOKENSCOUNTLEXICAL";} // TODO: support other kinds of term counts if (sort.startsWith("TYPESCOUNT")) {sortPrefix="TYPESCOUNTLEXICAL";} // TODO: support other kinds of term counts if (sort.startsWith("TYPETOKEN")) {sortPrefix="TYPETOKENRATIOLEXICAL";} // TODO: support other kinds of term counts else if (sort.startsWith("AUTHOR")) {sortPrefix="AUTHOR";} else if (sort.startsWith("PUBDATE")) {sortPrefix="PUBDATE";} String dir = parameters.getParameterValue("dir", "").toUpperCase(); String dirSuffix = "ASC"; if (dir.endsWith("DESC")) {dirSuffix="DESC";} return valueOf(sortPrefix+dirSuffix); } } /** * */ public IndexedDocument(Storage storage, String id) { this.storage = storage; this.id = id; } public String getId() { return id; } public StoredDocumentSource asStoredDocumentSource() throws IOException { return new StoredDocumentSource(getId(), getMetadata()); } public DocumentMetadata getMetadata() throws IOException { if (metadata==null) { metadata = storage.getStoredDocumentSourceStorage().getStoredDocumentSourceMetadata(getId()); } return metadata; } public String getDocumentString() throws IOException { InputStream is = storage.getStoredDocumentSourceStorage().getStoredDocumentSourceInputStream(id); String string = IOUtils.toString(is, "UTF-8"); is.close(); return string; } public static class IndexedDocumentPriorityQueue { // used when a size is given – use the Lucene implementation for better memory management (only top items are kept) private org.apache.lucene.util.PriorityQueue<IndexedDocument> limitedSizeQueue = null; // use the Java implementation to allow the queue to grow arbitrarily big private java.util.PriorityQueue<IndexedDocument> unlimitedSizeQueue = null; public IndexedDocumentPriorityQueue(IndexedDocument.Sort sort) { this(Integer.MAX_VALUE, sort); } public IndexedDocumentPriorityQueue(int size, IndexedDocument.Sort sort) { Comparator<IndexedDocument> comparator = IndexedDocument.getComparator(sort); if (size==Integer.MAX_VALUE) { unlimitedSizeQueue = new java.util.PriorityQueue<IndexedDocument>(11, comparator); } else { limitedSizeQueue = new LimitedSizeQueue<IndexedDocument>(size, comparator); } } private class LimitedSizeQueue<IndexedDocument> extends org.apache.lucene.util.PriorityQueue<IndexedDocument> { Comparator<IndexedDocument> comparator; public LimitedSizeQueue(int maxSize, Comparator<IndexedDocument> comparator) { super(maxSize); this.comparator = comparator; } @Override protected boolean lessThan(IndexedDocument a, IndexedDocument b) { return comparator.compare(a, b) < 0; } } public void offer(IndexedDocument document) { if (limitedSizeQueue!=null) {limitedSizeQueue.insertWithOverflow(document);} else if (unlimitedSizeQueue!=null) {unlimitedSizeQueue.offer(document);} } public int size() { if (limitedSizeQueue!=null) {return limitedSizeQueue.size();} else if (unlimitedSizeQueue!=null) {return unlimitedSizeQueue.size();} return 0; } public IndexedDocument poll() { if (limitedSizeQueue!=null) {return limitedSizeQueue.pop();} else if (unlimitedSizeQueue!=null) {return unlimitedSizeQueue.poll();} return null; } } public static Comparator<IndexedDocument> getComparator(Sort sort) { switch(sort) { case INDEXDESC: return IndexDescComparator; case TITLEASC: return TitleAscComparator; case TITLEDESC: return TitleDescComparator; case AUTHORASC: return AuthorAscComparator; case AUTHORDESC: return AuthorDescComparator; case TOKENSCOUNTLEXICALASC: return TermsCountLexicalAscComparator; case TOKENSCOUNTLEXICALDESC: return TermsCountLexicalDescComparator; case TYPESCOUNTLEXICALASC: return TypesCountLexicalAscComparator; case TYPESCOUNTLEXICALDESC: return TypesCountLexicalDescComparator; case TYPETOKENRATIOLEXICALASC: return TypeTokenRatioLexicalAscComparator; case TYPETOKENRATIOLEXICALDESC: return TypeTokenRatioLexicalDescComparator; case PUBDATEASC: return PubDateAscendingComparator; case PUBDATEDESC: return PubDateDescendingComparator; default: return IndexAscComparator; } } private static Comparator<IndexedDocument> IndexAscComparator = new Comparator<IndexedDocument>() { @Override public int compare(IndexedDocument doc1, IndexedDocument doc2) { try { return Integer.compare(doc2.getMetadata().getIndex(), doc1.getMetadata().getIndex()); } catch (IOException e) { throw new RuntimeException(e); } } }; private static Comparator<IndexedDocument> IndexDescComparator = new Comparator<IndexedDocument>() { @Override public int compare(IndexedDocument doc1, IndexedDocument doc2) { try { return Integer.compare(doc1.getMetadata().getIndex(), doc2.getMetadata().getIndex()); } catch (IOException e) { throw new RuntimeException(e); } } }; private static Comparator<IndexedDocument> TitleAscComparator = new Comparator<IndexedDocument>() { @Override public int compare(IndexedDocument doc1, IndexedDocument doc2) { try { return doc2.getMetadata().getTitle().compareTo(doc1.getMetadata().getTitle()); } catch (IOException e) { throw new RuntimeException(e); } } }; private static Comparator<IndexedDocument> TitleDescComparator = new Comparator<IndexedDocument>() { @Override public int compare(IndexedDocument doc1, IndexedDocument doc2) { try { return doc1.getMetadata().getTitle().compareTo(doc2.getMetadata().getTitle()); } catch (IOException e) { throw new RuntimeException(e); } } }; private static Comparator<IndexedDocument> AuthorAscComparator = new Comparator<IndexedDocument>() { @Override public int compare(IndexedDocument doc1, IndexedDocument doc2) { try { return doc2.getMetadata().getAuthor().compareTo(doc1.getMetadata().getAuthor()); } catch (IOException e) { throw new RuntimeException(e); } } }; private static Comparator<IndexedDocument> AuthorDescComparator = new Comparator<IndexedDocument>() { @Override public int compare(IndexedDocument doc1, IndexedDocument doc2) { try { return doc1.getMetadata().getAuthor().compareTo(doc2.getMetadata().getAuthor()); } catch (IOException e) { throw new RuntimeException(e); } } }; private static Comparator<IndexedDocument> TypesCountLexicalAscComparator = new Comparator<IndexedDocument>() { @Override public int compare(IndexedDocument doc1, IndexedDocument doc2) { try { return Integer.compare(doc2.getMetadata().getTypesCount(TokenType.lexical), doc1.getMetadata().getTypesCount(TokenType.lexical)); } catch (IOException e) { throw new RuntimeException(e); } } }; private static Comparator<IndexedDocument> TypesCountLexicalDescComparator = new Comparator<IndexedDocument>() { @Override public int compare(IndexedDocument doc1, IndexedDocument doc2) { try { return Integer.compare(doc1.getMetadata().getTypesCount(TokenType.lexical), doc2.getMetadata().getTypesCount(TokenType.lexical)); } catch (IOException e) { throw new RuntimeException(e); } } }; private static Comparator<IndexedDocument> TermsCountLexicalAscComparator = new Comparator<IndexedDocument>() { @Override public int compare(IndexedDocument doc1, IndexedDocument doc2) { try { return Integer.compare(doc2.getMetadata().getTokensCount(TokenType.lexical), doc1.getMetadata().getTokensCount(TokenType.lexical)); } catch (IOException e) { throw new RuntimeException(e); } } }; private static Comparator<IndexedDocument> TermsCountLexicalDescComparator = new Comparator<IndexedDocument>() { @Override public int compare(IndexedDocument doc1, IndexedDocument doc2) { try { return Integer.compare(doc1.getMetadata().getTokensCount(TokenType.lexical), doc2.getMetadata().getTokensCount(TokenType.lexical)); } catch (IOException e) { throw new RuntimeException(e); } } }; private static Comparator<IndexedDocument> TypeTokenRatioLexicalAscComparator = new Comparator<IndexedDocument>() { @Override public int compare(IndexedDocument doc1, IndexedDocument doc2) { try { return Float.compare((float) doc2.getMetadata().getTypesCount(TokenType.lexical)/(float) doc2.getMetadata().getTokensCount(TokenType.lexical), (float) doc1.getMetadata().getTypesCount(TokenType.lexical)/(float) doc1.getMetadata().getTokensCount(TokenType.lexical)); } catch (IOException e) { throw new RuntimeException(e); } } }; private static Comparator<IndexedDocument> TypeTokenRatioLexicalDescComparator = new Comparator<IndexedDocument>() { @Override public int compare(IndexedDocument doc1, IndexedDocument doc2) { try { return Float.compare((float) doc1.getMetadata().getTypesCount(TokenType.lexical)/(float) doc1.getMetadata().getTokensCount(TokenType.lexical), (float) doc2.getMetadata().getTypesCount(TokenType.lexical)/ (float) doc2.getMetadata().getTokensCount(TokenType.lexical)); } catch (IOException e) { throw new RuntimeException(e); } } }; private static Comparator<IndexedDocument> PubDateDescendingComparator = new Comparator<IndexedDocument>() { @Override public int compare(IndexedDocument doc1, IndexedDocument doc2) { try { String d1 = doc1.getMetadata().getPubDate(); String d2 = doc2.getMetadata().getPubDate(); if (d1.equals(d2)) { return TitleAscComparator.compare(doc1, doc2); } else { return d2.compareTo(d1); } } catch (IOException e) { throw new RuntimeException(e); } } }; private static Comparator<IndexedDocument> PubDateAscendingComparator = new Comparator<IndexedDocument>() { @Override public int compare(IndexedDocument doc1, IndexedDocument doc2) { try { String d1 = doc1.getMetadata().getPubDate(); String d2 = doc2.getMetadata().getPubDate(); if (d1.equals(d2)) { return TitleAscComparator.compare(doc1, doc2); } else { return d1.compareTo(d2); } } catch (IOException e) { throw new RuntimeException(e); } } }; @Override public int compareTo(IndexedDocument o) { return IndexAscComparator.compare(this, o); } }