/** * */ package org.voyanttools.trombone.model; import java.text.Normalizer; import java.util.Comparator; import java.util.List; import org.voyanttools.trombone.util.FlexibleParameters; import com.thoughtworks.xstream.annotations.XStreamOmitField; /** * @author sgs * */ public class DocumentNgram implements Comparable<DocumentNgram> { public enum Sort { RAWFREQASC, RAWFREQDESC, TERMASC, TERMDESC, LENGTHASC, LENGTHDESC; public static Sort getForgivingly(FlexibleParameters parameters) { String sort = parameters.getParameterValue("sort", "").toUpperCase(); String sortPrefix = "RAWFREQ"; // default if (sort.startsWith("TERM")) {sortPrefix = "TERM";} else if (sort.startsWith("LENGTH")) {sortPrefix = "LENGTH";} String dir = parameters.getParameterValue("dir", "").toUpperCase(); String dirSuffix = "DESC"; if (dir.endsWith("ASC")) {dirSuffix="ASC";} return valueOf(sortPrefix+dirSuffix); } } private int docIndex; private String term; private int rawFreq; private int length; private List<int[]> positions; @XStreamOmitField private transient String normalizedString = null; public DocumentNgram(int corpusDocumentIndex, String term, List<int[]> positions, int length) { this.docIndex = corpusDocumentIndex; this.term = term; this.length = length; this.rawFreq = positions.size(); this.positions = positions; } public int getCorpusDocumentIndex() { return docIndex; } public String getTerm() { return term; } public int getLength() { return length; } public List<int[]> getPositions() { return positions; } public String toString() { return "("+docIndex+") "+term+": "+positions.size()+" ("+length+")"; } @Override public int compareTo(DocumentNgram ngram) { if (length==ngram.length && positions.size()>0 && ngram.positions.size()>0) { // sort by first position if same length int a = positions.get(0)[0]; int b = ngram.positions.get(0)[0]; return a > b ? 1 : a < b ? -1 : 0; } return length > ngram.length ? -1 : length < ngram.length ? 1 : 0; } private String getNormalizedTerm() { if (normalizedString==null) {normalizedString = Normalizer.normalize(term, Normalizer.Form.NFD);} return normalizedString; } public static Comparator<DocumentNgram> getComparator(Sort sort) { switch (sort) { case RAWFREQASC: return RawFrequencyAscendingComparator; case TERMASC: return TermAscendingComparator; case TERMDESC: return TermDescendingComparator; case LENGTHASC: return LengthAscendingComparator; case LENGTHDESC: return LengthDescendingComparator; default: // rawFrequencyDesc return RawFrequencyDescendingComparator; } } private static Comparator<DocumentNgram> TermAscendingComparator = new Comparator<DocumentNgram>() { @Override public int compare(DocumentNgram term1, DocumentNgram term2) { int i = term2.getNormalizedTerm().compareTo(term1.getNormalizedTerm()); if (i==0) { return term1.rawFreq - term2.rawFreq; } return i; } }; private static Comparator<DocumentNgram> TermDescendingComparator = new Comparator<DocumentNgram>() { @Override public int compare(DocumentNgram term1, DocumentNgram term2) { int i = term1.getNormalizedTerm().compareTo(term2.getNormalizedTerm()); if (i==0) { return term1.rawFreq - term2.rawFreq; } return i; } }; private static Comparator<DocumentNgram> RawFrequencyDescendingComparator = new Comparator<DocumentNgram>() { @Override public int compare(DocumentNgram term1, DocumentNgram term2) { if (term1.rawFreq==term2.rawFreq) { return term1.getNormalizedTerm().compareTo(term2.getNormalizedTerm()); } else { return term2.rawFreq - term1.rawFreq; } } }; private static Comparator<DocumentNgram> RawFrequencyAscendingComparator = new Comparator<DocumentNgram>() { @Override public int compare(DocumentNgram term1, DocumentNgram term2) { if (term1.rawFreq==term2.rawFreq) { return term2.getNormalizedTerm().compareTo(term1.getNormalizedTerm()); } else { return term1.rawFreq - term2.rawFreq; } } }; private static Comparator<DocumentNgram> LengthDescendingComparator = new Comparator<DocumentNgram>() { @Override public int compare(DocumentNgram term1, DocumentNgram term2) { if (term1.length==term2.length) { return term1.getNormalizedTerm().compareTo(term2.getNormalizedTerm()); } else { return term2.length - term1.length; } } }; private static Comparator<DocumentNgram> LengthAscendingComparator = new Comparator<DocumentNgram>() { @Override public int compare(DocumentNgram term1, DocumentNgram term2) { if (term1.length==term2.length) { return term2.getNormalizedTerm().compareTo(term1.getNormalizedTerm()); } else { return term1.length - term2.length; } } }; }