/******************************************************************************* * Trombone is a flexible text processing and analysis library used * primarily by Voyant Tools (voyant-tools.org). * * Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell * * This file is part of Trombone. * * Trombone is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Trombone is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Trombone. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package org.voyanttools.trombone.model; import java.text.Normalizer; import java.util.Comparator; import java.util.HashMap; import java.util.Map; import org.voyanttools.trombone.util.FlexibleParameters; import com.thoughtworks.xstream.annotations.XStreamOmitField; /** * @author sgs * */ public class DocumentTerm { public enum Sort { RAWFREQASC, RAWFREQDESC, RELATIVEFREQASC, RELATIVEFREQDESC, TERMASC, TERMDESC, TFIDFASC, TFIDFDESC, ZSCOREASC, ZSCOREDESC; public static Sort getForgivingly(FlexibleParameters parameters) { String sort = parameters.getParameterValue("sort", "").toUpperCase(); String sortPrefix = "RELATIVEFREQ"; // default if (sort.startsWith("RAWFREQ")) {sortPrefix = "RAWFREQ";} if (sort.startsWith("TERM")) {sortPrefix = "TERM";} if (sort.startsWith("TFIDF")) {sortPrefix = "TFIDF";} if (sort.startsWith("ZSCORE")) {sortPrefix = "ZSCORE";} String dir = parameters.getParameterValue("dir", "").toUpperCase(); String dirSuffix = "DESC"; if (dir.endsWith("ASC")) {dirSuffix="ASC";} return valueOf(sortPrefix+dirSuffix); } } protected int docIndex; protected String docId; protected String term; @XStreamOmitField protected String normalizedString; protected int rawFreq; protected int totalTermsCount; protected float relativeFreq; protected float zscore; protected float zscoreRatio; protected float tfidf; protected int[] positions; protected int[] offsets; protected CorpusTermMinimal corpusTermMinimal; private transient Map<Integer, float[]> relativeDistributionsMap; public DocumentTerm(int docIndex, String docId, String term, int rawFreq, int totalTokens, float zscore, int[] positions, int[] offsets, CorpusTermMinimal corpusTermMinimal) { this.docIndex = docIndex; this.docId = docId; this.term = term; this.rawFreq = rawFreq; this.totalTermsCount = totalTokens; this.relativeFreq = totalTokens > 0 ? ((float) rawFreq / totalTokens) * 1000000 : 0; this.zscore = zscore; this.positions = positions; this.offsets = offsets; this.normalizedString = null; this.tfidf = Float.NaN; this.zscoreRatio = Float.NaN; this.corpusTermMinimal = corpusTermMinimal; relativeDistributionsMap = new HashMap<Integer, float[]>(); } public int getRawFrequency() { return rawFreq; } public String getNormalizedTerm() { if (normalizedString==null) {normalizedString = Normalizer.normalize(term, Normalizer.Form.NFD);} return normalizedString; } public String getTerm() { return term; } @Override public String toString() { return "(doc "+docIndex+") "+term+": "+rawFreq+" ("+relativeFreq+")"; } public float getRelativeFrequency() { return relativeFreq; } public float getZscore() { return zscore; } public int getDocumentIndex() { return docIndex; } public int[] getOffsets() { return offsets; } public int[] getPositions() { return positions; } public int[] getRawDistributions(int bins) { if (positions==null || bins ==0) return new int[0]; int[] distributions = new int[bins]; for(int position : positions) { distributions[(int) (position*bins/totalTermsCount)]++; } return distributions; } public float[] getRelativeDistributions(int bins) { if (positions==null || bins ==0) return new float[0]; if (relativeDistributionsMap.containsKey(bins)) { return relativeDistributionsMap.get(bins); } else { int[] rawDistributions = getRawDistributions(bins); float[] distributions = new float[bins]; for (int i=0, len = rawDistributions.length; i<len; i++) { distributions[i] = (float) rawDistributions[i] / totalTermsCount; } relativeDistributionsMap.put(bins, distributions); return distributions; } } public static Comparator<DocumentTerm> getComparator(Sort sort) { switch (sort) { case RAWFREQASC: return RawFrequencyAscendingComparator; case TERMASC: return TermAscendingComparator; case TERMDESC: return TermDescendingComparator; case RAWFREQDESC: return RawFrequencyDescendingComparator; case RELATIVEFREQASC: return RelativeFrequencyAscendingComparator; case TFIDFASC: return TfIdfAscendingComparator; case TFIDFDESC: return TfIdfDescendingComparator; case ZSCOREASC: return ZscoreAscendingComparator; case ZSCOREDESC: return ZscoreDescendingComparator; default: // relativeDesc return RelativeFrequencyDescendingComparator; } } private static Comparator<DocumentTerm> TermAscendingComparator = new Comparator<DocumentTerm>() { @Override public int compare(DocumentTerm term1, DocumentTerm term2) { if (term2.getTerm().equals(term1.getTerm())) { if (term1.relativeFreq==term2.relativeFreq) { if (term1.rawFreq==term2.rawFreq) { // return Integer.compare(term2.docIndex, term1.docIndex); } else { return Integer.compare(term1.rawFreq, term2.rawFreq); } } else { return Float.compare(term1.relativeFreq, term2.relativeFreq); } } else { return term2.getNormalizedTerm().compareTo(term1.getNormalizedTerm()); } } }; private static Comparator<DocumentTerm> TermDescendingComparator = new Comparator<DocumentTerm>() { @Override public int compare(DocumentTerm term1, DocumentTerm term2) { if (term2.getTerm().equals(term1.getTerm())) { if (term1.relativeFreq==term2.relativeFreq) { if (term1.rawFreq==term2.rawFreq) { // return Integer.compare(term2.docIndex, term1.docIndex); } else { return Integer.compare(term1.rawFreq, term2.rawFreq); } } else { return Float.compare(term1.relativeFreq, term2.relativeFreq); } } else { return term1.getNormalizedTerm().compareTo(term2.getNormalizedTerm()); } } }; private static Comparator<DocumentTerm> RawFrequencyDescendingComparator = new Comparator<DocumentTerm>() { @Override public int compare(DocumentTerm term1, DocumentTerm term2) { if (term1.rawFreq==term2.rawFreq) { if (term1.relativeFreq==term2.relativeFreq) { if (term2.getTerm().equals(term1.getTerm())) { return Integer.compare(term2.docIndex, term1.docIndex); } else { return term2.getNormalizedTerm().compareTo(term1.getNormalizedTerm()); } } else { return Float.compare(term2.relativeFreq, term1.relativeFreq); } } else { return term1.rawFreq - term2.rawFreq; } } }; private static Comparator<DocumentTerm> RawFrequencyAscendingComparator = new Comparator<DocumentTerm>() { @Override public int compare(DocumentTerm term1, DocumentTerm term2) { if (term1.rawFreq==term2.rawFreq) { if (term1.relativeFreq==term2.relativeFreq) { if (term2.getTerm().equals(term1.getTerm())) { return Integer.compare(term2.docIndex, term1.docIndex); } else { return term2.getNormalizedTerm().compareTo(term1.getNormalizedTerm()); } } else { return Float.compare(term1.relativeFreq, term2.relativeFreq); } } else { return term2.rawFreq - term1.rawFreq; } } }; private static Comparator<DocumentTerm> RelativeFrequencyAscendingComparator = new Comparator<DocumentTerm>() { @Override public int compare(DocumentTerm term1, DocumentTerm term2) { if (term1.relativeFreq==term2.relativeFreq) { if (term1.rawFreq==term2.rawFreq) { if (term2.getTerm().equals(term1.getTerm())) { return Integer.compare(term2.docIndex, term1.docIndex); } else { return term2.getNormalizedTerm().compareTo(term1.getNormalizedTerm()); } } else { return Integer.compare(term1.rawFreq, term2.rawFreq); } } else { return Float.compare(term1.relativeFreq, term2.relativeFreq); } } }; private static Comparator<DocumentTerm> RelativeFrequencyDescendingComparator = new Comparator<DocumentTerm>() { @Override public int compare(DocumentTerm term1, DocumentTerm term2) { if (term1.relativeFreq==term2.relativeFreq) { if (term1.rawFreq==term2.rawFreq) { if (term2.getTerm().equals(term1.getTerm())) { return Integer.compare(term2.docIndex, term1.docIndex); } else { return term1.getNormalizedTerm().compareTo(term2.getNormalizedTerm()); } } else { return Integer.compare(term1.rawFreq, term2.rawFreq); } } else { return Float.compare(term2.relativeFreq, term1.relativeFreq); } } }; private static Comparator<DocumentTerm> TfIdfDescendingComparator = new Comparator<DocumentTerm>() { @Override public int compare(DocumentTerm term1, DocumentTerm term2) { float f1 = term1.getTfIdf(); float f2 = term2.getTfIdf(); if (f1==f2) { return TermAscendingComparator.compare(term1, term2); } else { return Float.compare(f2, f1); } } }; private static Comparator<DocumentTerm> TfIdfAscendingComparator = new Comparator<DocumentTerm>() { @Override public int compare(DocumentTerm term1, DocumentTerm term2) { float f1 = term1.getTfIdf(); float f2 = term2.getTfIdf(); if (f1==f2) { return TermAscendingComparator.compare(term1, term2); } else { return Float.compare(f1, f2); } } }; private static Comparator<DocumentTerm> ZscoreDescendingComparator = new Comparator<DocumentTerm>() { @Override public int compare(DocumentTerm term1, DocumentTerm term2) { float f1 = term1.getZscore(); float f2 = term2.getZscore(); if (f1==f2) { return TermAscendingComparator.compare(term1, term2); } else { return Float.compare(f2, f1); } } }; private static Comparator<DocumentTerm> ZscoreAscendingComparator = new Comparator<DocumentTerm>() { @Override public int compare(DocumentTerm term1, DocumentTerm term2) { float f1 = term1.getZscore(); float f2 = term2.getZscore(); if (f1==f2) { return TermAscendingComparator.compare(term1, term2); } else { return Float.compare(f1, f2); } } }; public int getTotalTermsCount() { return totalTermsCount; } public int getDocIndex() { return docIndex; } public String getDocId() { return docId; } public float getZscoreRatio() { if (corpusTermMinimal!=null && Float.isNaN(zscoreRatio)) { float corpusZscore = corpusTermMinimal.getZscore(); if (zscore!=0 && corpusZscore!=0) { zscoreRatio = zscore > corpusZscore ? zscore / corpusZscore : -(corpusZscore / zscore); } } return zscoreRatio; } public float getTfIdf() { if (corpusTermMinimal!=null && Float.isNaN(tfidf)) { int inDocuments = corpusTermMinimal.getInDocumentsCount(); if (inDocuments>0) { this.tfidf = ((float) rawFreq / (float) totalTermsCount) * (float) Math.log10((float) corpusTermMinimal.getDocumentsCount() / (float) inDocuments); } } return tfidf; } }