package org.voyanttools.trombone.model;
import java.io.Serializable;
import java.text.Normalizer;
import java.util.Comparator;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.voyanttools.trombone.util.FlexibleParameters;
import com.thoughtworks.xstream.annotations.XStreamOmitField;
public class CorpusTerm implements Serializable {
public enum Sort {
INDOCUMENTSCOUNTASC, INDOCUMENTSCOUNTDESC, RAWFREQASC, RAWFREQDESC, TERMASC, TERMDESC, RELATIVEPEAKEDNESSASC, RELATIVEPEAKEDNESSDESC, RELATIVESKEWNESSASC, RELATIVESKEWNESSDESC, COMPARISONCORPUSRELATIVEFREQASC, COMPARISONCORPUSRELATIVEFREQDESC;
public static Sort getForgivingly(FlexibleParameters parameters) {
String sort = parameters.getParameterValue("sort", "").toUpperCase();
String sortPrefix = "RAWFREQ"; // default
if (sort.startsWith("TERM")) {sortPrefix = "TERM";}
else if (sort.startsWith("INDOCUMENTSCOUNT")) {sortPrefix = "INDOCUMENTSCOUNT";}
else if (sort.startsWith("RELATIVEPEAK")) {sortPrefix = "RELATIVEPEAKEDNESS";}
else if (sort.startsWith("RELATIVESKEW")) {sortPrefix = "RELATIVESKEWNESS";}
else if (sort.startsWith("COMPARISONCORPUSRELATIVEFREQ")) {sortPrefix = "COMPARISONCORPUSRELATIVEFREQ";}
String dir = parameters.getParameterValue("dir", "").toUpperCase();
String dirSuffix = "DESC";
if (dir.endsWith("ASC")) {dirSuffix="ASC";}
return valueOf(sortPrefix+dirSuffix);
}
public boolean needDistributions() {
return this==RELATIVEPEAKEDNESSASC || this==RELATIVEPEAKEDNESSDESC || this==RELATIVESKEWNESSASC || this==RELATIVESKEWNESSDESC;
}
}
private String term;
private int rawFreq;
private int totalTokens;
private int[] rawFreqs;
private int inDocumentsCount;
private int totalDocuments;
private float relativePeakedness = Float.NaN;
private float relativeSkewness = Float.NaN;
private float comparisonCorpusRelativeFrequencyDifference = Float.NaN;
// private float[] relativeFreqs;
@XStreamOmitField
private transient String normalizedString = null;
@XStreamOmitField
private transient DescriptiveStatistics relativeStats = null;
public CorpusTerm(String termString, int rawFreq, int totalTokens, int inDocumentsCount, int totalDocuments) {
this(termString, rawFreq, totalTokens, inDocumentsCount, totalDocuments, null, null, 0);
}
public void setComparisonRelativeFrequency(float comparisonCorpusRelativeFrequency) {
this.comparisonCorpusRelativeFrequencyDifference = this.getRelativeFrequency() - comparisonCorpusRelativeFrequency;
}
public float getComparisonCorpusRelativeFrequencyDifference() {
return comparisonCorpusRelativeFrequencyDifference;
}
public CorpusTerm(String termString, int rawFreq, int totalTokens, int inDocumentsCount, int totalDocuments, int[] rawFreqs, float[] relativeFreqs, int bins) {
this.term = termString;
this.rawFreq = rawFreq;
this.totalTokens = totalTokens;
this.inDocumentsCount = inDocumentsCount;
this.totalDocuments = totalDocuments;
if (rawFreqs==null || rawFreqs.length==0 || bins==0 || rawFreqs.length==bins) {
this.rawFreqs = rawFreqs;
}
else {
this.rawFreqs = new int[bins];
for(int position=0, len=rawFreqs.length; position<len; position++) {
this.rawFreqs[(int) (position*bins/len)]+=rawFreqs[position];
}
}
if (relativeFreqs==null || relativeFreqs.length==0 || bins==0) {
// do nothing, keep stats as null
}
else if (bins==0 || bins==relativeFreqs.length) {
this.relativeStats = new DescriptiveStatistics(relativeFreqs.length);
for (float f : relativeFreqs) {relativeStats.addValue(f);}
}
else {
SummaryStatistics[] stats = new SummaryStatistics[bins];
for(int position=0, len=rawFreqs.length; position<len; position++) {
int pos = (int) (position*bins/len);
if (stats[pos]==null) {stats[pos]=new SummaryStatistics();}
stats[pos].addValue(relativeFreqs[position]);
}
this.relativeStats = new DescriptiveStatistics(bins);
for (SummaryStatistics stat : stats) {relativeStats.addValue(stat==null ? 0 : stat.getMean());}
}
}
public CorpusTerm(CorpusTermMinimal corpusTermMinimal, int totalTokens) {
this(corpusTermMinimal.getTerm(), corpusTermMinimal.getRawFreq(), totalTokens, corpusTermMinimal.getInDocumentsCount(), corpusTermMinimal.getDocumentsCount(), null, null, 0);
}
public int getRawFrequency() {
return this.rawFreq;
}
@Deprecated
public int getRawFreq() {
return this.getRawFrequency();
}
public float getRelativeFrequency() {
return (float) rawFreq / (float) totalTokens;
}
@Deprecated
public float getRelativeFreq() {
return this.getRelativeFrequency();
}
private String getNormalizedTerm() {
if (normalizedString==null) {normalizedString = Normalizer.normalize(term, Normalizer.Form.NFD);}
return normalizedString;
}
public String getTerm() {
return term;
}
public float getPeakedness() {
if (Float.isNaN(relativePeakedness) && relativeStats!=null) {
relativePeakedness = (float) relativeStats.getKurtosis();
}
return relativePeakedness;
}
public float getSkewness() {
if (Float.isNaN(relativeSkewness) && relativeStats!=null) {
relativeSkewness = (float) relativeStats.getSkewness();
}
return relativeSkewness;
}
public int[] getRawDistributions() {
return rawFreqs;
}
public float[] getRelativeDistributions() {
if (relativeStats==null || relativeStats.getN()==0) {
return new float[0];
}
float[] distributions = new float[(int) relativeStats.getN()];
for(int position=0, len=distributions.length; position<len; position++) {
distributions[position]=(float) relativeStats.getElement(position); // TODO: this needs to be averaged?
}
return distributions;
}
public int getInDocumentsCount() {
return inDocumentsCount;
}
public static Comparator<CorpusTerm> getComparator(Sort sort) {
switch (sort) {
case RAWFREQASC:
return RawFrequencyAscendingComparator;
case TERMASC:
return TermAscendingComparator;
case RELATIVEPEAKEDNESSASC:
return RelativePeakednessAscendingComparator;
case RELATIVEPEAKEDNESSDESC:
return RelativePeakednessDescendingComparator;
case RELATIVESKEWNESSASC:
return RelativeSkewnessAscendingComparator;
case RELATIVESKEWNESSDESC:
return RelativeSkewnessDescendingComparator;
case TERMDESC:
return TermDescendingComparator;
case INDOCUMENTSCOUNTASC:
return InDocumentsCountAscendingComparator;
case INDOCUMENTSCOUNTDESC:
return InDocumentsCountDescendingComparator;
case COMPARISONCORPUSRELATIVEFREQASC:
return ComparisonCorpusRelativeFrequencyAscendingComparator;
case COMPARISONCORPUSRELATIVEFREQDESC:
return ComparisonCorpusRelativeFrequencyDescendingComparator;
default: // rawFrequencyDesc
return RawFrequencyDescendingComparator;
}
}
private static Comparator<CorpusTerm> TermAscendingComparator = new Comparator<CorpusTerm>() {
@Override
public int compare(CorpusTerm term1, CorpusTerm term2) {
int i = term2.getNormalizedTerm().compareTo(term1.getNormalizedTerm());
if (i==0) {
return term1.rawFreq - term2.rawFreq;
}
return i;
}
};
private static Comparator<CorpusTerm> TermDescendingComparator = new Comparator<CorpusTerm>() {
@Override
public int compare(CorpusTerm term1, CorpusTerm term2) {
int i = term1.getNormalizedTerm().compareTo(term2.getNormalizedTerm());
if (i==0) {
return term1.rawFreq - term2.rawFreq;
}
return i;
}
};
private static Comparator<CorpusTerm> RawFrequencyDescendingComparator = new Comparator<CorpusTerm>() {
@Override
public int compare(CorpusTerm term1, CorpusTerm term2) {
if (term1.rawFreq==term2.rawFreq) {
return term1.getNormalizedTerm().compareTo(term2.getNormalizedTerm());
}
else {
return term2.rawFreq - term1.rawFreq;
}
}
};
private static Comparator<CorpusTerm> RawFrequencyAscendingComparator = new Comparator<CorpusTerm>() {
@Override
public int compare(CorpusTerm term1, CorpusTerm term2) {
if (term1.rawFreq==term2.rawFreq) {
return term2.getNormalizedTerm().compareTo(term1.getNormalizedTerm());
}
else {
return term1.rawFreq - term2.rawFreq;
}
}
};
private static Comparator<CorpusTerm> InDocumentsCountAscendingComparator = new Comparator<CorpusTerm>() {
@Override
public int compare(CorpusTerm term1, CorpusTerm term2) {
if (term1.inDocumentsCount==term2.inDocumentsCount) {
return term1.getNormalizedTerm().compareTo(term2.getNormalizedTerm());
}
else {
return term1.inDocumentsCount - term2.inDocumentsCount;
}
}
};
private static Comparator<CorpusTerm> InDocumentsCountDescendingComparator = new Comparator<CorpusTerm>() {
@Override
public int compare(CorpusTerm term1, CorpusTerm term2) {
if (term1.inDocumentsCount==term2.inDocumentsCount) {
return term1.getNormalizedTerm().compareTo(term2.getNormalizedTerm());
}
else {
return term2.inDocumentsCount - term1.inDocumentsCount;
}
}
};
private static Comparator<CorpusTerm> RelativePeakednessAscendingComparator = new Comparator<CorpusTerm>() {
@Override
public int compare(CorpusTerm term1, CorpusTerm term2) {
float f1 = term1.getPeakedness();
float f2 = term2.getPeakedness();
if (f1==f2) {
return RawFrequencyDescendingComparator.compare(term1, term2);
}
else {
return Float.compare(f1, f2);
}
}
};
private static Comparator<CorpusTerm> RelativePeakednessDescendingComparator = new Comparator<CorpusTerm>() {
@Override
public int compare(CorpusTerm term1, CorpusTerm term2) {
float f1 = term1.getPeakedness();
float f2 = term2.getPeakedness();
if (f1==f2) {
return RawFrequencyDescendingComparator.compare(term1, term2);
}
else {
return Float.compare(f2, f1);
}
}
};
private static Comparator<CorpusTerm> RelativeSkewnessDescendingComparator = new Comparator<CorpusTerm>() {
@Override
public int compare(CorpusTerm term1, CorpusTerm term2) {
float f1 = term1.getSkewness();
float f2 = term2.getSkewness();
if (f1==f2) {
return RawFrequencyDescendingComparator.compare(term1, term2);
}
else {
return Float.compare(f2, f1);
}
}
};
private static Comparator<CorpusTerm> RelativeSkewnessAscendingComparator = new Comparator<CorpusTerm>() {
@Override
public int compare(CorpusTerm term1, CorpusTerm term2) {
float f1 = term1.getSkewness();
float f2 = term2.getSkewness();
if (f1==f2) {
return RawFrequencyDescendingComparator.compare(term1, term2);
}
else {
return Float.compare(f1, f2);
}
}
};
private static Comparator<CorpusTerm> ComparisonCorpusRelativeFrequencyDescendingComparator = new Comparator<CorpusTerm>() {
@Override
public int compare(CorpusTerm term1, CorpusTerm term2) {
float f1 = term1.getComparisonCorpusRelativeFrequencyDifference();
float f2 = term2.getComparisonCorpusRelativeFrequencyDifference();
if (f1==f2) {
return RawFrequencyDescendingComparator.compare(term1, term2);
}
else {
return Float.compare(f2, f1);
}
}
};
private static Comparator<CorpusTerm> ComparisonCorpusRelativeFrequencyAscendingComparator = new Comparator<CorpusTerm>() {
@Override
public int compare(CorpusTerm term1, CorpusTerm term2) {
float f1 = term1.getComparisonCorpusRelativeFrequencyDifference();
float f2 = term2.getComparisonCorpusRelativeFrequencyDifference();
if (f1==f2) {
return RawFrequencyDescendingComparator.compare(term1, term2);
}
else {
return Float.compare(f1, f2);
}
}
};
@Override
public String toString() {
return "{"+term+": "+rawFreq+" ("+getRelativeFreq()+")";
}
}