/**
* Performs enrichment on a gene list.
*
* @author Edward Y. Chen
* @since 8/2/2012
*/
package edu.mssm.pharm.maayanlab.Enrichr;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import edu.mssm.pharm.maayanlab.common.bio.EnrichedTerm;
import edu.mssm.pharm.maayanlab.common.bio.GeneSetLibrary;
import edu.mssm.pharm.maayanlab.common.bio.InputGenes;
import edu.mssm.pharm.maayanlab.common.bio.Term;
import edu.mssm.pharm.maayanlab.common.core.FileUtils;
import edu.mssm.pharm.maayanlab.common.core.Settings;
import edu.mssm.pharm.maayanlab.common.core.SettingsChanger;
import edu.mssm.pharm.maayanlab.common.math.NumberUtils;
public class Enrichment implements SettingsChanger {
// Default settings
private final Settings settings = new Settings() {
{
// String: rank the Enrichment by the Fisher Exact test's p-value, rank against the background of random genes, or combined score of the two. [combined score/p-value/rank]
set(Enrichment.SORT_BY, Enrichment.COMBINED_SCORE);
}
};
// Settings variables
public final static String SORT_BY = "sort enrichment by";
public final static String COMBINED_SCORE = "combined score";
public final static String PVALUE = "p-value";
public final static String RANK = "rank";
public static final String HEADER = "Term\tOverlap\tP-value\tAdjusted P-value\tZ-score\tCombined Score\tGenes";
private Collection<String> geneList;
private boolean isFuzzy = false;
public static void main(String[] args) {
try {
Enrichment app = new Enrichment(FileUtils.readFile(args[1]), true);
FileUtils.writeFile(args[2], HEADER, app.enrich(args[0]));
} catch (ParseException e) {
System.err.println(e.getMessage());
}
}
public Enrichment(Collection<String> geneList) {
this.isFuzzy = InputGenes.isFuzzy(geneList);
this.geneList = geneList;
}
public Enrichment(Collection<String> geneList, boolean validate) throws ParseException {
this.isFuzzy = InputGenes.isFuzzy(geneList);
if (validate) // Check if input list is valid
if (isFuzzy)
InputGenes.validateFuzzyInputGenes(geneList);
else
InputGenes.validateInputGenes(geneList);
this.geneList = geneList;
}
@Override
public void setSetting(String key, String value) {
settings.set(key, value);
}
public Collection<String> getInput() {
return geneList;
}
public ArrayList<EnrichedTerm> enrich(String backgroundType) {
return enrich(ResourceLoader.getInstance().getLibrary(backgroundType));
}
public ArrayList<EnrichedTerm> enrich(GeneSetLibrary geneSetLibrary) {
if (isFuzzy)
return fuzzyEnrich(geneSetLibrary);
HashSet<String> inputGenes = filterInputGenes(geneSetLibrary);
ArrayList<EnrichedTerm> enrichedTerms = new ArrayList<EnrichedTerm>();
for (Term currentTerm : geneSetLibrary.getTerms()) {
EnrichedTerm enrichedTerm = currentTerm.getEnrichedTerm(inputGenes);
if (enrichedTerm != null)
enrichedTerms.add(enrichedTerm);
}
sortEnrichedTerms(geneSetLibrary, enrichedTerms);
return enrichedTerms;
}
public ArrayList<EnrichedTerm> fuzzyEnrich(GeneSetLibrary geneSetLibrary) {
HashMap<String, Double> inputGenes = filterFuzzyInputGenes(geneSetLibrary);
ArrayList<EnrichedTerm> enrichedTerms = new ArrayList<EnrichedTerm>();
for (Term currentTerm : geneSetLibrary.getTerms()) {
EnrichedTerm enrichedTerm = currentTerm.getEnrichedTerm(inputGenes);
if (enrichedTerm != null)
enrichedTerms.add(enrichedTerm);
}
sortEnrichedTerms(geneSetLibrary, enrichedTerms);
return enrichedTerms;
}
// Filter genes from the input list that are not in the gene-set library
private HashSet<String> filterInputGenes(GeneSetLibrary geneSetLibrary) {
HashSet<String> inputGenes = new HashSet<String>();
for (String gene : geneList) {
gene = gene.toUpperCase();
if (geneSetLibrary.contains(gene))
inputGenes.add(gene);
}
return inputGenes;
}
private HashMap<String, Double> filterFuzzyInputGenes(GeneSetLibrary geneSetLibrary) {
HashMap<String, Double> fuzzyInputGenes = new HashMap<String, Double>(NumberUtils.roundUpDivision((geneList.size())*4, 3));
for (String genePair : geneList) {
String[] splitLine = genePair.split(",");
String gene = splitLine[0].toUpperCase();
if (geneSetLibrary.contains(gene)) {
try {
fuzzyInputGenes.put(gene, Double.parseDouble(splitLine[1]));
} catch (NumberFormatException nfe) {
fuzzyInputGenes.put(gene, 0.0);
}
}
}
return fuzzyInputGenes;
}
// Sort enriched terms
private void sortEnrichedTerms(GeneSetLibrary geneSetLibrary, ArrayList<EnrichedTerm> enrichedTerms) {
// Sort by p-value
Collections.sort(enrichedTerms);
// Calculate adjusted p-value via Benjamini-Hochberg
double previousValue = 1; // Prevent adjusted p-value to be > 1
for (int rank = enrichedTerms.size(); rank > 0; rank--) {
EnrichedTerm enrichedTerm = enrichedTerms.get(rank-1);
double adjustedPValue = Math.min(previousValue, enrichedTerm.getPValue() * enrichedTerms.size() / rank); // Ensure monotonicity
previousValue = adjustedPValue;
enrichedTerm.setAdjustedPValue(adjustedPValue);
if (geneSetLibrary.isRanked()) {
enrichedTerm.computeZScore(rank);
}
}
// Check if should calculate unadjusted score instead
boolean unadjusted = false;
for (int i = 0; i < Math.min(10, enrichedTerms.size()); i++) {
if (Math.log(enrichedTerms.get(i).getAdjustedPValue()) >= 0) {
unadjusted = true;
break;
}
}
if (geneSetLibrary.isRanked()) {
for (EnrichedTerm enrichedTerm : enrichedTerms) {
if (unadjusted) {
enrichedTerm.computeUnadjustedScore();
}
else {
enrichedTerm.computeScore();
}
}
if (settings.get(SORT_BY).equals(COMBINED_SCORE)) {
Collections.sort(enrichedTerms, new Comparator<EnrichedTerm>() {
@Override
public int compare(EnrichedTerm o1, EnrichedTerm o2) {
if (o1.getCombinedScore() < o2.getCombinedScore())
return 1;
else if (o1.getCombinedScore() > o2.getCombinedScore())
return -1;
else
return 0;
}
});
}
}
}
}