package org.baderlab.csplugins.enrichmentmap.parsers; import java.util.List; import java.util.Map; import org.baderlab.csplugins.enrichmentmap.model.EMDataSet; import org.baderlab.csplugins.enrichmentmap.model.EMCreationParameters; import org.baderlab.csplugins.enrichmentmap.model.EMCreationParameters.GreatFilter; import org.baderlab.csplugins.enrichmentmap.model.EnrichmentMap; import org.baderlab.csplugins.enrichmentmap.model.EnrichmentResult; import org.baderlab.csplugins.enrichmentmap.model.GeneSet; import org.baderlab.csplugins.enrichmentmap.model.GenericResult; import org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor; import org.cytoscape.work.TaskMonitor; import com.google.common.collect.ImmutableSet; public class ParseGREATEnrichmentResults extends DatasetLineParser { public ParseGREATEnrichmentResults(EMDataSet dataset) { super(dataset); } @Override public void parseLines(List<String> lines, EMDataSet dataset, TaskMonitor taskMonitor) { if(taskMonitor == null) taskMonitor = new NullTaskMonitor(); taskMonitor.setTitle("Parsing Enrichment Result file"); boolean hasBackground = false; EMCreationParameters params = dataset.getMap().getParams(); //Get the type of filter user specified on the GREAT results //If it is hyper use column 14 Hypergeometric p-value and 16 FDR for hyper //If it is binom use column 5 bionomial p-value and 7 FDR for binom //If they specify both use the highest p-value and q-value from the above columns GreatFilter filterType = dataset.getMap().getParams().getGreatFilter(); Map<String, GeneSet> genesets = dataset.getSetOfGeneSets().getGeneSets(); EnrichmentMap map = dataset.getMap(); Map<String, EnrichmentResult> results = dataset.getEnrichments().getEnrichments(); int currentProgress = 0; int maxValue = lines.size(); taskMonitor.setStatusMessage("Parsing Great Results file - " + maxValue + " rows"); //for great files there is an FDR dataset.getMap().getParams().setFDR(true); //skip the first l9 which just has the field names (start i=1) //check to see how many columns the data has //go through each line until we find the header line int k = 0; String line = lines.get(k); String[] tokens = line.split("\t"); for(; k < lines.size(); k++) { line = lines.get(k); tokens = line.split("\t"); int length = tokens.length; if((length == 24) && tokens[3].equalsIgnoreCase("BinomRank")) { break; } //If GREAT is done with a background set then the table looks different //There is not binom rank and no binomial data. else if((length == 20) && tokens[3].equalsIgnoreCase("Rank")) { hasBackground = true; break; } } //go through the rest of the lines for(int i = k + 1; i < lines.size(); i++) { line = lines.get(i); tokens = line.split("\t"); //there are extra lines at the end of the file that should be ignored. if(!hasBackground && tokens.length != 24) continue; if(hasBackground && tokens.length != 20) continue; double pvalue = 1.0; double FDRqvalue = 1.0; GenericResult result; int gs_size = 0; double NES = 1.0; //details of export file //http://bejerano.stanford.edu/help/display/GREAT/Export //The second column of the file is the name of the geneset final String name = tokens[1].trim() + "-" + tokens[2].trim(); //the first column of the file is the description final String description = tokens[2].trim(); //when there are two different species it is possible that the gene set could //already exist in the set of genesets. if it does exist then add the genes //in this set to the geneset ImmutableSet.Builder<Integer> builder = ImmutableSet.builder(); if(genesets.containsKey(name)) builder = builder.addAll(genesets.get(name).getGenes()); String[] gene_tokens; if(!hasBackground) gene_tokens = tokens[23].split(","); else gene_tokens = tokens[18].split(","); //All subsequent fields in the list are the geneset associated with this geneset. for(int j = 0; j < gene_tokens.length; j++) { String gene = gene_tokens[j].toUpperCase(); //Check to see if the gene is already in the hashmap of genes //if it is already in the hash then get its associated key and put it into the set of genes if(map.containsGene(gene)) { builder.add(map.getHashFromGene(gene)); } else if(!gene.isEmpty()) { Integer hash = map.addGene(gene).get(); builder.add(hash); } } //finished parsing that geneset //add the current geneset to the hashmap of genesets GeneSet gs = new GeneSet(name, description, builder.build()); genesets.put(name, gs); //There are two tests run by GREAT, the binomial on regions and the hypergeometric based on genes //The first pass of results shows only those that are significant both //The user can then choose to use either or both together // //If it is hyper use column 14 Hypergeometric p-value and 16 FDR for hyper //If it is binom use column 5 bionomial p-value and 7 FDR for binom //If they specify both use the highest p-value and q-value from the above columns double hyper_pvalue = 1; double hyper_fdr = 1; double binom_pvalue = 1; double binom_fdr = 1; if(!hasBackground) { if(!tokens[4].equalsIgnoreCase("")) binom_pvalue = Double.parseDouble(tokens[4]); if(!tokens[6].equalsIgnoreCase("")) binom_fdr = Double.parseDouble(tokens[6]); if(!tokens[13].equalsIgnoreCase("")) hyper_pvalue = Double.parseDouble(tokens[13]); if(!tokens[15].equalsIgnoreCase("")) hyper_fdr = Double.parseDouble(tokens[15]); } else { if(!tokens[4].equalsIgnoreCase("")) hyper_pvalue = Double.parseDouble(tokens[4]); if(!tokens[6].equalsIgnoreCase("")) hyper_fdr = Double.parseDouble(tokens[6]); } if(filterType == GreatFilter.HYPER) { pvalue = hyper_pvalue; FDRqvalue = hyper_fdr; } else if(filterType == GreatFilter.BINOM) { pvalue = binom_pvalue; FDRqvalue = binom_fdr; } else if(filterType == GreatFilter.BOTH) { pvalue = Math.max(hyper_pvalue, binom_pvalue); FDRqvalue = Math.max(hyper_fdr, binom_fdr); } else if(filterType == GreatFilter.EITHER) { pvalue = Math.min(hyper_pvalue, binom_pvalue); FDRqvalue = Math.min(hyper_fdr, binom_fdr); } else { System.out.println("Invalid attribute setting for GREAT p-value specification"); } //Keep track of minimum p-value to better calculate jslider if(pvalue < params.getPvalueMin()) params.setPvalueMin(pvalue); if(FDRqvalue < params.getQvalueMin()) params.setQvalueMin(FDRqvalue); //the Count is the size of the geneset - not restricted to the genes of interest //the 20th column total genes, a.k.a "annotation count" (K) //If this is a background set then it is in the 16th column if((!hasBackground) && (!tokens[19].equalsIgnoreCase(""))) gs_size = Integer.parseInt(tokens[19]); else if((hasBackground) && (!tokens[15].equalsIgnoreCase(""))) gs_size = Integer.parseInt(tokens[15]); result = new GenericResult(name, description, pvalue, gs_size, FDRqvalue); // Calculate Percentage. This must be a value between 0..100. int percentComplete = (int) (((double) currentProgress / maxValue) * 100); taskMonitor.setProgress(percentComplete); currentProgress++; //check to see if the gene set has already been entered in the results //it is possible that one geneset will be in both phenotypes. //if it is already exists then we want to make sure the one retained is the result with the //lower p-value. //ticket #149 GenericResult temp = (GenericResult) results.get(name); if(temp == null) results.put(name, result); else { if(result.getPvalue() < temp.getPvalue()) results.put(name, result); } } } }