package org.baderlab.csplugins.enrichmentmap.parsers; import java.util.List; import java.util.Map; import org.baderlab.csplugins.enrichmentmap.model.EMDataSet; import org.baderlab.csplugins.enrichmentmap.model.EnrichmentMap; import org.baderlab.csplugins.enrichmentmap.model.EnrichmentResult; import org.baderlab.csplugins.enrichmentmap.model.GeneSet; import org.baderlab.csplugins.enrichmentmap.model.GenericResult; import org.baderlab.csplugins.enrichmentmap.model.SetOfEnrichmentResults; import org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor; import org.cytoscape.work.TaskMonitor; import com.google.common.collect.ImmutableSet; public class ParseGenericEnrichmentResults extends DatasetLineParser { public ParseGenericEnrichmentResults(EMDataSet dataset) { super(dataset); } @Override public void parseLines(List<String> lines, EMDataSet dataset, TaskMonitor taskMonitor) { if(taskMonitor == null) taskMonitor = new NullTaskMonitor(); taskMonitor.setTitle("Parsing Generic Result file"); //Get the current genesets so we can check that all the results are in the geneset list //and put the size of the genesets into the visual style Map<String, GeneSet> genesets = dataset.getSetOfGeneSets().getGeneSets(); int currentProgress = 0; int maxValue = lines.size(); taskMonitor.setStatusMessage("Parsing Generic Results file - " + maxValue + " rows"); boolean FDR = false; //skip the first line which just has the field names (start i=1) //check to see how many columns the data has String line = lines.get(0); String[] tokens = line.split("\t"); int length = tokens.length; EnrichmentMap map = dataset.getMap(); SetOfEnrichmentResults enrichments = dataset.getEnrichments(); Map<String, EnrichmentResult> results = enrichments.getEnrichments(); String upPhenotype = enrichments.getPhenotype1(); String downPhenotype = enrichments.getPhenotype2(); //check to see if there are genesets. //if there are no genesets then populate the genesets from the generic file //can only do this if the 6th column has a list of genes for that geneset. boolean populate_gs = false; if(genesets == null || genesets.isEmpty()) populate_gs = true; //as this is the default for gprofiler use the Description in the visual style instead of the formatted name //but only if there is a gmt supplied. If using just the generic output file there is not field for description else dataset.getMap().getParams().setEMgmt(true); //if (length < 3) //not enough data in the file!! for(int i = 1; i < lines.size(); i++) { line = lines.get(i); tokens = line.split("\t"); //update the length each time because some line might have missing values length = tokens.length; double pvalue = 1.0; double FDRqvalue = 1.0; GenericResult result; int gs_size = 0; double NES = 1.0; //The first column of the file is the name of the geneset final String name = tokens[0].toUpperCase().trim(); final String description = tokens[1].toUpperCase(); if(genesets.containsKey(name)) { gs_size = genesets.get(name).getGenes().size(); } //The third column is the nominal p-value if(tokens[2] == null || tokens[2].equalsIgnoreCase("")) { //do nothing } else { pvalue = Double.parseDouble(tokens[2]); } if(length > 3) { //the fourth column is the FDR q-value if(tokens[3] == null || tokens[3].equalsIgnoreCase("")) { //do nothing } else { FDRqvalue = Double.parseDouble(tokens[3]); FDR = true; } //the fifth column is the phenotype. //it can either be a signed number or it can be text specifying the phenotype //in order for it to be parseable the text has to match the user specified phenotypes // and if it is a number the only important part is the sign if(length > 4) { if(tokens[4] == null || tokens[4].equalsIgnoreCase("")) { } else { //check to see if the string matches the specified phenotypes if(tokens[4].equalsIgnoreCase(upPhenotype)) NES = 1.0; else if(tokens[4].equalsIgnoreCase(downPhenotype)) NES = -1.0; //try and see if the user has specified the phenotype as a number else { try { NES = Double.parseDouble(tokens[4]); } catch(NumberFormatException nfe) { throw new IllegalThreadStateException(tokens[4] + " is not a valid phenotype. Phenotype specified in generic enrichment results file must have the same phenotype as specified in advanced options or must be a positive or negative number."); } } } //ticket#57 - adding additional column to generic format, similiar to Bingo and David // that outlines the genes from the query that are found in the geneset and results in //its enrichment if(length > 5 && populate_gs) { //get all the genes in the field String[] gene_tokens = tokens[5].split(","); ImmutableSet.Builder<Integer> builder = ImmutableSet.builder(); //All subsequent fields in the list are the geneset associated with this geneset. for(String token : gene_tokens) { String gene = token.trim().toUpperCase(); //Check to see if the gene is already in the hashmap of genes //if it is already in the hash then get its associated key and put it into the set of genes if(map.containsGene(gene)) { builder.add(map.getHashFromGene(gene)); } else if(!gene.isEmpty()) { Integer hash = map.addGene(gene).get(); builder.add(hash); } } GeneSet gs = new GeneSet(name, description, builder.build()); gs_size = gs.getGenes().size(); //put the new or filtered geneset back into the set. genesets.put(name, gs); } //end of tokens>5 result = new GenericResult(name, description, pvalue, gs_size, FDRqvalue, NES); } //end of tokens>4 else result = new GenericResult(name, description, pvalue, gs_size, FDRqvalue); } else { result = new GenericResult(name, description, pvalue, gs_size); } // Calculate Percentage. This must be a value between 0..100. int percentComplete = (int) (((double) currentProgress / maxValue) * 100); taskMonitor.setProgress(percentComplete); currentProgress++; //check to see if the gene set has already been entered in the results //it is possible that one geneset will be in both phenotypes. //if it is already exists then we want to make sure the one retained is the result with the //lower p-value. //ticket #149 GenericResult temp = (GenericResult) results.get(name); if(temp == null) results.put(name, result); else { if(result.getPvalue() < temp.getPvalue()) results.put(name, result); } } if(FDR) dataset.getMap().getParams().setFDR(FDR); } }