ParseGenericEnrichmentResults.java example

Explorer
EnrichmentMapApp-master
- EnrichmentMapApp-develop
  - EnrichmentMapIntegrationTest
    - src
      - test
        java
        org
        baderlab
        csplugins
        enrichmentmap
        integration
        BaseIntegrationTest.java
        EdgeSimilarities.java
        NullTaskMonitor.java
        PaxExamConfiguration.java
        SerialTestTaskManager.java
        SessionFile.java
        SimilarityKey.java
        TestUtils.java
        task
        CreateEnrichmentMapTaskTest.java
        LegacySessionLoadTest.java
        Protocol1Test.java
  - EnrichmentMapPlugin
    - src
package org.baderlab.csplugins.enrichmentmap.parsers;

import java.util.List;
import java.util.Map;

import org.baderlab.csplugins.enrichmentmap.model.EMDataSet;
import org.baderlab.csplugins.enrichmentmap.model.EnrichmentMap;
import org.baderlab.csplugins.enrichmentmap.model.EnrichmentResult;
import org.baderlab.csplugins.enrichmentmap.model.GeneSet;
import org.baderlab.csplugins.enrichmentmap.model.GenericResult;
import org.baderlab.csplugins.enrichmentmap.model.SetOfEnrichmentResults;
import org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor;
import org.cytoscape.work.TaskMonitor;

import com.google.common.collect.ImmutableSet;

public class ParseGenericEnrichmentResults extends DatasetLineParser {
	
	public ParseGenericEnrichmentResults(EMDataSet dataset) {
		super(dataset);
	}
	
	@Override
	public void parseLines(List<String> lines, EMDataSet dataset, TaskMonitor taskMonitor) {
		if(taskMonitor == null)
			taskMonitor = new NullTaskMonitor();
		taskMonitor.setTitle("Parsing Generic Result file");

		//Get the current genesets so we can check that all the results are in the geneset list
		//and put the size of the genesets into the visual style
		Map<String, GeneSet> genesets = dataset.getSetOfGeneSets().getGeneSets();

		int currentProgress = 0;
		int maxValue = lines.size();
		taskMonitor.setStatusMessage("Parsing Generic Results file - " + maxValue + " rows");
		boolean FDR = false;

		//skip the first line which just has the field names (start i=1)
		//check to see how many columns the data has
		String line = lines.get(0);
		String[] tokens = line.split("\t");
		int length = tokens.length;

		EnrichmentMap map = dataset.getMap();
		SetOfEnrichmentResults enrichments = dataset.getEnrichments();
		Map<String, EnrichmentResult> results = enrichments.getEnrichments();
		String upPhenotype = enrichments.getPhenotype1();
		String downPhenotype = enrichments.getPhenotype2();
		
		//check to see if there are genesets.
		//if there are no genesets then populate the genesets from the generic file
		//can only do this if the 6th column has a list of genes for that geneset.
		boolean populate_gs = false;
		if(genesets == null || genesets.isEmpty())
			populate_gs = true;
		//as this is the default for gprofiler use the Description in the visual style instead of the formatted name
		//but only if there is a gmt supplied.  If using just the generic output file there is not field for description
		else
			dataset.getMap().getParams().setEMgmt(true);

		//if (length < 3)
		//not enough data in the file!!

		for(int i = 1; i < lines.size(); i++) {
			line = lines.get(i);

			tokens = line.split("\t");

			//update the length each time because some line might have missing values
			length = tokens.length;

			double pvalue = 1.0;
			double FDRqvalue = 1.0;
			GenericResult result;
			int gs_size = 0;
			double NES = 1.0;

			//The first column of the file is the name of the geneset
			final String name = tokens[0].toUpperCase().trim();
			final String description = tokens[1].toUpperCase();

			if(genesets.containsKey(name)) {
				gs_size = genesets.get(name).getGenes().size();
			} 

			//The third column is the nominal p-value
			if(tokens[2] == null || tokens[2].equalsIgnoreCase("")) {
				//do nothing
			} else {
				pvalue = Double.parseDouble(tokens[2]);
			}

			if(length > 3) {
				//the fourth column is the FDR q-value
				if(tokens[3] == null || tokens[3].equalsIgnoreCase("")) {
					//do nothing
				} else {
					FDRqvalue = Double.parseDouble(tokens[3]);
					FDR = true;
				}
				//the fifth column is the phenotype.
				//it can either be a signed number or it can be text specifying the phenotype
				//in order for it to be parseable the text has to match the user specified phenotypes
				// and if it is a number the only important part is the sign
				if(length > 4) {

					if(tokens[4] == null || tokens[4].equalsIgnoreCase("")) {

					} else {
						//check to see if the string matches the specified phenotypes
						if(tokens[4].equalsIgnoreCase(upPhenotype))
							NES = 1.0;
						else if(tokens[4].equalsIgnoreCase(downPhenotype))
							NES = -1.0;
						//try and see if the user has specified the phenotype as a number
						else {
							try {
								NES = Double.parseDouble(tokens[4]);
							} catch(NumberFormatException nfe) {
								throw new IllegalThreadStateException(tokens[4]
										+ " is not a valid phenotype.  Phenotype specified in generic enrichment results file must have the same phenotype as specified in advanced options or must be a positive or negative number.");
							}
						}
					}

					//ticket#57 - adding additional column to generic format, similiar to Bingo and David
					// that outlines the genes from the query that are found in the geneset and results in
					//its enrichment
					if(length > 5 && populate_gs) {

						//get all the genes in the field
						String[] gene_tokens = tokens[5].split(",");

						ImmutableSet.Builder<Integer> builder = ImmutableSet.builder();
						
						//All subsequent fields in the list are the geneset associated with this geneset.
						for(String token : gene_tokens) {
							String gene = token.trim().toUpperCase();

							//Check to see if the gene is already in the hashmap of genes
							//if it is already in the hash then get its associated key and put it into the set of genes
							if(map.containsGene(gene)) {
								builder.add(map.getHashFromGene(gene));
							}
							else if(!gene.isEmpty()) {
								Integer hash = map.addGene(gene).get();
								builder.add(hash);
							}
						}

						GeneSet gs = new GeneSet(name, description, builder.build());
						gs_size = gs.getGenes().size();
						//put the new or filtered geneset back into the set.
						genesets.put(name, gs);

					} //end of tokens>5
					result = new GenericResult(name, description, pvalue, gs_size, FDRqvalue, NES);
				} //end of tokens>4

				else
					result = new GenericResult(name, description, pvalue, gs_size, FDRqvalue);

			} else {
				result = new GenericResult(name, description, pvalue, gs_size);
			}

			// Calculate Percentage.  This must be a value between 0..100.
			int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
			taskMonitor.setProgress(percentComplete);
			currentProgress++;

			//check to see if the gene set has already been entered in the results
			//it is possible that one geneset will be in both phenotypes.
			//if it is already exists then we want to make sure the one retained is the result with the
			//lower p-value.
			//ticket #149
			GenericResult temp = (GenericResult) results.get(name);
			if(temp == null)
				results.put(name, result);
			else {
				if(result.getPvalue() < temp.getPvalue())
					results.put(name, result);
			}

		}
		if(FDR)
			dataset.getMap().getParams().setFDR(FDR);
	}

}