ParseDavidEnrichmentResults.java example

Explorer

EnrichmentMapApp-master
- EnrichmentMapApp-develop
  - EnrichmentMapIntegrationTest
    - src
      - test
        java
        org
        baderlab
        csplugins
        enrichmentmap
        integration
        BaseIntegrationTest.java
        EdgeSimilarities.java
        NullTaskMonitor.java
        PaxExamConfiguration.java
        SerialTestTaskManager.java
        SessionFile.java
        SimilarityKey.java
        TestUtils.java
        task
        CreateEnrichmentMapTaskTest.java
        LegacySessionLoadTest.java
        Protocol1Test.java
  - EnrichmentMapPlugin
    - src

package org.baderlab.csplugins.enrichmentmap.parsers;

import java.util.List;
import java.util.Map;

import org.baderlab.csplugins.enrichmentmap.model.EMDataSet;
import org.baderlab.csplugins.enrichmentmap.model.EnrichmentMap;
import org.baderlab.csplugins.enrichmentmap.model.EnrichmentResult;
import org.baderlab.csplugins.enrichmentmap.model.GeneSet;
import org.baderlab.csplugins.enrichmentmap.model.GenericResult;
import org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor;
import org.cytoscape.work.TaskMonitor;

import com.google.common.collect.ImmutableSet;

public class ParseDavidEnrichmentResults extends DatasetLineParser {
	
	public ParseDavidEnrichmentResults(EMDataSet dataset) {
		super(dataset);
	}

	/**
	 * Parse david enrichment results file
	 */
	@Override
	public void parseLines(List<String> lines, EMDataSet dataset, TaskMonitor taskMonitor) {
		if(taskMonitor == null)
			taskMonitor = new NullTaskMonitor();
		taskMonitor.setTitle("Parsing David Enrichment Result file");
		
		//with David results there are no genesets defined.  first pass through the file
		// needs to parse the genesets

		//parameters that can be extracted from David files:
		//Category	Term	Count	%	PValue	Genes	List Total	Pop Hits	Pop Total	Fold Enrichment	Bonferroni	Benjamini	FDR
		// Count = number of genes in the geneset that came from the input list, number of genes in the genelist mapping toa specific term.
		// List Total - number of genes in the gene list mapping to the category (ie. GO Cellular component)
		// Pop Hits - number of genes in the background gene list mapping to a specific term
		// Pop total - number of gene s in the background gene list mapping to the category (i.e. Go Cellular Component)

		// Column 2 is the geneset name
		// Column 1 is the category (and can be used for the description)
		// Column 6 is the list of genes (from the loaded list) in this geneset -- therefore pre-filtered.
		Map<String, GeneSet> genesets = dataset.getSetOfGeneSets().getGeneSets();


		EnrichmentMap map = dataset.getMap();
		Map<String, EnrichmentResult> results = dataset.getEnrichments().getEnrichments();

		int currentProgress = 0;
		int maxValue = lines.size();
		taskMonitor.setStatusMessage("Parsing Generic Results file - " + maxValue + " rows");

		boolean FDR = true;

		//skip the first line which just has the field names (start i=1)
		//check to see how many columns the data has
		String line = lines.get(0);
		String[] tokens = line.split("\t");
		int length = tokens.length;
		if(length != 13)
			throw new IllegalThreadStateException("David results file is missing data.");
		//not enough data in the file!!

		for(int i = 1; i < lines.size(); i++) {
			line = lines.get(i);

			tokens = line.split("\t");

			double pvalue = 1.0;
			double FDRqvalue = 1.0;
			GenericResult result;
			int gs_size = 0;
			double NES = 1.0;

			//The second column of the file is the name of the geneset
			final String name = tokens[1].toUpperCase().trim();

			//the first column of the file is the description
			final String description = tokens[0].toUpperCase();

			//when there are two different species it is possible that the gene set could
			//already exist in the set of genesets.  if it does exist then add the genes
			//in this set to the geneset
			ImmutableSet.Builder<Integer> builder = ImmutableSet.builder();
			if(genesets.containsKey(name))
				builder = builder.addAll(genesets.get(name).getGenes());

			String[] gene_tokens = tokens[5].split(", ");

			//All subsequent fields in the list are the geneset associated with this geneset.
			for(int j = 0; j < gene_tokens.length; j++) {

				String gene = gene_tokens[j].toUpperCase();
				
				//Check to see if the gene is already in the hashmap of genes
				//if it is already in the hash then get its associated key and put it into the set of genes
				if(map.containsGene(gene)) {
					builder.add(map.getHashFromGene(gene));
				}
				else if(!gene.isEmpty()) {
					Integer hash = map.addGene(gene).get();
					builder.add(hash);
				}
			}

			//finished parsing that geneset
			//add the current geneset to the hashmap of genesets
			GeneSet gs = new GeneSet(name, description, builder.build());
			genesets.put(name, gs);

			//The 5th column is the nominal p-value
			if(tokens[4].equalsIgnoreCase("")) {
				//do nothing
			} else {
				pvalue = Double.parseDouble(tokens[4]);
			}

			//the Pop hits is the size of the geneset
			//the Count is the size of the geneset (restricted by the gene list)
			if(tokens[2].equalsIgnoreCase("")) {
				//do nothing
			} else {
				gs_size = Integer.parseInt(tokens[2]);
			}

			//Use the Benjamini value for the fdr
			if(tokens[11].equalsIgnoreCase("")) {
				//do nothing
			} else {
				FDRqvalue = Double.parseDouble(tokens[11]);
			}

			result = new GenericResult(name, description, pvalue, gs_size, FDRqvalue);

			// Calculate Percentage.  This must be a value between 0..100.
			int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
			taskMonitor.setProgress(percentComplete);
			currentProgress++;

			//check to see if the gene set has already been entered in the results
			//it is possible that one geneset will be in both phenotypes.
			//if it is already exists then we want to make sure the one retained is the result with the
			//lower p-value.
			//ticket #149
			GenericResult temp = (GenericResult) results.get(name);
			if(temp == null)
				results.put(name, result);
			else {
				if(result.getPvalue() < temp.getPvalue())
					results.put(name, result);
			}

		}
		if(FDR)
			dataset.getMap().getParams().setFDR(FDR);
	}

}