/** ** EnrichmentMap Cytoscape Plugin ** ** Copyright (c) 2008-2009 Bader Lab, Donnelly Centre for Cellular and Biomolecular ** Research, University of Toronto ** ** Contact: http://www.baderlab.org ** ** Code written by: Ruth Isserlin ** Authors: Daniele Merico, Ruth Isserlin, Oliver Stueker, Gary D. Bader ** ** This library is free software; you can redistribute it and/or modify it ** under the terms of the GNU Lesser General Public License as published ** by the Free Software Foundation; either version 2.1 of the License, or ** (at your option) any later version. ** ** This library is distributed in the hope that it will be useful, but ** WITHOUT ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF ** MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. The software and ** documentation provided hereunder is on an "as is" basis, and ** University of Toronto ** has no obligations to provide maintenance, support, updates, ** enhancements or modifications. In no event shall the ** University of Toronto ** be liable to any party for direct, indirect, special, ** incidental or consequential damages, including lost profits, arising ** out of the use of this software and its documentation, even if ** University of Toronto ** has been advised of the possibility of such damage. ** See the GNU Lesser General Public License for more details. ** ** You should have received a copy of the GNU Lesser General Public License ** along with this library; if not, write to the Free Software Foundation, ** Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. ** **/ // $Id$ // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // $HeadURL$ package org.baderlab.csplugins.enrichmentmap.parsers; import java.io.IOException; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; import org.baderlab.csplugins.enrichmentmap.model.EMDataSet; import org.baderlab.csplugins.enrichmentmap.model.EMDataSet.Method; import org.baderlab.csplugins.enrichmentmap.model.EnrichmentMap; import org.baderlab.csplugins.enrichmentmap.model.Rank; import org.baderlab.csplugins.enrichmentmap.model.Ranking; import org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor; import org.cytoscape.work.AbstractTask; import org.cytoscape.work.TaskMonitor; /** * Created by User: risserlin Date: May 1, 2009 Time: 9:10:22 AM * <p> * Task to parse ranks file <br> * There are multiple potential rank file formats: <br> * GSEA input rnk file - a two column file with genes and their specified rank * represented as a double, commented lines have a # at the line start. GSEA * output rank files (xls file) - a five column file with genes and specified * rank but also have three bland columns. * */ public class RanksFileReaderTask extends AbstractTask { private String RankFileName; private EMDataSet dataset; private String ranks_name; //distinguish between load from enrichment map input panel and heatmap interface private boolean loadFromHeatmap = false; /** * Class constructor * * @param rankFileName - file name of ranks file * @param dataset - which dataset is this rank file related to (dataset 1 or dataset 2) */ public RanksFileReaderTask(String rankFileName, EMDataSet dataset, boolean loadFromHeatmap) { this.RankFileName = rankFileName; this.dataset = dataset; this.loadFromHeatmap = loadFromHeatmap; } /** * Class constructor - curent task monitor specified. * * @param rankFileName - file name of ranks file * @param dataset - which dataset is this rank file related to (dataset 1 or dataset 2) */ public RanksFileReaderTask(String rankFileName, EMDataSet dataset, String ranks_name, boolean loadFromHeatmap) { RankFileName = rankFileName; this.ranks_name = ranks_name; this.dataset = dataset; this.loadFromHeatmap = loadFromHeatmap; } /** * Class constructor - for late loaded rank file that aren't specific to a dataset. * * @param params - enrichment map parameters for current map * @param rankFileName - file name of ranks file * @param ranks_name - name of rankings to be used in heat map drop down to refer to it. */ public RanksFileReaderTask(String rankFileName, String ranks_name, boolean loadFromHeatmap) { this.RankFileName = rankFileName; this.ranks_name = ranks_name; this.loadFromHeatmap = loadFromHeatmap; } /** * parse the rank file */ public void parse(TaskMonitor taskMonitor) throws IOException { if(taskMonitor == null) taskMonitor = new NullTaskMonitor(); List<String> lines = DatasetLineParser.readLines(RankFileName); int lineNumber = 0; int currentProgress = 0; int maxValue = lines.size(); taskMonitor.setStatusMessage("Parsing Rank file - " + maxValue + " rows"); EnrichmentMap map = dataset.getMap(); // we don't know the number of scores in the rank file yet, but it can't be more than the number of lines. Double[] score_collector = new Double[lines.size()]; boolean gseaDefinedRanks = false; Map<Integer, Rank> ranks = new HashMap<>(); /* * there are two possible Rank files: If loaded through the rpt file the * file is the one generated by GSEA and will have 5 columns (name, * description, empty,empty,score) If the user loaded it through the * generic of specifying advanced options then it will 2 columns * (name,score). The score in either case should be a double and the * name a string so check for either option. */ int nScores = 0; //number of found scores for(int i = 0; i < lines.size(); i++) { String line = lines.get(i); //check to see if the line is commented out and should be ignored. if(line.startsWith("#")) { // look for ranks_name in comment line e.g.: "# Ranks Name : My Ranks" if(Pattern.matches("^# *Ranks[ _-]?Name *:.+", line)) { this.ranks_name = line.split(":", 2)[1]; while(this.ranks_name.startsWith(" ")) this.ranks_name = this.ranks_name.substring(1); } //ignore comment line continue; } String[] tokens = line.split("\t"); String name = tokens[0].toUpperCase(); double score = 0; //if there are 5 columns in the data then the rank is the last column if(tokens.length == 5) { //ignore rows where the expected rank value is not a valid double try { //gseaDefinedRanks = true; score = Double.parseDouble(tokens[4]); } catch(NumberFormatException nfe) { if(lineNumber == 0) { lineNumber++; continue; } else throw new IllegalThreadStateException("rank value for" + tokens[0] + "is not a valid number"); } nScores++; } //if there are 2 columns in the data then the rank is the 2 column else if(tokens.length == 2) { try { score = Double.parseDouble(tokens[1]); } catch(NumberFormatException nfe) { if(lineNumber == 0) { lineNumber++; continue; } else throw new IllegalThreadStateException("rank value for" + tokens[0] + "is not a valid number"); } nScores++; } else { System.out.println("Invalid number of tokens line of Rank File (should be 5 or 2)"); //skip invalid line continue; } if((tokens.length == 5) || (dataset.getMethod() == Method.GSEA && !loadFromHeatmap)) gseaDefinedRanks = true; //add score to array of scores score_collector[nScores - 1] = score; //check to see if the gene is in the genelist Integer genekey = map.getHashFromGene(name); if(genekey != null) { Rank current_ranking; //if their were 5 tokens in the rank file then the assumption //is that this is a GSEA rank file and the order of the scores //is indicative of the rank //TODO: need a better way of defining GSEA or user defined rank files. //Ticket #189 if the user uses the rnk file from the of the edb directory insteaad of 5 column // rank file from the main directory (as entered by rpt load) the leading edge // is calculated wrong because it only has 2 columns but still needs to have the ranks defined // based on the order of the scores. // Making the assumption that all rank files loaded for GSEA results from EM input panel are leading // edge compatible files. if((tokens.length == 5) || (dataset.getMethod() == Method.GSEA && !loadFromHeatmap)) { current_ranking = new Rank(name, score, nScores); } else { current_ranking = new Rank(name, score); } ranks.put(genekey, current_ranking); } // Calculate Percentage. This must be a value between 0..100. int percentComplete = (int) (((double) currentProgress / maxValue) * 100); taskMonitor.setProgress(percentComplete); currentProgress++; } //the none of the genes are in the gene list if(ranks.isEmpty()) { throw new IllegalThreadStateException("None of the genes in the rank file are found in the expression file. Make sure the identifiers of the two files match."); } //remove Null values from collector Double[] sort_scores = new Double[nScores]; double[] scores = new double[nScores]; for(int i = 0; i < nScores; i++) { sort_scores[i] = score_collector[i]; scores[i] = (double) score_collector[i]; } //after we have loaded in all the scores, sort the score to compute ranks //create hash of scores to ranks. HashMap<Double, Integer> score2ranks = new HashMap<Double, Integer>(); //sorts the array in descending order Arrays.sort(sort_scores, Collections.reverseOrder()); //check to see if they are p-values (if the values are between -1 and 1 , for a signed pvalue) //this will actually give a weird sorting behaviour if the scores are actually not p-values and //just signed statistics for instance as it will sort them in the opposite direction. if(sort_scores[0] <= 1 && sort_scores[sort_scores.length - 1] >= -1) Arrays.sort(sort_scores); for(int j = 0; j < sort_scores.length; j++) { //check to see if this score is already enter if(!score2ranks.containsKey(sort_scores[j])) score2ranks.put(sort_scores[j], j); } //update scores Hash to contain the ranks as well. //only update the ranks if we haven't already defined them using order of scores in file if(!gseaDefinedRanks) { for(Iterator<Integer> k = ranks.keySet().iterator(); k.hasNext();) { Integer gene_key = k.next(); Rank current_ranking = ranks.get(gene_key); Integer rank = score2ranks.get(current_ranking.getScore()); current_ranking.setRank(rank); // update rank2gene and gene2score as well } } //check to see if some of the dataset genes are not in this rank file Set<Integer> current_genes = dataset.getDataSetGenes(); Set<Integer> current_ranks = ranks.keySet(); //intersect the genes with the ranks. only retain the genes that have ranks. Set<Integer> intersection = new HashSet<>(current_genes); intersection.retainAll(current_ranks); //see if there more genes than there are ranks if(!(intersection.size() == current_genes.size())) { //JOptionPane.showMessageDialog(Cytoscape.getDesktop(),"Ranks for some of the genes/proteins listed in the expression file are missing. \n These genes/proteins will be excluded from ranked listing in the heat map."); } //create a new Ranking Ranking new_ranking = new Ranking(); ranks.forEach(new_ranking::addRank); //add the Ranks to the expression file ranking dataset.getExpressionSets().addRanks(ranks_name, new_ranking); } @Override public void run(TaskMonitor taskMonitor) throws Exception { taskMonitor.setTitle("Parsing Ranks file"); parse(taskMonitor); } }