/** ** EnrichmentMap Cytoscape Plugin ** ** Copyright (c) 2008-2009 Bader Lab, Donnelly Centre for Cellular and Biomolecular ** Research, University of Toronto ** ** Contact: http://www.baderlab.org ** ** Code written by: Ruth Isserlin ** Authors: Daniele Merico, Ruth Isserlin, Oliver Stueker, Gary D. Bader ** ** This library is free software; you can redistribute it and/or modify it ** under the terms of the GNU Lesser General Public License as published ** by the Free Software Foundation; either version 2.1 of the License, or ** (at your option) any later version. ** ** This library is distributed in the hope that it will be useful, but ** WITHOUT ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF ** MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. The software and ** documentation provided hereunder is on an "as is" basis, and ** University of Toronto ** has no obligations to provide maintenance, support, updates, ** enhancements or modifications. In no event shall the ** University of Toronto ** be liable to any party for direct, indirect, special, ** incidental or consequential damages, including lost profits, arising ** out of the use of this software and its documentation, even if ** University of Toronto ** has been advised of the possibility of such damage. ** See the GNU Lesser General Public License for more details. ** ** You should have received a copy of the GNU Lesser General Public License ** along with this library; if not, write to the Free Software Foundation, ** Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. ** **/ // $Id: ExpressionFileReaderTask.java 371 2009-09-25 20:24:18Z risserlin $ // $LastChangedDate: 2009-09-25 16:24:18 -0400 (Fri, 25 Sep 2009) $ // $LastChangedRevision: 371 $ // $LastChangedBy: risserlin $ // $HeadURL: svn+ssh://risserlin@server1.baderlab.med.utoronto.ca/svn/EnrichmentMap/trunk/EnrichmentMapPlugin/src/org/baderlab/csplugins/enrichmentmap/ExpressionFileReaderTask.java $ package org.baderlab.csplugins.enrichmentmap.parsers; import java.io.File; import java.io.IOException; import java.util.List; import java.util.Map; import java.util.Set; import org.baderlab.csplugins.enrichmentmap.model.EMDataSet; import org.baderlab.csplugins.enrichmentmap.model.EnrichmentMap; import org.baderlab.csplugins.enrichmentmap.model.GeneExpression; import org.baderlab.csplugins.enrichmentmap.model.GeneExpressionMatrix; import org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor; import org.cytoscape.work.AbstractTask; import org.cytoscape.work.TaskMonitor; /** * Parse expression file. The user can also use a rank file instead of an * expression file so this class also handles reading of rank files. */ public class ExpressionFileReaderTask extends AbstractTask { private final EMDataSet dataset; /** * @param dataset - dataset expression file is associated with */ public ExpressionFileReaderTask(EMDataSet dataset) { this.dataset = dataset; } /** * Parse expression/rank file */ public GeneExpressionMatrix parse() throws IOException { return parse(null); } /** * Parse expression/rank file */ public GeneExpressionMatrix parse(TaskMonitor taskMonitor) throws IOException { if(taskMonitor == null) taskMonitor = new NullTaskMonitor(); //Need to check if the file specified as an expression file is actually a rank file //If it is a rank file it can either be 5 or 2 columns but it is important that the rank //value is extracted from the right column and placed in the expression matrix as if it //was an expression value in order for other features to work. //Also a problem with old session files that imported a rank file so it also //important to check if the file only has two columns. If it only has two columns, //check to see if the second column is a double. If it is then consider that column //expression boolean twoColumns = false; Set<Integer> datasetGenes = dataset.getDataSetGenes(); // Map<Integer,String> genes = dataset.getMap().getGenes(); EnrichmentMap map = dataset.getMap(); String expressionFileName = dataset.getExpressionSets().getFilename(); List<String> lines = DatasetLineParser.readLines(expressionFileName); int currentProgress = 0; int maxValue = lines.size(); int expressionUniverse = 0; taskMonitor.setStatusMessage("Parsing GCT file - " + maxValue + " rows"); GeneExpressionMatrix expressionMatrix = dataset.getExpressionSets(); //GeneExpressionMatrix expressionMatrix = new GeneExpressionMatrix(lines[0].split("\t")); //HashMap<Integer,GeneExpression> expression = new HashMap<Integer, GeneExpression>(); Map<Integer, GeneExpression> expression = expressionMatrix.getExpressionMatrix(); for(int i = 0; i < lines.size(); i++) { String line = lines.get(i); String[] tokens = line.split("\t"); //The first column of the file is the name of the geneset String Name = tokens[0].toUpperCase().trim(); //if this is the first line and the expression matrix if still empty and the column names are empty //Added column names empty for GSEA rank files that have no heading but after going through the loop //the first time we have given them default headings if(i == 0 && (expressionMatrix == null || expressionMatrix.getExpressionMatrix().isEmpty()) && expressionMatrix.getColumnNames() == null) { //otherwise the first line is the header if(Name.equalsIgnoreCase("#1.2")) { line = lines.get(2); i = 2; } else { line = lines.get(0); //ignore all comment lines int k = 0; while(line.startsWith("#")) { k++; line = lines.get(k); } i = k; } tokens = line.split("\t"); //check to see how many columns there are //if there are only 2 columns then we could be dealing with a ranked file //check to see if the second column contains expression values. if(tokens.length == 2) { twoColumns = true; //the assumption is the first line is the column names but //if we are loading a GSEA edb rnk file then their might not be column names try { int temp = Integer.parseInt(tokens[1]); i = -1; tokens[0] = "Name"; tokens[1] = "Rank/Score"; } catch(NumberFormatException v) { try { double temp2 = Double.parseDouble(tokens[1]); i = -1; tokens[0] = "Name"; tokens[1] = "Rank/Score"; } catch(NumberFormatException v2) { //if it isn't a double or int then we have a title line. } } } //expressionMatrix = new GeneExpressionMatrix(tokens); expressionMatrix.setColumnNames(tokens); expressionMatrix.setNumConditions(expressionMatrix.getColumnNames().length); expressionMatrix.setExpressionMatrix(expression); continue; } //Check to see if this gene is in the genes list //Currently we only load gene expression data for genes that are already in the gene list (i.e. are listed in at least one geneset) //TODO:is there the possibility that we need all the expression genes? Currently this great decreases space when saving sessions Integer genekey = map.getHashFromGene(Name); if(genekey != null) { //we want the genes hashmap and dataset genes hashmap to have the same keys so it is easier to compare. datasetGenes.add(genekey); String description = ""; //check to see if the second column is parseable if(twoColumns) { try { Double.parseDouble(tokens[1]); } catch(NumberFormatException e) { description = tokens[1]; } } else { description = tokens[1]; } GeneExpression expres = new GeneExpression(Name, description); expres.setExpression(tokens); double newMax = expres.newMax(expressionMatrix.getMaxExpression()); if(newMax != -100) expressionMatrix.setMaxExpression(newMax); double newMin = expres.newMin(expressionMatrix.getMinExpression()); if(newMin != -100) expressionMatrix.setMinExpression(newMin); double newClosest = expres.newclosesttoZero(expressionMatrix.getClosesttoZero()); if(newClosest != -100) expressionMatrix.setClosesttoZero(newClosest); expression.put(genekey, expres); } expressionUniverse++; // Calculate Percentage. This must be a value between 0..100. int percentComplete = (int) (((double) currentProgress / maxValue) * 100); taskMonitor.setProgress(percentComplete); currentProgress++; } //set the number of genes expressionMatrix.setExpressionUniverse(expressionUniverse); //row Normalize expressionset expressionMatrix.rowNormalizeMatrix(); return expressionMatrix; //TODO: intialize phenotypes associated with class files from expression file load /* * if(dataset == 1){ //set up the classes definition if it is set. * //check to see if the phenotypes were already set in the params from * a session load if(params.getTemp_class1() != null) * expressionMatrix.setPhenotypes(params.getTemp_class1()); * if(params.getClassFile1() != null) * expressionMatrix.setPhenotypes(setClasses( params.getClassFile1())); * //params.getEM().addExpression(EnrichmentMap.DATASET1, * expressionMatrix); } else{ //set up the classes definition if it is * set. * * //check to see if the phenotypes were already set in the params from * a session load if(params.getTemp_class2() != null) * expressionMatrix.setPhenotypes(params.getTemp_class2()); else * if(params.getClassFile2() != null) * expressionMatrix.setPhenotypes(setClasses( params.getClassFile2())); * //params.getEM().addExpression(EnrichmentMap.DATASET2, * expressionMatrix); } */ } /** * Parse class file (The class file is a GSEA specific file that specifyies * which phenotype each column of the expression file belongs to.) The class * file can only be associated with an analysis when dataset specifications * are specified initially using an rpt file. * * @param classFile - name of class file * @return String array of the phenotypes of each column in the expression * array */ private String[] setClasses(String classFile) throws IOException { File f = new File(classFile); //deal with legacy issue, if a session file has the class file set but //it didn't actually save the classes yet. if(!f.exists()) { return null; } //check to see if the file was opened successfully if(!classFile.equalsIgnoreCase(null)) { List<String> lines = DatasetLineParser.readLines(classFile); //the class file can be split by a space or a tab String[] classes = lines.get(2).split("\\s"); //the third line of the class file defines the classes return classes; } else { String[] def_pheno = { "Na_pos", "NA_neg" }; return def_pheno; } } @Override public void run(TaskMonitor taskMonitor) throws Exception { taskMonitor.setTitle("Parsing GCT file"); parse(taskMonitor); } }