package org.seqcode.math.diff; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import org.seqcode.data.io.StreamGobbler; import org.seqcode.deepseq.events.EventsConfig; /** * EdgeRDifferentialEnrichment: Construct a script and run EdgeR from the command-line. * Obviously, this won't work if R and the EdgeR library are not installed. * Tested only on R v2.14 and above * * @author Shaun Mahony * @version %I%, %G% */ public class EdgeRDifferentialEnrichment extends DifferentialEnrichment{ protected CountsDataset data; protected EventsConfig config; protected File outDir; protected String outBase; //Constructor public EdgeRDifferentialEnrichment(EventsConfig config, File outDir, String outBase){ super(); this.config = config; this.outDir = outDir; this.outBase = outBase; } /** * execute: make & run an appropriate EdgeR script */ public CountsDataset execute(CountsDataset dat) { this.data = dat; String repCountsFilename = outDir+File.separator+outBase+".replicates.counts"; String scriptFilename = outDir+File.separator+"call_DE_GLM"+fileIDname+".R"; //Write the R script try { FileWriter fout = new FileWriter(scriptFilename); fout.write( "#!/usr/bin/Rscript --vanilla \n"+ "# Rscript call_DE_GLM.R repCounts.txt 0.15 \n" + "# Dataset: "+data.getCondName(data.getFocalCondition())+"\n" + "library(edgeR) \n" + "args <- commandArgs(TRUE) \n"+ "raw <- read.delim(args[1], row.names=\"Point\") \n"+ "y <- DGEList(raw)"+ "\n"); String factorString = "c("; int[] design = data.getDesignArray(); for(int d=0; d<design.length; d++){ factorString = factorString+"\""+design[d]+"\""; if(d<design.length-1) factorString = factorString+","; } factorString = factorString+")"; fout.write( "group <- factor("+factorString+")\n"); fout.write( "design <- model.matrix(~group) \n" + "design \n"+ "y <- calcNormFactors(y, method=\"TMM\") # TMM method \n"+ "y <- estimateGLMCommonDisp(y, design, method=\"deviance\", robust=TRUE, subset=NULL) # this only works on later versions of edgeR, which you get with R >= 2.14 \n"+ "print(\"Best-fit overdispersion parameter:\") \n"+ "print(y$common.dispersion) \n"+ "y$common.dispersion = as.numeric(args[2]) \n"+ "print(\"In tests, using overdispersion parameter:\") \n"+ "print(y$common.dispersion) \n"+ "fit <- glmFit(y, design, dispersion=y$common.dispersion) \n"+ "\n"); int ref = data.getFocalCondition(); for(int x=0; x<data.getNumConditions(); x++){ if(x!=ref){ fout.write( "# "+data.getCondName(x)+" vs "+data.getCondName(ref)+"\n"); String contrastString = "contrast=c(0,"; for(int c=1; c<data.getNumConditions(); c++){ if(c==ref){contrastString = contrastString+"-1";} else if(c==x){contrastString = contrastString+"1";} else{contrastString = contrastString+"0";} if(c<data.getNumConditions()-1){contrastString = contrastString+",";} else{contrastString = contrastString+")";} } String diffFilename = outDir+File.separator+outBase+".overdisp"+config.getEdgeROverDisp()+ "."+data.getCondName(x)+"vs"+data.getCondName(ref)+".edgeR_GLM_DE.txt"; fout.write( "calls <- glmLRT(fit, "+contrastString+") # later versions of edgeR (R >= 2.15) dropped the y argument in the call to glmLRT \n"+ "all_tags = topTags(calls, n=1000000)$table \n"+ "write.table(all_tags, file=\""+diffFilename+"\", sep=\"\\t\") \n"+ "\n"); } } fout.close(); //Run the R script String Rscriptcmd = config.getRpath()+"Rscript "; System.err.println("Running: "+Rscriptcmd+" "+scriptFilename+" "+repCountsFilename+" "+config.getEdgeROverDisp()); Process proc = Runtime.getRuntime().exec(Rscriptcmd+" "+scriptFilename+" "+repCountsFilename+" "+config.getEdgeROverDisp()); // any error message? StreamGobbler errorGobbler = new StreamGobbler(proc.getErrorStream(), "R_ERR", true); // any output? StreamGobbler outputGobbler = new StreamGobbler(proc.getInputStream(), "R_OUT", true); // kick them off errorGobbler.start(); outputGobbler.start(); int exitVal = proc.waitFor(); System.err.println("R ExitValue: " + exitVal); proc.destroy(); //Import the EdgeR results for(int x=0; x<data.getNumConditions(); x++){ if(x!=ref){ String diffFilename = outDir+File.separator+outBase+".overdisp"+config.getEdgeROverDisp()+ "."+data.getCondName(x)+"vs"+data.getCondName(ref)+".edgeR_GLM_DE.txt"; File dfFile = new File(diffFilename); if(!dfFile.isFile()){System.err.println("ERROR: Differential enrichment file not generated: "+diffFilename);} BufferedReader reader = new BufferedReader(new FileReader(dfFile)); //First line should have column labels String line= reader.readLine(); while ((line = reader.readLine()) != null) { line = line.trim(); String[] words = line.split("\\s+"); //Edit for Double correctness if(words[1].equals("Inf")){words[1]="Infinite";} if(words[1].equals("-Inf")){words[1]="-Infinite";} if(words[2].equals("Inf")){words[2]="Infinite";} if(words[2].equals("-Inf")){words[2]="-Infinite";} if(words[5].equals("Inf")){words[5]="Infinite";} if(words[5].equals("-Inf")){words[5]="-Infinite";} //Format: //Point logFC logCPM LR PValue FDR String pointStr = words[0].replaceAll("\"", ""); Double logFC = new Double(words[1]); if(logFC>config.LOG_FC_LIMIT){ logFC=config.LOG_FC_LIMIT; } if(logFC<-config.LOG_FC_LIMIT){ logFC=-config.LOG_FC_LIMIT; } Double logCPM = new Double(words[2]); Double FDR = new Double(words[5]); int u = data.getUnitID(pointStr); data.setDEpval(u, x, FDR); data.setCondFold(u, x, logFC); data.setCondMean(u,x,logCPM); }reader.close(); } } } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } return data; } }