package org.seqcode.deepseq.events;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.seqcode.deepseq.events.BindingModel;
import org.seqcode.genome.Genome;
import org.seqcode.genome.GenomeConfig;
import org.seqcode.genome.location.AnnotationLoader;
import org.seqcode.gseutils.ArgParser;
import org.seqcode.gseutils.Args;
/**
* EventsConfig:
* Maintains settings for events and binding models.
*
* @author Shaun Mahony
* @version %I%, %G%
*/
public class EventsConfig {
protected GenomeConfig gconfig;
protected Genome gen=null;
protected boolean printHelp=false;
protected BindingModel defaultModel=null;
protected boolean addAnnotations=false;
protected boolean addSequences=true;
protected List<AnnotationLoader> geneAnnotations = new ArrayList<AnnotationLoader>();
protected int maxAnnotDistance=50000;
protected boolean annotOverlapOnly=false;
protected boolean calcEventBaseCompositions=false; //Calculate base compositions around events and tags belonging to events. Useful for analyzing permanganate ChIP-seq
protected double qMinThres=0.001; //Minimum Q-value for reported binding events
protected double minEventFoldChange=1.5;
protected double differentialSignificanceP = 0.01;
protected boolean runDiffTests = true; //Run differential enrichment testing
protected String Rpath="";
protected double edger_overdispersion = 0.15; //Overdispersion used by EdgeR differential enrichment tests
protected boolean eventsFileTXTExtension=false;
protected boolean printBED=true;
//Constants
public final double LOG2 = Math.log(2);
public final double LOG_FC_LIMIT = 10; //Maximum absolute log fold-change reported
public final boolean CALC_EVENTS_LL=false; //Calculate component-wise log-likelihoods during ML
protected String[] args;
public String getArgs(){
String a="";
for(int i=0; i<args.length; i++)
a = a+" "+args[i];
return a;
}
public EventsConfig(GenomeConfig gcon, String[] arguments){
gconfig = gcon;
gen = gconfig.getGenome();
this.args=arguments;
ArgParser ap = new ArgParser(args);
if(args.length==0 || ap.hasKey("h")){
printHelp=true;
}else{
try{
//Test for a config file... if there is concatenate the contents into the args
if(ap.hasKey("config")){
ArrayList<String> confArgs = new ArrayList<String>();
String confName = ap.getKeyValue("config");
File confFile = new File(confName);
if(!confFile.isFile())
System.err.println("\nCannot find configuration file: "+confName);
BufferedReader reader = new BufferedReader(new FileReader(confFile));
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
String[] words = line.split("\\s+");
if(!words[0].startsWith("--"))
words[0] = new String("--"+words[0]);
confArgs.add(words[0]);
if(words.length>1){
String rest=words[1];
for(int w=2; w<words.length; w++)
rest = rest+" "+words[w];
confArgs.add(rest);
}
}
String [] confArgsArr = confArgs.toArray(new String[confArgs.size()]);
String [] newargs =new String[args.length + confArgsArr.length];
System.arraycopy(args, 0, newargs, 0, args.length);
System.arraycopy(confArgsArr, 0, newargs, args.length, confArgsArr.length);
args = newargs;
ap = new ArgParser(args);
}
//Read distribution file
String modelFile = Args.parseString(args, "d", null); // read distribution file
if (modelFile != null){
File pFile = new File(modelFile);
if(!pFile.isFile()){
System.err.println("\nCannot find read distribution file: "+modelFile);
System.exit(1);
}
defaultModel = new BindingModel(pFile);
}
/****Gene Annotation****/
//Gene Annotations
Collection<String> tfiles = Args.parseStrings(args,"transcripts");
Collection<String> dbgenes = Args.parseStrings(args,"dbgenes");
for(String s:dbgenes)
geneAnnotations.add(new AnnotationLoader(gen, s, "refGene", maxAnnotDistance, annotOverlapOnly));
for(String s:tfiles)
geneAnnotations.add(new AnnotationLoader(gen, s, "file", maxAnnotDistance, annotOverlapOnly));
if(geneAnnotations.size()>0)
addAnnotations=true;
/****Miscellaneous arguments****/
//Record event base compositions
calcEventBaseCompositions = Args.parseFlags(args).contains("eventbasecomp") ? true : false;
//Q-value threshold
qMinThres = Args.parseDouble(args,"q",qMinThres);
//differential p-value threshold
differentialSignificanceP = Args.parseDouble(args,"diffp",differentialSignificanceP);
//Event Fold-change minimum
minEventFoldChange = Args.parseDouble(args,"minfold",minEventFoldChange);
//Turn off DE testing
runDiffTests = Args.parseFlags(args).contains("nodifftests") ? false : true;
//R path
Rpath = Args.parseString(args, "rpath", Rpath);
if(Rpath.length()>0 && !Rpath.endsWith(File.separator))
Rpath = Rpath+File.separator;
//EdgeR overdispersion parameter
edger_overdispersion = Args.parseDouble(args,"edgerod",edger_overdispersion);
//Add .txt extension to events files
eventsFileTXTExtension= Args.parseFlags(args).contains("eventsaretxt");
//No BED output
printBED = !(Args.parseFlags(args).contains("nobed"));
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* Merge a set of estimated genomes
* @param estGenomes
* @return
*/
public Genome mergeGenomes(List<Genome> estGenomes){
//Combine the chromosome information
HashMap<String, Integer> chrLenMap = new HashMap<String, Integer>();
for(Genome e : estGenomes){
Map<String, Integer> currMap = e.getChromLengthMap();
for(String s: currMap.keySet()){
if(!chrLenMap.containsKey(s) || chrLenMap.get(s)<currMap.get(s))
chrLenMap.put(s, currMap.get(s));
}
}
gen =new Genome("Genome", chrLenMap);
return gen;
}
//Accessors
public Genome getGenome(){return gen;}
public boolean helpWanted(){return printHelp;}
public boolean isAddingAnnotations(){return addAnnotations;}
public boolean isAddingSequences(){return addSequences;}
public List<AnnotationLoader> getGeneAnnotations(){return geneAnnotations;}
public int getMaxAnnotDistance(){return maxAnnotDistance;}
public boolean getAnnotOverlapOnly(){return annotOverlapOnly;}
public BindingModel getDefaultBindingModel(){return defaultModel;}
public boolean getCalcEventBaseCompositions(){return calcEventBaseCompositions;}
public double getQMinThres(){return qMinThres;}
public double getMinEventFoldChange(){return minEventFoldChange;}
public double getDiffPMinThres(){return differentialSignificanceP;}
public boolean getRunDiffTests(){return runDiffTests;}
public String getRpath(){return Rpath;}
public double getEdgeROverDisp(){return edger_overdispersion;}
public boolean getEventsFileTXTExtension(){return eventsFileTXTExtension;}
public boolean getPrintBED(){return printBED;}
/**
* returns a string describing the arguments handled by this parser.
* @return String
*/
public String getArgsList(){
return(new String("" +
"BindingModels:\n" +
"\t--d <read distribution model file>\n" +
"\t--eventbasecomp [flag to record event base compositions]\n"+
"EventTesting:\n"+
"\t--q <Q-value minimum, i.e corrected p-value(default="+qMinThres+")>\n" +
"\t--minfold <min event fold-change (default="+minEventFoldChange+")>\n" +
"\t--nodifftests [flag to turn off DE tests]\n" +
"\t--rpath <path to the R bin dir (default: R is in $PATH). Note that you need to install edgeR separately>\n" +
"\t--edgerod <EdgeR overdispersion (default="+edger_overdispersion+")>\n" +
"\t--diffp <minimum p-value for differential enrichment (default="+differentialSignificanceP+")>\n" +
"\t--eventsaretxt [add .txt to events file extension]\n"+
"\t--nobed [do not print BED files]" +
"Annotations:\n" +
"\t--transcripts <transcripts file>\n" +
"\t--dbgenes refGene\n" +
""));
}
}