package org.seqcode.gsebricks.verbs.chipseq; import java.io.*; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.TreeMap; import java.util.Vector; import org.seqcode.genome.Genome; import org.seqcode.genome.Species; import org.seqcode.genome.location.Region; import org.seqcode.gseutils.ArgParser; import org.seqcode.gseutils.Args; import org.seqcode.gseutils.NotFoundException; import org.seqcode.gseutils.Pair; /* * Class to parse out put from GPS Chip-Seq binding peak finder */ public class GPSParser { public static void main(String[] args) throws IOException { Genome genome=null; ArgParser ap = new ArgParser(args); try { Pair<Species, Genome> pair = Args.parseGenome(args); if(pair==null){ //Make fake genome... chr lengths provided??? if(ap.hasKey("geninfo")){ genome = new Genome("Genome", new File(ap.getKeyValue("geninfo")), true); }else{ System.err.println("No genome provided; provide a Gifford lab DB genome name or a file containing chromosome name/length pairs.");;System.exit(1); } }else{ genome = pair.cdr(); } } catch (NotFoundException e) { e.printStackTrace(); } String filename = Args.parseString(args,"GPS",null); List<GPSPeak> peaks = parseGPSOutput(filename, genome); System.out.println(peaks.size()); System.out.println(GPSPeak.toGPS_Header()); System.out.println(peaks.get(0).toGPS()); System.out.println(GPSPeak.toGPS_short_Header()); System.out.println(peaks.get(0).toGPS_short()); } /** * Parses data in the GPS output format * @param filename name of the file containing the data * @return a List of hit objects */ public static List<GPSPeak> parseGPSOutput(String filename, Genome g) throws IOException { return parseGPSOutput(new FileInputStream(filename), g); } public static List<GPSPeak> parseGPSOutput(InputStream stream, Genome g) throws IOException { ArrayList<GPSPeak> results = new ArrayList<GPSPeak>(); BufferedReader bin = null; int count = 0; bin = new BufferedReader(new InputStreamReader(stream)); String line; while((line = bin.readLine()) != null) { line = line.trim(); String[] f=line.split("\t"); if (f[0].equals("Position")||f[0].equals("chr")){ continue; } try { GPSPeak hit = GPSParser.parseLine(g, line, 0); if (hit!=null) { results.add(hit); } } catch (RuntimeException e) { System.err.println("Parsing line " + line); System.err.println(e.toString()); throw(e); } count++; } if (bin != null) { bin.close(); } return results; } /** * Parse a single line of text into a GPSPeak object * New format Position IP Control Fold Q_-lg10 P_-lg10 IPvsEMP IPvsCTR Joint NearestGene Distance Alpha EM_Position 4:117003586 496.8 7.4 67.1 129.052 135.017 -0.772 4.490 1 NONE 2147483647 7.8 4:117003586 * @param gpsLine a line of text representing a hit * @return a hit object containing the data from the specified line */ public static GPSPeak parseLine(Genome g, String gpsLine, int lineNumber) { GPSPeak peak; String[] t = gpsLine.split("\t"); if (t.length == 7) { // GPS output format 2010-07-31 // Position IP Control IP/Ctrl Q_-lg10 P_-lg10 Shape Region r = Region.fromString(g, t[0]); peak = new GPSPeak(g, r.getChrom(), r.getStart(), Double.parseDouble(t[1]), Double.parseDouble(t[2]), Double.parseDouble(t[4]), Math.pow(10,-1*Double.parseDouble(t[5])), Double.parseDouble(t[6])); } else if (t.length == 8 ) { // GPS output format 2010-11-10 // Position IP Control Fold Q_-lg10 P_-lg10 IPvsEMP IPvsCTR Region r = Region.fromString(g, t[0]); peak = new GPSPeak(g, r.getChrom(), r.getStart(), Double.parseDouble(t[1]), Double.parseDouble(t[2]), Double.parseDouble(t[4]), Math.pow(10,-1*Double.parseDouble(t[5])), Double.parseDouble(t[6]), Double.parseDouble(t[7])); } else if (t.length == 12 ||t.length == 11) { // with kmer info // GPS output format 2011-01-30 // Position IP Control Fold Q_-lg10 P_-lg10 IPvsEMP IPvsCTR Kmer KmerCount KmerStrength BoundSequence Region r = Region.fromString(g, t[0]); peak = new GPSPeak(g, r.getChrom(), r.getStart(), Double.parseDouble(t[1]), Double.parseDouble(t[2]), Double.parseDouble(t[4]), Math.pow(10,-1*Double.parseDouble(t[5])), Double.parseDouble(t[6]), Double.parseDouble(t[7]), t[8], (int)Double.parseDouble(t[9]), Double.parseDouble(t[10]), t.length >= 12 ? t[11] : ""); } else if (t.length == 10 ) { // not with kmer info // GPS output format 2011-07-25 // Position IP Control Fold Expectd Q_-lg10 P_-lg10 P_poiss IPvsEMP IPvsCTR Region r = Region.fromString(g, t[0]); peak = new GPSPeak(g, r.getChrom(), r.getStart(), Double.parseDouble(t[1]), Double.parseDouble(t[2]), Double.parseDouble(t[4]), Double.parseDouble(t[5]), Math.pow(10,-1*Double.parseDouble(t[6])), Double.parseDouble(t[7]), Double.parseDouble(t[8]), Double.parseDouble(t[9])); } else if (t.length == 14 || t.length == 15 ) { // with kmer info // GPS output format 2011-07-25 // Position IP Control Fold Expectd Q_-lg10 P_-lg10 P_poiss IPvsEMP IPvsCTR Kmer Count Strength BoundSequence EnrichedHGP Region r = Region.fromString(g, t[0]); peak = new GPSPeak(g, r.getChrom(), r.getStart(), Double.parseDouble(t[1]), Double.parseDouble(t[2]), Double.parseDouble(t[4]), Double.parseDouble(t[5]), Math.pow(10,-1*Double.parseDouble(t[6])), Double.parseDouble(t[7]), Double.parseDouble(t[8]), Double.parseDouble(t[9]), t[10], (int)Double.parseDouble(t[11]), t[12].charAt(0), t[13]); } else if (t.length == 13 ) { // with kmer info // GPS output format 2012-03 // Position IP Control Fold Expectd Q_-lg10 P_-lg10 P_poiss IPvsEMP IPvsCTR Kmer Count Strength BoundSequence Region r = Region.fromString(g, t[0]); peak = new GPSPeak(g, r.getChrom(), r.getStart(), Double.parseDouble(t[1]), Double.parseDouble(t[2]), Double.parseDouble(t[4]), Double.parseDouble(t[5]), Math.pow(10,-1*Double.parseDouble(t[6])), Double.parseDouble(t[7]), Double.parseDouble(t[8]), Double.parseDouble(t[9]), t[10], (int)Double.parseDouble(t[11]), t[12].charAt(0), ""); } else { throw new RuntimeException("Invalid number of fields (" + t.length + ") on line " + lineNumber + ": " + gpsLine); } return peak; } }