package org.seqcode.data.readdb.tools; import java.io.*; import java.util.*; import org.apache.commons.cli.*; /** * Reads Bowtie output on stdin. * Produces a file on stdout in the format expected by ImportHits. * The weight for a hit is 1/(# of hits for that read) * * Options: --nosuboptimal (flag to only take the hits with the minimum number of mismatches) * --uniquehits (flag to only print 1:1 read to hit mappings) * * nosuboptimal is applied before uniquehits */ public class BowtieToReadDB { private static int readLength=-1; public static void main(String args[]) throws IOException, ParseException { Options options = new Options(); options.addOption("u","uniquehits",false,"only output hits with a single mapping"); options.addOption("s","nosuboptimal",false,"do not include hits whose score is not equal to the best score for the read"); CommandLineParser parser = new GnuParser(); CommandLine cl = parser.parse( options, args, false ); boolean uniqueOnly = cl.hasOption("uniquehits"); boolean filterSubOpt = cl.hasOption("nosuboptimal"); ArrayList<String[]> lines = new ArrayList<String[]>(); String line; String lastRead = ""; BufferedReader reader = new BufferedReader(new InputStreamReader(System.in)); while ((line = reader.readLine()) != null) { String pieces[] = line.split("\t"); if (!pieces[0].equals(lastRead)) { printLines(lines, uniqueOnly, filterSubOpt); lines.clear(); } lines.add(pieces); lastRead = pieces[0]; } printLines(lines, uniqueOnly, filterSubOpt); } /* ImportHits (where the printed lines are eventually used) takes * the average between the start and stop positions in the * coordinate. I think it is most correct to have this value * reflect the 5' coordinate for each hit. This is why I do the * calculation gymnastics for the "coord" String below (even * though this will slow imports down a bit) and also why the * start and stop positions are the same. */ public static void printLines(ArrayList<String[]> lines, boolean uniqueOnly, boolean filterSubOpt) { double numHits=0; ArrayList<String[]> linesToPrint; //First count the number of (valid) hits if(filterSubOpt && lines.size()>1){ linesToPrint = new ArrayList<String[]>(); int minMis=Integer.MAX_VALUE; //Find minimum number of mismatchs int [] mismatches = new int [lines.size()]; int i=0; for(String[] pieces : lines){ int mis=0; if(pieces.length>7 && pieces[7].length()>1) mis = pieces[7].split(",").length; mismatches[i]=mis; if(mis<minMis) minMis=mis; i++; } //Only add minimum mismatch hits i=0; for(String[] pieces : lines){ if(mismatches[i]==minMis) linesToPrint.add(pieces); i++; } numHits = linesToPrint.size(); }else{ linesToPrint=lines; numHits=lines.size(); } //Now add the hits! if(!uniqueOnly || linesToPrint.size()==1){ double weight = 1.0 / numHits; for (String[] pieces : linesToPrint) { if(readLength==-1) readLength=pieces[4].length(); Integer coord = pieces[1].equals("+") ? Integer.valueOf(pieces[3]) : Integer.valueOf(pieces[3])+readLength-1; coord+=1; //Bowtie default output is 0-based - we want 1-based. System.out.println(String.format("%s\t%s\t%s\t%d\t%f", pieces[2], coord, pieces[1], pieces[4].length(), weight)); } } } }