package org.seqcode.tools.sequence; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.IOException; import java.util.Iterator; import java.util.NoSuchElementException; import org.biojava.bio.Annotation; import org.biojava.bio.BioException; import org.biojava.bio.seq.Feature; import org.biojava.bio.seq.Sequence; import org.biojava.bio.seq.SequenceIterator; import org.biojava.bio.seq.io.SeqIOTools; import org.seqcode.gseutils.Args; /** * This class takes input in <tt>Genbank</tt> format and converts them into output in <tt>Fasta</tt> format<br> * The sequences are put in windows of 60 residues by default. * @author gio_fou * */ public class GenbankToFasta { private static int window = 60; /** * @param args */ public static void main(String[] args) { if(args.length==0){ System.out.println("Usage:\n\tGenbankToFasta --in in.genbank --out out.fasta"); }else{ try { String ipfs = Args.parseString(args,"in",null); String opfs = Args.parseString(args,"out",null); File ipf = new File(ipfs); File opf = new File(opfs); FileInputStream is = new FileInputStream(ipf); FileOutputStream os = new FileOutputStream(opf); GenbankToFasta.convertGenbankToFasta(is, os, "GENBANK", "DNA"); } catch (FileNotFoundException e) { e.printStackTrace(); } } } /** * Converts inputs in <tt>Genbank</tt> to outputs in <tt>Fasta</tt> format<br> * Except for the sequence id, it also stores other info as strain, locus, length, type etc. * @param is The input file in <tt>Genbank</tt> format. E.g. <tt>System.in</tt>, <tt>new FileInputStream(File f)</tt> * @param os The output file in <tt>Fasta</tt> format. E.g. <tt>System.out</tt>, <tt>new FileOutputStream(File f)</tt> */ public static void convertGenbankToFasta(InputStream is, OutputStream os) { BufferedReader br = null; BufferedWriter bw = null; try { InputStreamReader isr = new InputStreamReader(is); br = new BufferedReader(isr); OutputStreamWriter osw = new OutputStreamWriter(os); bw = new BufferedWriter(osw); SequenceIterator sequenceItr = SeqIOTools.readGenbank(br); int count = 0; while(sequenceItr.hasNext()) { StringBuilder sbName = new StringBuilder(); Sequence seq = sequenceItr.nextSequence(); // get the name of the sequence String seqName = seq.getName(); // get the char-sequence of the sequence String seqString = seq.seqString(); //present sequence in steps of window (default: 60) String seqStringFormatted = formatSequence(seqString, window); sbName.append(seqName); String str; // append strain Iterator<Feature> featItr = seq.features(); while( featItr.hasNext() ) { Feature feat = featItr.next(); Annotation featAnnot = feat.getAnnotation(); if(featAnnot.containsProperty("strain")) if( (str = (String)featAnnot.getProperty("strain")) != null ) sbName.append(String.format("|%s", str)); } String[] keys = {"LOCUS", "SIZE", "TYPE", "CIRCULAR", "DIVISION", "MDAT", "SOURCE"}; Annotation seqAnnot = seq.getAnnotation(); for(String key : keys) if(seqAnnot.containsProperty(key)) if( (str = (String)seqAnnot.getProperty(key)) != null ) sbName.append(String.format("|%s", str)); // Print record String seqRecord = String.format(">%s%n%s%n", sbName, seqStringFormatted); bw.write(seqRecord); // print statistics count++; if( count%1000 == 0) System.out.printf("Record %d has been read...%n", count); } } catch (FileNotFoundException e) { e.printStackTrace(); System.exit(-1); } catch (IOException e) { e.printStackTrace(); System.exit(-1); } catch (NoSuchElementException e) { e.printStackTrace(); System.exit(-1); } catch (BioException e) { e.printStackTrace(); System.exit(-1); } finally { try { br.close(); bw.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }// end of convertGenbankToFasta method /** * Converts inputs in <tt>Genbank</tt> to outputs in <tt>Fasta</tt> format<br> * Only the name of the sequence is stored on the header section. * @param is The input file in <tt>Genbank</tt> format. E.g. <tt>System.in</tt>, <tt>new FileInputStream(File f)</tt> * @param os The output file in <tt>Fasta</tt> format. E.g. <tt>System.out</tt>, <tt>new FileOutputStream(File f)</tt> * @param format The format of the file. Allowed formats are (case insensitive): FASTA, EMBL, GENBANK, SWISSPROT (or swiss), GENPEPT * @param alpha The substance of the file. Allowed formats are (case insensitive): DNA, AA, Protein, RNA */ public static void convertGenbankToFasta(InputStream is, OutputStream os, String format, String alpha) { if( !(format.equals("FASTA") | format.equals("EMBL") | format.equals("GENBANK") | format.equals("SWISSPROT") | format.equals("swiss") | format.equals("GENPEPT") ) ) throw new IllegalArgumentException("Illegal value for argument format"); if( !(alpha.equals("DNA") | alpha.equals("AA") | alpha.equals("Protein") | alpha.equals("RNA") ) ) throw new IllegalArgumentException("Illegal value for argument alpha"); BufferedReader br = null; try { InputStreamReader isr = new InputStreamReader(is); br = new BufferedReader(isr); SequenceIterator iter = (SequenceIterator)SeqIOTools.fileToBiojava(format, alpha, br); SeqIOTools.writeFasta(os, iter); } catch (FileNotFoundException e) { e.printStackTrace(); System.exit(-1); } catch (IOException e) { e.printStackTrace(); System.exit(-1); } catch (NoSuchElementException e) { e.printStackTrace(); System.exit(-1); } catch (BioException e) { e.printStackTrace(); System.exit(-1); } finally { try { br.close(); os.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }// end of convertGenbankToFasta method /** * Formats the input string in windows of size <tt>window</tt> * @param seq string to be formatted * @param window size of the window * @return the formatted string */ public static String formatSequence(String seq, int window) { StringBuilder seqFormatted = new StringBuilder(); int currStart = 0; while( currStart < seq.length()) { int currEnd = Math.min(currStart+window, seq.length()); seqFormatted.append(seq.substring(currStart, currEnd) + "\n"); currStart = currEnd; } return seqFormatted.toString(); }//end of formatSequence method /** * Formats the input string with the default window size of 60 residues * @param seq string to be formatted * @return */ public static String formatSequence(String seq) { return formatSequence(seq, window); }//end of formatSequence method }//end of GenbankToFasta class