/**
*
*/
package jannovar.io;
import jannovar.exception.JannovarException;
import jannovar.reference.TranscriptModel;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.zip.GZIPInputStream;
/**
* This is the
* @author mjaeger
* @version 0.1 (2013-07-12)
*/
public abstract class FastaParser {
protected String filename;
protected String accession;
protected StringBuilder sequence;
protected ArrayList<TranscriptModel> transcriptmodels;
protected ArrayList<TranscriptModel> transcriptmodelsProcessed;
protected HashMap<String, Integer> transcript2index;
/**
* Constructs a new {@link FastaParser} and initiates the path to the FastA
* file and the {@link TranscriptModel}s
* @param filename path to the FastA file
* @param models list of {@link TranscriptModel}s w/o mRNA sequence data
*/
public FastaParser(String filename, ArrayList<TranscriptModel> models) {
this.filename = filename;
this.transcriptmodels = models;
this.transcriptmodelsProcessed = new ArrayList<TranscriptModel>();
transcript2index = new HashMap<String, Integer>(transcriptmodels.size());
for (int i = 0; i < transcriptmodels.size(); i++) {
transcript2index.put(transcriptmodels.get(i).getAccessionNumber(), i);
}
}
/**
* Parse the mRNA sequences and thereby add these to the {@link TranscriptModel}s.
* @return list of sequence annotated {@link TranscriptModel}s
*/
public ArrayList<TranscriptModel> parse() throws JannovarException{
BufferedReader in = null;
String str;
try {
if(filename.endsWith(".gz"))
in = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(filename))));
else
in = new BufferedReader(new FileReader(filename));
while ((str = in.readLine()) != null) {
if(str.startsWith(">")){
if(sequence != null)
addSequenceToModel();
accession = processHeader(str);
sequence = new StringBuilder();
}else
sequence.append(str);
}
} catch (IOException e) {
System.err.println("[WARNING] failed to read the FastA file:\n"+e.toString());
} finally {
try{
if(in != null)
in.close();
}catch (IOException e){
System.err.println("[WARNING] failed to close the FastA file reader:\n"+e.toString());
}
}
return transcriptmodelsProcessed;
}
/**
* Adds the sequence to the corresponding {@link TranscriptModel}.
*/
private void addSequenceToModel() throws JannovarException{
Integer idx;
if((idx = transcript2index.get(accession)) != null){
transcriptmodels.get(idx).setSequence(sequence.toString());
transcriptmodels.get(idx).initialize();
transcriptmodelsProcessed.add(transcriptmodels.get(idx));
}
// System.out.println(accession+"\t"+sequence);
}
/**
* Selects the unique identifier from the header line to match the sequence to the {@link TranscriptModel}
* definition.
* @param header The FastA header line
* @return A unique identifier (e.g. NR_024540.1)
*/
protected abstract String processHeader(String header);
}