package jannovar.io;
import java.io.File;
import java.io.FileInputStream;
import java.io.DataInputStream;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.zip.GZIPInputStream;
import jannovar.common.Constants;
import jannovar.exception.JannovarException;
import jannovar.exception.KGParseException;
import jannovar.reference.TranscriptModel;
/**
* This class coordinates the downloading and parsing of the RefSeq transcript
* definition files from the UCSC database (hg19). It is possible to parse
* directly from the gzip file without decompressing them, or the start from the
* decompressed files. The class checks of the files exist and if they have the
* suffix "gz".
*
* @see <a href="http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/">UCSC
* hg19 database downloads</a>
* @author Peter N Robinson
* @version 0.02 (10 July, 2013)
* @Deprecated
*/
public class RefSeqParser extends TranscriptDataParser implements Constants {
/**
* Number of tab-separated fields in then UCSC refFlat.txt file (build
* hg19).
*/
public static final int NFIELDS = 11;
/**
* Base URI for UCSC hg19 build annotation files
*/
private static final String hg19base = "http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/";
/**
* Name of refFlat.txt file
*/
private static final String refFlat = "refFlat.txt";
public RefSeqParser(String path) {
super(path);
}
/**
* The refflat file has the following fields.
* <ol>
* <li>geneName: e.g., ANAPC13, Name of gene as it appears in genome
* browser.
* <li>name: e.g., NM_001242374, Name of gene (refseq accession number)
* <li>chrom: e.g., chr3, Reference sequence chromosome or scaffold
* <li>strand: e.g., -,+ or - for strand
* <li>txStart: e.g., 134196545, Transcription start position
* <li>txEnd: e.g., 134204865, Transcription end position
* <li>cdsStart: e.g., 134197431, Coding region start
* <li>cdsEnd: e.g., 134201746, Coding region end
* <li>exonCount: e.g.,3, Number of exons
* <li>exonStarts: e.g., 134196545,134201647,134204161, Exon start positions
* <li>exonEnds: e.g., 134197557,134201773,13420486
* </ol>
* This is the basis for the TranscriptModel
*/
public void parseRefFlatFile() {
}
public void downloadFiles() throws JannovarException {
makeDirectoryIfNotExist();
String refFlatCompressed = String.format("%s.gz", RefSeqParser.refFlat);
//String knownGeneMrna = String.format("%s.gz",Constants.knownGeneMrna);
//String kgXref = String.format("%s.gz",Constants.kgXref);
//String known2locus = String.format("%s.gz",Constants.known2locus);
download_file(RefSeqParser.hg19base, refFlatCompressed);
}
/**
* The constructor parses a single line of the refseqFlat.txt file. *
* <ol>
* <li>geneName: e.g., ANAPC13, Name of gene as it appears in genome
* browser.
* <li>name: e.g., NM_001242374, Name of gene (refseq accession number)
* <li>chrom: e.g., chr3, Reference sequence chromosome or scaffold
* <li>strand: e.g., -,+ or - for strand
* <li>txStart: e.g., 134196545, Transcription start position
* <li>txEnd: e.g., 134204865, Transcription end position
* <li>cdsStart: e.g., 134197431, Coding region start
* <li>cdsEnd: e.g., 134201746, Coding region end
* <li>exonCount: e.g.,3, Number of exons
* <li>exonStarts: e.g., 134196545,134201647,134204161, Exon start positions
* <li>exonEnds: e.g., 134197557,134201773,13420486
* </ol>
* The function additionalls parses the start and end of the exons. Note
* that in the UCSC database, positions are represented using half-open,
* zero-based coordinates. That is, if start is 2 and end is 7, then the
* first nucleotide is at position 3 (one-based) and the last nucleotide is
* at positon 7 (one-based). For now, we are switching the coordinates to
* fully-closed one based by incrementing all start positions by one. This
* is the way coordinates are typically represented in VCF files and is the
* way coordinate calculations are done in annovar. At a later date, it may
* be worthwhile to switch to the UCSC-way of half-open zero based
* coordinates.
*
* @param line A single line of the UCSC refFlat.txt file
*/
public TranscriptModel parseTranscriptModelFromLine(String line) throws KGParseException, JannovarException {
TranscriptModel model = TranscriptModel.createTranscriptModel();
String A[] = line.split("\t");
if (A.length != NFIELDS) {
String error = String.format("Malformed line in UCSC knownGene.txt file:\n%s\nExpected %d fields but there were %d",
line, NFIELDS, A.length);
throw new KGParseException(error);
}
/* Field 0 has the gene symbol, e.g., ANAPC13. */
model.setGeneSymbol(A[0]);
/* Field 1 has the accession number, e.g., NM_001242374 */
model.setAccessionNumber(A[1]);
byte chromosome;
try {
if (A[2].equals("chrX")) {
chromosome = X_CHROMOSOME; // 23
} else if (A[2].equals("chrY")) {
chromosome = Y_CHROMOSOME; // 24
} else if (A[2].equals("chrM")) {
chromosome = M_CHROMOSOME; // 25
} else {
String tmp = A[2].substring(3); // remove leading "chr"
chromosome = Byte.parseByte(tmp);
}
} catch (NumberFormatException e) { // scaffolds such as chrUn_gl000243 cause Exception to be thrown.
throw new KGParseException("Could not parse chromosome field: " + A[1]);
}
model.setChromosome(chromosome);
char strand = A[3].charAt(0);
if (strand != '+' && strand != '-') {
throw new KGParseException("Malformed strand: " + A[2]);
}
model.setStrand(strand);
int txStart, txEnd, cdsStart, cdsEnd;
try {
txStart = Integer.parseInt(A[4]) + 1; // +1 to convert to one-based fully closed numbering
} catch (NumberFormatException e) {
throw new KGParseException("Could not parse txStart:" + A[4]);
}
model.setTranscriptionStart(txStart);
try {
txEnd = Integer.parseInt(A[5]);
} catch (NumberFormatException e) {
throw new KGParseException("Could not parse txEnd:" + A[5]);
}
model.setTranscriptionEnd(txEnd);
try {
cdsStart = Integer.parseInt(A[6]) + 1;// +1 to convert to one-based fully closed numbering
} catch (NumberFormatException e) {
throw new KGParseException("Could not parse cdsStart:" + A[6]);
}
model.setCdsStart(cdsStart);
try {
cdsEnd = Integer.parseInt(A[7]);
} catch (NumberFormatException e) {
throw new KGParseException("Could not parse cdsEnd:" + A[7]);
}
model.setCdsEnd(cdsEnd);
byte exonCount;
try {
exonCount = Byte.parseByte(A[8]);
} catch (NumberFormatException e) {
throw new KGParseException("Could not parse exonCount:" + A[8]);
}
model.setExonCount(exonCount);
/* Now parse the exon ends and starts */
int[] exonStarts = new int[exonCount];
/**
* End positions of each of the exons of this transcript
*/
int[] exonEnds = new int[exonCount];
String starts = A[9];
String ends = A[10];
String B[] = starts.split(",");
if (B.length != exonCount) {
String error = String.format("[RefSeqParser] Malformed exonStarts list: found %d but I expected %d exons",
B.length, exonCount);
error = String.format("%s. This should never happen, the refFlat.txt file may be corrupted", error);
throw new KGParseException(error);
}
for (int i = 0; i < exonCount; ++i) {
try {
exonStarts[i] = Integer.parseInt(B[i]) + 1; // Change 0-based to 1-based numbering
} catch (NumberFormatException e) {
String error = String.format("[UCSCKGParser] Malformed exon start at position %d of line %s", i, starts);
error = String.format("%s. This should never happen, the knownGene.txt file may be corrupted", error);
throw new KGParseException(error);
}
}
// Now do the ends.
B = ends.split(",");
for (int i = 0; i < exonCount; ++i) {
try {
exonEnds[i] = Integer.parseInt(B[i]);
} catch (NumberFormatException e) {
String error = String.format("[UCSCKGParser] Malformed exon end at position %d of line %s", i, ends);
error = String.format("%s. This should never happen, the knownGene.txt file may be corrupted", error);
throw new KGParseException(error);
}
}
model.setExonStartsAndEnds(exonStarts, exonEnds);
model.initialize();
return model;
}
}