/**
* Copyright (c) 2014 Marc Fiume <mfiume@cs.toronto.edu>
* Unauthorized use of this file is strictly prohibited.
*
* All rights reserved. No warranty, explicit or implicit, provided.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
* SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
* FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
package org.ut.biolab.medsavant.server.vcf;
import com.google.code.externalsorting.ExternalSort;
import java.io.*;
import java.nio.charset.Charset;
import java.rmi.RemoteException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.sf.samtools.util.BlockCompressedInputStream;
import org.apache.commons.lang.NumberUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.ut.biolab.medsavant.server.MedSavantServerEngine;
import org.ut.biolab.medsavant.shared.format.BasicVariantColumns;
import org.ut.biolab.medsavant.shared.model.MedSavantServerJobProgress;
import org.ut.biolab.medsavant.shared.model.SessionExpiredException;
import org.ut.biolab.medsavant.shared.serverapi.LogManagerAdapter;
import org.ut.biolab.medsavant.shared.util.IOUtils;
import org.ut.biolab.medsavant.shared.util.MiscUtils;
import org.ut.biolab.medsavant.shared.vcf.VariantRecord;
import org.ut.biolab.medsavant.shared.vcf.VariantRecord.VariantType;
import org.ut.biolab.medsavant.shared.vcf.VariantRecord.Zygosity;
/**
*
* @author mfiume, rammar
*/
public class VCFParser {
private static final Log LOG = LogFactory.getLog(VCFParser.class);
private static final String HEADER_CHARS = "#";
private static final String COMMENT_CHARS = "##";
private static final Pattern VCF_FORMAT_REGEX = Pattern.compile("^##fileformat=VCFv([\\d+.]+)");
private static final int VCF_START_INDEX = 1;
private static final int VCF_ID_INDEX = 2;
private static final int VCF_REF_INDEX = 3;
private static final int VCF_ALT_INDEX = 4;
private static final int VCF_QUALITY_INDEX = 5;
private static final int VCF_FILTER_INDEX = 6;
private static final int VCF_INFO_INDEX = 7;
private static final int VCF_FORMAT_INDEX = 8;
private static final int VCF_SAMPLE_START_INDEX = 9;
private static final Pattern VCF_BADREF_REGEX = Pattern.compile("[^ACGTNacgtn]");
private static final Pattern VCF_SNP_REGEX = Pattern.compile("^[ACGTNacgtn]");
private static final Pattern VCF_BADALT_REGEX = Pattern.compile("[^ACGTNacgtn:\\d\\[\\]]");
private static final Pattern VCF_ALT_OLD_1000G_REGEX = Pattern.compile("^<.+>$");
private static final int LINES_PER_PROGRESSREPORT = 50000;
//numbers or dots delimited by pipes or slashes.
//private static final Pattern VCF_GT_REGEX = Pattern.compile("([\\d.\\.])([/|]([\\d.\\.]))*");
private static final Pattern VCF_GT_REGEX = Pattern.compile("([\\d.])(([/|])([\\d.]))*");
private int lineNumber = 0;
private int numInvalidRef = 0;
private int numInvalidAlt = 0;
private int numSnp = 0;
private int numTi = 0; //mismatched base pairs in SNPs
private int numTv = 0; //matches base pairs in SNPs
private int numIndels = 0;
private int numSnp1 = 0;
private int numTi1 = 0; //mismatched base pairs in SNPs
private int numTv1 = 0; //matches base pairs in SNPs
private int numIndels1 = 0;
private int numInvalidGT = 0;
private int numHom = 0;
private int numHet = 0;
private int numVariants = 0;
private String sessID;
private File vcfFile;
private MedSavantServerJobProgress jobProgress;
public VCFParser(String sessID, File vcfFile, MedSavantServerJobProgress jobProgress) {
this.sessID = sessID;
this.vcfFile = vcfFile;
this.jobProgress = jobProgress;
}
public int getNumInvalidRef() {
return numInvalidRef;
}
public int getNumInvalidAlt() {
return numInvalidAlt;
}
public int getNumSnp() {
return numSnp;
}
public int getNumTi() {
return numTi;
}
public int getNumTv() {
return numTv;
}
public int getNumIndels() {
return numIndels;
}
public int getNumSnp1() {
return numSnp1;
}
public int getNumTi1() {
return numTi1;
}
public int getNumTv1() {
return numTv1;
}
public int getNumIndels1() {
return numIndels1;
}
public int getNumInvalidGT() {
return numInvalidGT;
}
public int getNumHom() {
return numHom;
}
public int getNumHet() {
return numHet;
}
public int parseVariantsFromReader(BufferedReader r, File outfile, int updateId, int fileId) throws IOException {
return parseVariantsFromReader(r, outfile, updateId, fileId, false);
}
// private Map<String, BufferedWriter> chromOutOfOrderFileMap = new HashMap<String, BufferedWriter>();
/*
private void writeOutOfOrderLine(String chrom, String[] line, String prefix) throws IOException {
BufferedWriter handle = chromOutOfOrderFileMap.get(chrom);
if (handle == null) {
handle = new BufferedWriter(new FileWriter(prefix + "_" + chrom, true));
chromOutOfOrderFileMap.put(chrom, handle);
}
}
*/
/**
* Like parseVariantsFromReader, but use a pre-parsed header object (useful
* when reusing a header)
*
* @param r A reader for .vcf file being parsed
* @param header The VCF header object to use. If header == null, the first
* header line in the file will be used
* @param outputLinesLimit The number of lines to write to output before
* returning
* @param outfile The temporary output file, with variants parsed 1 per
* position per individual
* @param updateId The updateId to prepend to each line
* @param fileId The fileId to prepend to each line
* @return number of lines written to outfile
* @throws IOException
*/
public int parseVariantsFromReader(BufferedReader r, File outfile, int updateId, int fileId, boolean includeHomoRef) throws IOException {
VCFHeader header = null;
String nextLineString;
int numRecords = 0;
final String outOfOrderFilename = outfile.getAbsolutePath() + "_ooo";
BufferedWriter outOfOrderHandle = new BufferedWriter(new FileWriter(outOfOrderFilename, true));
int variantId = 0;
int numLinesWritten = 0;
while (true) {
if ((nextLineString = r.readLine()) == null) {
LOG.info("Reader returned null after " + numLinesWritten + " lines.");
break;
}
lineNumber++;
nextLineString = nextLineString.trim();
//skip blank lines.
if (nextLineString.length() == 0) {
continue;
}
String s = "Processed " + numRecords + " lines (" + numLinesWritten + " variants) so far...";
jobProgress.setMessage(vcfFile.getName() + " - " + s);
if (numRecords % 100000 == 0 && numRecords != 0) {
LOG.info(s);
}
String[] nextLine = nextLineString.split("\t");
if (nextLine[0].startsWith(COMMENT_CHARS)) {
//Check for VCF vesion 4.
Matcher vcf_format_matcher = VCF_FORMAT_REGEX.matcher(nextLineString);
if (vcf_format_matcher.find()) {
String ver = vcf_format_matcher.group(1);
if (Float.parseFloat(ver) < 4.0) {
throw new IllegalArgumentException("VCF version (" + ver + ") is older than version 4");
}
}
} else if (nextLine[0].startsWith(HEADER_CHARS)) {
// header line
header = parseHeader(nextLine);
} else {
if (header == null) {
throw new IOException("Cannot parse headless VCF file");
}
// a data line
List<VariantRecord> records = null;
try {
records = parseRecord(nextLine, header);
if (records == null) {
continue;
}
} catch (Exception ex) {
LOG.error("Erroneous line: " + nextLineString);
throw new IOException(ex);
}
//add records to tdf
for (VariantRecord v : records) {
if (includeHomoRef || v.getZygosity() != Zygosity.HomoRef) {
outOfOrderHandle.write(v.toTabString(updateId, fileId, variantId));
outOfOrderHandle.write("\r\n");
numLinesWritten++;
variantId++;
}
}
numRecords++;
}//end else
}
outOfOrderHandle.close();
jobProgress.setMessage("Sorting variants...");
LOG.info("sorting out of order handle");
sortTDF(outOfOrderFilename, outfile);
return numLinesWritten;
}
private final static int EXTERNALSORT_MAX_TMPFILES = 1024;
private final static Charset EXTERNALSORT_CHARSET = Charset.defaultCharset();
private final static int TDF_INDEX_OF_CHROM = 4;
private final static int TDF_INDEX_OF_STARTPOS = 5;
//Uses ExternalSort so that it can be used with very large files.
static void sortTDF(String unsortedTDF, File sortedTDF) throws IOException {
final boolean eliminateDuplicateRows = false;
final int numHeaderLinesToExcludeFromSort = 0;
final boolean useGzipForTmpFiles = false;
//Sorts by chromosome, then position.
Comparator comparator = new Comparator<String>() {
@Override
public int compare(String o1, String o2) {
String[] tokens1 = o1.split("\t");
String[] tokens2 = o2.split("\t");
String chr1 = tokens1[TDF_INDEX_OF_CHROM].toLowerCase();
String chr2 = tokens2[TDF_INDEX_OF_CHROM].toLowerCase();
//Removed these checks - assume chromosomes are quoted.
/*
if(chr1.startsWith("\"") && chr1.endsWith("\"")){
chr1 = chr1.substring(1, chr1.length()-1);
}
if(chr2.startsWith("\"") && chr2.endsWith("\"")){
chr1 = chr1.substring(1, chr1.length()-1);
}*/
chr1 = chr1.substring(1, chr1.length() - 1);
chr2 = chr2.substring(1, chr2.length() - 1);
if (chr1.startsWith("chr")) {
chr1 = chr1.substring(3);
}
if (chr2.startsWith("chr")) {
chr2 = chr2.substring(3);
}
if (chr1.equals(chr2)) {
//assume positions are also quoted
long pos1 = Long.parseLong(tokens1[TDF_INDEX_OF_STARTPOS].substring(1, tokens1[TDF_INDEX_OF_STARTPOS].length() - 1));
long pos2 = Long.parseLong(tokens2[TDF_INDEX_OF_STARTPOS].substring(1, tokens2[TDF_INDEX_OF_STARTPOS].length() - 1));
if (pos1 < pos2) {
return -1;
} else if (pos1 > pos2) {
return 1;
} else {
return 0;
}
} else {
int c1 = NumberUtils.isDigits(chr1) ? Integer.parseInt(chr1) : (int) chr1.charAt(0);
int c2 = NumberUtils.isDigits(chr2) ? Integer.parseInt(chr2) : (int) chr2.charAt(0);
return (c1 < c2) ? -1 : ((c1 > c2) ? 1 : 0);
}
}
};
//Sort the file, producing up to EXTERNALSORT_MAX_TMPFILES temproary
//files.
File uf = new File(unsortedTDF);
BufferedReader fbr = new BufferedReader(new InputStreamReader(
new FileInputStream(uf), EXTERNALSORT_CHARSET));
//Use 0.3 * (1/numThreads) * total memory allocated to Java of memory for sorting.
long maxMem = (long) (0.3 * Runtime.getRuntime().maxMemory() / (double) MedSavantServerEngine.getMaxThreads());
//...unless that amount of memory would exceed 50% of the available memory, in which case cap
//memory use at 50% of the available memory.
long availMem = ExternalSort.estimateAvailableMemory();
if (0.50 * availMem < maxMem) {
maxMem = (long) 0.5 * availMem;
LOG.info("WARNING: Memory is low for sorting, sorting with reduced memory of " + (maxMem >> 20) + " M");
} else {
LOG.info("Sorting using " + (maxMem >> 20) + "M of memory");
}
List<File> batch = ExternalSort.sortInBatch(fbr, uf.length(), comparator, EXTERNALSORT_MAX_TMPFILES,
maxMem, EXTERNALSORT_CHARSET, new File(sortedTDF.getParent()),
eliminateDuplicateRows,
numHeaderLinesToExcludeFromSort,
useGzipForTmpFiles);
/*
List<File> batch = ExternalSort.sortInBatch(
new File(unsortedTDF),
comparator,
EXTERNALSORT_MAX_TMPFILES,
EXTERNALSORT_CHARSET,
new File(sortedTDF.getParent()),
eliminateDuplicateRows,
numHeaderLinesToExcludeFromSort,
useGzipForTmpFiles);
*/
//Merge the temporary files with each other
//batch.add(sortedTDF);
String finalOutputFileName = sortedTDF.getCanonicalPath();
File outputFile = new File(finalOutputFileName + "_MERGED");
ExternalSort.mergeSortedFiles(batch, outputFile, comparator, EXTERNALSORT_CHARSET,
eliminateDuplicateRows, false, useGzipForTmpFiles);
if (!IOUtils.moveFile(outputFile, sortedTDF)) {
throw new IOException("Can't rename merged file " + outputFile.getCanonicalPath() + " to " + sortedTDF.getCanonicalPath());
} else {
LOG.info("Outputted sorted TDF file to " + sortedTDF);
}
}
static BufferedReader openFile(File vcf) throws FileNotFoundException, IOException {
if (vcf.getAbsolutePath().endsWith(".gz")) {
return new BufferedReader(new InputStreamReader(new BlockCompressedInputStream(vcf)));
} else {
return new BufferedReader(new FileReader(vcf));
}
}
private VCFHeader parseHeader(String[] headerLine) {
VCFHeader result = new VCFHeader();
// has genotype information
if (headerLine.length > VCFHeader.getNumMandatoryFields()) {
// get the genotype labels
for (int i = VCFHeader.getNumMandatoryFields() + 1; i < headerLine.length; i++) {
if (headerLine[i] != null && headerLine[i].length() != 0) {
result.addGenotypeLabel(headerLine[i]);
}
}
}
return result;
}
private void messageToUser(LogManagerAdapter.LogType logtype, String msg) {
try {
org.ut.biolab.medsavant.server.serverapi.LogManager.getInstance().addServerLog(
sessID,
logtype,
msg);
LOG.info("sessId=" + sessID + " " + msg);
} catch (RemoteException re) {
LOG.error(re);
LOG.error("WARNING: Couldn't log warning due to RemoteException. Warning: " + msg);
} catch (SessionExpiredException see) {
LOG.error(see);
LOG.error("WARNING: Couldn't log warning due to SessionExpiredException. Warning: " + msg);
}
}
private static final long MAX_WARNINGS = 1000;
private long warningsEmitted = 0;
private void vcf_warning(String msg) {
if (warningsEmitted < MAX_WARNINGS) {
String warning = vcfFile.getName() + ": WARNING (line " + lineNumber + "): " + msg;
messageToUser(LogManagerAdapter.LogType.WARNING, warning);
} else if (warningsEmitted == MAX_WARNINGS) {
String warning = vcfFile.getName() + ": Further warnings have been truncated.";
messageToUser(LogManagerAdapter.LogType.WARNING, warning);
}
warningsEmitted++;
}
//These variables control how bad refs are reported or handled.
private static final boolean TOOLONG_REFS_GIVE_WARNING = true;
private static final boolean UNRECOGNIZED_REFS_GIVE_WARNING = true; //only matters if above is true.
private static final boolean TOOLONG_ALTS_GIVE_WARNING = true;
private static final boolean UNRECOGNIZED_ALTS_GIVE_WARNING = true;
private List<VariantRecord> parseRecord(String[] line, VCFHeader h) {
int numMandatoryFields = VCFHeader.getNumMandatoryFields();
//remove quotes from info field, if necessary.
if (line[VCF_INFO_INDEX].startsWith("\"") && line[VCF_INFO_INDEX].endsWith("\"")) {
line[VCF_INFO_INDEX] = line[VCF_INFO_INDEX].substring(1, line[VCF_INFO_INDEX].length() - 1);
}
List<String> infos = new ArrayList<String>();
List<String> ids;
for (int i = numMandatoryFields; i < line.length; i++) {
infos.add(line[i]);
}
if (infos.isEmpty()) {
infos.add(".");
ids = new ArrayList<String>();
ids.add(".");
} else {
ids = h.getGenotypeLabels();
}
List<VariantRecord> records = new ArrayList<VariantRecord>();
int triedIndex = 0;
try {
String ref = line[VCF_REF_INDEX].toUpperCase();
String altStr = line[VCF_ALT_INDEX].toUpperCase();
long start = 0;
try {
start = Long.parseLong(line[VCF_START_INDEX]);
} catch (NumberFormatException nex) {
vcf_warning("Invalid (non-numeric) start position detected in VCF4 file: " + line[VCF_START_INDEX]);
return null;
}
boolean badRef = false;
//If a ref is unrecognized or too long, we truncate it to 0 and store it anyway.
//(We never match on ref anyway)
if (ref.length() > BasicVariantColumns.REF.getColumnLength()) {
if (TOOLONG_REFS_GIVE_WARNING) {
vcf_warning("Detected reference allele with too many characters (maximum is " + BasicVariantColumns.REF.getColumnLength() + ", " + ref.length() + " detected). Setting ref=0");
}
badRef = true;
ref = "0";
} else if (VCF_BADREF_REGEX.matcher(ref).find()) { //Unrecognized ref
/*if (ref.length() < 100) { //sometimes extremely long ref contains N nucleotides
vcf_warning("Invalid reference ref allele record found in VCF4 file (ACGT expected, found " + ref + ") Setting ref as 0");
}*/
badRef = true;
if (UNRECOGNIZED_REFS_GIVE_WARNING) {
vcf_warning("Unrecognized reference allele found in VCF4 file (ACGT expected, found " + ref + ") Storing anyway.");
}
}
String[] allAlt = altStr.split(","); //there may be multiple alternative alleles
int altNumber = 0;
for (String alt : allAlt) { //process each alternative allele
altNumber++;
if (badRef) {
++numInvalidRef;
}
//handle multi-allelic variants by deleting the common portion of substring.
boolean complex = (alt.contains("[") || alt.contains("]"));
boolean snp = false;
VariantType variantType = VariantType.Unknown;
long newStart = start;
String newRef;
String newAlt;
long newEnd;
if (complex) { //complex rearrangement
newRef = ref;
newAlt = alt;
newEnd = newStart;
if (newRef.length() == 1) {
variantType = VariantType.Complex;
} else {
vcf_warning("Unrecognized complex rearrangement detected (ref length expected to be 1, found ref=" + newRef + ". Storing anyway.");
}
} else {
String prefix = StringUtils.getCommonPrefix(new String[]{ref, alt});
newRef = ref.substring(prefix.length());
newAlt = alt.substring(prefix.length());
newStart += prefix.length();
newEnd = newStart;
//If the ALT sequence is too long, we can't store it, and truncate it down to 10 characters.
if (newAlt.length() > BasicVariantColumns.ALT.getColumnLength()) {
if (TOOLONG_ALTS_GIVE_WARNING) {
vcf_warning("Skipping alternate allele with too many characters (maximum is "
+ BasicVariantColumns.ALT.getColumnLength() + ", " + newAlt.length()
+ " detected). MedSavant does not yet support sequences of this size. Storing first 10 characters");
}
newAlt = newAlt.substring(0, Math.min(10, BasicVariantColumns.ALT.getColumnLength()));
++numInvalidAlt;
// continue;
}
if (newRef.length() == newAlt.length() && VCF_SNP_REGEX.matcher(newRef).matches() && VCF_SNP_REGEX.matcher(newAlt).matches()) {
snp = true;
++numSnp;
//annovar counts base pair mismatches (e.g. AG, GA, CT, TC)
if ((newRef.equals("A") && newAlt.equals("G"))
|| (newRef.equals("G") && newAlt.equals("A"))
|| (newRef.equals("C") && newAlt.equals("T"))
|| (newRef.equals("T") && newAlt.equals("C"))) {
++numTi;
} else {
++numTv;
}
} else {
++numIndels;
}
///??
if (newAlt.equals("<DEL>")) { //old 1000G vcf files have this.
newEnd = newStart + newRef.length() - 1;
newAlt = "-";
variantType = VariantType.Deletion;
} else if (newAlt.equals(".")) {
newAlt = newRef;
newEnd = newStart + newRef.length() - 1;
variantType = VariantType.HomoRef;
} else if (snp) { //SNP
newEnd = newStart + newRef.length() - 1;
variantType = VariantType.SNP;
} else if (!badRef && newRef.length() >= newAlt.length()) { //deletion or block substitution
String head = newRef.substring(0, newAlt.length());
if (head.equals(newAlt)) {
newEnd = newStart + newRef.length() - 1;
newStart = newStart + head.length();
newRef = newRef.substring(newAlt.length());
newAlt = "-";
variantType = VariantType.Deletion;
} else {
newEnd = newStart + newRef.length() - 1;
variantType = VariantType.InDel;
}
} else if (VCF_BADALT_REGEX.matcher(newAlt).find()) {
if (UNRECOGNIZED_ALTS_GIVE_WARNING) {
vcf_warning("Unrecognized ALT allele detected (ACGTN]:[ expected, found " + alt + "). Storing anyway.");
}
++numInvalidAlt;
} else if (!badRef) {// if(ref.length() < alt.length()){ //insertion or block substitution
/* String head = newAlt.substring(0, newRef.length());
if (head.equals(newRef)) {
newEnd = newStart + newRef.length() - 1;
newStart = newStart + newRef.length() - 1;
newAlt = newAlt.substring(newRef.length());
newRef = "-";
variantType = VariantType.Insertion;
*/
if (newRef.length() == 0) { //insertion
newRef = "-";
newStart--;
newEnd--;
variantType = VariantType.Insertion;
} else {
newEnd = newStart + newRef.length() - 1;
variantType = VariantType.InDel;
}
}
}//end else.
VariantRecord variantRecordTemplate = new VariantRecord(line, newStart, newEnd, newRef, newAlt, altNumber, variantType);
int indexGT = getIndexGT(line); //index of GT in line
for (int i = 0; i < ids.size(); i++) { //for each sample.
VariantRecord sampleVariantRecord = new VariantRecord(variantRecordTemplate);
if (indexGT >= 0) {
String chunk = line[numMandatoryFields + i + 1];
String gt = chunk.split(":")[indexGT];
Matcher gtMatcher = VCF_GT_REGEX.matcher(gt);
if (!gtMatcher.find() || indexGT < 0) {
vcf_warning("SKIPPED VARIANT. Invalid GT field (" + gt.substring(0, Math.min(10, gt.length())) + (gt.length() < 10 ? "" : "...") + ") found in VCF file. cannot determine genotype.");
++numInvalidGT;
continue;
}
//Replaced by 'calculateZygosity'.
/*
String a1 = gtMatcher.group(1);
String a2 = gtMatcher.group(2);
if(a1.equals(".")){ //CG VCF files have many records as ".", probably denoting unknown alleles
a1 = "0";
}
if(a2.equals(".")){
a2 = "0";
}
if(a1.equals("0") && a2.equals("0")){
continue; //no mutation in this sample.
}else if(a1.equals(a2)){
if(a1.equals(Integer.toString(i+1))){
sampleVariantRecord.setZygosity(Zygosity.HomoAlt);
++numHom;
}else{
continue;
}
}else{
String x = Integer.toString(i+1);
if(!a1.equals(x) && !a2.equals(x)){
sampleVariantRecord.setZygosity(Zygosity.Hetero);
++numHet;
}
}
*/
sampleVariantRecord.setGenotype(gt);
try {
sampleVariantRecord.setZygosity(calculateZygosity(sampleVariantRecord));
} catch (IllegalArgumentException iex) {
vcf_warning("SKIPPED VARIANT. " + iex.getMessage());
continue;
}
if (sampleVariantRecord.getZygosity() == Zygosity.Hetero) {
++numHet;
} else if (sampleVariantRecord.getZygosity() == Zygosity.HomoAlt) {
++numHom;
}
}
if (snp) {
++numSnp1;
//annovar counts base pair mismatches (e.g. AG, GA, CT, TC)
if ((ref.equals("A") && alt.equals("G"))
|| (ref.equals("G") && alt.equals("A"))
|| (ref.equals("C") && alt.equals("T"))
|| (ref.equals("T") && alt.equals("C"))) {
++numTi1;
} else {
++numTv1;
}
} else {
++numIndels1;
}
triedIndex = 0;
String id = ids.get(i);
sampleVariantRecord.setDnaID(id);
// add sample information to the INFO field on this line
try {
String format = line[VCFHeader.getNumMandatoryFields()].trim();
String sampleInfo = line[numMandatoryFields + i + 1];
sampleInfo = sampleInfo.replace(";", ",");
sampleVariantRecord.setSampleInformation(format, sampleInfo);
} catch (Exception e) {
}
records.add(sampleVariantRecord);
}
}
} catch (IllegalArgumentException ex) { //only thrown if chromosome was invalid
vcf_warning(ex.getMessage());
return null;
} catch (Exception ex) {
String lStr = "";
for (int i = 0; i < line.length; i++) {
lStr += line[i] + "\t";
}
LOG.info("Tried index " + triedIndex + " of line with " + line.length + " entries");
String badString = lStr.length() > 300 ? lStr.substring(0, 299) + "..." : lStr;
LOG.error("Error parsing line " + badString + ": " + ex.getClass() + " " + MiscUtils.getMessage(ex));
ex.printStackTrace();
}
if ((lineNumber % LINES_PER_PROGRESSREPORT) == 0) {
messageToUser(LogManagerAdapter.LogType.INFO, vcfFile.getName() + ": Loaded " + lineNumber + " variants...");
}
return records;
}
private static int getIndexGT(String[] line) {
if (line.length >= VCFHeader.getNumMandatoryFields() + 1) {
String[] list = line[VCFHeader.getNumMandatoryFields()].trim().split(":");
for (int i = 0; i < list.length; i++) {
if (list[i].equals("GT")) {
return i;
}
}
}
return -1;
}
private static Zygosity calculateZygosity(VariantRecord vr) throws IllegalArgumentException {
boolean homoRef = (vr.getRef().equals(vr.getAlt()));
String gt = vr.getGenotype();
String[] split = gt.split("/|\\\\|\\|"); // splits on / or \ or |
if (split.length < 2 || split[0] == null || split[1] == null || split[0].length() == 0 || split[1].length() == 0) {
throw new IllegalArgumentException("Invalid genotype field: " + gt);
}
try {
if (split[0].equals(".") || split[1].equals(".")) {
if (homoRef) {
return Zygosity.HomoRef;
}
return Zygosity.Missing;
}
int a = Integer.parseInt(split[0]);
int b = Integer.parseInt(split[1]);
if (a == 0 && b == 0) {
return Zygosity.HomoRef;
} else if (!homoRef) {
if (a == b) {
return Zygosity.HomoAlt;
} else if (a == 0 || b == 0) {
return Zygosity.Hetero;
} else {
return Zygosity.HeteroTriallelic;
}
} else {
throw new IllegalArgumentException("Ref and Alt field are equal or Alt=., indicicating HomoRef variant, but genotype (" + gt + ") is invalid or indicates differently.");
}
} catch (NumberFormatException e) {
throw new IllegalArgumentException("Invalid Genotype " + gt);
}
}
}