package medsavant.haplotype; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Random; import java.util.zip.GZIPInputStream; import main.Main; import org.apache.commons.lang3.StringUtils; /** * Call BEAGLE to perform genotype phasing. * Multi-patient VCFs are supported. * * @author rammar */ public class BEAGLEWrapper { private static final int FORMAT_COLUMN_INDEX= 8; private static final int SAMPLE_COLUMN_INDEX= 9; private static final String GT= "GT"; private static Random r= new Random(); // use static, so it doesn't use the same pseudorandom #s repeatedly private String inputVCFName; private String tempDirectoryPath; private String tempFileNamePrefix; private List<String> beagleArgs= new LinkedList<String>(); private Map<String, List<String>> phasedVariants; private File mergedVCF; /** * Perform genotype phasing of the input VCF using BEAGLE. * @param tempDirectoryPath The temp directory where files can be processed. * @param inputVCF The input VCF name to phase and replace * @param refPanel The reference panel VCF used to create the models * @param numThreads The number of BEAGLE threads * @precondition The input VCF and the reference panel must have the same chromosomal nomenclature */ public BEAGLEWrapper(String tempDirectoryPath, String inputVCF, String refPanel, int numThreads) { inputVCFName= inputVCF; this.tempDirectoryPath= tempDirectoryPath; /* Set the arguments for BEAGLE. */ tempFileNamePrefix= getTempFileName(tempDirectoryPath); beagleArgs.add("gt=" + inputVCF); beagleArgs.add("ref=" + refPanel); beagleArgs.add("out=" + tempFileNamePrefix); //beagleArgs.add("window=24000"); // let BEAGLE use default window size, which = 24000 beagleArgs.add("impute=false"); // need to specify this beagleArgs.add("nthreads=" + numThreads); /* Call the main method for BEAGLE. */ Main.main(beagleArgs.toArray(new String[0])); /* Merge the phased VCF results with the unphased results. */ try { mergedVCF= mergePhased(); } catch (Exception e) { System.err.println("[" + this.getClass().getSimpleName() + "]: Error merging phased VCF with unphased input VCF."); e.printStackTrace(); } } /** * Perform genotype phasing of the input VCF using BEAGLE using a single thread for BEAGLE run. * @param tempDirectoryPath The temp directory where files can be processed. * @param inputVCF The input VCF name to phase and replace * @param refPanel The reference panel VCF used to create the models * @precondition The input VCF and the reference panel must have the same chromosomal nomenclature */ public BEAGLEWrapper(String tempDirectoryPath, String inputVCF, String refPanel) { this(tempDirectoryPath, inputVCF, refPanel, 1); // 1 thread } /** * Get a temporary file name. * @param tempDirectoryPath The temp directory where files can be processed. * @return the temp file name */ private String getTempFileName(String tempDirectoryPath) { String tempNamePrefix= tempDirectoryPath + "/" + "beagle_phasing_temp_"; // extra "/" is fine String tempName= tempNamePrefix + r.nextInt(1000000000); /* Check that the file doesn't already exist. */ File f= new File(tempName); while(f.exists()) { tempName= tempNamePrefix + r.nextInt(1000000000); f= new File(tempName); } return tempName; } /** * Merge the phased VCF results with the input (unphased) VCF. * Works for multi-patient/sample VCF files. * @return the new File */ private File mergePhased() throws FileNotFoundException, IOException { // Get the new file name and put it in the temp directory File outputVCF= new File(tempDirectoryPath, inputVCFName.substring(inputVCFName.lastIndexOf(File.separator) + 1, inputVCFName.lastIndexOf(".")) + "_partially_phased.vcf"); storePhasedVariants(); BufferedWriter bw= new BufferedWriter(new FileWriter(outputVCF)); BufferedReader br= new BufferedReader(new FileReader(inputVCFName)); String line= br.readLine(); while (line != null) { line= StringUtils.chomp(line); // remove terminal newline // output header lines without processing if (line.substring(0,1).equals("#")) { bw.write(line); } else { List<String> currentVariants= Arrays.asList(line.split("\\t")); /* use chrom, position, ref and alt as a key - these should * be identical between the input and output VCFs. ID (index 2) * is not always present on the input file. */ List<String> keyList= new LinkedList<String>(); keyList.addAll(currentVariants.subList(0, 2)); keyList.addAll(currentVariants.subList(3, 5)); String key= StringUtils.join(keyList, " "); /* Check if this variant was phased. This can depend on whether the * reference panel contained these markers and whether imputation * was enabled. */ if (phasedVariants.containsKey(key)) { /* Get the phased GT values. */ List<String> phasedVariantsLine= phasedVariants.get(key); List<String> phasedFormatColumn= Arrays.asList(phasedVariantsLine.get(FORMAT_COLUMN_INDEX).split(":")); List<List<String>> phasedSampleColumns= getSampleColumns(phasedVariantsLine); int phasedGTIndex= phasedFormatColumn.indexOf(GT); /* Replace the unphased GT values if they exist. Append otherwise. */ List<String> unphasedFormatColumn= Arrays.asList(currentVariants.get(FORMAT_COLUMN_INDEX).split(":")); List<List<String>> unphasedSampleColumns= getSampleColumns(currentVariants); int unphasedGTIndex= unphasedFormatColumn.indexOf(GT); /* Assumption: phased and unphased files have the same number * of patient columns. */ for (int i= 0; i != phasedSampleColumns.size(); ++i) { List<String> phasedSampleColumn= phasedSampleColumns.get(i); List<String> unphasedSampleColumn= unphasedSampleColumns.get(i); String phasedGTValue= phasedSampleColumn.get(phasedGTIndex); if (unphasedGTIndex > -1) { // GT field is found in format column unphasedSampleColumn.set(unphasedGTIndex, phasedGTValue); } else { if (unphasedGTIndex == -1) { // GT field is missing from format column unphasedGTIndex= -2; // special index to indicate GT is added to first column but not all unphasedFormatColumn.add(GT); } // append GT to the end unphasedSampleColumn.add(phasedGTValue); } /* Replace the GT values in the current columns. */ currentVariants.set(SAMPLE_COLUMN_INDEX + i, StringUtils.join(unphasedSampleColumn, ":")); } /* Write the modified line to the new merged VCF. */ currentVariants.set(FORMAT_COLUMN_INDEX, StringUtils.join(unphasedFormatColumn, ":")); String outputLine= StringUtils.join(currentVariants, "\t"); bw.write(outputLine); } else { // variant was not phased /* If the input VCF was phased ahead of time, as Complete Genomics files are, * we have to ignore that phasing and use only the phased genotypes from the * BEAGLE output based on the reference panel. Otherwise we'll end up with * potentially non-existent haplotypes. To avoid this, I'm removing all * phasing on variants that were not found in the reference panel (if they * were phased ahead of time). */ List<String> unphasedFormatColumn= Arrays.asList(currentVariants.get(FORMAT_COLUMN_INDEX).split(":")); List<List<String>> unphasedSampleColumns= getSampleColumns(currentVariants); int unphasedGTIndex= unphasedFormatColumn.indexOf(GT); /* Iterate through all the sample/patient columns of the VCF. */ for (int i= 0; i != unphasedSampleColumns.size(); ++i) { List<String> unphasedSampleColumn= unphasedSampleColumns.get(i); // If GT field is found in format column if (unphasedGTIndex != -1) { /* Replace the phased GT values with unphased ("|" becomes "/"). */ String inputGTValue= unphasedSampleColumn.get(unphasedGTIndex); inputGTValue= inputGTValue.replace("|", "/"); unphasedSampleColumn.set(unphasedGTIndex, inputGTValue); } /* Replace the GT values in the current columns. */ currentVariants.set(SAMPLE_COLUMN_INDEX + i, StringUtils.join(unphasedSampleColumn, ":")); } /* Write the modified line to the new merged VCF. */ String outputLine= StringUtils.join(currentVariants, "\t"); bw.write(outputLine); } } bw.write("\n"); // we're going to assume this is done in unix rather than using System.getProperty("line.separator") line= br.readLine(); } br.close(); bw.close(); return outputVCF; } /** * Since BEAGLE outputs a gzipped VCF, uncompress it first and then * load each line into memory keyed by the variant positions. */ private void storePhasedVariants() throws FileNotFoundException, IOException { phasedVariants= new HashMap<String, List<String>>(); GZIPInputStream gzippedInputStream= new GZIPInputStream(new FileInputStream(tempFileNamePrefix + ".vcf.gz")); BufferedReader br= new BufferedReader(new InputStreamReader(gzippedInputStream)); String line= br.readLine(); while (line != null) { if (!line.substring(0,1).equals("#")) { // ignore the header lines line= StringUtils.chomp(line); // remove terminal newline List<String> currentVariants= Arrays.asList(line.split("\\t")); /* use chrom, position, ref and alt as a key - these should * be identical between the input and output VCFs. ID (index 2) * is not always present on the input file. */ List<String> keyList= new LinkedList<String>(); keyList.addAll(currentVariants.subList(0, 2)); keyList.addAll(currentVariants.subList(3, 5)); String key= StringUtils.join(keyList, " "); phasedVariants.put(key, currentVariants); } line= br.readLine(); } br.close(); } /** * Convert the list of current variants (from the complete current line) to * a list of sample/patient info lists. * @param currentVariants A List representation of the current line's columns * @return a List of the sample/patient info columns as Lists (list of lists) */ private List<List<String>> getSampleColumns(List<String> currentVariants) { List<List<String>> output= new LinkedList<List<String>>(); // Get all the sample Columns List<String> sampleInfoColumns= currentVariants.subList(SAMPLE_COLUMN_INDEX, currentVariants.size()); // Iterate through the columns of this line for (String column : sampleInfoColumns) { output.add(Arrays.asList(column.split(":"))); } return output; } /** * Get the new partially phased merged VCF. * @return the merged VCF */ public File getMergedVCF() { return this.mergedVCF; } }