package org.ut.biolab.medsavant.server.phasing;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.zip.GZIPInputStream;
import main.Main;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.ut.biolab.medsavant.shared.util.DirectorySettings;
import org.ut.biolab.medsavant.shared.util.IOUtils;
/**
* Call BEAGLE to perform genotype phasing. Multi-patient VCFs are supported.
*
* @author rammar
*/
public class BEAGLEWrapper {
private static final Log LOG = LogFactory.getLog(BEAGLEWrapper.class);
private static final int FORMAT_COLUMN_INDEX = 8;
private static final int SAMPLE_COLUMN_INDEX = 9;
private static final String GT = "GT";
private static Random r = new Random(); // use static, so it doesn't use the same pseudorandom #s repeatedly
private File inputVCF;
private File tempDirectory;
private String tempFileNamePrefix;
private List<String> beagleArgs = new LinkedList<String>();
private Map<String, List<String>> phasedVariants;
private File mergedVCF;
private static File refPanel = null;
private static final String BEAGLE_COMPRESSED_REFERENCE = "CPIC_30k_pgx_phasing_windows_CHR_PREFIX.vcf.bz2";
private static synchronized void initBeagleReference(File destDir) throws IOException {
System.out.println("Initializing Phasing reference...");
LOG.info("Initializing Phasing reference...");
File tmpDir = null;
File compressedReference = null;
try {
//Copy the compressed reference out of the jar and into a temporary folder.
int i = 0;
do {
tmpDir = new File(DirectorySettings.getTmpDirectory(), "beagle" + i++);
} while (tmpDir.exists());
tmpDir.mkdirs();
compressedReference = new File(tmpDir, BEAGLE_COMPRESSED_REFERENCE);
InputStream compressedRefStream = BEAGLEWrapper.class.getResourceAsStream("/BEAGLEWrapper/" + BEAGLE_COMPRESSED_REFERENCE);
IOUtils.copyStream(compressedRefStream, new FileOutputStream(compressedReference));
//Decompress the reference to its permanent location, then delete the compressed copy.
List<File> fileList = IOUtils.decompressAndDelete(compressedReference, destDir);
if (fileList.size() > 1) {
LOG.info("WARNING: Unexpected number of files in beagle reference -- expected a single VCF");
} else if (fileList.size() < 1) {
throw new IOException("Couldn't locate phasing reference within compressed phasing file.");
}
String extractedFileName = fileList.get(0).getCanonicalPath();
if (!refPanel.getCanonicalPath().equals(extractedFileName)) {
refPanel.delete();
throw new IOException("Unexpected filename for phasing reference: " + extractedFileName);
}
} finally {
if (tmpDir != null) {
IOUtils.deleteDirectory(tmpDir);
}
}
System.out.println("Phasing reference initialized.");
LOG.info("Phasing reference initialized.");
}
public static synchronized void install(File directory) throws IOException {
File destDir = new File(DirectorySettings.getMedSavantDirectory(), "beagle");
if (!destDir.exists()) {
if (!destDir.mkdirs()) {
throw new IOException("Couldn't create destination directory for phasing reference");
}
}
refPanel = new File(destDir, StringUtils.substringBeforeLast(BEAGLE_COMPRESSED_REFERENCE, ".bz2"));
if (!refPanel.exists()) {
LOG.info("Phasing reference doesn't exist, extracting to " + destDir.getAbsolutePath());
initBeagleReference(destDir);
}
}
/**
* Perform genotype phasing of the input VCF using BEAGLE.
*
* @param tempDirectoryPath The temp directory where files can be processed.
* @param inputVCF The input VCF name to phase and replace
* @param numThreads The number of BEAGLE threads
* @precondition The input VCF and the reference panel must have the same
* chromosomal nomenclature
*/
public BEAGLEWrapper(File tempDirectory, File inputVCF, int numThreads) {
this.inputVCF = inputVCF;
this.tempDirectory = tempDirectory;
/* Set the arguments for BEAGLE. */
tempFileNamePrefix = getPathToTempFile(tempDirectory);
beagleArgs.add("gt=" + inputVCF.getAbsolutePath());
beagleArgs.add("ref=" + refPanel.getAbsolutePath());
beagleArgs.add("out=" + tempFileNamePrefix);
//beagleArgs.add("window=24000"); // let BEAGLE use default window size, which = 24000
beagleArgs.add("impute=false"); // need to specify this
beagleArgs.add("nthreads=" + numThreads);
}
/**
* Perform genotype phasing of the input VCF using Beagle.
*
* @return The phased file, located in a temporary directory
*/
public File run() throws FileNotFoundException, IOException {
/* Call the main method for BEAGLE. */
Main.main(beagleArgs.toArray(new String[0]));
/* Merge the phased VCF results with the unphased results. */
mergedVCF = mergePhased();
return mergedVCF;
}
/**
* Perform genotype phasing of the input VCF using BEAGLE using a single
* thread for BEAGLE run.
*
* @param tempDirectoryPath The temp directory where files can be processed.
* @param inputVCF The input VCF name to phase and replace
* @precondition The input VCF and the reference panel must have the same
* chromosomal nomenclature
*/
public BEAGLEWrapper(File tempDirectory, File inputVCF) {
this(tempDirectory, inputVCF, 1); // 1 thread
}
/**
* Get a temporary file name.
*
* @param tempDirectoryPath The temp directory where files can be processed.
* @return the temp file name
*/
private String getPathToTempFile(File tempDirectoryPath) {
File tempFile = null;
do {
tempFile = new File(tempDirectoryPath, "beagle_phasing_temp_" + r.nextInt());
} while (tempFile.exists());
return tempFile.getAbsolutePath();
}
/**
* Merge the phased VCF results with the input (unphased) VCF. Works for
* multi-patient/sample VCF files.
*
* @return the new File
*/
private File mergePhased() throws FileNotFoundException, IOException {
// Get the new file name and put it in the temp directory
File outputVCF = new File(tempDirectory,
inputVCF.getName().substring(inputVCF.getName().lastIndexOf(File.separator) + 1,
inputVCF.getName().lastIndexOf(".")) + "_partially_phased.vcf");
storePhasedVariants();
BufferedWriter bw = new BufferedWriter(new FileWriter(outputVCF));
BufferedReader br = new BufferedReader(new FileReader(inputVCF));
String line = br.readLine();
while (line != null) {
line = StringUtils.chomp(line); // remove terminal newline
// output header lines without processing
if (line.substring(0, 1).equals("#")) {
bw.write(line);
} else {
List<String> currentVariants = Arrays.asList(line.split("\\t"));
/* use chrom, position, ref and alt as a key - these should
* be identical between the input and output VCFs. ID (index 2)
* is not always present on the input file. */
List<String> keyList = new LinkedList<String>();
keyList.addAll(currentVariants.subList(0, 2));
keyList.addAll(currentVariants.subList(3, 5));
String key = StringUtils.join(keyList, " ");
/* Check if this variant was phased. This can depend on whether the
* reference panel contained these markers and whether imputation
* was enabled. */
if (phasedVariants.containsKey(key)) {
/* Get the phased GT values. */
List<String> phasedVariantsLine = phasedVariants.get(key);
List<String> phasedFormatColumn = Arrays.asList(phasedVariantsLine.get(FORMAT_COLUMN_INDEX).split(":"));
List<List<String>> phasedSampleColumns = getSampleColumns(phasedVariantsLine);
int phasedGTIndex = phasedFormatColumn.indexOf(GT);
/* Replace the unphased GT values if they exist. Append otherwise. */
List<String> unphasedFormatColumn = Arrays.asList(currentVariants.get(FORMAT_COLUMN_INDEX).split(":"));
List<List<String>> unphasedSampleColumns = getSampleColumns(currentVariants);
int unphasedGTIndex = unphasedFormatColumn.indexOf(GT);
/* Assumption: phased and unphased files have the same number
* of patient columns. */
for (int i = 0; i != phasedSampleColumns.size(); ++i) {
List<String> phasedSampleColumn = phasedSampleColumns.get(i);
List<String> unphasedSampleColumn = unphasedSampleColumns.get(i);
String phasedGTValue = phasedSampleColumn.get(phasedGTIndex);
if (unphasedGTIndex > -1) { // GT field is found in format column
unphasedSampleColumn.set(unphasedGTIndex, phasedGTValue);
} else {
if (unphasedGTIndex == -1) { // GT field is missing from format column
unphasedGTIndex = -2; // special index to indicate GT is added to first column but not all
unphasedFormatColumn.add(GT);
}
// append GT to the end
unphasedSampleColumn.add(phasedGTValue);
}
/* Replace the GT values in the current columns. */
currentVariants.set(SAMPLE_COLUMN_INDEX + i, StringUtils.join(unphasedSampleColumn, ":"));
}
/* Write the modified line to the new merged VCF. */
currentVariants.set(FORMAT_COLUMN_INDEX, StringUtils.join(unphasedFormatColumn, ":"));
String outputLine = StringUtils.join(currentVariants, "\t");
bw.write(outputLine);
} else { // variant was not phased
/* If the input VCF was phased ahead of time, as Complete Genomics files are,
* we have to ignore that phasing and use only the phased genotypes from the
* BEAGLE output based on the reference panel. Otherwise we'll end up with
* potentially non-existent haplotypes. To avoid this, I'm removing all
* phasing on variants that were not found in the reference panel (if they
* were phased ahead of time). */
List<String> unphasedFormatColumn = Arrays.asList(currentVariants.get(FORMAT_COLUMN_INDEX).split(":"));
List<List<String>> unphasedSampleColumns = getSampleColumns(currentVariants);
int unphasedGTIndex = unphasedFormatColumn.indexOf(GT);
/* Iterate through all the sample/patient columns of the VCF. */
for (int i = 0; i != unphasedSampleColumns.size(); ++i) {
List<String> unphasedSampleColumn = unphasedSampleColumns.get(i);
// If GT field is found in format column
if (unphasedGTIndex != -1) {
/* Replace the phased GT values with unphased ("|" becomes "/"). */
String inputGTValue = unphasedSampleColumn.get(unphasedGTIndex);
inputGTValue = inputGTValue.replace("|", "/");
unphasedSampleColumn.set(unphasedGTIndex, inputGTValue);
}
/* Replace the GT values in the current columns. */
currentVariants.set(SAMPLE_COLUMN_INDEX + i, StringUtils.join(unphasedSampleColumn, ":"));
}
/* Write the modified line to the new merged VCF. */
String outputLine = StringUtils.join(currentVariants, "\t");
bw.write(outputLine);
}
}
bw.write("\n"); // we're going to assume this is done in unix rather than using System.getProperty("line.separator")
line = br.readLine();
}
br.close();
bw.close();
return outputVCF;
}
/**
* Since BEAGLE outputs a gzipped VCF, uncompress it first and then load
* each line into memory keyed by the variant positions.
*/
private void storePhasedVariants() throws FileNotFoundException, IOException {
phasedVariants = new HashMap<String, List<String>>();
GZIPInputStream gzippedInputStream = new GZIPInputStream(new FileInputStream(tempFileNamePrefix + ".vcf.gz"));
BufferedReader br = new BufferedReader(new InputStreamReader(gzippedInputStream));
String line = br.readLine();
while (line != null) {
if (!line.substring(0, 1).equals("#")) { // ignore the header lines
line = StringUtils.chomp(line); // remove terminal newline
List<String> currentVariants = Arrays.asList(line.split("\\t"));
/* use chrom, position, ref and alt as a key - these should
* be identical between the input and output VCFs. ID (index 2)
* is not always present on the input file. */
List<String> keyList = new LinkedList<String>();
keyList.addAll(currentVariants.subList(0, 2));
keyList.addAll(currentVariants.subList(3, 5));
String key = StringUtils.join(keyList, " ");
phasedVariants.put(key, currentVariants);
}
line = br.readLine();
}
br.close();
}
/**
* Convert the list of current variants (from the complete current line) to
* a list of sample/patient info lists.
*
* @param currentVariants A List representation of the current line's
* columns
* @return a List of the sample/patient info columns as Lists (list of
* lists)
*/
private List<List<String>> getSampleColumns(List<String> currentVariants) {
List<List<String>> output = new LinkedList<List<String>>();
// Get all the sample Columns
List<String> sampleInfoColumns = currentVariants.subList(SAMPLE_COLUMN_INDEX, currentVariants.size());
// Iterate through the columns of this line
for (String column : sampleInfoColumns) {
output.add(Arrays.asList(column.split(":")));
}
return output;
}
/**
* Get the new partially phased merged VCF.
*
* @return the merged VCF
*/
public File getMergedVCF() {
return this.mergedVCF;
}
}