package org.ut.biolab.medsavant.shared.appdevapi; import java.util.Arrays; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.ut.biolab.medsavant.shared.format.BasicVariantColumns; /** * Variant representation of individual variant rows from VariantManagerAdapter.getVariants(...). * * @author rammar */ public class Variant { private static final String JANNOVAR_EFFECT= BasicVariantColumns.JANNOVAR_EFFECT.getColumnName(); private static final String JANNOVAR_SYMBOL= BasicVariantColumns.JANNOVAR_SYMBOL.getColumnName(); private static final String DP4= "DP4"; private static final String AD= "AD"; private static final String AO= "AO"; private static final String DP= "DP"; private static final String GT= "GT"; private static final Pattern geneSymbolPattern= Pattern.compile("^([^:]+)"); private static final String VCF_MISSING_VALUE= "."; private Object[] row; private String chromosome; private String reference; private String alternate; private int alternateNumber; private long start; private long end; private String zygosity; private int alternateDepth= -1; private int referenceDepth= -1; private String geneSymbol; private String infoColumn; private String formatColumn; private String sampleInfoColumn; private String gtField; // Corresponds to "GT" in FORMAT column private String variantEffect; private String variantType; /** * Variant object based on a row from the table. * @param currentRow a single row from VariantManagerAdapter.getVariants(...) */ public Variant(Object[] currentRow) { row= currentRow; chromosome= (String) row[BasicVariantColumns.INDEX_OF_CHROM]; reference= (String) row[BasicVariantColumns.INDEX_OF_REF]; alternate= (String) row[BasicVariantColumns.INDEX_OF_ALT]; alternateNumber= ((Integer) row[BasicVariantColumns.INDEX_OF_ALT_NUMBER]).intValue(); start= ((Integer) row[BasicVariantColumns.INDEX_OF_START_POSITION]).longValue(); end= ((Integer) row[BasicVariantColumns.INDEX_OF_END_POSITION]).longValue(); zygosity= (String) row[BasicVariantColumns.INDEX_OF_ZYGOSITY]; infoColumn= (String) row[BasicVariantColumns.INDEX_OF_CUSTOM_INFO]; formatColumn= extractFromInfoColumn(BasicVariantColumns.FORMAT.getColumnName()); sampleInfoColumn= extractFromInfoColumn(BasicVariantColumns.SAMPLE_INFO.getColumnName()); } /** * Thrown when fields are not found within custom info, sample_info or * format columns. */ private class FieldNotFoundException extends Exception { public FieldNotFoundException(String message) { super(message); } } /** * Return the alternate allelic depth of coverage. * @return the alternate allelic depth of coverage */ public int getAlternateDepth() { try { if (alternateDepth == -1) // not initialized yet extractCoverage(); } catch (FieldNotFoundException fnfe) { System.err.println(fnfe.getMessage()); fnfe.printStackTrace(); } return alternateDepth; } /** * Return the reference allelic depth of coverage. * @return the reference allelic depth of coverage */ public int getReferenceDepth() { try { if (referenceDepth == -1) // not initialized yet extractCoverage(); } catch (FieldNotFoundException fnfe) { System.err.println(fnfe.getMessage()); fnfe.printStackTrace(); } return referenceDepth; } /** * Return the reference allele. * @return the reference allele. */ public String getReference() { return reference; } /** * Return the alternate allele. * @return the alternate allele. */ public String getAlternate() { return alternate; } /** * Return the alternate allele number. The first alternate allele is 1, the * second is 2, etc. * @return the alternate allele number */ public int getAlternateNumber() { return alternateNumber; } /** * Return the chromosome for this variant * @return the chromosome */ public String getChromosome() { return chromosome; } /** * Return the start position for this variant. * @return the start position */ public long getStart() { return start; } /** * Return the end position for this variant. * @return the end position */ public long getEnd() { return end; } /** * Return the first gene symbol for this variant. * @return the gene symbol */ public String getGene() { if (geneSymbol == null) extractGene(); return geneSymbol; } /** * Get the GT field corresponding to the genotype and haplotype for this variant. * @return the GT field value */ public String getGT() { try { if (gtField == null) gtField= extractFromFormatColumn(GT); } catch (FieldNotFoundException fnfe) { System.err.println(fnfe.getMessage()); fnfe.printStackTrace(); } return gtField; } /** * Get the biological effect of this variant. * @return biological effect of this variant */ public String getMutationSymbols() { if (variantEffect == null) variantEffect= extractFromInfoColumn(JANNOVAR_SYMBOL); return variantEffect; } /** * Get the mutation category of this variant. * @return the mutation type/category */ public String getMutationType() { if (variantType == null) variantType= extractFromInfoColumn(JANNOVAR_EFFECT); return variantType; } /** * Get the zygosity of this variant. * @return the zygosity String */ public String getZygosity() { return zygosity; } /** * Get value for a specific column. * @param columnName the String name of this column (in the header) * @return the value corresponding to this column */ public Object getColumn(String columnName) { int index= VariantIterator.header.indexOf(columnName); return row[index]; } /** * Get the full table row for this variant. * @return row for this variant */ public Object[] getRow() { return row; } /** * Get the table header. * @return table header */ public List<String> getHeader() { return VariantIterator.header; } /** * Extract field from the VCF CUSTOM_INFO column. * @param key the key for the field within the info column * @return the value corresponding to the key, null if key is absent */ public String extractFromInfoColumn(String key) { String regex= ";?" + key + "=([^;]+);?"; Pattern keyPattern= Pattern.compile(regex, Pattern.CASE_INSENSITIVE); Matcher keyMatcher= keyPattern.matcher(infoColumn); String value= null; if (keyMatcher.find()) { // NOTE: need to run find() to get group() below value= keyMatcher.group(1); } return value; } /** * * Extract field from FORMAT and SAMPLE_INFO columns. * @param key the key for the field within the FORMAT column * @return the value corresponding to the key from the SAMPLE_INFO column, null if key is absent * @throws org.ut.biolab.medsavant.shared.appdevapi.Variant.FieldNotFoundException * if the number of format columns differs from the sample_info columns, which is * acceptible by VCF 4.1 spec. the only field that has to be present of all the * format columns is the GT field; all others can be missing. */ public String extractFromFormatColumn(String key) throws FieldNotFoundException { List<String> formatKeys= Arrays.asList(formatColumn.split(":")); int index= formatKeys.indexOf(key); List<String> sampleInfoKeys= Arrays.asList(sampleInfoColumn.split(":")); // Check that format and sample info columns have same number of fields if (formatKeys.size() != sampleInfoKeys.size()) { throw new FieldNotFoundException("FORMAT and SAMPLE_INFO columns "+ "have a different number of fields; should be identical size."); } String value= null; if (index != -1) { value= sampleInfoKeys.get(index); } return value; } /** * Extract the gene symbol for this variant. */ private void extractGene() { String geneString= extractFromInfoColumn(JANNOVAR_SYMBOL); Matcher geneMatcher= geneSymbolPattern.matcher(geneString); if (geneMatcher.find()) { geneSymbol= geneMatcher.group(1); } } /** * Extracts the reference and alternate allelic coverage for this variant. */ private void extractCoverage() throws FieldNotFoundException { String dp4Text= extractFromInfoColumn(DP4); String adText= extractFromFormatColumn(AD); String aoText= extractFromFormatColumn(AO); String dpText= extractFromFormatColumn(DP); /* Process DP4 or AD or AO and DP text (from VCF INFO or Format columns) if present. */ if (dp4Text != null) { /* From the samtools definition of the DP4 field: * Number of: * 1) forward ref alleles; * 2) reverse ref; * 3) forward non-ref; * 4) reverse non-ref alleles, used in variant calling. * Sum can be smaller than DP because low-quality bases are not counted. * * URL: http://samtools.sourceforge.net/mpileup.shtml */ String[] delimited= dp4Text.split(","); referenceDepth= Integer.parseInt(delimited[0]) + Integer.parseInt(delimited[1]); alternateDepth= Integer.parseInt(delimited[2]) + Integer.parseInt(delimited[3]); } else if (adText != null && adText != VCF_MISSING_VALUE) { String[] adCoverageDelimited= adText.split(","); referenceDepth= Integer.parseInt(adCoverageDelimited[0]); alternateDepth= Integer.parseInt(adCoverageDelimited[1]); } else if (aoText != null && dpText != null && aoText != VCF_MISSING_VALUE && dpText != VCF_MISSING_VALUE) { int totalCount= Integer.parseInt(dpText); String[] aoCoverageDelimited= aoText.split(","); // UPDATE WHEN DB IS UPDATED TO NEW FORMAT - deals with multiple alleles/vcf line System.err.println("Fix covereage BUG here: " + this.getClass().getSimpleName()); /* Sometimes the AO count can be comma separated for multiple alternate * alleles. In this case, due to the way we import these in MedSavant, * I don't know which allele corresponds to which depth, so take the * minimum. This may have to be modified later. */ int[] aoCoverageInt= new int[aoCoverageDelimited.length]; for (int i= 0; i != aoCoverageInt.length; ++i) { aoCoverageInt[i]= Integer.parseInt(aoCoverageDelimited[i]); } Arrays.sort(aoCoverageInt); // sort the array in ascending order alternateDepth= aoCoverageInt[0]; referenceDepth= totalCount - alternateDepth; } } }