/* * AncestralSequenceAnnotator.java * * Copyright (c) 2002-2015 Alexei Drummond, Andrew Rambaut and Marc Suchard * * This file is part of BEAST. * See the NOTICE file distributed with this work for additional * information regarding copyright ownership and licensing. * * BEAST is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * BEAST is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with BEAST; if not, write to the * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, * Boston, MA 02110-1301 USA */ package dr.app.tools; import dr.evomodel.branchmodel.HomogeneousBranchModel; import dr.evomodelxml.siteratemodel.GammaSiteModelParser; import dr.evomodel.substmodel.FrequencyModel; import dr.evomodel.substmodel.GeneralSubstitutionModel; import dr.evomodel.substmodel.aminoacid.EmpiricalAminoAcidModel; import dr.evomodel.substmodel.aminoacid.JTT; import dr.evomodel.substmodel.aminoacid.LG; import dr.evomodel.substmodel.aminoacid.WAG; import dr.evomodel.substmodel.codon.GY94CodonModel; import dr.evomodel.substmodel.nucleotide.GTR; import dr.evomodel.substmodel.nucleotide.HKY; import dr.app.beast.BeastVersion; import dr.app.util.Arguments; import dr.app.util.Utils; import dr.evolution.alignment.Alignment; import dr.evolution.alignment.ConvertAlignment; import dr.evolution.alignment.PatternList; import dr.evolution.alignment.SimpleAlignment; //import dr.evolution.datatype.AminoAcids; //import dr.evolution.datatype.GeneralDataType; import dr.evolution.datatype.*; import dr.evolution.io.*; import dr.evolution.sequence.Sequence; import dr.evolution.tree.*; import dr.evolution.util.Taxon; import dr.evolution.util.TaxonList; import dr.evomodel.branchratemodel.BranchRateModel; import dr.evomodel.branchratemodel.StrictClockBranchRates; import dr.evomodel.substmodel.SubstitutionModel; import dr.evomodel.tree.TreeModel; import dr.oldevomodelxml.substmodel.GeneralSubstitutionModelParser; import dr.inference.model.Parameter; import dr.stats.DiscreteStatistics; import dr.util.HeapSort; import dr.util.Version; import dr.evomodel.siteratemodel.GammaSiteRateModel; import dr.evomodel.treelikelihood.PartialsRescalingScheme; import dr.evomodel.treelikelihood.AncestralStateBeagleTreeLikelihood; import java.io.*; import java.util.*; import java.util.logging.Logger; /* * @author Marc A. Suchard * @author Wai Lok Sibon Li */ public class AncestralSequenceAnnotator { private final static Version version = new BeastVersion(); public final static int MAX_CLADE_CREDIBILITY = 0; public final static int MAX_SUM_CLADE_CREDIBILITY = 1; public final static int USER_TARGET_TREE = 2; public final static int KEEP_HEIGHTS = 0; public final static int MEAN_HEIGHTS = 1; public final static int MEDIAN_HEIGHTS = 2; public final String[] GENERAL_MODELS_LIST = {"EQU"}; public final String[] NUCLEOTIDE_MODELS_LIST = {"HKY", "TN", "GTR"}; public final String[] AMINO_ACID_MODELS_LIST = {"JTT1992", "WAG2001", "LG2008", "Empirical\\(.+\\)"}; public final String[] TRIPLET_MODELS_LIST = {"HKYx3", "TNx3", "GTRx3"}; public final String[] CODON_MODELS_LIST = {"M0HKY", "M0TN", "M0GTR"};//"M0", "M0\\[.+\\]", ""}; public AncestralSequenceAnnotator(int burnin, int heightsOption, double posteriorLimit, int targetOption, String targetTreeFileName, String inputFileName, String outputFileName, String kalignExecutable ) throws IOException { this.posteriorLimit = posteriorLimit; this.kalignExecutable = kalignExecutable; attributeNames.add("height"); attributeNames.add("length"); System.out.println("Reading trees and simulating internal node states..."); CladeSystem cladeSystem = new CladeSystem(); boolean firstTree = true; FileReader fileReader = new FileReader(inputFileName); TreeImporter importer = new NexusImporter(fileReader); TreeExporter exporter; if (outputFileName != null) exporter = new NexusExporter(new PrintStream(new FileOutputStream(outputFileName))); else exporter = new NexusExporter(System.out); TreeExporter simulationResults = new NexusExporter(new PrintStream(new FileOutputStream(inputFileName + ".out"))); List<Tree> simulatedTree = new ArrayList<Tree>(); // burnin = 0; // java.util.logging.Logger.getLogger("dr.evomodel"). try { while (importer.hasTree()) { Tree tree = importer.importNextTree(); if (firstTree) { Tree unprocessedTree = tree; tree = processTree(tree); setupTreeAttributes(tree); setupAttributes(tree); tree = unprocessedTree; //This actually does nothing since unprocessedTree was a reference to processedTree in the first place firstTree = false; } if (totalTrees >= burnin) { addTreeAttributes(tree); // Tree savedTree = tree; tree = processTree(tree); // System.err.println(Tree.Utils.newick(tree)); exporter.exportTree(tree); // simulationResults.exportTree(tree); // System.exit(-1); simulatedTree.add(tree); cladeSystem.add(tree); totalTreesUsed += 1; } totalTrees += 1; } } catch (Importer.ImportException e) { System.err.println("Error Parsing Input Tree: " + e.getMessage()); return; } fileReader.close(); cladeSystem.calculateCladeCredibilities(totalTreesUsed); System.out.println("\tTotal trees read: " + totalTrees); if (burnin > 0) { System.out.println("\tIgnoring first " + burnin + " trees."); } simulationResults.exportTrees(simulatedTree.toArray(new Tree[simulatedTree.size()])); MutableTree targetTree; if (targetOption == USER_TARGET_TREE) { if (targetTreeFileName != null) { System.out.println("Reading user specified target tree, " + targetTreeFileName); importer = new NexusImporter(new FileReader(targetTreeFileName)); try { targetTree = new FlexibleTree(importer.importNextTree()); } catch (Importer.ImportException e) { System.err.println("Error Parsing Target Tree: " + e.getMessage()); return; } } else { System.err.println("No user target tree specified."); return; } } else if (targetOption == MAX_CLADE_CREDIBILITY) { System.out.println("Finding maximum credibility tree..."); targetTree = new FlexibleTree(summarizeTrees(burnin, cladeSystem, inputFileName, false)); } else if (targetOption == MAX_SUM_CLADE_CREDIBILITY) { System.out.println("Finding maximum sum clade credibility tree..."); targetTree = new FlexibleTree(summarizeTrees(burnin, cladeSystem, inputFileName, true)); } else { throw new RuntimeException("Unknown target tree option"); } System.out.println("Annotating target tree... (this may take a long time)"); // System.out.println("Starting processing...."); // System.out.println("calling annotateTree"); // System.out.println("targetTree has "+targetTree.getNodeCount()+" nodes."); cladeSystem.annotateTree(targetTree, targetTree.getRoot(), null, heightsOption); System.out.println("Processing ended."); System.out.println("Writing annotated tree...."); if (outputFileName != null) { exporter = new NexusExporter(new PrintStream(new FileOutputStream(outputFileName))); exporter.exportTree(targetTree); } else { exporter = new NexusExporter(System.out); exporter.exportTree(targetTree); } } private abstract class SubstitutionModelLoader { public final char[] AA_ORDER = AminoAcids.AMINOACID_CHARS; public final char[] NUCLEOTIDE_ORDER = Nucleotides.NUCLEOTIDE_CHARS; public final String[] CODON_ORDER = Codons.CODON_TRIPLETS; protected SubstitutionModel substModel; protected FrequencyModel freqModel; private String modelType; protected String substModelName; protected String[] charList; protected DataType dataType; SubstitutionModelLoader(Tree tree, String modelType, DataType dataType) { this.dataType = dataType; this.modelType = modelType; load(tree, modelType); } /* An artifact of old code? */ SubstitutionModelLoader(String name) { this.substModelName = name; } public SubstitutionModel getSubstitutionModel() { return substModel; } public FrequencyModel getFrequencyModel() { return freqModel; } public String getModelType() { return modelType; } public String getSubstModel() { return substModelName; } public String[] getCharList() { return charList; } public void setCharList(String[] cl) { System.arraycopy(cl, 0, charList, 0, cl.length); //charList = cl.clone(); } public abstract DataType getDataType(); protected abstract void modelSpecifics(Tree tree, String modelType); public void load(Tree tree, String modelType) { substModelName = modelType.replaceFirst("\\*.+","").replaceFirst("\\+.+","").trim(); loadFrequencyModel(tree); modelSpecifics(tree, modelType); printLogLikelihood(tree); } private void loadFrequencyModel(Tree tree) { final String[] AA_ORDER = {"A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R", "S","T","V","W","Y"}; //"ACDEFGHIKLMNPQRSTVWY".split("");//AminoAcids.AMINOACID_CHARS; final String[] DNA_NUCLEOTIDE_ORDER = {"A", "C", "G", "T"};//"ACGT".split(""); //Nucleotides.NUCLEOTIDE_CHARS; final String[] RNA_NUCLEOTIDE_ORDER = {"A", "C", "G", "U"};//"ACGU".split(""); //Nucleotides.NUCLEOTIDE_CHARS; // final String[] CODON_ORDER = {"AAA", "AAC", "AAG", "AAT", "ACA", "ACC", "ACG", "ACT", // "AGA", "AGC", "AGG", "AGT", "ATA", "ATC", "ATG", "ATT", // "CAA", "CAC", "CAG", "CAT", "CCA", "CCC", "CCG", "CCT", // "CGA", "CGC", "CGG", "CGT", "CTA", "CTC", "CTG", "CTT", // "GAA", "GAC", "GAG", "GAT", "GCA", "GCC", "GCG", "GCT", // "GGA", "GGC", "GGG", "GGT", "GTA", "GTC", "GTG", "GTT", // "TAA", "TAC", "TAG", "TAT", "TCA", "TCC", "TCG", "TCT", // "TGA", "TGC", "TGG", "TGT", "TTA", "TTC", "TTG", "TTT"};// Codons.CODON_TRIPLETS; final String[] DNA_CODON_ORDER = {"AAA", "AAC", "AAG", "AAT", "ACA", "ACC", "ACG", "ACT", "AGA", "AGC", "AGG", "AGT", "ATA", "ATC", "ATG", "ATT", "CAA", "CAC", "CAG", "CAT", "CCA", "CCC", "CCG", "CCT", "CGA", "CGC", "CGG", "CGT", "CTA", "CTC", "CTG", "CTT", "GAA", "GAC", "GAG", "GAT", "GCA", "GCC", "GCG", "GCT", "GGA", "GGC", "GGG", "GGT", "GTA", "GTC", "GTG", "GTT", /*"TAA",*/ "TAC", /*"TAG",*/ "TAT", "TCA", "TCC", "TCG", "TCT", /* Minus the stop and start codons */ /*"TGA",*/ "TGC", "TGG", "TGT", "TTA", "TTC", "TTG", "TTT"};// Codons.CODON_TRIPLETS; final String[] RNA_CODON_ORDER = {"AAA", "AAC", "AAG", "AAU", "ACA", "ACC", "ACG", "ACU", "AGA", "AGC", "AGG", "AGU", "AUA", "AUC", "AUG", "AUU", "CAA", "CAC", "CAG", "CAU", "CCA", "CCC", "CCG", "CCU", "CGA", "CGC", "CGG", "CGU", "CUA", "CUC", "CUG", "CUU", "GAA", "GAC", "GAG", "GAU", "GCA", "GCC", "GCG", "GCU", "GGA", "GGC", "GGG", "GGU", "GUA", "GUC", "GUG", "GUU", /*"UAA",*/ "UAC", /*"UAG",*/ "UAU", "UCA", "UCC", "UCG", "UCU", /* Minus the stop and start codons */ /*"UGA",*/ "UGC", "UGG", "UGU", "UUA", "UUC", "UUG", "UUU"};// Codons.CODON_TRIPLETS; /* For BAli-Phy, even if F=constant, you can still extract the frequencies this way. */ /* Obtain the equilibrium base frequencies for the model */ double[] freq = new double[0]; String[] charOrder = new String[0]; if(getDataType().getClass().equals(GeneralDataType.class)) { ArrayList<String> tempCharOrder = new ArrayList<String>(freq.length); for (Iterator<String> i = tree.getAttributeNames(); i.hasNext();) { String name = i.next(); if (name.startsWith("pi")) { /* the pi in the output files contains the frequencies */ String character = name.substring(2, name.length()); tempCharOrder.add(character); } } charOrder = tempCharOrder.toArray(new String[tempCharOrder.size()]); //this.charList = tempCharOrder.toArray(new String[tempCharOrder.size()]); freq = new double[charOrder.length]; } else if(getDataType().getClass().equals(Nucleotides.class)) { if(tree.getAttribute("piT") != null) { charOrder = DNA_NUCLEOTIDE_ORDER; freq = new double[charOrder.length]; } else if(tree.getAttribute("piU") != null) { charOrder = RNA_NUCLEOTIDE_ORDER; freq = new double[charOrder.length]; } else { throw new RuntimeException("Not proper nucleotide data"); } } else if(getDataType().getClass().equals(AminoAcids.class)) { charOrder = AA_ORDER; freq = new double[charOrder.length]; } else if(getDataType().getClass().equals(Codons.class)) { if(tree.getAttribute("piAAT") != null) { charOrder = DNA_CODON_ORDER; freq = new double[charOrder.length]; } else if(tree.getAttribute("piAAU") != null) { charOrder = RNA_CODON_ORDER; freq = new double[charOrder.length]; } else{ throw new RuntimeException("Base frequencies do not fit those for a codon model or not proper nucleotide data for codons\n" + "If you are using F=nucleotides models for codon models, they are currently not supported in BEAST"); } } else { throw new RuntimeException("Datatype unknown! (This error message should never be seen, contact Sibon)"); } // // This if statement is reserved for triplet data // else if(getDataType().getClass().equals(Nucleotides.class)) { // // This is wrong because for triplets // freq = new double[Nucleotides.NUCLEOTIDE_CHARS.length]; // // } int cnt = 0; double sum = 0; //charList = ""; ArrayList<String> tempCharList = new ArrayList<String>(freq.length); for (Iterator<String> i = tree.getAttributeNames(); i.hasNext();) { String name = i.next(); if (name.startsWith("pi")) { /* the pi in the output files contains the frequencies */ String character = name.substring(2, name.length()); tempCharList.add(character); //charList = charList.concat(character); Double value = (Double) tree.getAttribute(name); freq[cnt++] = value; sum += value; } } charList = tempCharList.toArray(new String[tempCharList.size()]); // for(int j=0; j<charList.length; j++) { // System.out.println("charizard lists " + charList[j]); // } /* Order the frequencies correctly */ double[] freqOrdered = new double[freq.length]; for (int i = 0; i < freqOrdered.length; i++) { int index = -5; search: for (int j = 0; j < charList.length; j++) { if(charList[j].equals(charOrder[i])) { index = j; break search; } } freqOrdered[i] = freq[index] / sum; //System.out.println(" no fried " + freqOrdered.length + "\t" + freq.length + "\t" + charOrder[i] + "\t" + freqOrdered[i]); //ddd } this.freqModel = new FrequencyModel(getDataType(), new Parameter.Default(freqOrdered)); } protected boolean doPrint = false; private void printLogLikelihood(Tree tree) { if (doPrint) { Double logLikelihood = Double.parseDouble(tree.getAttribute(LIKELIHOOD).toString()); if (logLikelihood != null) System.err.printf("%5.1f", logLikelihood); } } } private class GeneralSubstitutionModelLoader extends SubstitutionModelLoader { private final String EQU_TEXT = "EQU"; private GeneralSubstitutionModelLoader(Tree tree, String modelType) { super(tree, modelType, new GeneralDataType(new String[0])); setGeneralDataType(); throw new RuntimeException("General substitution model is currently not stable and should not be used"); } protected void modelSpecifics(Tree tree, String modelType) { if(substModelName.equals(EQU_TEXT)) { if(freqModel.getFrequencyCount() != charList.length) { System.err.println("Frequency model length does not match character list length, " + "GeneralSubstitutionModelLoader"); System.exit(-1); } /* Equivalent to a JC model but for all states */ //TODO CHECK IF THIS IS CORRECT double[] rates = new double[(charList.length * (charList.length - 1)) / 2]; for(int i=0; i<rates.length; i++) { rates[i] = 1.0; } System.out.println("Number of site transition rate categories (debuggin): " + rates.length); //substModel = new GeneralSubstitutionModel(freqModel.getDataType(), freqModel, new Parameter.Default(rates), 1); substModel = new GeneralSubstitutionModel(GeneralSubstitutionModelParser.GENERAL_SUBSTITUTION_MODEL, freqModel.getDataType(), freqModel, new Parameter.Default(rates), 1, null); } } public DataType getDataType() { setGeneralDataType(); return dataType; } public void setGeneralDataType() { if(charList!=null && dataType.getStateCount()!=charList.length) { GeneralDataType gdt = new GeneralDataType(charList); gdt.addAmbiguity("-", charList); gdt.addAmbiguity("X", charList); gdt.addAmbiguity("?", charList); dataType = gdt; } } } private class NucleotideSubstitutionModelLoader extends SubstitutionModelLoader { protected static final String HKY_TEXT = "HKY"; protected static final String TN_TEXT = "TN"; protected static final String GTR_TEXT = "GTR"; private NucleotideSubstitutionModelLoader(Tree tree, String modelType) { super(tree, modelType, Nucleotides.INSTANCE); } protected void modelSpecifics(Tree tree, String modelType) { if(substModelName.equals(HKY_TEXT)) { double kappa = Double.parseDouble(tree.getAttribute("HKY_kappa").toString()); //double kappa = (Double) tree.getAttribute("HKY_kappa"); //double kappa = (Double) tree.getAttribute("HKY\\:\\:kappa"); //double kappa = (Double) tree.getAttribute("HKY::kappa"); //double kappa = (Double) tree.getAttribute("kappa"); substModel = new HKY(new Parameter.Default(kappa), freqModel); } if(substModelName.equals(TN_TEXT)) { double kappa1 = Double.parseDouble(tree.getAttribute("TN_kappa(pur)").toString()); double kappa2 = Double.parseDouble(tree.getAttribute("TN_kappa(pyr)").toString()); //double kappa1 = (Double) tree.getAttribute("TN_kappa(pur)"); //double kappa2 = (Double) tree.getAttribute("TN_kappa(pyr)"); System.err.println("Sorry, TN substitution model is not yet implemented in BEAST-Beagle"); System.exit(0); //TODO Tamura-Nei model //substModel = new TN93(new Parameter.Default(kappa1), new Parameter.Default(kappa2), freqModel); } if(substModelName.equals(GTR_TEXT)) { /* It should be noted that BAli-Phy uses TC instead of CT and GC instead of CG */ //double rateACValue = (Double) tree.getAttribute("GTR_AC"); //double rateAGValue = (Double) tree.getAttribute("GTR_AG"); //double rateATValue = (Double) tree.getAttribute("GTR_AT"); //double rateCGValue = (Double) tree.getAttribute("GTR_GC"); //double rateCTValue = (Double) tree.getAttribute("GTR_TC"); //double rateGTValue = (Double) tree.getAttribute("GTR_GT"); double rateACValue = Double.parseDouble(tree.getAttribute("GTR_AC").toString()); double rateAGValue = Double.parseDouble(tree.getAttribute("GTR_AG").toString()); double rateATValue = Double.parseDouble(tree.getAttribute("GTR_AT").toString()); double rateCGValue = Double.parseDouble(tree.getAttribute("GTR_GC").toString()); double rateCTValue = Double.parseDouble(tree.getAttribute("GTR_TC").toString()); double rateGTValue = Double.parseDouble(tree.getAttribute("GTR_GT").toString()); substModel = new GTR(new Parameter.Default(rateACValue), new Parameter.Default(rateAGValue), new Parameter.Default(rateATValue), new Parameter.Default(rateCGValue), new Parameter.Default(rateCTValue), new Parameter.Default(rateGTValue), freqModel); } } public DataType getDataType() { return dataType; //Potential } } private class AminoAcidSubstitutionModelLoader extends SubstitutionModelLoader { protected final String JTT_TEXT = "JTT1992"; protected final String WAG_TEXT = "WAG2001"; protected final String LG_TEXT = "LG2008"; protected final String Empirical_TEXT = "Empirical(.+).+"; private AminoAcidSubstitutionModelLoader(Tree tree, String modelType) { super(tree, modelType, AminoAcids.INSTANCE); } protected void modelSpecifics(Tree tree, String modelType) { if(substModelName.equals(JTT_TEXT)) { substModel = new EmpiricalAminoAcidModel(JTT.INSTANCE, freqModel); } if(substModelName.equals(WAG_TEXT)) { substModel = new EmpiricalAminoAcidModel(WAG.INSTANCE, freqModel); } if(substModelName.equals(LG_TEXT)) { substModel = new EmpiricalAminoAcidModel(LG.INSTANCE, freqModel); } //todo Allow proper file input of Empirical amino-acid models if(substModelName.matches(Empirical_TEXT)) { String empiricalModelFileName = substModelName.replaceFirst("Empirical\\(", "").replaceFirst("\\).*",""); if (empiricalModelFileName.equals("wag.dat")) { substModel = new EmpiricalAminoAcidModel(WAG.INSTANCE, freqModel); } else if(empiricalModelFileName.equals("jtt.dat")) { substModel = new EmpiricalAminoAcidModel(JTT.INSTANCE, freqModel); } else if(empiricalModelFileName.equals("lg.dat")) { substModel = new EmpiricalAminoAcidModel(LG.INSTANCE, freqModel); } else { System.err.println("Sorry, AncestralSequenceAnnotator does not currently support other files"); System.err.println("Soon, we will allow users to enter a file"); System.exit(0); } } } public DataType getDataType() { return dataType; //Potential } } private class TripletSubstitutionModelLoader extends SubstitutionModelLoader { protected final String HKYx3_TEXT = "HKYx3"; protected final String TNx3_TEXT = "TNx3"; protected final String GTRx3_TEXT = "GTRx3"; private TripletSubstitutionModelLoader(Tree tree, String modelType) { super(tree, modelType, Nucleotides.INSTANCE); } protected void modelSpecifics(Tree tree, String modelType) { if(substModelName.equals(HKYx3_TEXT)) { System.err.println("Sorry, HKYx3 substitution model is not yet implemented in BEAST"); System.exit(0); //substModel = new HKY(); } if(substModelName.equals(TNx3_TEXT)) { System.err.println("Sorry, TNx3 substitution model is not yet implemented in BEAST"); System.exit(0); //substModel = new TN93(); } if(substModelName.equals(GTRx3_TEXT)) { System.err.println("Sorry, GTRx3 substitution model is not yet implemented in BEAST"); System.exit(0); //substModel = new GTR(); } } public DataType getDataType() { return dataType; // Is this right? } } private class CodonSubstitutionModelLoader extends SubstitutionModelLoader { // private final String M0_TEXT = "M0"; // Not necessary since this never actually shows up in BAli-Phy output protected final String M0_NUC_TEXT = "M0\\w+"; //private final String HKY_TEXT = "HKY"; //private final String TN_TEXT = "TN"; //private final String GTR_TEXT = "GTR"; private CodonSubstitutionModelLoader(Tree tree, String modelType) { super(tree, modelType, Codons.UNIVERSAL); //Potential } protected void modelSpecifics(Tree tree, String modelType) { //String codonNucleotideModel = substModelName.substring(substModelName.indexOf("\\\\[")+1, substModelName.indexOf("\\\\]")); String codonNucleotideModel = substModelName.substring(substModelName.indexOf("M0")+2, substModelName.length()); // if(substModelName.equals(M0_TEXT)) { // /* HKY is default */ // codonNucleotideModel = NucleotideSubstitutionModelLoader.HKY_TEXT; // // // } if(substModelName.matches(M0_NUC_TEXT)) { /* M0_omega may be *M0_omega, depending on whether M2 etc. are used */ double omega = Double.parseDouble(tree.getAttribute("M0_omega").toString()); //omega = (Double) tree.getAttribute("\\M0_omega"); if(codonNucleotideModel.equals(NucleotideSubstitutionModelLoader.HKY_TEXT)) { //double kappa = (Double) tree.getAttribute("HKY_kappa"); double kappa = Double.parseDouble(tree.getAttribute("HKY_kappa").toString()); //substModel = new YangCodonModel(Codons.UNIVERSAL, new Parameter.Default(omega), //new Parameter.Default(kappa), freqModel); substModel = new GY94CodonModel(Codons.UNIVERSAL, new Parameter.Default(omega), new Parameter.Default(kappa), freqModel); } if(codonNucleotideModel.equals(NucleotideSubstitutionModelLoader.TN_TEXT)) { //double kappa1 = (Double) tree.getAttribute("TN_kappa(pur)"); //double kappa2 = (Double) tree.getAttribute("TN_kappa(pyr)"); double kappa1 = Double.parseDouble(tree.getAttribute("TN_kappa(pur)").toString()); double kappa2 = Double.parseDouble(tree.getAttribute("TN_kappa(pyr)").toString()); System.err.println("Sorry, M0[TN] substitution model is not yet implemented in BEAST"); System.exit(0); } if(codonNucleotideModel.equals(NucleotideSubstitutionModelLoader.GTR_TEXT)) { double rateACValue = Double.parseDouble(tree.getAttribute("GTR_AC").toString()); double rateAGValue = Double.parseDouble(tree.getAttribute("GTR_AG").toString()); double rateATValue = Double.parseDouble(tree.getAttribute("GTR_AT").toString()); double rateCGValue = Double.parseDouble(tree.getAttribute("GTR_GC").toString()); double rateCTValue = Double.parseDouble(tree.getAttribute("GTR_TC").toString()); double rateGTValue = Double.parseDouble(tree.getAttribute("GTR_GT").toString()); //double rateACValue = (Double) tree.getAttribute("GTR_AC"); //double rateAGValue = (Double) tree.getAttribute("GTR_AG"); //double rateATValue = (Double) tree.getAttribute("GTR_AT"); //double rateCGValue = (Double) tree.getAttribute("GTR_GC"); //double rateCTValue = (Double) tree.getAttribute("GTR_TC"); //double rateGTValue = (Double) tree.getAttribute("GTR_GT"); System.err.println("Sorry, M0[GTR] substitution model is not yet implemented in BEAST"); System.exit(0); } // If +m2 then *M0 } } public DataType getDataType() { return dataType; // Is this right too? Just use the universal codon table? } } /* * This method is equivalent to the SubstitutionModelLoader without having to * be object orientated and can be much more flexible. */ private GammaSiteRateModel loadSiteModel(Tree tree) { String modelType = (String) tree.getAttribute(SUBST_MODEL); /* Identify the datatype and substitution model. Load the model */ SubstitutionModelLoader sml = null; String substModelName = modelType.replaceFirst("\\*.+","").replaceFirst("\\+.+","").trim(); System.out.println("Basic Substitution Model is " + substModelName); for(int i = 0; i<GENERAL_MODELS_LIST.length; i++) { if(substModelName.matches(GENERAL_MODELS_LIST[i])) { sml = new GeneralSubstitutionModelLoader(tree, modelType); } } for(int i = 0; i<NUCLEOTIDE_MODELS_LIST.length; i++) { if(substModelName.matches(NUCLEOTIDE_MODELS_LIST[i])) { sml = new NucleotideSubstitutionModelLoader(tree, modelType); } } for(int i = 0; i<AMINO_ACID_MODELS_LIST.length; i++) { if(substModelName.matches(AMINO_ACID_MODELS_LIST[i])) { sml = new AminoAcidSubstitutionModelLoader(tree, modelType); } } for(int i = 0; i<TRIPLET_MODELS_LIST.length; i++) { if(substModelName.matches(TRIPLET_MODELS_LIST[i])) { sml = new TripletSubstitutionModelLoader(tree, modelType); } } for(int i = 0; i<CODON_MODELS_LIST.length; i++) { if(substModelName.matches(CODON_MODELS_LIST[i])) { sml = new CodonSubstitutionModelLoader(tree, modelType); } } if (sml.getSubstitutionModel() == null) { System.err.println("Substitution model type '" + modelType + "' not implemented"); System.exit(-1); } //SiteModel siteModel = new GammaSiteModel(sml.getSubstitutionModel(), new Parameter.Default(1.0), null, 0, null); //SiteModel siteModel = new GammaSiteModel(sml.getSubstitutionModel(), null, null, 0, null); String siteRatesModels = modelType.substring(modelType.indexOf("+")+1, modelType.length()); //String[] siteRatesModels = siteRatesParameters.split(" + "); System.out.println("Site rate models: " + siteRatesModels); if(sml.getSubstitutionModel().getDataType().getClass().equals(Codons.class) && siteRatesModels.length() > 0) { /* For codon site models */ if(siteRatesModels.indexOf("+M2") >= 0) { /* M2 */ System.out.println("Site model - M2 Codon site model used"); Parameter m2FrequencyAAInv = new Parameter.Default(Double.parseDouble(tree.getAttribute("M2_f[AA INV]").toString())); Parameter m2FrequencyNeutral = new Parameter.Default(Double.parseDouble(tree.getAttribute("M2_f[Neutral]").toString())); Parameter m2FrequencySelected = new Parameter.Default(Double.parseDouble(tree.getAttribute("M2_f[Selected]").toString())); Parameter m2Omega = new Parameter.Default(Double.parseDouble(tree.getAttribute("M2_omega").toString())); //Parameter m2FrequencyAAInv = new Parameter.Default((Double) tree.getAttribute("M2_f[AA INV]")); //Parameter m2FrequencyNeutral = new Parameter.Default((Double) tree.getAttribute("M2_f[Neutral]")); //Parameter m2FrequencySelected = new Parameter.Default((Double) tree.getAttribute("M2_f[Selected]")); //Parameter m2Omega = new Parameter.Default((Double) tree.getAttribute("M2_omega")); System.err.println("Sorry, M2 substitution model is not yet implemented in BEAST"); System.exit(0); } else if(siteRatesModels.indexOf("+M3") >= 0) { /* M3 */ System.out.println("Site model - M3 Codon site model used"); int numberOfBins = Integer.parseInt(siteRatesModels.replaceFirst(".+M3\\[","").replaceFirst("\\].+", "")); System.out.println(" + M3 n value: " + numberOfBins); Parameter[] m3Frequencies = new Parameter[numberOfBins]; Parameter[] m3Omegas = new Parameter[numberOfBins]; for(int i=1; i<=numberOfBins; i++) { m3Frequencies[i-1] = new Parameter.Default(Double.parseDouble(tree.getAttribute("M3_f"+i).toString())); m3Omegas[i-1] = new Parameter.Default(Double.parseDouble(tree.getAttribute("M3_omega"+i).toString())); //m3Frequencies[i-1] = new Parameter.Default((Double) tree.getAttribute("M3_f"+i)); //m3Omegas[i-1] = new Parameter.Default((Double) tree.getAttribute("M3_omega"+i)); } System.err.println("Sorry, M3 substitution model is not yet implemented in BEAST"); System.exit(0); } else if(siteRatesModels.indexOf("+M0_omega~Beta(") >= 0) { /* M7 */ System.out.println("Site model - M7 Codon site model used"); int numberOfBins = Integer.parseInt(siteRatesModels.replaceFirst("M0_omega~Beta\\(","").replaceFirst("\\)", "")); System.out.println(" + M7 n value: " + numberOfBins); Parameter m7BetaMu = new Parameter.Default(Double.parseDouble(tree.getAttribute("beta_mu").toString())); Parameter m7BetaVarMu = new Parameter.Default(Double.parseDouble(tree.getAttribute("beta_Var/mu").toString())); //Parameter m7BetaMu = new Parameter.Default((Double) tree.getAttribute("beta_mu")); //Parameter m7BetaVarMu = new Parameter.Default((Double) tree.getAttribute("beta_Var/mu")); System.err.println("Sorry, M7 substitution model is not yet implemented in BEAST"); System.exit(0); } } else if(siteRatesModels.length() > 0) { /* i.e. for other data types. */ /* Do gamma/lognormal + pinv */ Parameter pInvParameter = null; int categories = -1; Parameter alphaParameter = null; //System.out.println("Greatest story ever told! " + siteRatesModels); if(siteRatesModels.indexOf("+INV") >= 0) { System.out.println("Site model - proportion of invariable sites used"); //pInvParameter = new Parameter.Default(((Double) tree.getAttribute("INV_p")).doubleValue()); pInvParameter = new Parameter.Default(Double.parseDouble(tree.getAttribute("INV_p").toString())); } if(siteRatesModels.indexOf("+rate~Gamma(") >= 0) { System.out.println("Site model - gamma site rate heterogeneity used"); categories = Integer.parseInt(siteRatesModels.replaceFirst(".+rate~Gamma\\(", "").replaceFirst("\\).*","")); //double sigmaMu = (Double) tree.getAttribute("gamma_sigma/mu"); double sigmaMu = Double.parseDouble(tree.getAttribute("gamma_sigma/mu").toString()); sigmaMu = (1.0/sigmaMu) * (1.0/sigmaMu); /* BAli-Phy is parameterised by sigma/mu instead of alpha */ alphaParameter = new Parameter.Default(sigmaMu); } else if(siteRatesModels.indexOf("+rate~LogNormal(") >= 0) { // TODO implement lognormal site model System.out.println("Site model - lognormal site rate heterogeneity used"); System.err.println("Sorry, lognormal site rates are not yet implemented in BEAST"); System.exit(0); categories = Integer.parseInt(siteRatesModels.replaceFirst(".+rate~LogNormal\\(", "").replaceFirst("\\).*","")); //double sigmaMu = (Double) tree.getAttribute("log-normal_sigma/mu"); double sigmaMu = Double.parseDouble(tree.getAttribute("log-normal_sigma/mu").toString()); sigmaMu = (1.0/sigmaMu) * (1.0/sigmaMu); /* BAli-Phy is parameterised by sigma/mu instead of alpha */ alphaParameter = new Parameter.Default(sigmaMu); } else if(siteRatesModels.indexOf("+GAMMA(") >= 0) { /* For BEAST output */ System.out.println("Site model - gamma site rate heterogeneity used"); categories = Integer.parseInt(siteRatesModels.replaceFirst(".+GAMMA\\(", "").replaceFirst("\\).*","")); //double sigmaMu = (Double) tree.getAttribute("gamma_sigma/mu"); double alpha = Double.parseDouble(tree.getAttribute("alpha").toString()); alphaParameter = new Parameter.Default(alpha); } //System.out.println("alpha and pinv parameters: " + alphaParameter.getParameterValue(0) + "\t" + pInvParameter.getParameterValue(0)); //GammaSiteRateModel siteModel = new GammaSiteRateModel(sml.getSubstitutionModel(), new Parameter.Default(1.0), alphaParameter, categories, pInvParameter); GammaSiteRateModel siteModel = new GammaSiteRateModel(GammaSiteModelParser.SITE_MODEL, new Parameter.Default(1.0), alphaParameter, categories, pInvParameter); siteModel.setSubstitutionModel(sml.getSubstitutionModel()); //SiteModel siteModel = new GammaSiteModel(sml.getSubstitutionModel(), new Parameter.Default(1.0), new Parameter.Default(1.0), 1, new Parameter.Default(0.5)); //SiteModel siteModel = new GammaSiteModel(sml.getSubstitutionModel(), null, null, 0, null); return siteModel; } /* Default with no gamma or pinv */ //SiteRateModel siteModel = new GammaSiteRateModel(sml.getSubstitutionModel()); GammaSiteRateModel siteModel = new GammaSiteRateModel(GammaSiteModelParser.SITE_MODEL); siteModel.setSubstitutionModel(sml.getSubstitutionModel()); return siteModel; } public static final String KAPPA_STRING = "kappa"; //public static final String SEQ_STRING = "states"; // For BEAST input files public static String SEQ_STRING = "seq"; //public static final String SEQ_STRING_2 = "states"; // For BEAST input files public static final String NEW_SEQ = "newSeq"; public static final String TAG = "tag"; public static final String LIKELIHOOD = "lnL"; public static final String SUBST_MODEL = "subst"; // public static final String WAG_STRING = "Empirical(Data/wag.dat)*pi"; private final int die = 0; private Tree processTree(Tree tree) { // Remake tree to fix node ordering - Marc GammaSiteRateModel siteModel = loadSiteModel(tree); SimpleAlignment alignment = new SimpleAlignment(); alignment.setDataType(siteModel.getSubstitutionModel().getDataType()); if(siteModel.getSubstitutionModel().getDataType().getClass().equals(Codons.class)) { //System.out.println("trololo"); alignment.setDataType(Nucleotides.INSTANCE); } //System.out.println("BOO BOO " + siteModel.getSubstitutionModel().getDataType().getClass().getName()+"\t" + Codons.UNIVERSAL.getClass().getName() + "\t" + alignment.getDataType().getClass().getName()); // Get sequences String[] sequence = new String[tree.getNodeCount()]; for (int i = 0; i < tree.getNodeCount(); i++) { NodeRef node = tree.getNode(i); sequence[i] = (String) tree.getNodeAttribute(node, SEQ_STRING); if (tree.isExternal(node)) { Taxon taxon = tree.getNodeTaxon(node); alignment.addSequence(new Sequence(taxon, sequence[i])); //System.out.println("seq " + sequence[i]); } } // Make evolutionary model BranchRateModel rateModel = new StrictClockBranchRates(new Parameter.Default(1.0)); FlexibleTree flexTree; if(siteModel.getSubstitutionModel().getDataType().getClass().equals(Codons.class)) { ConvertAlignment convertAlignment = new ConvertAlignment(siteModel.getSubstitutionModel().getDataType(), ((Codons) siteModel.getSubstitutionModel().getDataType()).getGeneticCode(), alignment); flexTree = sampleTree(tree, convertAlignment, siteModel, rateModel); //flexTree = sampleTree(tree, alignment, siteModel, rateModel); } else { flexTree = sampleTree(tree, alignment, siteModel, rateModel); } introduceGaps(flexTree, tree); return flexTree; } private void introduceGaps(FlexibleTree flexTree, Tree gapTree) { // I forget what this function was supposed to do. - Marc } public static final char GAP = '-'; boolean[] bit = null; private FlexibleTree sampleTree(Tree tree, PatternList alignment, GammaSiteRateModel siteModel, BranchRateModel rateModel) { FlexibleTree flexTree = new FlexibleTree(tree, true); flexTree.adoptTreeModelOrdering(); FlexibleTree finalTree = new FlexibleTree(tree); finalTree.adoptTreeModelOrdering(); TreeModel treeModel = new TreeModel(tree); // Turn off noisy logging by TreeLikelihood constructor Logger logger = Logger.getLogger("dr.evomodel"); boolean useParentHandlers = logger.getUseParentHandlers(); logger.setUseParentHandlers(false); // AncestralStateTreeLikelihood likelihood = new AncestralStateTreeLikelihood( // alignment, // treeModel, // siteModel, // rateModel, // false, true, // alignment.getDataType(), // TAG, // false); AncestralStateBeagleTreeLikelihood likelihood = new AncestralStateBeagleTreeLikelihood( alignment, treeModel, new HomogeneousBranchModel(siteModel.getSubstitutionModel()), siteModel, rateModel, null, false, PartialsRescalingScheme.DEFAULT, true, null, alignment.getDataType(), TAG, false, true ); // PatternList patternList, TreeModel treeModel, // BranchSiteModel branchSiteModel, // SiteRateModel siteRateModel, // BranchRateModel branchRateModel, // boolean useAmbiguities, // PartialsRescalingScheme scalingScheme, // Map<Set<String>, Parameter> partialsRestrictions, // final DataType dataType, // final String tag, // SubstitutionModel substModel, // boolean useMAP, // boolean returnML) { // PatternList patternList, TreeModel treeModel, // SiteModel siteModel, BranchRateModel branchRateModel, // boolean useAmbiguities, boolean storePartials, // final DataType dataType, // final String tag, // boolean forceRescaling, // boolean useMAP, // boolean returnML) { logger.setUseParentHandlers(useParentHandlers); // Sample internal nodes likelihood.makeDirty(); double logLikelihood = likelihood.getLogLikelihood(); System.out.println("The new and old Likelihood (this value should be roughly the same, debug?): " + logLikelihood + ", " + Double.parseDouble(tree.getAttribute(LIKELIHOOD).toString())); if(Double.parseDouble(tree.getAttribute(LIKELIHOOD).toString()) != logLikelihood) { /* Newly written check, not sure if this is correct. May need to round values at least */ //throw new RuntimeException("The values of likelihood are not identical"); } // System.err.printf("New logLikelihood = %4.1f\n", logLikelihood); flexTree.setAttribute(LIKELIHOOD, logLikelihood); TreeTrait ancestralStates = likelihood.getTreeTrait(TAG); for (int i = 0; i < treeModel.getNodeCount(); i++) { NodeRef node = treeModel.getNode(i); String sample = ancestralStates.getTraitString(treeModel, node); String oldSeq = (String) flexTree.getNodeAttribute(flexTree.getNode(i), SEQ_STRING); if (oldSeq != null) { char[] seq = (sample.substring(1, sample.length() - 1)).toCharArray(); int length = oldSeq.length(); //System.out.println("length " + length + "\t" + sample.length() + "\t" + sample + "\t" + seq.length + "\t" + oldSeq); for (int j = 0; j < length; j++) { if (oldSeq.charAt(j) == GAP) seq[j] = GAP; } String newSeq = new String(seq); // if( newSeq.contains("MMMMMMM") ) { // System.err.println("bad = "+newSeq); // System.exit(-1); // } finalTree.setNodeAttribute(finalTree.getNode(i), NEW_SEQ, newSeq); } // Taxon taxon = finalTree.getNodeTaxon(finalTree.getNode(i)); // System.err.println("node: "+(taxon == null ? "null" : taxon.getId())+" "+ // finalTree.getNodeAttribute(finalTree.getNode(i),NEW_SEQ)); } return finalTree; } private void setupAttributes(Tree tree) { for (int i = 0; i < tree.getNodeCount(); i++) { NodeRef node = tree.getNode(i); Iterator iter = tree.getNodeAttributeNames(node); if (iter != null) { while (iter.hasNext()) { String name = (String) iter.next(); if (!attributeNames.contains(name)) attributeNames.add(name); } } } } private void setupTreeAttributes(Tree tree) { Iterator<String> iter = tree.getAttributeNames(); if (iter != null) { while (iter.hasNext()) { String name = iter.next(); treeAttributeNames.add(name); } } } private void addTreeAttributes(Tree tree) { if (treeAttributeNames != null) { if (treeAttributeLists == null) { treeAttributeLists = new List[treeAttributeNames.size()]; for (int i = 0; i < treeAttributeNames.size(); i++) { treeAttributeLists[i] = new ArrayList(); } } for (int i = 0; i < treeAttributeNames.size(); i++) { String attributeName = treeAttributeNames.get(i); Object value = tree.getAttribute(attributeName); if (value != null) { treeAttributeLists[i].add(value); } } } } private Tree summarizeTrees(int burnin, CladeSystem cladeSystem, String inputFileName, boolean useSumCladeCredibility) throws IOException { Tree bestTree = null; double bestScore = Double.NEGATIVE_INFINITY; // System.out.println("Analyzing " + totalTreesUsed + " trees..."); // System.out.println("0 25 50 75 100"); // System.out.println("|--------------|--------------|--------------|--------------|"); int stepSize = totalTrees / 60; if (stepSize < 1) stepSize = 1; int counter = 0; TreeImporter importer = new NexusImporter(new FileReader(inputFileName)); try { while (importer.hasTree()) { Tree tree = importer.importNextTree(); if (counter >= burnin) { double score = scoreTree(tree, cladeSystem, useSumCladeCredibility); // System.out.println(score); if (score > bestScore) { bestTree = tree; bestScore = score; } } if (counter > 0 && counter % stepSize == 0) { // System.out.print("*"); // System.out.flush(); } counter++; } } catch (Importer.ImportException e) { System.err.println("Error Parsing Input Tree: " + e.getMessage()); return null; } // System.out.println(); // System.out.println(); if (useSumCladeCredibility) { System.out.println("\tHighest Sum Clade Credibility: " + bestScore); } else { System.out.println("\tHighest Log Clade Credibility: " + bestScore); } return bestTree; } private double scoreTree(Tree tree, CladeSystem cladeSystem, boolean useSumCladeCredibility) { if (useSumCladeCredibility) { return cladeSystem.getSumCladeCredibility(tree, tree.getRoot(), null); } else { return cladeSystem.getLogCladeCredibility(tree, tree.getRoot(), null); } } private class CladeSystem { // // Public stuff // /** */ public CladeSystem() { } /** * adds all the clades in the tree */ public void add(Tree tree) { if (taxonList == null) { taxonList = tree; } // Recurse over the tree and add all the clades (or increment their // frequency if already present). The root clade is added too (for // annotation purposes). addClades(tree, tree.getRoot(), null); addTreeAttributes(tree); } public Map<BitSet, Clade> getCladeMap() { return cladeMap; } public Clade getClade(NodeRef node) { return null; } private void addClades(Tree tree, NodeRef node, BitSet bits) { BitSet bits2 = new BitSet(); if (tree.isExternal(node)) { int index = taxonList.getTaxonIndex(tree.getNodeTaxon(node).getId()); bits2.set(index); } else { for (int i = 0; i < tree.getChildCount(node); i++) { NodeRef node1 = tree.getChild(node, i); addClades(tree, node1, bits2); } } addClade(bits2, tree, node); if (bits != null) { bits.or(bits2); } } private void addClade(BitSet bits, Tree tree, NodeRef node) { Clade clade = cladeMap.get(bits); if (clade == null) { clade = new Clade(bits); cladeMap.put(bits, clade); } clade.setCount(clade.getCount() + 1); if (attributeNames != null) { if (clade.attributeLists == null) { clade.attributeLists = new List[attributeNames.size()]; for (int i = 0; i < attributeNames.size(); i++) { clade.attributeLists[i] = new ArrayList(); } } for (int i = 0; i < attributeNames.size(); i++) { String attributeName = attributeNames.get(i); Object value; if (attributeName.equals("height")) { value = tree.getNodeHeight(node); } else if (attributeName.equals("length")) { value = tree.getBranchLength(node); } else { value = tree.getNodeAttribute(node, attributeName); } if (value != null) { clade.attributeLists[i].add(value); } } } } public void calculateCladeCredibilities(int totalTreesUsed) { for (Clade clade : cladeMap.values()) { clade.setCredibility(((double) clade.getCount()) / totalTreesUsed); } } public double getSumCladeCredibility(Tree tree, NodeRef node, BitSet bits) { double sum = 0.0; if (tree.isExternal(node)) { int index = taxonList.getTaxonIndex(tree.getNodeTaxon(node).getId()); bits.set(index); } else { BitSet bits2 = new BitSet(); for (int i = 0; i < tree.getChildCount(node); i++) { NodeRef node1 = tree.getChild(node, i); sum += getSumCladeCredibility(tree, node1, bits2); } sum += getCladeCredibility(bits2); if (bits != null) { bits.or(bits2); } } return sum; } public double getLogCladeCredibility(Tree tree, NodeRef node, BitSet bits) { double logCladeCredibility = 0.0; if (tree.isExternal(node)) { int index = taxonList.getTaxonIndex(tree.getNodeTaxon(node).getId()); bits.set(index); } else { BitSet bits2 = new BitSet(); for (int i = 0; i < tree.getChildCount(node); i++) { NodeRef node1 = tree.getChild(node, i); logCladeCredibility += getLogCladeCredibility(tree, node1, bits2); } logCladeCredibility += Math.log(getCladeCredibility(bits2)); if (bits != null) { bits.or(bits2); } } return logCladeCredibility; } private double getCladeCredibility(BitSet bits) { Clade clade = cladeMap.get(bits); if (clade == null) { return 0.0; } return clade.getCredibility(); } public void annotateTree(MutableTree tree, NodeRef node, BitSet bits, int heightsOption) { BitSet bits2 = new BitSet(); if (tree.isExternal(node)) { int index = taxonList.getTaxonIndex(tree.getNodeTaxon(node).getId()); bits2.set(index); annotateNode(tree, node, bits2, true, heightsOption); } else { for (int i = 0; i < tree.getChildCount(node); i++) { NodeRef node1 = tree.getChild(node, i); annotateTree(tree, node1, bits2, heightsOption); } annotateNode(tree, node, bits2, false, heightsOption); } if (bits != null) { bits.or(bits2); } } private void annotateNode(MutableTree tree, NodeRef node, BitSet bits, boolean isTip, int heightsOption) { Clade clade = cladeMap.get(bits); if (clade == null) { throw new RuntimeException("Clade missing"); } // System.err.println("annotateNode new: "+bits.toString()+" size = "+attributeNames.size()); // for(String string : attributeNames) { // System.err.println(string); // } // System.exit(-1); boolean filter = false; if (!isTip) { double posterior = clade.getCredibility(); tree.setNodeAttribute(node, "posterior", posterior); if (posterior < posteriorLimit) { filter = true; } } for (int i = 0; i < attributeNames.size(); i++) { String attributeName = attributeNames.get(i); double[] values = new double[clade.attributeLists[i].size()]; HashMap<String, Integer> hashMap = new HashMap<String, Integer>(clade.attributeLists[i].size()); if (values.length > 0) { Object v = clade.attributeLists[i].get(0); boolean isHeight = attributeName.equals("height"); boolean isBoolean = v instanceof Boolean; boolean isDiscrete = v instanceof String; double minValue = Double.MAX_VALUE; double maxValue = -Double.MAX_VALUE; for (int j = 0; j < clade.attributeLists[i].size(); j++) { if (isDiscrete) { String value = (String) clade.attributeLists[i].get(j); if (value.startsWith("\"")) { value = value.replaceAll("\"", ""); } if (attributeName.equals(NEW_SEQ)) { // Strip out gaps before storing value = value.replaceAll("-", ""); } if (hashMap.containsKey(value)) { int count = hashMap.get(value); hashMap.put(value, count + 1); } else { hashMap.put(value, 1); } } else if (isBoolean) { values[j] = (((Boolean) clade.attributeLists[i].get(j)) ? 1.0 : 0.0); } else { values[j] = ((Number) clade.attributeLists[i].get(j)).doubleValue(); if (values[j] < minValue) minValue = values[j]; if (values[j] > maxValue) maxValue = values[j]; } } if (isHeight) { if (heightsOption == MEAN_HEIGHTS) { double mean = DiscreteStatistics.mean(values); tree.setNodeHeight(node, mean); } else if (heightsOption == MEDIAN_HEIGHTS) { double median = DiscreteStatistics.median(values); tree.setNodeHeight(node, median); } else { // keep the existing height } } if (!filter) { if (!isDiscrete) annotateMeanAttribute(tree, node, attributeName, values); else annotateModeAttribute(tree, node, attributeName, hashMap); // if( tree.getNodeTaxon(node) != null && // tree.getNodeTaxon(node).getId().compareTo("Calanus") == 0) { // System.err.println("size = "+hashMap.keySet().size()); // System.err.println("count = "+hashMap.get(hashMap.keySet().toArray()[0])); // } // System.err.println(); // ; if (!isBoolean && minValue < maxValue && !isDiscrete) { // Basically, if it is a boolean (0, 1) then we don't need the distribution information // Likewise if it doesn't vary. annotateMedianAttribute(tree, node, attributeName + "_median", values); annotateHPDAttribute(tree, node, attributeName + "_95%_HPD", 0.95, values); annotateRangeAttribute(tree, node, attributeName + "_range", values); } } } } } private void annotateMeanAttribute(MutableTree tree, NodeRef node, String label, double[] values) { double mean = DiscreteStatistics.mean(values); tree.setNodeAttribute(node, label, mean); } private void annotateMedianAttribute(MutableTree tree, NodeRef node, String label, double[] values) { double median = DiscreteStatistics.median(values); tree.setNodeAttribute(node, label, median); } public static final String fileName = "junk"; // public static final String execName = "/sw/bin/clustalw"; private int externalCalls = 0; private void annotateModeAttribute(MutableTree tree, NodeRef node, String label, HashMap<String, Integer> values) { if (label.equals(NEW_SEQ)) { String consensusSeq = null; double[] support = null; int numElements = values.keySet().size(); // System.err.println("size = "+numElements); if (numElements > 1) { // Essentially just finding the modal character try { PrintWriter pw = new PrintWriter(new PrintStream(new FileOutputStream(fileName))); int i = 0; int[] weight = new int[numElements]; // Iterator iter = values.keySet().iterator(); for (String key : values.keySet()) { int thisCount = values.get(key); weight[i] = thisCount; //TODO I THINK I FIXED IT?!?! pw.write(">" + i+"\n");// + " " + thisCount + "\n"); pw.write(key + "\n"); i++; } pw.close(); //Process p = Runtime.getRuntime().exec(kalignExecutable + " " + fileName + " -OUTPUT=NEXUS"); //Process p = Runtime.getRuntime().exec(kalignExecutable + " " + fileName + " -fmsf -o"+fileName + ".fasta"); // System.out.println("Command: " + kalignExecutable + " " + fileName + " -ffasta -o"+fileName + ".fasta"); // Process p = Runtime.getRuntime().exec(kalignExecutable + " " + fileName + " -ffasta -o"+fileName + ".fasta"); Process p = Runtime.getRuntime().exec(kalignExecutable + " " + fileName + " -ffasta -q -o"+fileName + ".fasta"); // ByteArrayOutputStream kalignOutput = new ByteArrayOutputStream(); // StreamGobbler errorGobbler = new StreamGobbler(p.getErrorStream(), "ERR"); // StreamGobbler outputGobbler = new StreamGobbler(p.getInputStream(), "OUT", kalignOutput); // // errorGobbler.start(); // outputGobbler.start(); // // int exitVal = p.waitFor(); BufferedReader input = new BufferedReader (new InputStreamReader(p.getInputStream())); { // String line; while ((/*line = */input.readLine()) != null) { // System.out.println(line); } } input.close(); externalCalls++; // System.err.println("clustal call #" + externalCalls); //NexusImporter importer = new NexusImporter(new FileReader(fileName + ".nxs")); // FastaImporter importer = new FastaImporter(new FileReader(new File(fileName + ".fasta")), Nucleotides.INSTANCE); FastaImporter importer = new FastaImporter(new FileReader(new File(fileName + ".fasta")), new GeneralDataType(new String[0]));//AminoAcids.INSTANCE); //new FastaImporter(new FileReader(), new AminoAcids.INSTANCE); Alignment alignment = importer.importAlignment(); // build index int[] index = new int[numElements]; for (int j = 0; j < numElements; j++) index[j] = alignment.getTaxonIndex("" + j); StringBuffer sb = new StringBuffer(); support = new double[alignment.getPatternCount()]; // System.err.println(new dr.math.matrixAlgebra.Vector(weight)); for (int j = 0; j < alignment.getPatternCount(); j++) { int[] pattern = alignment.getPattern(j); // support[j] = appendNextConsensusCharacter(alignment,j,sb); // System.err.println(new dr.math.matrixAlgebra.Vector(pattern)); int[] siteWeight = new int[30]; // + 2 to handle ambiguous and gap int maxWeight = -1; int totalWeight = 0; int maxChar = -1; for (int k = 0; k < pattern.length; k++) { int whichChar = pattern[k]; if (whichChar < 30) { int addWeight = weight[index[k]]; //System.out.println(k + "\t" + addWeight + "\t" + whichChar + "\t" + siteWeight[whichChar]); siteWeight[whichChar] += addWeight; totalWeight += addWeight; // if( k >= alignment.getDataType().getStateCount()+2 ) { // System.err.println("k = "+k); // System.err.println("pattern = "+new dr.math.matrixAlgebra.Vector(pattern)); // } if (siteWeight[whichChar] > maxWeight) { maxWeight = siteWeight[whichChar]; maxChar = whichChar; } } else { System.err.println("BUG"); System.err.println("k = " + k); System.err.println("whichChar = " + whichChar); System.err.println("pattern = " + new dr.math.matrixAlgebra.Vector(pattern)); } } sb.append(alignment.getDataType().getChar(maxChar)); support[j] = (double) maxWeight / (double) totalWeight; } // System.err.println("cSeq = " + sb.toString()); consensusSeq = sb.toString(); // System.exit(-1); } catch (FileNotFoundException e) { System.err.println(e.getMessage()); System.exit(-1); } catch (Importer.ImportException e) { System.err.println(e.getMessage()); System.exit(-1); } catch (IOException e) { System.err.println(e.getMessage()); System.exit(-1); } } else { consensusSeq = (String) values.keySet().toArray()[0]; support = new double[consensusSeq.length()]; for (int i = 0; i < support.length; i++) support[i] = 1.0; } // Trim out gaps from consensus and support // ArrayList<Double> newSupport = new ArrayList<Double>(support.length); boolean noComma = true; StringBuffer newSupport = new StringBuffer("{"); StringBuffer newSeq = new StringBuffer(); if (consensusSeq.length() != support.length) { System.err.println("What happened here?"); System.exit(-1); } // Restripping all the sequences of gap characters since they may have been added in during clustal step for (int i = 0; i < support.length; i++) { if (consensusSeq.charAt(i) != GAP) { newSeq.append(consensusSeq.charAt(i)); if (noComma) noComma = false; else newSupport.append(","); newSupport.append(String.format("%1.3f", support[i])); } } newSupport.append("}"); tree.setNodeAttribute(node, label, newSeq); // String num = Str tree.setNodeAttribute(node, label + ".prob", newSupport); } else { String mode = null; int maxCount = 0; int totalCount = 0; for (String key : (String[]) values.keySet().toArray()) { int thisCount = values.get(key); if (thisCount == maxCount) mode = mode.concat("+" + key); else if (thisCount > maxCount) { mode = key; maxCount = thisCount; } totalCount += thisCount; } double freq = (double) maxCount / (double) totalCount; tree.setNodeAttribute(node, label, mode); tree.setNodeAttribute(node, label + ".prob", freq); } } // private double appendNextConsensusCharacter(Alignment alignment, int j, StringBuffer sb, int[] weight) { // int[] pattern = alignment.getPattern(j); // int[] siteWeight = new int[alignment.getDataType().getStateCount()]; // int maxWeight = -1; // int totalWeight = 0; // int maxChar = -1; // for (int k = 0; k < pattern.length; k++) { // int whichChar = pattern[k]; // if (whichChar < alignment.getDataType().getStateCount()) { // // int addWeight = weight[index[k]]; // siteWeight[whichChar] += addWeight; // totalWeight += addWeight; // if (siteWeight[k] > maxWeight) { // maxWeight = siteWeight[k]; // maxChar = k; // } // } // } // sb.append(alignment.getDataType().getChar(maxChar)); // return (double) maxWeight / (double) totalWeight; // // } private void annotateRangeAttribute(MutableTree tree, NodeRef node, String label, double[] values) { double min = DiscreteStatistics.min(values); double max = DiscreteStatistics.max(values); tree.setNodeAttribute(node, label, new Object[]{min, max}); } private void annotateHPDAttribute(MutableTree tree, NodeRef node, String label, double hpd, double[] values) { int[] indices = new int[values.length]; HeapSort.sort(values, indices); double minRange = Double.MAX_VALUE; int hpdIndex = 0; int diff = (int) Math.round(hpd * (double) values.length); for (int i = 0; i <= (values.length - diff); i++) { double minValue = values[indices[i]]; double maxValue = values[indices[i + diff - 1]]; double range = Math.abs(maxValue - minValue); if (range < minRange) { minRange = range; hpdIndex = i; } } double lower = values[indices[hpdIndex]]; double upper = values[indices[hpdIndex + diff - 1]]; tree.setNodeAttribute(node, label, new Object[]{lower, upper}); } class Clade { public Clade(BitSet bits) { this.bits = bits; count = 0; credibility = 0.0; } public int getCount() { return count; } public void setCount(int count) { this.count = count; } public double getCredibility() { return credibility; } public void setCredibility(double credibility) { this.credibility = credibility; } public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; final Clade clade = (Clade) o; return !(bits != null ? !bits.equals(clade.bits) : clade.bits != null); } public int hashCode() { return (bits != null ? bits.hashCode() : 0); } int count; double credibility; BitSet bits; List[] attributeLists = null; } // // Private stuff // TaxonList taxonList = null; Map<BitSet, Clade> cladeMap = new HashMap<BitSet, Clade>(); } int totalTrees = 0; int totalTreesUsed = 0; double posteriorLimit = 0.0; String kalignExecutable = null; List<String> attributeNames = new ArrayList<String>(); List<String> treeAttributeNames = new ArrayList<String>(); List[] treeAttributeLists = null; TaxonList taxa = null; public static void printTitle() { System.out.println(); centreLine("Ancestral Sequence Annotator " + "v0.1" + ", " + "2008", 60); // version.getVersionString() + ", " + version.getDateString(), 60); System.out.println(); centreLine("by", 60); System.out.println(); centreLine("Marc A. Suchard, Wai Lok Sibon Li", 60); System.out.println(); centreLine("Departments of Biomathematics,", 60); centreLine("Biostatistics and Human Genetics", 60); centreLine("UCLA", 60); centreLine("msuchard@ucla.edu", 60); System.out.println(); System.out.println(); System.out.println("NB: I stole a substantial portion of this code from Andrew Rambaut."); System.out.println(" Please also give him due credit."); System.out.println(); } public static void centreLine(String line, int pageWidth) { int n = pageWidth - line.length(); int n1 = n / 2; for (int i = 0; i < n1; i++) { System.out.print(" "); } System.out.println(line); } public static void printUsage(Arguments arguments) { arguments.printUsage("ancestralsequenceannotator", "<input-file-name> <output-file-name>"); System.out.println(); System.out.println(" Example: ancestralsequenceannotator test.trees out.txt"); System.out.println(); } //Main method public static void main(String[] args) throws IOException { String targetTreeFileName = null; String inputFileName = null; String outputFileName = null; printTitle(); Arguments arguments = new Arguments( new Arguments.Option[]{ //new Arguments.StringOption("target", new String[] { "maxclade", "maxtree" }, false, "an option of 'maxclade' or 'maxtree'"), new Arguments.StringOption("heights", new String[]{"keep", "median", "mean"}, false, "an option of 'keep', 'median' or 'mean'"), new Arguments.IntegerOption("burnin", "the number of states to be considered as 'burn-in'"), new Arguments.StringOption("beastInput", new String[]{"true", "false"}, false, "If the input is taken from BEAST rather than BAli-Phy"), new Arguments.RealOption("limit", "the minimum posterior probability for a node to be annotated"), new Arguments.StringOption("target", "target_file_name", "specifies a user target tree to be annotated"), new Arguments.Option("help", "option to print this message"), new Arguments.StringOption("kalign", "full_path_to_kalign", "specifies full path to the kalign executable file") }); try { arguments.parseArguments(args); } catch (Arguments.ArgumentException ae) { System.out.println(ae); printUsage(arguments); System.exit(1); } if (arguments.hasOption("help")) { printUsage(arguments); System.exit(0); } int heights = KEEP_HEIGHTS; if (arguments.hasOption("heights")) { String value = arguments.getStringOption("heights"); if (value.equalsIgnoreCase("mean")) { heights = MEAN_HEIGHTS; } else if (value.equalsIgnoreCase("median")) { heights = MEDIAN_HEIGHTS; } } int burnin = -1; if (arguments.hasOption("burnin")) { burnin = arguments.getIntegerOption("burnin"); } double posteriorLimit = 0.0; if (arguments.hasOption("limit")) { posteriorLimit = arguments.getRealOption("limit"); } boolean beastInput = false; if(arguments.hasOption("beastInput") && arguments.getStringOption("beastInput").equals("true")) { SEQ_STRING = "states"; } int target = MAX_CLADE_CREDIBILITY; if (arguments.hasOption("target")) { target = USER_TARGET_TREE; targetTreeFileName = arguments.getStringOption("target"); } // String kalignExecutable = "/usr/local/bin/clustalw"; String kalignExecutable = "/usr/local/bin/kalign"; if (arguments.hasOption("kalign")) { kalignExecutable = arguments.getStringOption("kalign"); } String[] args2 = arguments.getLeftoverArguments(); if (args2.length > 2) { System.err.println("Unknown option: " + args2[2]); System.err.println(); printUsage(arguments); System.exit(1); } if (args2.length == 2) { targetTreeFileName = null; inputFileName = args2[0]; outputFileName = args2[1]; } else { if (inputFileName == null) { // No input file name was given so throw up a dialog box... inputFileName = Utils.getLoadFileName("AncestralSequenceAnnotator " + version.getVersionString() + " - Select input file file to analyse"); } if (outputFileName == null) { outputFileName = Utils.getSaveFileName("AncestralSequenceAnnotator " + version.getVersionString() + " - Select output file"); } } if(inputFileName == null || outputFileName == null) { System.err.println("Missing input or output file name"); printUsage(arguments); System.exit(1); } new AncestralSequenceAnnotator(burnin, heights, posteriorLimit, target, targetTreeFileName, inputFileName, outputFileName, kalignExecutable); System.exit(0); } }