/*
* NexusImporter.java
*
* Copyright (c) 2002-2015 Alexei Drummond, Andrew Rambaut and Marc Suchard
*
* This file is part of BEAST.
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership and licensing.
*
* BEAST is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* BEAST is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with BEAST; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301 USA
*/
package dr.evolution.io;
import dr.evolution.alignment.Alignment;
import dr.evolution.alignment.SimpleAlignment;
import dr.evolution.datatype.AminoAcids;
import dr.evolution.datatype.DataType;
import dr.evolution.datatype.Nucleotides;
import dr.evolution.datatype.TwoStates;
import dr.evolution.sequence.Sequence;
import dr.evolution.sequence.SequenceList;
import dr.evolution.sequence.Sequences;
import dr.evolution.tree.FlexibleNode;
import dr.evolution.tree.FlexibleTree;
import dr.evolution.tree.Tree;
import dr.evolution.tree.TreeUtils;
import dr.evolution.util.*;
import dr.util.Attributable;
import java.awt.*;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Class for importing NEXUS file format
*
* @author Andrew Rambaut
* @author Alexei Drummond
* @version $Id: NexusImporter.java,v 1.30 2006/04/25 14:39:37 rambaut Exp $
*/
public class NexusImporter extends Importer implements SequenceImporter, TreeImporter {
public static final NexusBlock UNKNOWN_BLOCK = new NexusBlock("unknown");
public static final NexusBlock TAXA_BLOCK = new NexusBlock("TAXA");
public static final NexusBlock CHARACTERS_BLOCK = new NexusBlock("CHARACTERS");
public static final NexusBlock DATA_BLOCK = new NexusBlock("DATA");
public static final NexusBlock UNALIGNED_BLOCK = new NexusBlock("UNALIGNED");
public static final NexusBlock DISTANCES_BLOCK = new NexusBlock("DISTANCES");
public static final NexusBlock TREES_BLOCK = new NexusBlock("TREES");
public static final NexusBlock CALIBRATION_BLOCK = new NexusBlock("CALIBRATION");
public static boolean suppressWarnings = false;
private final boolean ignoreMetaComments;
public static void setSuppressWarnings(boolean sw) {
suppressWarnings = sw;
}
// NEXUS specific ImportException classes
public static class MissingBlockException extends ImportException {
/**
*
*/
private static final long serialVersionUID = -6287423449717453999L;
public MissingBlockException() {
super();
}
public MissingBlockException(String message) {
super(message);
}
}
public static class NexusBlock {
private final String name;
public NexusBlock(String name) {
this.name = name;
}
public String toString() {
return name;
}
}
/**
* Constructor
*/
public NexusImporter(Reader reader) {
super(reader);
this.ignoreMetaComments = false;
setCommentDelimiters('[', ']', '\0', '\0', '&');
}
public NexusImporter(Reader reader, boolean ignoreMetaComments) {
super(reader);
this.ignoreMetaComments = ignoreMetaComments;
setCommentDelimiters('[', ']', '\0', '!', '&');
}
public NexusImporter(Reader reader, Writer commentWriter) {
super(reader, commentWriter);
this.ignoreMetaComments = false;
setCommentDelimiters('[', ']', '\0', '!', '&');
}
/**
* This function returns an integer to specify what the
* next block in the file is. The internal variable nextBlock is also set to this
* value. This should be overridden to provide support for other blocks. Once
* the block is read in, nextBlock is automatically set to UNKNOWN_BLOCK by
* findEndBlock.
*/
public NexusBlock findNextBlock() throws IOException {
findToken("BEGIN", true);
String blockName = readToken(";");
return findBlockName(blockName);
}
/**
* This function returns an enum class to specify what the
* block given by blockName is.
*/
public NexusBlock findBlockName(String blockName) {
if (blockName.equalsIgnoreCase(TAXA_BLOCK.toString())) {
nextBlock = TAXA_BLOCK;
} else if (blockName.equalsIgnoreCase(CHARACTERS_BLOCK.toString())) {
nextBlock = CHARACTERS_BLOCK;
} else if (blockName.equalsIgnoreCase(DATA_BLOCK.toString())) {
nextBlock = DATA_BLOCK;
} else if (blockName.equalsIgnoreCase(UNALIGNED_BLOCK.toString())) {
nextBlock = UNALIGNED_BLOCK;
} else if (blockName.equalsIgnoreCase(DISTANCES_BLOCK.toString())) {
nextBlock = DISTANCES_BLOCK;
} else if (blockName.equalsIgnoreCase(TREES_BLOCK.toString())) {
nextBlock = TREES_BLOCK;
} else if (blockName.equalsIgnoreCase(CALIBRATION_BLOCK.toString())) {
nextBlock = CALIBRATION_BLOCK;
}
return nextBlock;
}
/**
* Parses a 'TAXA' block.
*/
public TaxonList parseTaxaBlock() throws ImportException, IOException {
return readTaxaBlock();
}
/**
* Parses a 'CHARACTERS' block.
*/
public Alignment parseCharactersBlock(TaxonList taxonList) throws ImportException, IOException {
return readCharactersBlock(taxonList);
}
/**
* Parses a 'DATA' block.
*/
public Alignment parseDataBlock(TaxonList taxonList) throws ImportException, IOException {
return readDataBlock(/*taxonList*/);
}
/**
* Parses a 'TREES' block.
*/
public Tree[] parseTreesBlock(TaxonList taxonList) throws ImportException, IOException {
return readTreesBlock(taxonList, false);
}
/**
* Parses a 'CALIBRATION' block putting the dates into the appropriate taxa
*/
public dr.evolution.util.Date[] parseCalibrationBlock(TaxonList taxonList) throws ImportException, IOException {
return readCalibrationBlock(taxonList);
}
// **************************************************************
// SequenceImporter IMPLEMENTATION
// **************************************************************
/**
* importAlignment.
*/
public Alignment importAlignment() throws IOException, Importer.ImportException {
boolean done = false;
TaxonList taxonList = null;
Alignment alignment = null;
while (!done) {
try {
NexusImporter.NexusBlock block = findNextBlock();
if (block == NexusImporter.TAXA_BLOCK) {
taxonList = readTaxaBlock();
} else if (block == NexusImporter.CALIBRATION_BLOCK) {
if (taxonList == null) {
throw new MissingBlockException("TAXA block is missing");
}
readCalibrationBlock(taxonList);
} else if (block == NexusImporter.CHARACTERS_BLOCK) {
if (taxonList == null) {
throw new MissingBlockException("TAXA block is missing");
}
alignment = readCharactersBlock(taxonList);
done = true;
} else if (block == NexusImporter.DATA_BLOCK) {
// A data block doesn't need a taxon block before it
// but if one exists then it will use it.
alignment = readDataBlock(/*taxonList*/);
done = true;
} else {
// Ignore the block..
}
} catch (EOFException ex) {
done = true;
}
}
if (alignment == null) {
throw new MissingBlockException("DATA or CHARACTERS block is missing");
}
return alignment;
}
/**
* importSequences.
*/
public SequenceList importSequences() throws IOException, ImportException {
return importAlignment();
}
// **************************************************************
// TreeImporter IMPLEMENTATION
// **************************************************************
private boolean isReadingTreesBlock = false;
private HashMap<String, Taxon> translationList = null;
private Tree nextTree = null;
private final String[] lastToken = new String[1];
/**
* import a single tree.
*/
public Tree importTree(TaxonList taxonList) throws IOException, ImportException {
return importTree(taxonList, false);
}
public Tree importTree(TaxonList taxonList, boolean useTaxonListNumbering) throws IOException, ImportException {
isReadingTreesBlock = false;
TaxonList[] aTaxonList = new TaxonList[1];
aTaxonList[0] = taxonList;
if (!startReadingTrees(aTaxonList)) {
throw new MissingBlockException("TREES block is missing");
}
translationList = readTranslationList(aTaxonList[0], lastToken);
return readNextTree(translationList, lastToken, useTaxonListNumbering ? taxonList : null);
}
/**
* import an array of all trees.
*/
public Tree[] importTrees(TaxonList taxonList) throws IOException, ImportException {
return importTrees(taxonList, false);
}
public Tree[] importTrees(TaxonList taxonList, boolean useTaxonListNumbering) throws IOException, ImportException {
isReadingTreesBlock = false;
TaxonList[] aTaxonList = new TaxonList[1];
aTaxonList[0] = taxonList;
if (!startReadingTrees(aTaxonList)) {
throw new MissingBlockException("TREES block is missing");
}
return readTreesBlock(aTaxonList[0], useTaxonListNumbering);
}
/**
* return whether another tree is available.
*/
public boolean hasTree() throws IOException, ImportException {
if (!isReadingTreesBlock) {
TaxonList[] taxonList = new TaxonList[1];
taxonList[0] = null;
isReadingTreesBlock = startReadingTrees(taxonList);
if (!isReadingTreesBlock) return false;
translationList = readTranslationList(taxonList[0], lastToken);
}
if (nextTree == null) {
nextTree = readNextTree(translationList, lastToken, null);
}
return (nextTree != null);
}
/**
* import the next tree.
* return the tree or null if no more trees are available
*/
public Tree importNextTree() throws IOException, ImportException {
// call hasTree to do the hard work...
if (!hasTree()) {
isReadingTreesBlock = false;
return null;
}
Tree tree = nextTree;
nextTree = null;
return tree;
}
public boolean startReadingTrees(TaxonList[] taxonList) throws IOException, ImportException {
boolean done = false;
while (!done) {
try {
NexusImporter.NexusBlock block = findNextBlock();
if (block == NexusImporter.TAXA_BLOCK && taxonList[0] == null) {
// only read the taxon list if one hasn't been set already...
taxonList[0] = readTaxaBlock();
} else if (block == NexusImporter.TREES_BLOCK) {
return true;
} else {
// Ignore the block..
}
} catch (EOFException ex) {
done = true;
}
}
return false;
}
/**
* Finds the end of the current block.
*/
private void findToken(String query, boolean ignoreCase) throws IOException {
String token;
boolean found = false;
do {
token = readToken();
if ((ignoreCase && token.equalsIgnoreCase(query)) || token.equals(query)) {
found = true;
}
} while (!found);
}
/**
* Finds the end of the current block.
*/
public void findEndBlock() throws IOException {
try {
String token;
do {
token = readToken(";");
} while (!token.equalsIgnoreCase("END") && !token.equalsIgnoreCase("ENDBLOCK"));
} catch (EOFException e) {
// Doesn't matter if the End is missing
}
nextBlock = UNKNOWN_BLOCK;
}
/**
* Reads the header information for a 'DATA', 'CHARACTERS' or 'TAXA' block.
*/
private void readDataBlockHeader(String tokenToLookFor, NexusBlock block) throws ImportException, IOException {
boolean dim = false, ttl = false, fmt = false;
String token;
do {
token = readToken();
if (token.equalsIgnoreCase("TITLE")) {
if (ttl) {
throw new DuplicateFieldException("TITLE");
}
ttl = true;
} else if (token.equalsIgnoreCase("DIMENSIONS")) {
if (dim) {
throw new DuplicateFieldException("DIMENSIONS");
}
boolean nchar = (block == TAXA_BLOCK);
boolean ntax = (block == CHARACTERS_BLOCK);
do {
String token2 = readToken("=;");
if (getLastDelimiter() != '=') {
throw new BadFormatException("Unknown subcommand, '" + token2 + "', or missing '=' in DIMENSIONS command");
}
if (token2.equalsIgnoreCase("NTAX")) {
if (block == CHARACTERS_BLOCK) {
throw new BadFormatException("NTAX subcommand in CHARACTERS block");
}
taxonCount = readInteger(";");
ntax = true;
} else if (token2.equalsIgnoreCase("NCHAR")) {
if (block == TAXA_BLOCK) {
throw new BadFormatException("NCHAR subcommand in TAXA block");
}
siteCount = readInteger(";");
nchar = true;
} else {
throw new BadFormatException("Unknown subcommand, '" + token2 + "', in DIMENSIONS command");
}
} while (getLastDelimiter() != ';');
if (!ntax) {
throw new BadFormatException("NTAX subcommand missing from DIMENSIONS command");
}
if (!nchar) {
throw new BadFormatException("NCHAR subcommand missing from DIMENSIONS command");
}
dim = true;
} else if (token.equalsIgnoreCase("FORMAT")) {
if (fmt) {
throw new DuplicateFieldException("FORMAT");
}
dataType = null;
do {
String token2 = readToken("=;");
if (token2.equalsIgnoreCase("GAP")) {
if (getLastDelimiter() != '=') {
throw new BadFormatException("Expecting '=' after GAP subcommand in FORMAT command");
}
gapCharacters = readToken(";");
} else if (token2.equalsIgnoreCase("MISSING")) {
if (getLastDelimiter() != '=') {
throw new BadFormatException("Expecting '=' after MISSING subcommand in FORMAT command");
}
missingCharacters = readToken(";");
} else if (token2.equalsIgnoreCase("MATCHCHAR")) {
if (getLastDelimiter() != '=') {
throw new BadFormatException("Expecting '=' after MATCHCHAR subcommand in FORMAT command");
}
matchCharacters = readToken(";");
} else if (token2.equalsIgnoreCase("DATATYPE")) {
if (getLastDelimiter() != '=') {
throw new BadFormatException("Expecting '=' after DATATYPE subcommand in FORMAT command");
}
String token3 = readToken(";");
if (token3.equalsIgnoreCase("NUCLEOTIDE") ||
token3.equalsIgnoreCase("DNA") ||
token3.equalsIgnoreCase("RNA")) {
dataType = Nucleotides.INSTANCE;
} else if (token3.equalsIgnoreCase("STANDARD") || token3.equalsIgnoreCase("BINARY")) {
dataType = TwoStates.INSTANCE;
} else if (token3.equalsIgnoreCase("PROTEIN")) {
dataType = AminoAcids.INSTANCE;
} else if (token3.equalsIgnoreCase("CONTINUOUS")) {
throw new UnparsableDataException("Continuous data cannot be parsed at present");
}
} else if (token2.equalsIgnoreCase("INTERLEAVE")) {
isInterleaved = true;
}
} while (getLastDelimiter() != ';');
fmt = true;
}
} while (!token.equalsIgnoreCase(tokenToLookFor));
if (!dim) {
throw new MissingFieldException("DIMENSIONS");
}
if (block != TAXA_BLOCK && dataType == null) {
throw new MissingFieldException("DATATYPE");
}
}
/**
* Reads sequences in a 'DATA' or 'CHARACTERS' block.
*/
private void readSequenceData(Sequences sequences, TaxonList taxonList) throws ImportException, IOException {
int n, i;
String firstSequence = null;
if (isInterleaved) {
boolean firstLoop = true;
int readCount = 0;
while (readCount < siteCount) {
n = -1;
for (i = 0; i < taxonCount; i++) {
String token = readToken().trim();
Sequence sequence;
if (firstLoop) {
sequence = new Sequence();
sequence.setDataType(dataType);
sequences.addSequence(sequence);
Taxon taxon;
if (taxonList != null) {
int index = taxonList.getTaxonIndex(token.trim());
if (index == -1) {
// taxon not found in taxon list...
// ...perhaps it is a numerical taxon reference?
throw new UnknownTaxonException(token);
} else {
taxon = taxonList.getTaxon(index);
}
} else {
taxon = new Taxon(token.trim());
}
sequence.setTaxon(taxon);
} else {
sequence = sequences.getSequence(i);
Taxon taxon = sequence.getTaxon();
if (!taxon.getId().equals(token)) {
throw new UnknownTaxonException("Unknown taxon label: expecting '" +
taxon.getId() + "', found '" + token + "'");
}
}
StringBuffer buffer = new StringBuffer();
readSequenceLine(buffer, dataType, ";", gapCharacters, missingCharacters,
matchCharacters, firstSequence);
String seqString = buffer.toString();
sequence.appendSequenceString(seqString);
if (i == 0) {
firstSequence = seqString;
}
if (getLastDelimiter() == ';') {
if (i < taxonCount - 1) {
throw new TooFewTaxaException();
}
if (readCount + n < siteCount) {
throw new ShortSequenceException(sequence.getTaxon().getId());
}
}
if (n == -1) {
n = seqString.length();
}
if (n != seqString.length()) {
throw new ShortSequenceException(sequence.getTaxon().getId());
}
}
firstLoop = false;
readCount += n;
}
if (getLastDelimiter() != ';') {
throw new BadFormatException("Expecting ';' after sequences data");
}
} else {
for (i = 0; i < taxonCount; i++) {
String token = readToken().trim();
Sequence sequence = new Sequence();
sequence.setDataType(dataType);
sequences.addSequence(sequence);
Taxon taxon;
if (taxonList != null) {
int index = taxonList.getTaxonIndex(token);
if (index == -1) {
// taxon not found in taxon list...
// ...perhaps it is a numerical taxon reference?
throw new UnknownTaxonException(token);
} else {
taxon = taxonList.getTaxon(index);
}
} else {
taxon = new Taxon(token);
}
sequence.setTaxon(taxon);
StringBuffer buffer = new StringBuffer();
readSequence(buffer, dataType, ";", siteCount, gapCharacters,
missingCharacters, matchCharacters, firstSequence);
String seqString = buffer.toString();
if (seqString.length() != siteCount) {
throw new ShortSequenceException(sequence.getTaxon().getId());
}
sequence.appendSequenceString(seqString);
if (i == 0) {
firstSequence = seqString;
}
if (getLastDelimiter() == ';' && i < taxonCount - 1) {
throw new TooFewTaxaException();
}
}
if (getLastDelimiter() != ';') {
throw new BadFormatException("Expecting ';' after sequences data, has '"
+ (char) getLastDelimiter() + "' in line " + getLineNumber());
}
}
}
/**
* Reads a 'TAXA' block.
*/
private TaxonList readTaxaBlock() throws ImportException, IOException, IllegalArgumentException {
taxonCount = 0;
readDataBlockHeader("TAXLABELS", TAXA_BLOCK);
if (taxonCount == 0) {
throw new MissingFieldException("NTAXA");
}
Taxa taxa = new Taxa();
do {
String name = readToken(";").trim();
if (name.length() > 0) {
Taxon taxon = new Taxon(name);
taxa.addTaxon(taxon);
}
} while (getLastDelimiter() != ';');
if (taxa.getTaxonCount() != taxonCount) {
throw new BadFormatException("Number of taxa doesn't match NTAXA field");
}
findEndBlock();
int duplicateTaxon = TaxonList.Utils.findDuplicateTaxon(taxa);
if (duplicateTaxon >= 0)
throw new IllegalArgumentException("Tree contains duplicate taxon name: " + taxa.getTaxon(duplicateTaxon).getId() +
"!\nAll taxon names should be unique.");
return taxa;
}
/**
* Reads a 'CHARACTERS' block.
*/
private Alignment readCharactersBlock(TaxonList taxonList) throws ImportException, IOException {
siteCount = 0;
dataType = null;
readDataBlockHeader("MATRIX", CHARACTERS_BLOCK);
SimpleAlignment alignment = new SimpleAlignment();
readSequenceData(alignment, taxonList);
alignment.updateSiteCount();
findEndBlock();
return alignment;
}
/**
* Reads a 'DATA' block.
*/
private Alignment readDataBlock(/*TaxonList taxonList*/) throws ImportException, IOException {
taxonCount = 0;
siteCount = 0;
dataType = null;
readDataBlockHeader("MATRIX", DATA_BLOCK);
SimpleAlignment alignment = new SimpleAlignment();
readSequenceData(alignment, null);
alignment.updateSiteCount();
findEndBlock();
return alignment;
}
/**
* Reads a 'TREES' block.
*/
private Tree[] readTreesBlock(TaxonList taxonList, boolean useTaxonListNumbering) throws ImportException, IOException {
ArrayList<Tree> trees = new ArrayList<Tree>();
String[] lastToken = new String[1];
HashMap<String, Taxon> translationList = readTranslationList(taxonList, lastToken);
boolean done = false;
do {
Tree tree = readNextTree(translationList, lastToken, useTaxonListNumbering ? taxonList : null);
if (tree != null) {
trees.add(tree);
} else {
done = true;
}
} while (!done);
if (trees.size() == 0) {
throw new BadFormatException("No trees defined in TREES block");
}
Tree[] treeArray = new Tree[trees.size()];
trees.toArray(treeArray);
nextBlock = UNKNOWN_BLOCK;
return treeArray;
}
private HashMap<String, Taxon> readTranslationList(TaxonList taxonList, String[] lastToken) throws ImportException, IOException {
HashMap<String, Taxon> translationList = new HashMap<String, Taxon>();
String token = readToken(";");
if (token.equalsIgnoreCase("TRANSLATE")) {
do {
String token2 = readToken(",;");
if (getLastDelimiter() == ',' || getLastDelimiter() == ';') {
throw new BadFormatException("Missing taxon label in TRANSLATE command of TREES block");
}
String token3 = readToken(",;");
Taxon taxon;
if (getLastDelimiter() != ',' && getLastDelimiter() != ';') {
throw new BadFormatException("Expecting ',' or ';' after taxon label in TRANSLATE command of TREES block");
}
if (taxonList != null) {
int index = taxonList.getTaxonIndex(token3);
if (index == -1) {
// taxon not found in taxon list...
// ...perhaps it is a numerical taxon reference?
throw new UnknownTaxonException(token3);
} else {
taxon = taxonList.getTaxon(index);
}
} else {
taxon = new Taxon(token3);
}
if (translationList.containsKey(token2)) {
throw new BadFormatException("Translation list uses the key, " + token2 + ", more than once.");
}
translationList.put(token2, taxon);
} while (getLastDelimiter() != ';');
token = readToken(";");
} else if (taxonList != null) {
for (int i = 0; i < taxonList.getTaxonCount(); i++) {
Taxon taxon = taxonList.getTaxon(i);
translationList.put(taxon.getId(), taxon);
}
}
lastToken[0] = token;
return translationList;
}
private Tree readNextTree(HashMap<String, Taxon> translationList, String[] lastToken, TaxonList taxonList) throws ImportException, IOException {
try {
Tree tree = null;
String token = lastToken[0];
if (token.equalsIgnoreCase("UTREE") || token.equalsIgnoreCase("TREE")) {
if (nextCharacter() == '*') {
// Star is used to specify a default tree - ignore it
readCharacter();
}
String token2 = readToken("=;");
// Save tree comment and attach it later
final String comment = getLastMetaComment();
clearLastMetaComment();
if (getLastDelimiter() != '=') {
throw new BadFormatException("Missing label for tree'" + token2 + "' or missing '=' in TREE command of TREES block");
}
try {
if (nextCharacter() != '(') {
throw new BadFormatException("Missing tree definition in TREE command of TREES block");
}
// tree special comments
final String scomment = getLastMetaComment();
clearLastMetaComment();
FlexibleNode root = readInternalNode(translationList);
if (translationList != null) {
// this ensures that if a translation list is used, the external node numbers
// of the trees correspond as well.
Map<Taxon, Integer> taxonNumberMap = new HashMap<Taxon, Integer>();
int count = 0;
for (String label : translationList.keySet()) {
Taxon taxon = translationList.get(label);
int number;
if (taxonList != null) { // Map back to original numbering from TaxonList
number = taxonList.getTaxonIndex(taxon);
} else { // Old functionality
try {
number = Integer.parseInt(label) - 1;
} catch (NumberFormatException nfe) {
number = count;
}
}
taxonNumberMap.put(taxon, number);
count++;
}
tree = new FlexibleTree(root, false, true, taxonNumberMap);
} else {
tree = new FlexibleTree(root, false, true, null);
}
tree.setId(token2);
if (getLastDelimiter() == ':') {
// in case the root has a branch length, skip it
readToken(";");
if (getLastMetaComment() != null) {
// There was a meta-comment which should be in the form:
// \[&label[=value][,label[=value]>[,/..]]\]
try {
parseMetaCommentPairs(getLastMetaComment(), root);
} catch (BadFormatException bfe) {
// ignore it
}
clearLastMetaComment();
}
}
if (getLastDelimiter() != ';') {
throw new BadFormatException("Expecting ';' after tree, '" + token2 + "', TREE command of TREES block");
}
if (scomment != null) {
// below is correct only if [&W] appears on it own
String c = scomment;
while (c.length() > 0) {
final char ch = c.charAt(0);
if (ch == ';') {
c = c.substring(1);
continue;
}
if (ch == 'R') {
// we only have rooted trees anyway
c = c.substring(1);
} else if (ch == 'W') {
int e = c.indexOf(';');
if (e < 0) e = c.length();
try {
final Float value = new Float(c.substring(2, e));
tree.setAttribute("weight", value);
} catch (NumberFormatException ex) {
// don't fail, ignore
}
c = c.substring(e);
} else {
c = c.substring(1);
}
}
}
if (comment != null) {
try {
parseMetaCommentPairs(comment, tree);
} catch (Importer.BadFormatException e) {
// set generic comment attribute
tree.setAttribute("comment", comment);
}
}
} catch (EOFException e) {
// If we reach EOF we may as well return what we have?
return tree;
}
token = readToken(";");
} else if (token.equalsIgnoreCase("ENDBLOCK") || token.equalsIgnoreCase("END")) {
return null;
} else {
throw new BadFormatException("Unknown command '" + token + "' in TREES block");
}
//added this to escape readNextTree loop correctly -- AJD
lastToken[0] = token;
return tree;
} catch (EOFException e) {
return null;
}
}
/**
* Reads a branch in. This could be a node or a tip (calls readNode or readTip
* accordingly). It then reads the branch length and SimpleNode that will
* point at the new node or tip.
*/
FlexibleNode readBranch(HashMap<String, Taxon> translationList) throws IOException, ImportException {
double length = 0.0;
FlexibleNode branch;
clearLastMetaComment();
if (nextCharacter() == '(') {
// is an internal node
branch = readInternalNode(translationList);
} else {
// is an external node
branch = readExternalNode(translationList);
}
if (getLastDelimiter() != ':' && getLastDelimiter() != ',' && getLastDelimiter() != ')') {
String label = readToken(",():;");
if (label.length() > 0) {
branch.setAttribute("label", label);
}
}
if (getLastDelimiter() == ':') {
length = readDouble(",():;");
if (getLastMetaComment() != null) {
if (!ignoreMetaComments) {
// There was a meta-comment which should be in the form:
// \[&label[=value][,label[=value]>[,/..]]\]
try {
parseMetaCommentPairs(getLastMetaComment(), branch);
} catch (BadFormatException bfe) {
// ignore it
}
}
clearLastMetaComment();
}
}
branch.setLength(length);
return branch;
}
/**
* Reads a node in. This could be a polytomy. Calls readBranch on each branch
* in the node.
*/
FlexibleNode readInternalNode(HashMap<String, Taxon> translationList) throws IOException, ImportException {
FlexibleNode node = new FlexibleNode();
// read the opening '('
readCharacter();
// read the first child
node.addChild(readBranch(translationList));
if (getLastDelimiter() != ',' && !suppressWarnings) {
java.util.logging.Logger.getLogger("dr.evolution.io").warning("Internal node only has a single child.");
}
// this allows one or more children
while(getLastDelimiter()==',') {
node.addChild(readBranch(translationList));
}
// should have had a closing ')'
if (getLastDelimiter() != ')') {
throw new BadFormatException("Missing closing ')' in tree in TREES block");
}
readToken(":(),;");
if (getLastMetaComment() != null) {
if (!ignoreMetaComments) {
// There was a meta-comment which should be in the form:
// \[&label[=value][,label[=value]>[,/..]]\]
try {
parseMetaCommentPairs(getLastMetaComment(), node);
} catch (BadFormatException bfe) {
// ignore it
}
}
clearLastMetaComment();
}
// find the next delimiter
return node;
}
// private void labelNode(FlexibleNode node, String label, String value) {
// // Attempt to format the value as a number
// Number number = null;
// try {
// number = Integer.valueOf(value);
// } catch (NumberFormatException nfe1) {
// try {
// number = Double.valueOf(value);
// } catch (NumberFormatException nfe2) {
// //
// }
// }
// if (number != null) {
// node.setAttribute(label, number);
// } else {
// node.setAttribute(label, value);
// }
// }
/**
* Reads an external node in.
*/
FlexibleNode readExternalNode(HashMap<String, Taxon> translationList) throws ImportException, IOException {
FlexibleNode node = new FlexibleNode();
String label = readToken(":(),;");
Taxon taxon;
if (translationList.size() > 0) {
taxon = translationList.get(label);
if (taxon == null) {
// taxon not found in taxon list...
throw new UnknownTaxonException("Taxon in tree, '" + label + "' is unknown");
}
} else {
taxon = new Taxon(label);
}
if (getLastMetaComment() != null) {
if (!ignoreMetaComments) {
// There was a meta-comment which should be in the form:
// \[&label[=value][,label[=value]>[,/..]]\]
try {
parseMetaCommentPairs(getLastMetaComment(), node);
} catch (BadFormatException bfe) {
// ignore it
}
}
clearLastMetaComment();
}
node.setTaxon(taxon);
return node;
}
/**
* Reads a 'CALIBRATION' block.
*/
private dr.evolution.util.Date[] readCalibrationBlock(TaxonList taxonList) throws ImportException, IOException {
double origin = 0.0;
boolean isBackwards = false;
Units.Type units = Units.Type.YEARS;
ArrayList<Date> dates = new ArrayList<Date>();
String token;
boolean done = false;
do {
token = readToken(";");
if (token.equalsIgnoreCase("OPTIONS")) {
do {
String token2 = readToken("=;");
if (getLastDelimiter() != '=') {
throw new BadFormatException("Unknown subcommand, '" + token2 + "', or missing '=' in OPTIONS command of CALIBRATION block");
}
if (token2.equalsIgnoreCase("SCALE")) {
String token3 = readToken(";");
if (token3.equalsIgnoreCase("DAYS")) {
units = Units.Type.DAYS;
} else if (token3.equalsIgnoreCase("MONTHS")) {
units = Units.Type.MONTHS;
} else if (token3.equalsIgnoreCase("YEARS")) {
units = Units.Type.YEARS;
} else {
throw new BadFormatException("SCALE in OPTIONS command of CALIBRATION block must be one of DAYS, MONTHS or YEARS");
}
} else if (token2.equalsIgnoreCase("ORIGIN")) {
origin = readDouble(";");
} else if (token2.equalsIgnoreCase("DIRECTION")) {
String token3 = readToken(";");
if (token3.equalsIgnoreCase("FORWARDS")) {
isBackwards = false;
} else if (token3.equalsIgnoreCase("BACKWARDS")) {
isBackwards = true;
} else {
throw new BadFormatException("DIRECTION in OPTIONS command of CALIBRATION block must be either FORWARDS or BACKWARDS");
}
} else {
throw new BadFormatException("Unknown subcommand, '" + token2 + "', in OPTIONS command of CALIBRATION block");
}
} while (getLastDelimiter() != ';');
} else if (token.equalsIgnoreCase("TIPCALIBRATION")) {
do {
String token2 = readToken("=;");
if (getLastDelimiter() != '=') {
throw new BadFormatException("Missing date for label '" + token2 + "' or missing '=' in TIPCALIBRATION command of CALIBRATION block");
}
double value = readDouble(":;");
if (getLastDelimiter() != ':') {
throw new BadFormatException("Missing taxon list for label '" + token2 + "' or missing ':' in TIPCALIBRATION command of CALIBRATION block");
}
dr.evolution.util.Date date;
if (isBackwards) {
date = dr.evolution.util.Date.createTimeAgoFromOrigin(value, units, origin);
} else {
date = dr.evolution.util.Date.createTimeSinceOrigin(value, units, origin);
}
dates.add(date);
do {
String token3 = readToken(",;");
Taxon taxon;
int index = taxonList.getTaxonIndex(token3);
if (index == -1) {
// taxon not found in taxon list...
throw new UnknownTaxonException("Unknown taxon '" + token3 + "' for label '" + token2 + "' in TIPCALIBRATION command of CALIBRATION block");
} else {
taxon = taxonList.getTaxon(index);
}
taxon.setAttribute("date", date);
} while (getLastDelimiter() != ',' && getLastDelimiter() != ';');
} while (getLastDelimiter() == ',');
} else if (token.equalsIgnoreCase("NODECALIBRATION")) {
throw new BadFormatException("NODECALIBRATION not suppored in CALIBRATION block");
} else if (token.equalsIgnoreCase("ENDBLOCK") || token.equalsIgnoreCase("END")) {
done = true;
} else {
throw new BadFormatException("Unknown command '" + token + "' in CALIBRATION block");
}
} while (!done);
dr.evolution.util.Date[] dateArray = new dr.evolution.util.Date[dates.size()];
dates.toArray(dateArray);
nextBlock = UNKNOWN_BLOCK;
return dateArray;
}
static void parseMetaCommentPairs(String meta, Attributable item) throws Importer.BadFormatException {
if (meta.startsWith("B ")) {
// a MrBayes annotation
String[] parts = meta.split(" ");
if (parts.length == 3 && parts[1].length() > 0 && parts[2].length() > 0) {
item.setAttribute(parts[1], parseValue(parts[2]));
} else if (parts.length == 2 && parts[1].length() > 0) {
item.setAttribute(parts[1], Boolean.TRUE);
} else {
throw new Importer.BadFormatException("Badly formatted attribute: '" + meta + "'");
}
return;
}
// This regex should match key=value pairs, separated by commas
// This can match the following types of meta comment pairs:
// value=number, value="string", value={item1, item2, item3}
// (label must be quoted if it contains spaces (i.e. "my label"=label)
// TODO MAS Minor change in line below for nested arrays may cause other unforeseen bugs
Pattern pattern = Pattern.compile("(\"[^\"]*\"+|[^,=\\s]+)\\s*(=\\s*(\\{[^=]*\\}|\"[^\"]*\"+|[^,]+))?");
Matcher matcher = pattern.matcher(meta);
while (matcher.find()) {
String label = matcher.group(1);
if (label.charAt(0) == '\"') {
label = label.substring(1, label.length() - 1);
}
if (label == null || label.trim().length() == 0) {
throw new Importer.BadFormatException("Badly formatted attribute: '" + matcher.group() + "'");
}
final String value = matcher.group(2);
if (value != null && value.trim().length() > 0) {
// there is a specified value so try to parse it
item.setAttribute(label, parseValue(value.substring(1)));
} else {
item.setAttribute(label, Boolean.TRUE);
}
}
}
public static boolean isInt(String str)
{
if (str == null) {
return false;
}
int length = str.length();
if (length == 0) {
return false;
}
int i = 0;
if (str.charAt(0) == '-') {
if (length == 1) {
return false;
}
i = 1;
}
for (; i < length; i++) {
char c = str.charAt(i);
if (c <= '/' || c >= ':') {
return false;
}
}
return true;
}
/**
* This method takes a string and tries to decode it returning the object
* that best fits the data. It will recognize comma delimited lists enclosed
* in {..} and call parseValue() on each element. It will also recognize Boolean,
* Integer and Double. If the value starts with a # then it will attempt to decode
* the following integer as an RGB colour - see Color.decode(). If nothing else fits
* then the value will be returned as a string but trimmed of leading and trailing
* white space.
*
* @param value the string
* @return the object
*/
public static Serializable parseValue(String value) {
value = value.trim();
if (value.startsWith("{")) {
// the value is a list so recursively parse the elements
// and return an array
String inside = value.substring(1, value.length() - 1);
if (inside.length() == 0) {
return null;
}
// Determine depth of further nesting
int depth = 0;
while (inside.charAt(depth) == '{') {
depth++;
}
StringBuilder split;
if (depth == 0) {
split = new StringBuilder(",");
} else {
StringBuilder rightBookEnd = new StringBuilder("(?<=");
StringBuilder leftBookEnd = new StringBuilder("(?=");
for (int i = 0; i < depth; ++i) {
rightBookEnd.append("\\}");
leftBookEnd.append("\\{");
}
leftBookEnd.append(")");
rightBookEnd.append(")");
split = rightBookEnd.append(",").append(leftBookEnd);
}
// Non-destructive split
String[] elements = inside.split(split.toString());
Object[] values = new Object[elements.length];
for (int i = 0; i < elements.length; i++) {
values[i] = parseValue(elements[i]);
}
return values;
}
if (value.startsWith("#")) {
// I am not sure whether this is a good idea but
// I am going to assume that a # denotes an RGB colour
try {
return Color.decode(value.substring(1));
} catch (NumberFormatException nfe1) {
// not a colour
}
}
if (value.equalsIgnoreCase("TRUE") || value.equalsIgnoreCase("FALSE")) {
return Boolean.valueOf(value);
}
// Attempt to format the value as an integer
// try {
// return new Integer(value);
// } catch (NumberFormatException nfe1) {
// // not an integer
// }
// throwing exception to test for an integer is slow
if (isInt(value)) {
return Integer.valueOf(value);
}
// Attempt to format the value as a double
try {
return new Double(value);
} catch (NumberFormatException nfe2) {
// not a double
}
// return the trimmed string
return value;
}
// private stuff
private NexusBlock nextBlock = null;
private int taxonCount = 0, siteCount = 0;
private DataType dataType = null;
private String gapCharacters = "-";
private String matchCharacters = ".";
private String missingCharacters = "?";
private boolean isInterleaved = false;
public static void main(String[] args) throws IOException, ImportException {
if (args.length > 3) {
int sampleFrequency = Integer.parseInt(args[1]);
boolean includeBranchLengths = Boolean.getBoolean(args[2]);
boolean isNexus = Boolean.getBoolean(args[3]);
NexusImporter nexusImporter = null;
BufferedReader reader = null;
if (isNexus) {
nexusImporter = new NexusImporter(new FileReader(args[0]));
} else {
reader = new BufferedReader(new FileReader(args[0]));
}
int index = 0;
int count = 0;
String line = null;
if (!isNexus) line = reader.readLine();
while (line != null || (isNexus && nexusImporter.hasTree())) {
Tree tree;
if (isNexus) {
tree = nexusImporter.importNextTree();
} else {
String treeString = line.substring(line.indexOf('(')).trim();
java.io.Reader stringReader = new java.io.StringReader(treeString);
NewickImporter importer = new NewickImporter(stringReader);
tree = importer.importNextTree();
}
if (index % sampleFrequency == 0) {
if (includeBranchLengths) {
System.out.println(TreeUtils.newick(tree));
} else {
System.out.println(TreeUtils.newickNoLengths(tree));
count += 1;
}
}
index += 1;
if (!isNexus) line = reader.readLine();
}
System.out.println(count + " trees");
} else {
System.err.println("usage: filterTrees <tree-file-name> <sample-frequency> <include-branch-lengths>");
}
}
}