NexusImporter.java example

Explorer
beast-mcmc-master
/*
 * NexusImporter.java
 *
 * Copyright (c) 2002-2015 Alexei Drummond, Andrew Rambaut and Marc Suchard
 *
 * This file is part of BEAST.
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership and licensing.
 *
 * BEAST is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 *  BEAST is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with BEAST; if not, write to the
 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
 * Boston, MA  02110-1301  USA
 */

package dr.evolution.io;

import dr.evolution.alignment.Alignment;
import dr.evolution.alignment.SimpleAlignment;
import dr.evolution.datatype.AminoAcids;
import dr.evolution.datatype.DataType;
import dr.evolution.datatype.Nucleotides;
import dr.evolution.datatype.TwoStates;
import dr.evolution.sequence.Sequence;
import dr.evolution.sequence.SequenceList;
import dr.evolution.sequence.Sequences;
import dr.evolution.tree.FlexibleNode;
import dr.evolution.tree.FlexibleTree;
import dr.evolution.tree.Tree;
import dr.evolution.tree.TreeUtils;
import dr.evolution.util.*;
import dr.util.Attributable;

import java.awt.*;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Class for importing NEXUS file format
 *
 * @author Andrew Rambaut
 * @author Alexei Drummond
 * @version $Id: NexusImporter.java,v 1.30 2006/04/25 14:39:37 rambaut Exp $
 */
public class NexusImporter extends Importer implements SequenceImporter, TreeImporter {

    public static final NexusBlock UNKNOWN_BLOCK = new NexusBlock("unknown");
    public static final NexusBlock TAXA_BLOCK = new NexusBlock("TAXA");
    public static final NexusBlock CHARACTERS_BLOCK = new NexusBlock("CHARACTERS");
    public static final NexusBlock DATA_BLOCK = new NexusBlock("DATA");
    public static final NexusBlock UNALIGNED_BLOCK = new NexusBlock("UNALIGNED");
    public static final NexusBlock DISTANCES_BLOCK = new NexusBlock("DISTANCES");
    public static final NexusBlock TREES_BLOCK = new NexusBlock("TREES");
    public static final NexusBlock CALIBRATION_BLOCK = new NexusBlock("CALIBRATION");

    public static boolean suppressWarnings = false;

    private final boolean ignoreMetaComments;

    public static void setSuppressWarnings(boolean sw) {
        suppressWarnings = sw;
    }

    // NEXUS specific ImportException classes
    public static class MissingBlockException extends ImportException {
        /**
         *
         */
        private static final long serialVersionUID = -6287423449717453999L;

        public MissingBlockException() {
            super();
        }

        public MissingBlockException(String message) {
            super(message);
        }
    }

    public static class NexusBlock {
        private final String name;

        public NexusBlock(String name) {
            this.name = name;
        }

        public String toString() {
            return name;
        }
    }

    /**
     * Constructor
     */
    public NexusImporter(Reader reader) {
        super(reader);
        this.ignoreMetaComments = false;
        setCommentDelimiters('[', ']', '\0', '\0', '&');
    }

    public NexusImporter(Reader reader, boolean ignoreMetaComments) {
        super(reader);
        this.ignoreMetaComments = ignoreMetaComments;
        setCommentDelimiters('[', ']', '\0', '!', '&');
    }

    public NexusImporter(Reader reader, Writer commentWriter) {
        super(reader, commentWriter);
        this.ignoreMetaComments = false;
        setCommentDelimiters('[', ']', '\0', '!', '&');
    }

    /**
     * This function returns an integer to specify what the
     * next block in the file is. The internal variable nextBlock is also set to this
     * value. This should be overridden to provide support for other blocks. Once
     * the block is read in, nextBlock is automatically set to UNKNOWN_BLOCK by
     * findEndBlock.
     */
    public NexusBlock findNextBlock() throws IOException {
        findToken("BEGIN", true);
        String blockName = readToken(";");
        return findBlockName(blockName);
    }

    /**
     * This function returns an enum class to specify what the
     * block given by blockName is.
     */
    public NexusBlock findBlockName(String blockName) {
        if (blockName.equalsIgnoreCase(TAXA_BLOCK.toString())) {
            nextBlock = TAXA_BLOCK;
        } else if (blockName.equalsIgnoreCase(CHARACTERS_BLOCK.toString())) {
            nextBlock = CHARACTERS_BLOCK;
        } else if (blockName.equalsIgnoreCase(DATA_BLOCK.toString())) {
            nextBlock = DATA_BLOCK;
        } else if (blockName.equalsIgnoreCase(UNALIGNED_BLOCK.toString())) {
            nextBlock = UNALIGNED_BLOCK;
        } else if (blockName.equalsIgnoreCase(DISTANCES_BLOCK.toString())) {
            nextBlock = DISTANCES_BLOCK;
        } else if (blockName.equalsIgnoreCase(TREES_BLOCK.toString())) {
            nextBlock = TREES_BLOCK;
        } else if (blockName.equalsIgnoreCase(CALIBRATION_BLOCK.toString())) {
            nextBlock = CALIBRATION_BLOCK;
        }

        return nextBlock;
    }

    /**
     * Parses a 'TAXA' block.
     */
    public TaxonList parseTaxaBlock() throws ImportException, IOException {
        return readTaxaBlock();
    }

    /**
     * Parses a 'CHARACTERS' block.
     */
    public Alignment parseCharactersBlock(TaxonList taxonList) throws ImportException, IOException {
        return readCharactersBlock(taxonList);
    }

    /**
     * Parses a 'DATA' block.
     */
    public Alignment parseDataBlock(TaxonList taxonList) throws ImportException, IOException {
        return readDataBlock(/*taxonList*/);
    }

    /**
     * Parses a 'TREES' block.
     */
    public Tree[] parseTreesBlock(TaxonList taxonList) throws ImportException, IOException {
        return readTreesBlock(taxonList, false);
    }

    /**
     * Parses a 'CALIBRATION' block putting the dates into the appropriate taxa
     */
    public dr.evolution.util.Date[] parseCalibrationBlock(TaxonList taxonList) throws ImportException, IOException {
        return readCalibrationBlock(taxonList);
    }

    // **************************************************************
    // SequenceImporter IMPLEMENTATION
    // **************************************************************

    /**
     * importAlignment.
     */
    public Alignment importAlignment() throws IOException, Importer.ImportException {
        boolean done = false;

        TaxonList taxonList = null;
        Alignment alignment = null;

        while (!done) {
            try {

                NexusImporter.NexusBlock block = findNextBlock();

                if (block == NexusImporter.TAXA_BLOCK) {

                    taxonList = readTaxaBlock();

                } else if (block == NexusImporter.CALIBRATION_BLOCK) {
                    if (taxonList == null) {
                        throw new MissingBlockException("TAXA block is missing");
                    }

                    readCalibrationBlock(taxonList);

                } else if (block == NexusImporter.CHARACTERS_BLOCK) {

                    if (taxonList == null) {
                        throw new MissingBlockException("TAXA block is missing");
                    }

                    alignment = readCharactersBlock(taxonList);
                    done = true;

                } else if (block == NexusImporter.DATA_BLOCK) {

                    // A data block doesn't need a taxon block before it
                    // but if one exists then it will use it.
                    alignment = readDataBlock(/*taxonList*/);
                    done = true;

                } else {
                    // Ignore the block..
                }

            } catch (EOFException ex) {
                done = true;
            }
        }

        if (alignment == null) {
            throw new MissingBlockException("DATA or CHARACTERS block is missing");
        }

        return alignment;
    }

    /**
     * importSequences.
     */
    public SequenceList importSequences() throws IOException, ImportException {
        return importAlignment();
    }

    // **************************************************************
    // TreeImporter IMPLEMENTATION
    // **************************************************************

    private boolean isReadingTreesBlock = false;
    private HashMap<String, Taxon> translationList = null;
    private Tree nextTree = null;
    private final String[] lastToken = new String[1];

    /**
     * import a single tree.
     */
    public Tree importTree(TaxonList taxonList) throws IOException, ImportException {
        return importTree(taxonList, false);
    }

    public Tree importTree(TaxonList taxonList, boolean useTaxonListNumbering) throws IOException, ImportException {
        isReadingTreesBlock = false;
        TaxonList[] aTaxonList = new TaxonList[1];
        aTaxonList[0] = taxonList;
        if (!startReadingTrees(aTaxonList)) {
            throw new MissingBlockException("TREES block is missing");
        }
        translationList = readTranslationList(aTaxonList[0], lastToken);
        return readNextTree(translationList, lastToken, useTaxonListNumbering ? taxonList : null);
    }

    /**
     * import an array of all trees.
     */
    public Tree[] importTrees(TaxonList taxonList) throws IOException, ImportException {
        return importTrees(taxonList, false);
    }

    public Tree[] importTrees(TaxonList taxonList, boolean useTaxonListNumbering) throws IOException, ImportException {
        isReadingTreesBlock = false;
        TaxonList[] aTaxonList = new TaxonList[1];
        aTaxonList[0] = taxonList;
        if (!startReadingTrees(aTaxonList)) {
            throw new MissingBlockException("TREES block is missing");
        }
        return readTreesBlock(aTaxonList[0], useTaxonListNumbering);
    }

    /**
     * return whether another tree is available.
     */
    public boolean hasTree() throws IOException, ImportException {
        if (!isReadingTreesBlock) {
            TaxonList[] taxonList = new TaxonList[1];
            taxonList[0] = null;
            isReadingTreesBlock = startReadingTrees(taxonList);
            if (!isReadingTreesBlock) return false;

            translationList = readTranslationList(taxonList[0], lastToken);
        }

        if (nextTree == null) {
            nextTree = readNextTree(translationList, lastToken, null);
        }

        return (nextTree != null);
    }


    /**
     * import the next tree.
     * return the tree or null if no more trees are available
     */
    public Tree importNextTree() throws IOException, ImportException {
        // call hasTree to do the hard work...
        if (!hasTree()) {
            isReadingTreesBlock = false;
            return null;
        }

        Tree tree = nextTree;
        nextTree = null;

        return tree;
    }

    public boolean startReadingTrees(TaxonList[] taxonList) throws IOException, ImportException {
        boolean done = false;

        while (!done) {
            try {

                NexusImporter.NexusBlock block = findNextBlock();

                if (block == NexusImporter.TAXA_BLOCK && taxonList[0] == null) {
                    // only read the taxon list if one hasn't been set already...
                    taxonList[0] = readTaxaBlock();

                } else if (block == NexusImporter.TREES_BLOCK) {

                    return true;
                } else {
                    // Ignore the block..
                }

            } catch (EOFException ex) {
                done = true;
            }
        }

        return false;
    }

    /**
     * Finds the end of the current block.
     */
    private void findToken(String query, boolean ignoreCase) throws IOException {
        String token;
        boolean found = false;

        do {

            token = readToken();

            if ((ignoreCase && token.equalsIgnoreCase(query)) || token.equals(query)) {
                found = true;
            }
        } while (!found);
    }

    /**
     * Finds the end of the current block.
     */
    public void findEndBlock() throws IOException {
        try {
            String token;

            do {
                token = readToken(";");
            } while (!token.equalsIgnoreCase("END") && !token.equalsIgnoreCase("ENDBLOCK"));
        } catch (EOFException e) {
            // Doesn't matter if the End is missing
        }

        nextBlock = UNKNOWN_BLOCK;
    }

    /**
     * Reads the header information for a 'DATA', 'CHARACTERS' or 'TAXA' block.
     */
    private void readDataBlockHeader(String tokenToLookFor, NexusBlock block) throws ImportException, IOException {

        boolean dim = false, ttl = false, fmt = false;
        String token;

        do {
            token = readToken();

            if (token.equalsIgnoreCase("TITLE")) {
                if (ttl) {
                    throw new DuplicateFieldException("TITLE");
                }

                ttl = true;
            } else if (token.equalsIgnoreCase("DIMENSIONS")) {

                if (dim) {
                    throw new DuplicateFieldException("DIMENSIONS");
                }

                boolean nchar = (block == TAXA_BLOCK);
                boolean ntax = (block == CHARACTERS_BLOCK);

                do {
                    String token2 = readToken("=;");

                    if (getLastDelimiter() != '=') {
                        throw new BadFormatException("Unknown subcommand, '" + token2 + "', or missing '=' in DIMENSIONS command");
                    }

                    if (token2.equalsIgnoreCase("NTAX")) {

                        if (block == CHARACTERS_BLOCK) {
                            throw new BadFormatException("NTAX subcommand in CHARACTERS block");
                        }

                        taxonCount = readInteger(";");
                        ntax = true;

                    } else if (token2.equalsIgnoreCase("NCHAR")) {

                        if (block == TAXA_BLOCK) {
                            throw new BadFormatException("NCHAR subcommand in TAXA block");
                        }

                        siteCount = readInteger(";");
                        nchar = true;

                    } else {
                        throw new BadFormatException("Unknown subcommand, '" + token2 + "', in DIMENSIONS command");
                    }

                } while (getLastDelimiter() != ';');

                if (!ntax) {
                    throw new BadFormatException("NTAX subcommand missing from DIMENSIONS command");
                }
                if (!nchar) {
                    throw new BadFormatException("NCHAR subcommand missing from DIMENSIONS command");
                }
                dim = true;

            } else if (token.equalsIgnoreCase("FORMAT")) {

                if (fmt) {
                    throw new DuplicateFieldException("FORMAT");
                }

                dataType = null;

                do {
                    String token2 = readToken("=;");

                    if (token2.equalsIgnoreCase("GAP")) {

                        if (getLastDelimiter() != '=') {
                            throw new BadFormatException("Expecting '=' after GAP subcommand in FORMAT command");
                        }

                        gapCharacters = readToken(";");

                    } else if (token2.equalsIgnoreCase("MISSING")) {

                        if (getLastDelimiter() != '=') {
                            throw new BadFormatException("Expecting '=' after MISSING subcommand in FORMAT command");
                        }

                        missingCharacters = readToken(";");

                    } else if (token2.equalsIgnoreCase("MATCHCHAR")) {

                        if (getLastDelimiter() != '=') {
                            throw new BadFormatException("Expecting '=' after MATCHCHAR subcommand in FORMAT command");
                        }

                        matchCharacters = readToken(";");

                    } else if (token2.equalsIgnoreCase("DATATYPE")) {

                        if (getLastDelimiter() != '=') {
                            throw new BadFormatException("Expecting '=' after DATATYPE subcommand in FORMAT command");
                        }

                        String token3 = readToken(";");
                        if (token3.equalsIgnoreCase("NUCLEOTIDE") ||
                                token3.equalsIgnoreCase("DNA") ||
                                token3.equalsIgnoreCase("RNA")) {

                            dataType = Nucleotides.INSTANCE;

                        } else if (token3.equalsIgnoreCase("STANDARD") || token3.equalsIgnoreCase("BINARY")) {

                            dataType = TwoStates.INSTANCE;

                        } else if (token3.equalsIgnoreCase("PROTEIN")) {

                            dataType = AminoAcids.INSTANCE;

                        } else if (token3.equalsIgnoreCase("CONTINUOUS")) {

                            throw new UnparsableDataException("Continuous data cannot be parsed at present");

                        }
                    } else if (token2.equalsIgnoreCase("INTERLEAVE")) {
                        isInterleaved = true;
                    }

                } while (getLastDelimiter() != ';');

                fmt = true;
            }
        } while (!token.equalsIgnoreCase(tokenToLookFor));

        if (!dim) {
            throw new MissingFieldException("DIMENSIONS");
        }
        if (block != TAXA_BLOCK && dataType == null) {
            throw new MissingFieldException("DATATYPE");
        }
    }

    /**
     * Reads sequences in a 'DATA' or 'CHARACTERS' block.
     */
    private void readSequenceData(Sequences sequences, TaxonList taxonList) throws ImportException, IOException {
        int n, i;
        String firstSequence = null;

        if (isInterleaved) {
            boolean firstLoop = true;

            int readCount = 0;
            while (readCount < siteCount) {

                n = -1;

                for (i = 0; i < taxonCount; i++) {

                    String token = readToken().trim();

                    Sequence sequence;

                    if (firstLoop) {

                        sequence = new Sequence();
                        sequence.setDataType(dataType);
                        sequences.addSequence(sequence);

                        Taxon taxon;

                        if (taxonList != null) {
                            int index = taxonList.getTaxonIndex(token.trim());
                            if (index == -1) {
                                // taxon not found in taxon list...
                                // ...perhaps it is a numerical taxon reference?
                                throw new UnknownTaxonException(token);
                            } else {
                                taxon = taxonList.getTaxon(index);
                            }
                        } else {
                            taxon = new Taxon(token.trim());
                        }

                        sequence.setTaxon(taxon);

                    } else {

                        sequence = sequences.getSequence(i);
                        Taxon taxon = sequence.getTaxon();
                        if (!taxon.getId().equals(token)) {
                            throw new UnknownTaxonException("Unknown taxon label: expecting '" +
                                    taxon.getId() + "', found '" + token + "'");
                        }
                    }

                    StringBuffer buffer = new StringBuffer();
                    readSequenceLine(buffer, dataType, ";", gapCharacters, missingCharacters,
                            matchCharacters, firstSequence);
                    String seqString = buffer.toString();
                    sequence.appendSequenceString(seqString);
                    if (i == 0) {
                        firstSequence = seqString;
                    }

                    if (getLastDelimiter() == ';') {
                        if (i < taxonCount - 1) {
                            throw new TooFewTaxaException();
                        }
                        if (readCount + n < siteCount) {
                            throw new ShortSequenceException(sequence.getTaxon().getId());
                        }
                    }

                    if (n == -1) {
                        n = seqString.length();
                    }

                    if (n != seqString.length()) {
                        throw new ShortSequenceException(sequence.getTaxon().getId());
                    }
                }

                firstLoop = false;
                readCount += n;

            }
            if (getLastDelimiter() != ';') {
                throw new BadFormatException("Expecting ';' after sequences data");
            }

        } else {

            for (i = 0; i < taxonCount; i++) {
                String token = readToken().trim();

                Sequence sequence = new Sequence();
                sequence.setDataType(dataType);
                sequences.addSequence(sequence);

                Taxon taxon;

                if (taxonList != null) {
                    int index = taxonList.getTaxonIndex(token);
                    if (index == -1) {
                        // taxon not found in taxon list...
                        // ...perhaps it is a numerical taxon reference?
                        throw new UnknownTaxonException(token);
                    } else {
                        taxon = taxonList.getTaxon(index);
                    }
                } else {
                    taxon = new Taxon(token);
                }

                sequence.setTaxon(taxon);

                StringBuffer buffer = new StringBuffer();
                readSequence(buffer, dataType, ";", siteCount, gapCharacters,
                        missingCharacters, matchCharacters, firstSequence);
                String seqString = buffer.toString();
                if (seqString.length() != siteCount) {
                    throw new ShortSequenceException(sequence.getTaxon().getId());
                }

                sequence.appendSequenceString(seqString);
                if (i == 0) {
                    firstSequence = seqString;
                }

                if (getLastDelimiter() == ';' && i < taxonCount - 1) {
                    throw new TooFewTaxaException();
                }

            }

            if (getLastDelimiter() != ';') {
                throw new BadFormatException("Expecting ';' after sequences data, has '"
                        + (char) getLastDelimiter() + "' in line " + getLineNumber());
            }

        }
    }


    /**
     * Reads a 'TAXA' block.
     */
    private TaxonList readTaxaBlock() throws ImportException, IOException, IllegalArgumentException {

        taxonCount = 0;

        readDataBlockHeader("TAXLABELS", TAXA_BLOCK);

        if (taxonCount == 0) {
            throw new MissingFieldException("NTAXA");
        }

        Taxa taxa = new Taxa();

        do {
            String name = readToken(";").trim();
            if (name.length() > 0) {
                Taxon taxon = new Taxon(name);
                taxa.addTaxon(taxon);
            }
        } while (getLastDelimiter() != ';');

        if (taxa.getTaxonCount() != taxonCount) {
            throw new BadFormatException("Number of taxa doesn't match NTAXA field");
        }

        findEndBlock();

        int duplicateTaxon = TaxonList.Utils.findDuplicateTaxon(taxa);
        if (duplicateTaxon >= 0)
            throw new IllegalArgumentException("Tree contains duplicate taxon name: " + taxa.getTaxon(duplicateTaxon).getId() +
                    "!\nAll taxon names should be unique.");

        return taxa;
    }

    /**
     * Reads a 'CHARACTERS' block.
     */
    private Alignment readCharactersBlock(TaxonList taxonList) throws ImportException, IOException {

        siteCount = 0;
        dataType = null;

        readDataBlockHeader("MATRIX", CHARACTERS_BLOCK);

        SimpleAlignment alignment = new SimpleAlignment();
        readSequenceData(alignment, taxonList);
        alignment.updateSiteCount();

        findEndBlock();

        return alignment;
    }

    /**
     * Reads a 'DATA' block.
     */
    private Alignment readDataBlock(/*TaxonList taxonList*/) throws ImportException, IOException {

        taxonCount = 0;
        siteCount = 0;
        dataType = null;

        readDataBlockHeader("MATRIX", DATA_BLOCK);

        SimpleAlignment alignment = new SimpleAlignment();
        readSequenceData(alignment, null);
        alignment.updateSiteCount();

        findEndBlock();

        return alignment;
    }


    /**
     * Reads a 'TREES' block.
     */
    private Tree[] readTreesBlock(TaxonList taxonList, boolean useTaxonListNumbering) throws ImportException, IOException {
        ArrayList<Tree> trees = new ArrayList<Tree>();

        String[] lastToken = new String[1];
        HashMap<String, Taxon> translationList = readTranslationList(taxonList, lastToken);

        boolean done = false;
        do {

            Tree tree = readNextTree(translationList, lastToken, useTaxonListNumbering ? taxonList : null);

            if (tree != null) {
                trees.add(tree);
            } else {
                done = true;
            }
        } while (!done);

        if (trees.size() == 0) {
            throw new BadFormatException("No trees defined in TREES block");
        }

        Tree[] treeArray = new Tree[trees.size()];
        trees.toArray(treeArray);

        nextBlock = UNKNOWN_BLOCK;

        return treeArray;
    }

    private HashMap<String, Taxon> readTranslationList(TaxonList taxonList, String[] lastToken) throws ImportException, IOException {
        HashMap<String, Taxon> translationList = new HashMap<String, Taxon>();

        String token = readToken(";");

        if (token.equalsIgnoreCase("TRANSLATE")) {

            do {
                String token2 = readToken(",;");

                if (getLastDelimiter() == ',' || getLastDelimiter() == ';') {
                    throw new BadFormatException("Missing taxon label in TRANSLATE command of TREES block");
                }

                String token3 = readToken(",;");
                Taxon taxon;

                if (getLastDelimiter() != ',' && getLastDelimiter() != ';') {
                    throw new BadFormatException("Expecting ',' or ';' after taxon label in TRANSLATE command of TREES block");
                }

                if (taxonList != null) {
                    int index = taxonList.getTaxonIndex(token3);
                    if (index == -1) {
                        // taxon not found in taxon list...
                        // ...perhaps it is a numerical taxon reference?
                        throw new UnknownTaxonException(token3);
                    } else {
                        taxon = taxonList.getTaxon(index);
                    }
                } else {
                    taxon = new Taxon(token3);
                }

                if (translationList.containsKey(token2)) {
                    throw new BadFormatException("Translation list uses the key, " + token2 + ", more than once.");
                }
                translationList.put(token2, taxon);

            } while (getLastDelimiter() != ';');

            token = readToken(";");
        } else if (taxonList != null) {
            for (int i = 0; i < taxonList.getTaxonCount(); i++) {
                Taxon taxon = taxonList.getTaxon(i);
                translationList.put(taxon.getId(), taxon);
            }
        }

        lastToken[0] = token;

        return translationList;
    }

    private Tree readNextTree(HashMap<String, Taxon> translationList, String[] lastToken, TaxonList taxonList) throws ImportException, IOException {
        try {
            Tree tree = null;
            String token = lastToken[0];

            if (token.equalsIgnoreCase("UTREE") || token.equalsIgnoreCase("TREE")) {

                if (nextCharacter() == '*') {
                    // Star is used to specify a default tree - ignore it
                    readCharacter();
                }

                String token2 = readToken("=;");
                // Save tree comment and attach it later
                final String comment = getLastMetaComment();
                clearLastMetaComment();

                if (getLastDelimiter() != '=') {
                    throw new BadFormatException("Missing label for tree'" + token2 + "' or missing '=' in TREE command of TREES block");
                }

                try {
                    if (nextCharacter() != '(') {
                        throw new BadFormatException("Missing tree definition in TREE command of TREES block");
                    }

                    // tree special comments
                    final String scomment = getLastMetaComment();
                    clearLastMetaComment();

                    FlexibleNode root = readInternalNode(translationList);

                    if (translationList != null) {
                        // this ensures that if a translation list is used, the external node numbers
                        // of the trees correspond as well.

                        Map<Taxon, Integer> taxonNumberMap = new HashMap<Taxon, Integer>();
                        int count = 0;
                        for (String label : translationList.keySet()) {
                            Taxon taxon = translationList.get(label);
                            int number;

                            if (taxonList != null) { // Map back to original numbering from TaxonList
                                number =  taxonList.getTaxonIndex(taxon);
                            } else { // Old functionality
                                try {
                                    number = Integer.parseInt(label) - 1;
                                } catch (NumberFormatException nfe) {
                                    number = count;
                                }
                            }

                            taxonNumberMap.put(taxon, number);
                            count++;
                        }

                        tree = new FlexibleTree(root, false, true, taxonNumberMap);
                    } else {
                        tree = new FlexibleTree(root, false, true, null);
                    }

                    tree.setId(token2);

                    if (getLastDelimiter() == ':') {
                        // in case the root has a branch length, skip it
                        readToken(";");

                        if (getLastMetaComment() != null) {
                            // There was a meta-comment which should be in the form:
                            // \[&label[=value][,label[=value]>[,/..]]\]
                            try {
                                parseMetaCommentPairs(getLastMetaComment(), root);
                            } catch (BadFormatException bfe) {
                                // ignore it
                            }
                            clearLastMetaComment();
                        }
                    }

                    if (getLastDelimiter() != ';') {
                        throw new BadFormatException("Expecting ';' after tree, '" + token2 + "', TREE command of TREES block");
                    }

                    if (scomment != null) {
                        // below is correct only if [&W] appears on it own
                        String c = scomment;
                        while (c.length() > 0) {
                            final char ch = c.charAt(0);
                            if (ch == ';') {
                                c = c.substring(1);
                                continue;
                            }
                            if (ch == 'R') {
                                // we only have rooted trees anyway
                                c = c.substring(1);
                            } else if (ch == 'W') {
                                int e = c.indexOf(';');
                                if (e < 0) e = c.length();

                                try {
                                    final Float value = new Float(c.substring(2, e));
                                    tree.setAttribute("weight", value);
                                } catch (NumberFormatException ex) {
                                    // don't fail, ignore
                                }
                                c = c.substring(e);
                            } else {
                                c = c.substring(1);
                            }
                        }
                    }

                    if (comment != null) {
                        try {
                            parseMetaCommentPairs(comment, tree);
                        } catch (Importer.BadFormatException e) {
                            // set generic comment attribute
                            tree.setAttribute("comment", comment);
                        }
                    }

                } catch (EOFException e) {
                    // If we reach EOF we may as well return what we have?
                    return tree;
                }

                token = readToken(";");
            } else if (token.equalsIgnoreCase("ENDBLOCK") || token.equalsIgnoreCase("END")) {
                return null;
            } else {
                throw new BadFormatException("Unknown command '" + token + "' in TREES block");
            }

            //added this to escape readNextTree loop correctly -- AJD
            lastToken[0] = token;

            return tree;

        } catch (EOFException e) {
            return null;
        }
    }

    /**
     * Reads a branch in. This could be a node or a tip (calls readNode or readTip
     * accordingly). It then reads the branch length and SimpleNode that will
     * point at the new node or tip.
     */
    FlexibleNode readBranch(HashMap<String, Taxon> translationList) throws IOException, ImportException {
        double length = 0.0;
        FlexibleNode branch;

        clearLastMetaComment();

        if (nextCharacter() == '(') {
            // is an internal node
            branch = readInternalNode(translationList);

        } else {
            // is an external node
            branch = readExternalNode(translationList);
        }

        if (getLastDelimiter() != ':' && getLastDelimiter() != ',' && getLastDelimiter() != ')') {
            String label = readToken(",():;");
            if (label.length() > 0) {
                branch.setAttribute("label", label);
            }
        }

        if (getLastDelimiter() == ':') {
            length = readDouble(",():;");

            if (getLastMetaComment() != null) {
                if (!ignoreMetaComments) {
                    // There was a meta-comment which should be in the form:
                    // \[&label[=value][,label[=value]>[,/..]]\]
                    try {
                        parseMetaCommentPairs(getLastMetaComment(), branch);
                    } catch (BadFormatException bfe) {
                        // ignore it
                    }
                }
                clearLastMetaComment();
            }

        }

        branch.setLength(length);

        return branch;
    }

    /**
     * Reads a node in. This could be a polytomy. Calls readBranch on each branch
     * in the node.
     */
    FlexibleNode readInternalNode(HashMap<String, Taxon> translationList) throws IOException, ImportException {
        FlexibleNode node = new FlexibleNode();

        // read the opening '('
        readCharacter();

        // read the first child
        node.addChild(readBranch(translationList));

        if (getLastDelimiter() != ',' && !suppressWarnings) {
            java.util.logging.Logger.getLogger("dr.evolution.io").warning("Internal node only has a single child.");
        }

        // this allows one or more children
        while(getLastDelimiter()==',') {
            node.addChild(readBranch(translationList));
        }

        // should have had a closing ')'
        if (getLastDelimiter() != ')') {
            throw new BadFormatException("Missing closing ')' in tree in TREES block");
        }

        readToken(":(),;");

        if (getLastMetaComment() != null) {
            if (!ignoreMetaComments) {
                // There was a meta-comment which should be in the form:
                // \[&label[=value][,label[=value]>[,/..]]\]
                try {
                    parseMetaCommentPairs(getLastMetaComment(), node);
                } catch (BadFormatException bfe) {
                    // ignore it
                }
            }
            clearLastMetaComment();
        }

        // find the next delimiter
        return node;
    }

//	private void labelNode(FlexibleNode node, String label, String value) {
//		// Attempt to format the value as a number
//		Number number = null;
//		try {
//			number = Integer.valueOf(value);
//		} catch (NumberFormatException nfe1) {
//			try {
//				number = Double.valueOf(value);
//			} catch (NumberFormatException nfe2) {
//				//
//			}
//		}
//		if (number != null) {
//			node.setAttribute(label, number);
//		} else {
//			node.setAttribute(label, value);
//		}
//	}

    /**
     * Reads an external node in.
     */
    FlexibleNode readExternalNode(HashMap<String, Taxon> translationList) throws ImportException, IOException {
        FlexibleNode node = new FlexibleNode();

        String label = readToken(":(),;");

        Taxon taxon;

        if (translationList.size() > 0) {
            taxon = translationList.get(label);

            if (taxon == null) {
                // taxon not found in taxon list...
                throw new UnknownTaxonException("Taxon in tree, '" + label + "' is unknown");
            }
        } else {
            taxon = new Taxon(label);
        }

        if (getLastMetaComment() != null) {
            if (!ignoreMetaComments) {
                // There was a meta-comment which should be in the form:
                // \[&label[=value][,label[=value]>[,/..]]\]
                try {
                    parseMetaCommentPairs(getLastMetaComment(), node);
                } catch (BadFormatException bfe) {
                    // ignore it
                }
            }
            clearLastMetaComment();
        }

        node.setTaxon(taxon);
        return node;
    }

    /**
     * Reads a 'CALIBRATION' block.
     */
    private dr.evolution.util.Date[] readCalibrationBlock(TaxonList taxonList) throws ImportException, IOException {
        double origin = 0.0;
        boolean isBackwards = false;
        Units.Type units = Units.Type.YEARS;
        ArrayList<Date> dates = new ArrayList<Date>();

        String token;

        boolean done = false;
        do {
            token = readToken(";");

            if (token.equalsIgnoreCase("OPTIONS")) {

                do {
                    String token2 = readToken("=;");

                    if (getLastDelimiter() != '=') {
                        throw new BadFormatException("Unknown subcommand, '" + token2 + "', or missing '=' in OPTIONS command of CALIBRATION block");
                    }

                    if (token2.equalsIgnoreCase("SCALE")) {

                        String token3 = readToken(";");
                        if (token3.equalsIgnoreCase("DAYS")) {

                            units = Units.Type.DAYS;

                        } else if (token3.equalsIgnoreCase("MONTHS")) {

                            units = Units.Type.MONTHS;

                        } else if (token3.equalsIgnoreCase("YEARS")) {

                            units = Units.Type.YEARS;

                        } else {
                            throw new BadFormatException("SCALE in OPTIONS command of CALIBRATION block must be one of DAYS, MONTHS or YEARS");
                        }

                    } else if (token2.equalsIgnoreCase("ORIGIN")) {

                        origin = readDouble(";");

                    } else if (token2.equalsIgnoreCase("DIRECTION")) {

                        String token3 = readToken(";");
                        if (token3.equalsIgnoreCase("FORWARDS")) {

                            isBackwards = false;

                        } else if (token3.equalsIgnoreCase("BACKWARDS")) {

                            isBackwards = true;

                        } else {
                            throw new BadFormatException("DIRECTION in OPTIONS command of CALIBRATION block must be either FORWARDS or BACKWARDS");
                        }


                    } else {
                        throw new BadFormatException("Unknown subcommand, '" + token2 + "', in OPTIONS command of CALIBRATION block");
                    }

                } while (getLastDelimiter() != ';');

            } else if (token.equalsIgnoreCase("TIPCALIBRATION")) {

                do {
                    String token2 = readToken("=;");

                    if (getLastDelimiter() != '=') {
                        throw new BadFormatException("Missing date for label '" + token2 + "' or missing '=' in TIPCALIBRATION command of CALIBRATION block");
                    }

                    double value = readDouble(":;");

                    if (getLastDelimiter() != ':') {
                        throw new BadFormatException("Missing taxon list for label '" + token2 + "' or missing ':' in TIPCALIBRATION command of CALIBRATION block");
                    }

                    dr.evolution.util.Date date;
                    if (isBackwards) {
                        date = dr.evolution.util.Date.createTimeAgoFromOrigin(value, units, origin);
                    } else {
                        date = dr.evolution.util.Date.createTimeSinceOrigin(value, units, origin);
                    }

                    dates.add(date);

                    do {
                        String token3 = readToken(",;");
                        Taxon taxon;

                        int index = taxonList.getTaxonIndex(token3);
                        if (index == -1) {
                            // taxon not found in taxon list...
                            throw new UnknownTaxonException("Unknown taxon '" + token3 + "' for label '" + token2 + "' in TIPCALIBRATION command of CALIBRATION block");
                        } else {
                            taxon = taxonList.getTaxon(index);
                        }

                        taxon.setAttribute("date", date);

                    } while (getLastDelimiter() != ',' && getLastDelimiter() != ';');

                } while (getLastDelimiter() == ',');

            } else if (token.equalsIgnoreCase("NODECALIBRATION")) {
                throw new BadFormatException("NODECALIBRATION not suppored in CALIBRATION block");
            } else if (token.equalsIgnoreCase("ENDBLOCK") || token.equalsIgnoreCase("END")) {
                done = true;
            } else {
                throw new BadFormatException("Unknown command '" + token + "' in CALIBRATION block");
            }
        } while (!done);

        dr.evolution.util.Date[] dateArray = new dr.evolution.util.Date[dates.size()];
        dates.toArray(dateArray);

        nextBlock = UNKNOWN_BLOCK;

        return dateArray;
    }

    static void parseMetaCommentPairs(String meta, Attributable item) throws Importer.BadFormatException {
        if (meta.startsWith("B ")) {
            // a MrBayes annotation
            String[] parts = meta.split(" ");
            if (parts.length == 3 && parts[1].length() > 0 && parts[2].length() > 0) {
                item.setAttribute(parts[1], parseValue(parts[2]));
            } else if (parts.length == 2 && parts[1].length() > 0) {
                item.setAttribute(parts[1], Boolean.TRUE);
            } else {
                throw new Importer.BadFormatException("Badly formatted attribute: '" + meta + "'");
            }
            return;
        }

        // This regex should match key=value pairs, separated by commas
        // This can match the following types of meta comment pairs:
        // value=number, value="string", value={item1, item2, item3}
        // (label must be quoted if it contains spaces (i.e. "my label"=label)

        // TODO MAS Minor change in line below for nested arrays may cause other unforeseen bugs
        Pattern pattern = Pattern.compile("(\"[^\"]*\"+|[^,=\\s]+)\\s*(=\\s*(\\{[^=]*\\}|\"[^\"]*\"+|[^,]+))?");
        Matcher matcher = pattern.matcher(meta);

        while (matcher.find()) {
            String label = matcher.group(1);
            if (label.charAt(0) == '\"') {
                label = label.substring(1, label.length() - 1);
            }
            if (label == null || label.trim().length() == 0) {
                throw new Importer.BadFormatException("Badly formatted attribute: '" + matcher.group() + "'");
            }
            final String value = matcher.group(2);
            if (value != null && value.trim().length() > 0) {
                // there is a specified value so try to parse it
                item.setAttribute(label, parseValue(value.substring(1)));
            } else {
                item.setAttribute(label, Boolean.TRUE);
            }
        }
    }

    public static boolean isInt(String str)
    {
        if (str == null) {
            return false;
        }
        int length = str.length();
        if (length == 0) {
            return false;
        }
        int i = 0;
        if (str.charAt(0) == '-') {
            if (length == 1) {
                return false;
            }
            i = 1;
        }
        for (; i < length; i++) {
            char c = str.charAt(i);
            if (c <= '/' || c >= ':') {
                return false;
            }
        }
        return true;
    }

    /**
     * This method takes a string and tries to decode it returning the object
     * that best fits the data. It will recognize comma delimited lists enclosed
     * in {..} and call parseValue() on each element. It will also recognize Boolean,
     * Integer and Double. If the value starts with a # then it will attempt to decode
     * the following integer as an RGB colour - see Color.decode(). If nothing else fits
     * then the value will be returned as a string but trimmed of leading and trailing
     * white space.
     *
     * @param value the string
     * @return the object
     */
    public static Serializable parseValue(String value) {

        value = value.trim();

        if (value.startsWith("{")) {
            // the value is a list so recursively parse the elements
            // and return an array
            String inside = value.substring(1, value.length() - 1);

            if (inside.length() == 0) {
                return null;
            }

             // Determine depth of further nesting
            int depth = 0;
            while (inside.charAt(depth) == '{') {
                depth++;
            }

            StringBuilder split;
            if (depth == 0) {
                split = new StringBuilder(",");
            } else {
                StringBuilder rightBookEnd = new StringBuilder("(?<=");
                StringBuilder leftBookEnd = new StringBuilder("(?=");
                for (int i = 0; i < depth; ++i) {
                    rightBookEnd.append("\\}");
                    leftBookEnd.append("\\{");
                }
                leftBookEnd.append(")");
                rightBookEnd.append(")");
                split = rightBookEnd.append(",").append(leftBookEnd);
            }

            // Non-destructive split
            String[] elements = inside.split(split.toString());
            Object[] values = new Object[elements.length];
            for (int i = 0; i < elements.length; i++) {
                values[i] = parseValue(elements[i]);
            }
            return values;
        }

        if (value.startsWith("#")) {
            // I am not sure whether this is a good idea but
            // I am going to assume that a # denotes an RGB colour
            try {
                return Color.decode(value.substring(1));
            } catch (NumberFormatException nfe1) {
                // not a colour
            }
        }

        if (value.equalsIgnoreCase("TRUE") || value.equalsIgnoreCase("FALSE")) {
            return Boolean.valueOf(value);
        }

        // Attempt to format the value as an integer
//        try {
//            return new Integer(value);
//        } catch (NumberFormatException nfe1) {
//            // not an integer
//        }
        // throwing exception to test for an integer is slow
        if (isInt(value)) {
            return Integer.valueOf(value);
        }

        // Attempt to format the value as a double
        try {
            return new Double(value);
        } catch (NumberFormatException nfe2) {
            // not a double
        }

        // return the trimmed string
        return value;
    }

    // private stuff

    private NexusBlock nextBlock = null;

    private int taxonCount = 0, siteCount = 0;
    private DataType dataType = null;
    private String gapCharacters = "-";
    private String matchCharacters = ".";
    private String missingCharacters = "?";
    private boolean isInterleaved = false;

    public static void main(String[] args) throws IOException, ImportException {

        if (args.length > 3) {

            int sampleFrequency = Integer.parseInt(args[1]);
            boolean includeBranchLengths = Boolean.getBoolean(args[2]);
            boolean isNexus = Boolean.getBoolean(args[3]);

            NexusImporter nexusImporter = null;

            BufferedReader reader = null;

            if (isNexus) {
                nexusImporter = new NexusImporter(new FileReader(args[0]));
            } else {
                reader = new BufferedReader(new FileReader(args[0]));
            }


            int index = 0;
            int count = 0;
            String line = null;
            if (!isNexus) line = reader.readLine();
            while (line != null || (isNexus && nexusImporter.hasTree())) {

                Tree tree;
                if (isNexus) {
                    tree = nexusImporter.importNextTree();
                } else {
                    String treeString = line.substring(line.indexOf('(')).trim();

                    java.io.Reader stringReader = new java.io.StringReader(treeString);
                    NewickImporter importer = new NewickImporter(stringReader);
                    tree = importer.importNextTree();
                }


                if (index % sampleFrequency == 0) {
                    if (includeBranchLengths) {
                        System.out.println(TreeUtils.newick(tree));
                    } else {
                        System.out.println(TreeUtils.newickNoLengths(tree));
                        count += 1;
                    }
                }
                index += 1;
                if (!isNexus) line = reader.readLine();
            }
            System.out.println(count + " trees");

        } else {
            System.err.println("usage: filterTrees <tree-file-name> <sample-frequency> <include-branch-lengths>");
        }


    }

}