Importer.java example

Explorer
beast-mcmc-master
/*
 * Importer.java
 *
 * Copyright (c) 2002-2015 Alexei Drummond, Andrew Rambaut and Marc Suchard
 *
 * This file is part of BEAST.
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership and licensing.
 *
 * BEAST is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 *  BEAST is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with BEAST; if not, write to the
 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
 * Boston, MA  02110-1301  USA
 */

package dr.evolution.io;

import dr.evolution.datatype.DataType;

import java.io.*;

/**
 * Base class for phylogenetic file format importers
 *
 * @version $Id: Importer.java,v 1.19 2006/04/25 14:41:08 rambaut Exp $
 *
 * @author Andrew Rambaut
 * @author Alexei Drummond
 */
public abstract class Importer {

	public static class ImportException extends Exception {
		/**
		 *
		 */
		private static final long serialVersionUID = 7858834683324203750L;
		public ImportException() { super(); }
		public ImportException(String message) { super(message); }
	}

    public static class DuplicateFieldException extends ImportException {
		/**
		 *
		 */
		private static final long serialVersionUID = 8047146381348414810L;
		public DuplicateFieldException() { super(); }
		public DuplicateFieldException(String message) { super(message); }
	}

    public static class BadFormatException extends ImportException {
		/**
		 *
		 */
		private static final long serialVersionUID = -8206831989674620748L;
		public BadFormatException() { super(); }
		public BadFormatException(String message) { super(message); }
	}

    public static class UnparsableDataException extends ImportException {
		/**
		 *
		 */
		private static final long serialVersionUID = 5905130039882401006L;
		public UnparsableDataException() { super(); }
		public UnparsableDataException(String message) { super(message); }
	}

    public static class MissingFieldException extends ImportException {
		/**
		 *
		 */
		private static final long serialVersionUID = -7576489210458327552L;
		public MissingFieldException() { super(); }
		public MissingFieldException(String message) { super(message); }
	}

    public static class ShortSequenceException extends ImportException {
		/**
		 *
		 */
		private static final long serialVersionUID = 7460033398106047073L;
		public ShortSequenceException() { super(); }
		public ShortSequenceException(String message) { super(message); }
	}

    public static class TooFewTaxaException extends ImportException {
		/**
		 *
		 */
		private static final long serialVersionUID = -6349041350075169247L;
		public TooFewTaxaException() { super(); }
		public TooFewTaxaException(String message) { super(message); }
	}

    public static class UnknownTaxonException extends ImportException {
		/**
		 *
		 */
		private static final long serialVersionUID = 6611115782536515250L;
		public UnknownTaxonException() { super(); }
		public UnknownTaxonException(String message) { super(message); }
	}

    /**
	 * Constructor
	 */
	public Importer(Reader reader) {
		this.reader = new LineNumberReader(reader);
		this.commentWriter = null;
	}

	public Importer(Reader reader, Writer commentWriter) {
		this.reader = new LineNumberReader(reader);
		this.commentWriter = commentWriter != null ? new BufferedWriter(commentWriter) : null;
	}

	public void setCommentDelimiters(char line) {
		hasComments = true;
		this.lineComment = line;
	}

	public void setCommentDelimiters(char start, char stop) {
		hasComments = true;
		this.startComment = start;
		this.stopComment = stop;
	}

    public void setCommentDelimiters(char start, char stop, char line) {
        hasComments = true;
        this.startComment = start;
        this.stopComment = stop;
        this.lineComment = line;
    }

	public void setCommentDelimiters(char start, char stop, char line, char write, char meta) {
		hasComments = true;
		this.startComment = start;
		this.stopComment = stop;
		this.lineComment = line;
        this.writeComment = write;
        this.metaComment = meta;
	}

	public void setCommentWriter(Writer commentWriter) {
		this.commentWriter = new BufferedWriter(commentWriter);
	}

	public int getLineNumber() {
		return reader.getLineNumber();
	}

	public int getLastDelimiter() {
		return lastDelimiter;
	}

	public char nextCharacter() throws IOException {
		if (lastChar == '\0') {
			lastChar = readCharacter();
		}
		return (char)lastChar;
	}

	public char readCharacter() throws IOException {

		skipSpace();

		char ch = read();

		while (hasComments && (ch == startComment || ch == lineComment)) {
			skipComments(ch);
			skipSpace();
			ch = read();
		}

		return ch;
	}

	public void unreadCharacter(char ch) {
		lastChar = ch;
	}

	public char next() throws IOException {
		if (lastChar == '\0') {
			lastChar = read();
		}
		return (char)lastChar;
	}

	public char read() throws IOException {
		int ch;

		if (lastChar == '\0') {
			ch = reader.read();
			if (ch <= 0) {
				throw new EOFException();
			}
		} else {
			ch = lastChar;
			lastChar = '\0';
		}

		return (char)ch;
	}

	/**
	 * Reads a line, skipping over any comments.
	 */
	public String readLine() throws IOException {

		StringBuffer line = new StringBuffer();

		char ch = read();

		try {

			while (ch != '\n' && ch != '\r') {

				if (hasComments) {
					if (ch == lineComment) {
						skipComments(ch);
						break;
					}
					if (ch == startComment) {
						skipComments(ch);
						ch = read();
					}
				}

				line.append(ch);
				ch = read();
			}

			// accommodate DOS line endings..
			if (ch == '\r') {
				if (next() == '\n') read();
			}

			lastDelimiter = ch;

		} catch (EOFException e) {
			// We catch an EOF and return the line we have so far
		}

		return line.toString();
	}

	/**
	 * Reads sequences, skipping over any comments and filtering using dataType.
	 * @param sequence a StringBuffer into which the sequences is put
	 * @param dataType the dataType of the sequences
	 * @param delimiters list of characters that will stop the reading
	 * @param maxSites maximum number of sites to read
	 */
	public void readSequence(StringBuffer sequence, DataType dataType, 
								String delimiters, int maxSites,
								String gapCharacters, String missingCharacters,
								String matchCharacters, String matchSequence) throws IOException, ImportException {
			
		char ch = read();

		try {
			int n = 0;

			while (n < maxSites && delimiters.indexOf(ch) == -1) {

				if (hasComments && (ch == startComment || ch == lineComment)) {
					skipComments(ch);
					ch = read();
				}

				if (!Character.isWhitespace(ch)) {

					char ch1 = ch;

					if (gapCharacters.indexOf(ch) != -1) {
						ch1 = DataType.GAP_CHARACTER;
					} else if (missingCharacters.indexOf(ch) != -1) {
						ch1 = DataType.UNKNOWN_CHARACTER;
					} else if (matchCharacters.indexOf(ch) != -1) {
						if (matchSequence == null) {
							throw new ImportException("Match character in first sequences");
						}
						if (n >= matchSequence.length()) {
							throw new ImportException("Match sequences too short");
						}

						ch1 = matchSequence.charAt(n);
					}

					sequence.append(ch1);

					n++;
				}

				ch = read();
			}

			lastDelimiter = ch;

			if (Character.isWhitespace((char)lastDelimiter)) {
				ch = nextCharacter();
				if (delimiters.indexOf(ch) != -1) {
					lastDelimiter = readCharacter();
				}
			}

		} catch (EOFException e) {
			// We catch an EOF and return the sequences we have so far
		}
	}

	/**
	 * Reads a line of sequences, skipping over any comments and filtering using dataType.
	 * @param sequence a StringBuffer into which the sequences is put
	 * @param dataType the dataType of the sequences
	 * @param delimiters list of characters that will stop the reading
	 */
	public void readSequenceLine(StringBuffer sequence, DataType dataType,
								String delimiters,
								String gapCharacters, String missingCharacters,
								String matchCharacters, String matchSequence) throws IOException, ImportException {

		char ch = read();

		try {
			int n = 0;

			while (ch != '\r' && ch != '\n' && delimiters.indexOf(ch) == -1) {

				if (hasComments) {
					if (ch == lineComment) {
						skipComments(ch);
						break;
					}
					if (ch == startComment) {
						skipComments(ch);
						ch = read();
					}
				}

				if (ch != ' ' && ch != '\t') {
					char ch1 = ch;

					if (gapCharacters.indexOf(ch) != -1) {
						ch1 = DataType.GAP_CHARACTER;
					} else if (missingCharacters.indexOf(ch) != -1) {
						ch1 = DataType.UNKNOWN_CHARACTER;
					} else if (matchCharacters.indexOf(ch) != -1) {
						if (matchSequence == null) {
							throw new ImportException("Match character in first sequences");
						}
						if (n >= matchSequence.length()) {
							throw new ImportException("Match sequences too short");
						}

						ch1 = matchSequence.charAt(n);
					}

					sequence.append(ch1);

					n++;
				}

				ch = read();
			}

			if (ch == '\r') {
				if (next() == '\n') read();
			}

			lastDelimiter = ch;

			if (Character.isWhitespace((char)lastDelimiter)) {
				ch = nextCharacter();
				if (delimiters.indexOf(ch) != -1) {
					lastDelimiter = readCharacter();
				}
			}

		} catch (EOFException e) {
			// We catch an EOF and return the sequences we have so far
		}
	}

	/**
	 * Attempts to read and parse an integer delimited by whitespace.
	 */
	public int readInteger() throws IOException, ImportException {
		String token = readToken();
		try {
			return Integer.parseInt(token);
		} catch (NumberFormatException nfe) {
			throw new ImportException("Number format error: " + nfe.getMessage());
		}
	}

	/**
	 * Attempts to read and parse an integer delimited by whitespace or by
	 * any character in delimiters.
	 */
	public int readInteger(String delimiters) throws IOException, ImportException {
		String token = readToken(delimiters);
		try {
			return Integer.parseInt(token);
		} catch (NumberFormatException nfe) {
			throw new ImportException("Number format error: " + nfe.getMessage());
		}
	}

	/**
	 * Attempts to read and parse a double delimited by whitespace.
	 */
	public double readDouble() throws IOException, ImportException {
		String token = readToken();
		try {
			return Double.parseDouble(token);
		} catch (NumberFormatException nfe) {
			throw new ImportException("Number format error: " + nfe.getMessage());
		}
	}

	/**
	 * Attempts to read and parse a double delimited by whitespace or by
	 * any character in delimiters.
	 */
	public double readDouble(String delimiters) throws IOException, ImportException {
		String token = readToken(delimiters);
		try {
			return Double.parseDouble(token);
		} catch (NumberFormatException nfe) {
			throw new ImportException("Number format error: " + nfe.getMessage());
		}
	}

	/**
	 * Reads a token stopping when any whitespace or a comment is found.
	 * If the token begins with a quote char then all characters will be
	 * included in token until a matching quote is found (including whitespace or comments).
	 */
	public String readToken() throws IOException {
		return readToken("");
	}

	/**
	 * Reads a token stopping when any whitespace, a comment or when any character
	 * in delimiters is found. If the token begins with a quote char
	 * then all characters will be included in token until a matching
	 * quote is found (including whitespace or comments).
	 */
	public String readToken(String delimiters) throws IOException {
		int space = 0;
		char ch, ch2, quoteChar = '\0';
		boolean done = false, first = true, quoted = false, isSpace;

		nextCharacter();

		StringBuffer token = new StringBuffer();

		while (!done) {
			ch = read();

			try {
				isSpace = Character.isWhitespace(ch);

				if (quoted && ch == quoteChar) { // Found the closing quote
					ch2 = read();

					if (ch == ch2) {
						// A repeated quote character so add this to the token
						token.append(ch);
					} else {
						// otherwise it terminates the token

						lastDelimiter = ' ';
						unreadCharacter(ch2);
						done = true;
						quoted = false;
					}
				} else if (first && (ch == '\'' || ch == '"')) {
					// if the opening character is a quote
					// read everything up to the closing quote
					quoted = true;
					quoteChar = ch;
					first = false;
					space = 0;
				} else if ( ch == startComment || ch == lineComment ) {
					skipComments(ch);
					lastDelimiter = ' ';
					done = true;
				} else {
					if (quoted) {
						// compress multiple spaces into one
						if (isSpace) {
							space++;
							ch = ' ';
						} else {
							space = 0;
						}

						if (space < 2) {
							token.append(ch);
						}
					} else if (isSpace) {
						lastDelimiter = ' ';
						done = true;
					} else if (delimiters.indexOf(ch) != -1) {
						done = true;
						lastDelimiter = ch;
					} else {
						token.append(ch);
						first = false;
					}
				}
			} catch (EOFException e) {
				// We catch an EOF and return the token we have so far
				done = true;
			}
		}

		if (Character.isWhitespace((char)lastDelimiter)) {
			ch = nextCharacter();
			while (Character.isWhitespace(ch)) {
				read();
				ch = nextCharacter();
			}

			if (delimiters.indexOf(ch) != -1) {
				lastDelimiter = readCharacter();
			}
		}

		return token.toString();
	}

	/**
	 * Skips over any comments. The opening comment delimiter is passed.
	 */
	protected void skipComments(char delimiter) throws IOException {

		char ch;
		int n=1;
		boolean write = false;
        StringBuffer meta = null;

		if (nextCharacter() == writeComment) {
			read();
			write = true;
		} else if (nextCharacter() == metaComment) {
			read();
            // combine two consecutive meta comments
            meta = lastMetaComment!= null ? new StringBuffer(lastMetaComment + ";") : new StringBuffer();
		}

        lastMetaComment = null;

		if (delimiter == lineComment) {
			String line = readLine();
			if (write && commentWriter != null) {
				commentWriter.write(line, 0, line.length());
				commentWriter.newLine();
			} else if (meta != null) {
				meta.append(line);
			}
		} else {
			do {
				ch = read();
				if (ch == startComment) {
					n++;
				} else if (ch == stopComment) {
					if (write && commentWriter != null) {
						commentWriter.newLine();
					}
					n--;
				} else if (write && commentWriter != null) {
					commentWriter.write(ch);
				} else if (meta != null) {
					meta.append(ch);
				}
			} while (n > 0);
		}

        if (meta != null) {
            lastMetaComment = meta.toString();
        }
	}

	/**
	 * Skips to the end of the line. If a comment is found then this is read.
	 */
	public void skipToEndOfLine() throws IOException {

		char ch;

		do {
			ch = read();
			if (hasComments) {
				if (ch == lineComment) {
					skipComments(ch);
					break;
				}
				if (ch == startComment) {
					skipComments(ch);
					ch = read();
				}
			}

		} while (ch != '\n' && ch != '\r');

		if (ch == '\r') {
			if (nextCharacter() == '\n') read();
		}
	}

	/**
	 * Skips char any contiguous characters in skip. Will also skip
	 * comments.
	 */
	public void skipWhile(String skip) throws IOException {

		char ch;

		do {
			ch = read();
		} while ( skip.indexOf(ch) > -1 );

		unreadCharacter(ch);
	}

	/**
	 * Skips over any space (plus tabs and returns) in the file. Will also skip
	 * comments.
	 */
	public void skipSpace() throws IOException {
		skipWhile(" \t\r\n");
	}

	/**
	 * Skips over any contiguous characters in skip. Will also skip
	 * comments and space.
	 */
	public void skipCharacters(String skip) throws IOException {
		skipWhile(skip + " \t\r\n");
	}

	/**
	 * Skips over the file until a character from delimiters is found. Returns
	 * the delimiter found. Will skip comments and will ignore delimiters within
	 * comments.
	 */
	public char skipUntil(String skip) throws IOException {
		char ch;

		do {
			ch = readCharacter();
		} while ( skip.indexOf(ch) == -1 );

		return ch;
	}

    public String getLastMetaComment() {
        return lastMetaComment;
    }

    public void clearLastMetaComment() {
        lastMetaComment = null;
    }

	// Private stuff

	private LineNumberReader reader;
	private BufferedWriter commentWriter = null;

	private int lastChar = '\0';
	private int lastDelimiter = '\0';

	private boolean hasComments = false;
	private char startComment = (char)-1;
	private char stopComment = (char)-1;
	private char lineComment = (char)-1;

    private char writeComment = (char)-1;
    private char metaComment = (char)-1;

    private String lastMetaComment = null;
}