/*
* Importer.java
*
* Copyright (c) 2002-2015 Alexei Drummond, Andrew Rambaut and Marc Suchard
*
* This file is part of BEAST.
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership and licensing.
*
* BEAST is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* BEAST is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with BEAST; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301 USA
*/
package dr.evolution.io;
import dr.evolution.datatype.DataType;
import java.io.*;
/**
* Base class for phylogenetic file format importers
*
* @version $Id: Importer.java,v 1.19 2006/04/25 14:41:08 rambaut Exp $
*
* @author Andrew Rambaut
* @author Alexei Drummond
*/
public abstract class Importer {
public static class ImportException extends Exception {
/**
*
*/
private static final long serialVersionUID = 7858834683324203750L;
public ImportException() { super(); }
public ImportException(String message) { super(message); }
}
public static class DuplicateFieldException extends ImportException {
/**
*
*/
private static final long serialVersionUID = 8047146381348414810L;
public DuplicateFieldException() { super(); }
public DuplicateFieldException(String message) { super(message); }
}
public static class BadFormatException extends ImportException {
/**
*
*/
private static final long serialVersionUID = -8206831989674620748L;
public BadFormatException() { super(); }
public BadFormatException(String message) { super(message); }
}
public static class UnparsableDataException extends ImportException {
/**
*
*/
private static final long serialVersionUID = 5905130039882401006L;
public UnparsableDataException() { super(); }
public UnparsableDataException(String message) { super(message); }
}
public static class MissingFieldException extends ImportException {
/**
*
*/
private static final long serialVersionUID = -7576489210458327552L;
public MissingFieldException() { super(); }
public MissingFieldException(String message) { super(message); }
}
public static class ShortSequenceException extends ImportException {
/**
*
*/
private static final long serialVersionUID = 7460033398106047073L;
public ShortSequenceException() { super(); }
public ShortSequenceException(String message) { super(message); }
}
public static class TooFewTaxaException extends ImportException {
/**
*
*/
private static final long serialVersionUID = -6349041350075169247L;
public TooFewTaxaException() { super(); }
public TooFewTaxaException(String message) { super(message); }
}
public static class UnknownTaxonException extends ImportException {
/**
*
*/
private static final long serialVersionUID = 6611115782536515250L;
public UnknownTaxonException() { super(); }
public UnknownTaxonException(String message) { super(message); }
}
/**
* Constructor
*/
public Importer(Reader reader) {
this.reader = new LineNumberReader(reader);
this.commentWriter = null;
}
public Importer(Reader reader, Writer commentWriter) {
this.reader = new LineNumberReader(reader);
this.commentWriter = commentWriter != null ? new BufferedWriter(commentWriter) : null;
}
public void setCommentDelimiters(char line) {
hasComments = true;
this.lineComment = line;
}
public void setCommentDelimiters(char start, char stop) {
hasComments = true;
this.startComment = start;
this.stopComment = stop;
}
public void setCommentDelimiters(char start, char stop, char line) {
hasComments = true;
this.startComment = start;
this.stopComment = stop;
this.lineComment = line;
}
public void setCommentDelimiters(char start, char stop, char line, char write, char meta) {
hasComments = true;
this.startComment = start;
this.stopComment = stop;
this.lineComment = line;
this.writeComment = write;
this.metaComment = meta;
}
public void setCommentWriter(Writer commentWriter) {
this.commentWriter = new BufferedWriter(commentWriter);
}
public int getLineNumber() {
return reader.getLineNumber();
}
public int getLastDelimiter() {
return lastDelimiter;
}
public char nextCharacter() throws IOException {
if (lastChar == '\0') {
lastChar = readCharacter();
}
return (char)lastChar;
}
public char readCharacter() throws IOException {
skipSpace();
char ch = read();
while (hasComments && (ch == startComment || ch == lineComment)) {
skipComments(ch);
skipSpace();
ch = read();
}
return ch;
}
public void unreadCharacter(char ch) {
lastChar = ch;
}
public char next() throws IOException {
if (lastChar == '\0') {
lastChar = read();
}
return (char)lastChar;
}
public char read() throws IOException {
int ch;
if (lastChar == '\0') {
ch = reader.read();
if (ch <= 0) {
throw new EOFException();
}
} else {
ch = lastChar;
lastChar = '\0';
}
return (char)ch;
}
/**
* Reads a line, skipping over any comments.
*/
public String readLine() throws IOException {
StringBuffer line = new StringBuffer();
char ch = read();
try {
while (ch != '\n' && ch != '\r') {
if (hasComments) {
if (ch == lineComment) {
skipComments(ch);
break;
}
if (ch == startComment) {
skipComments(ch);
ch = read();
}
}
line.append(ch);
ch = read();
}
// accommodate DOS line endings..
if (ch == '\r') {
if (next() == '\n') read();
}
lastDelimiter = ch;
} catch (EOFException e) {
// We catch an EOF and return the line we have so far
}
return line.toString();
}
/**
* Reads sequences, skipping over any comments and filtering using dataType.
* @param sequence a StringBuffer into which the sequences is put
* @param dataType the dataType of the sequences
* @param delimiters list of characters that will stop the reading
* @param maxSites maximum number of sites to read
*/
public void readSequence(StringBuffer sequence, DataType dataType,
String delimiters, int maxSites,
String gapCharacters, String missingCharacters,
String matchCharacters, String matchSequence) throws IOException, ImportException {
char ch = read();
try {
int n = 0;
while (n < maxSites && delimiters.indexOf(ch) == -1) {
if (hasComments && (ch == startComment || ch == lineComment)) {
skipComments(ch);
ch = read();
}
if (!Character.isWhitespace(ch)) {
char ch1 = ch;
if (gapCharacters.indexOf(ch) != -1) {
ch1 = DataType.GAP_CHARACTER;
} else if (missingCharacters.indexOf(ch) != -1) {
ch1 = DataType.UNKNOWN_CHARACTER;
} else if (matchCharacters.indexOf(ch) != -1) {
if (matchSequence == null) {
throw new ImportException("Match character in first sequences");
}
if (n >= matchSequence.length()) {
throw new ImportException("Match sequences too short");
}
ch1 = matchSequence.charAt(n);
}
sequence.append(ch1);
n++;
}
ch = read();
}
lastDelimiter = ch;
if (Character.isWhitespace((char)lastDelimiter)) {
ch = nextCharacter();
if (delimiters.indexOf(ch) != -1) {
lastDelimiter = readCharacter();
}
}
} catch (EOFException e) {
// We catch an EOF and return the sequences we have so far
}
}
/**
* Reads a line of sequences, skipping over any comments and filtering using dataType.
* @param sequence a StringBuffer into which the sequences is put
* @param dataType the dataType of the sequences
* @param delimiters list of characters that will stop the reading
*/
public void readSequenceLine(StringBuffer sequence, DataType dataType,
String delimiters,
String gapCharacters, String missingCharacters,
String matchCharacters, String matchSequence) throws IOException, ImportException {
char ch = read();
try {
int n = 0;
while (ch != '\r' && ch != '\n' && delimiters.indexOf(ch) == -1) {
if (hasComments) {
if (ch == lineComment) {
skipComments(ch);
break;
}
if (ch == startComment) {
skipComments(ch);
ch = read();
}
}
if (ch != ' ' && ch != '\t') {
char ch1 = ch;
if (gapCharacters.indexOf(ch) != -1) {
ch1 = DataType.GAP_CHARACTER;
} else if (missingCharacters.indexOf(ch) != -1) {
ch1 = DataType.UNKNOWN_CHARACTER;
} else if (matchCharacters.indexOf(ch) != -1) {
if (matchSequence == null) {
throw new ImportException("Match character in first sequences");
}
if (n >= matchSequence.length()) {
throw new ImportException("Match sequences too short");
}
ch1 = matchSequence.charAt(n);
}
sequence.append(ch1);
n++;
}
ch = read();
}
if (ch == '\r') {
if (next() == '\n') read();
}
lastDelimiter = ch;
if (Character.isWhitespace((char)lastDelimiter)) {
ch = nextCharacter();
if (delimiters.indexOf(ch) != -1) {
lastDelimiter = readCharacter();
}
}
} catch (EOFException e) {
// We catch an EOF and return the sequences we have so far
}
}
/**
* Attempts to read and parse an integer delimited by whitespace.
*/
public int readInteger() throws IOException, ImportException {
String token = readToken();
try {
return Integer.parseInt(token);
} catch (NumberFormatException nfe) {
throw new ImportException("Number format error: " + nfe.getMessage());
}
}
/**
* Attempts to read and parse an integer delimited by whitespace or by
* any character in delimiters.
*/
public int readInteger(String delimiters) throws IOException, ImportException {
String token = readToken(delimiters);
try {
return Integer.parseInt(token);
} catch (NumberFormatException nfe) {
throw new ImportException("Number format error: " + nfe.getMessage());
}
}
/**
* Attempts to read and parse a double delimited by whitespace.
*/
public double readDouble() throws IOException, ImportException {
String token = readToken();
try {
return Double.parseDouble(token);
} catch (NumberFormatException nfe) {
throw new ImportException("Number format error: " + nfe.getMessage());
}
}
/**
* Attempts to read and parse a double delimited by whitespace or by
* any character in delimiters.
*/
public double readDouble(String delimiters) throws IOException, ImportException {
String token = readToken(delimiters);
try {
return Double.parseDouble(token);
} catch (NumberFormatException nfe) {
throw new ImportException("Number format error: " + nfe.getMessage());
}
}
/**
* Reads a token stopping when any whitespace or a comment is found.
* If the token begins with a quote char then all characters will be
* included in token until a matching quote is found (including whitespace or comments).
*/
public String readToken() throws IOException {
return readToken("");
}
/**
* Reads a token stopping when any whitespace, a comment or when any character
* in delimiters is found. If the token begins with a quote char
* then all characters will be included in token until a matching
* quote is found (including whitespace or comments).
*/
public String readToken(String delimiters) throws IOException {
int space = 0;
char ch, ch2, quoteChar = '\0';
boolean done = false, first = true, quoted = false, isSpace;
nextCharacter();
StringBuffer token = new StringBuffer();
while (!done) {
ch = read();
try {
isSpace = Character.isWhitespace(ch);
if (quoted && ch == quoteChar) { // Found the closing quote
ch2 = read();
if (ch == ch2) {
// A repeated quote character so add this to the token
token.append(ch);
} else {
// otherwise it terminates the token
lastDelimiter = ' ';
unreadCharacter(ch2);
done = true;
quoted = false;
}
} else if (first && (ch == '\'' || ch == '"')) {
// if the opening character is a quote
// read everything up to the closing quote
quoted = true;
quoteChar = ch;
first = false;
space = 0;
} else if ( ch == startComment || ch == lineComment ) {
skipComments(ch);
lastDelimiter = ' ';
done = true;
} else {
if (quoted) {
// compress multiple spaces into one
if (isSpace) {
space++;
ch = ' ';
} else {
space = 0;
}
if (space < 2) {
token.append(ch);
}
} else if (isSpace) {
lastDelimiter = ' ';
done = true;
} else if (delimiters.indexOf(ch) != -1) {
done = true;
lastDelimiter = ch;
} else {
token.append(ch);
first = false;
}
}
} catch (EOFException e) {
// We catch an EOF and return the token we have so far
done = true;
}
}
if (Character.isWhitespace((char)lastDelimiter)) {
ch = nextCharacter();
while (Character.isWhitespace(ch)) {
read();
ch = nextCharacter();
}
if (delimiters.indexOf(ch) != -1) {
lastDelimiter = readCharacter();
}
}
return token.toString();
}
/**
* Skips over any comments. The opening comment delimiter is passed.
*/
protected void skipComments(char delimiter) throws IOException {
char ch;
int n=1;
boolean write = false;
StringBuffer meta = null;
if (nextCharacter() == writeComment) {
read();
write = true;
} else if (nextCharacter() == metaComment) {
read();
// combine two consecutive meta comments
meta = lastMetaComment!= null ? new StringBuffer(lastMetaComment + ";") : new StringBuffer();
}
lastMetaComment = null;
if (delimiter == lineComment) {
String line = readLine();
if (write && commentWriter != null) {
commentWriter.write(line, 0, line.length());
commentWriter.newLine();
} else if (meta != null) {
meta.append(line);
}
} else {
do {
ch = read();
if (ch == startComment) {
n++;
} else if (ch == stopComment) {
if (write && commentWriter != null) {
commentWriter.newLine();
}
n--;
} else if (write && commentWriter != null) {
commentWriter.write(ch);
} else if (meta != null) {
meta.append(ch);
}
} while (n > 0);
}
if (meta != null) {
lastMetaComment = meta.toString();
}
}
/**
* Skips to the end of the line. If a comment is found then this is read.
*/
public void skipToEndOfLine() throws IOException {
char ch;
do {
ch = read();
if (hasComments) {
if (ch == lineComment) {
skipComments(ch);
break;
}
if (ch == startComment) {
skipComments(ch);
ch = read();
}
}
} while (ch != '\n' && ch != '\r');
if (ch == '\r') {
if (nextCharacter() == '\n') read();
}
}
/**
* Skips char any contiguous characters in skip. Will also skip
* comments.
*/
public void skipWhile(String skip) throws IOException {
char ch;
do {
ch = read();
} while ( skip.indexOf(ch) > -1 );
unreadCharacter(ch);
}
/**
* Skips over any space (plus tabs and returns) in the file. Will also skip
* comments.
*/
public void skipSpace() throws IOException {
skipWhile(" \t\r\n");
}
/**
* Skips over any contiguous characters in skip. Will also skip
* comments and space.
*/
public void skipCharacters(String skip) throws IOException {
skipWhile(skip + " \t\r\n");
}
/**
* Skips over the file until a character from delimiters is found. Returns
* the delimiter found. Will skip comments and will ignore delimiters within
* comments.
*/
public char skipUntil(String skip) throws IOException {
char ch;
do {
ch = readCharacter();
} while ( skip.indexOf(ch) == -1 );
return ch;
}
public String getLastMetaComment() {
return lastMetaComment;
}
public void clearLastMetaComment() {
lastMetaComment = null;
}
// Private stuff
private LineNumberReader reader;
private BufferedWriter commentWriter = null;
private int lastChar = '\0';
private int lastDelimiter = '\0';
private boolean hasComments = false;
private char startComment = (char)-1;
private char stopComment = (char)-1;
private char lineComment = (char)-1;
private char writeComment = (char)-1;
private char metaComment = (char)-1;
private String lastMetaComment = null;
}