/*
* DataType.java
*
* Copyright (c) 2002-2015 Alexei Drummond, Andrew Rambaut and Marc Suchard
*
* This file is part of BEAST.
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership and licensing.
*
* BEAST is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* BEAST is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with BEAST; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301 USA
*/
package dr.evolution.datatype;
import java.io.Serializable;
import java.util.*;
/**
* Base class for sequence data types.
*
* @author Andrew Rambaut
* @author Alexei Drummond
* @version $Id: DataType.java,v 1.13 2005/05/24 20:25:56 rambaut Exp $
*/
public abstract class DataType implements Serializable {
public static final String DATA_TYPE = "dataType";
public static final int NUCLEOTIDES = 0;
public static final int AMINO_ACIDS = 1;
public static final int CODONS = 2;
public static final int TWO_STATES = 3;
public static final int GENERAL = 4;
public static final int COVARION = 5;
public static final int MICRO_SAT = 6;
public static final int P2PTYPE = 7;
public static final int CONTINUOUS = 8;
public static final char UNKNOWN_CHARACTER = '?';
public static final char GAP_CHARACTER = '-';
protected int stateCount;
protected int ambiguousStateCount;
// this map contains all dataTypes in the class loader that have added themselves
static private Map<String, DataType> registeredDataTypes = null;
/**
* Due to some unpleasant interactions between static initializations in the
* different classes, I have changed this to a lazy initialization.
*/
private static void lazyRegisterDataTypes() {
if (registeredDataTypes == null) {
registeredDataTypes = new Hashtable<String, DataType>();
registerDataType(Nucleotides.DESCRIPTION, Nucleotides.INSTANCE);
registerDataType(AminoAcids.DESCRIPTION, AminoAcids.INSTANCE);
registerDataType(Codons.DESCRIPTION + "-" + GeneticCode.UNIVERSAL.getName(), Codons.UNIVERSAL);
registerDataType(Codons.DESCRIPTION + "-" + GeneticCode.VERTEBRATE_MT.getName(), Codons.VERTEBRATE_MT);
registerDataType(Codons.DESCRIPTION + "-" + GeneticCode.YEAST.getName(), Codons.YEAST);
registerDataType(Codons.DESCRIPTION + "-" + GeneticCode.MOLD_PROTOZOAN_MT.getName(), Codons.MOLD_PROTOZOAN_MT);
registerDataType(Codons.DESCRIPTION + "-" + GeneticCode.MYCOPLASMA.getName(), Codons.MYCOPLASMA);
registerDataType(Codons.DESCRIPTION + "-" + GeneticCode.INVERTEBRATE_MT.getName(), Codons.INVERTEBRATE_MT);
registerDataType(Codons.DESCRIPTION + "-" + GeneticCode.CILIATE.getName(), Codons.CILIATE);
registerDataType(Codons.DESCRIPTION + "-" + GeneticCode.ECHINODERM_MT.getName(), Codons.ECHINODERM_MT);
registerDataType(Codons.DESCRIPTION + "-" + GeneticCode.EUPLOTID_NUC.getName(), Codons.EUPLOTID_NUC);
registerDataType(Codons.DESCRIPTION + "-" + GeneticCode.BACTERIAL.getName(), Codons.BACTERIAL);
registerDataType(Codons.DESCRIPTION + "-" + GeneticCode.ALT_YEAST.getName(), Codons.ALT_YEAST);
registerDataType(Codons.DESCRIPTION + "-" + GeneticCode.ASCIDIAN_MT.getName(), Codons.ASCIDIAN_MT);
registerDataType(Codons.DESCRIPTION + "-" + GeneticCode.FLATWORM_MT.getName(), Codons.FLATWORM_MT);
registerDataType(Codons.DESCRIPTION + "-" + GeneticCode.BLEPHARISMA_NUC.getName(), Codons.BLEPHARISMA_NUC);
registerDataType(Codons.DESCRIPTION + "-" + GeneticCode.NO_STOPS.getName(), Codons.NO_STOPS);
registerDataType(TwoStates.DESCRIPTION, TwoStates.INSTANCE);
registerDataType(OldHiddenNucleotides.DESCRIPTION, OldHiddenNucleotides.INSTANCE);
registerDataType(TwoStateCovarion.DESCRIPTION, TwoStateCovarion.INSTANCE);
registerDataType(HiddenCodons.DESCRIPTION + "2-" + GeneticCode.UNIVERSAL.getName(), HiddenCodons.UNIVERSAL_HIDDEN_2);
registerDataType(HiddenCodons.DESCRIPTION + "3-" + GeneticCode.UNIVERSAL.getName(), HiddenCodons.UNIVERSAL_HIDDEN_3);
registerDataType(HiddenNucleotides.DESCRIPTION + "1", HiddenNucleotides.NUCLEOTIDE_HIDDEN_1);
registerDataType(HiddenNucleotides.DESCRIPTION + "2", HiddenNucleotides.NUCLEOTIDE_HIDDEN_2);
registerDataType(HiddenNucleotides.DESCRIPTION + "3", HiddenNucleotides.NUCLEOTIDE_HIDDEN_3);
registerDataType(HiddenNucleotides.DESCRIPTION + "4", HiddenNucleotides.NUCLEOTIDE_HIDDEN_4);
registerDataType(HiddenNucleotides.DESCRIPTION + "8", HiddenNucleotides.NUCLEOTIDE_HIDDEN_8);
registerDataType(HiddenAminoAcids.DESCRIPTION + "1", HiddenAminoAcids.AMINO_ACIDS_HIDDEN_1);
registerDataType(HiddenAminoAcids.DESCRIPTION + "2", HiddenAminoAcids.AMINO_ACIDS_HIDDEN_2);
registerDataType(HiddenAminoAcids.DESCRIPTION + "3", HiddenAminoAcids.AMINO_ACIDS_HIDDEN_3);
registerDataType(HiddenAminoAcids.DESCRIPTION + "4", HiddenAminoAcids.AMINO_ACIDS_HIDDEN_4);
registerDataType(GeneralDataType.DESCRIPTION, GeneralDataType.INSTANCE);
registerDataType(Microsatellite.DESCRIPTION, Microsatellite.INSTANCE);
registerDataType(P2P.DESCRIPTION, P2P.INSTANCE);
registerDataType(ContinuousDataType.DESCRIPTION, ContinuousDataType.INSTANCE);
}
}
/**
* Registers a data type with a (hopefully unique) name.
*
* @param name
* @param dataType
*/
public static void registerDataType(String name, DataType dataType) {
lazyRegisterDataTypes();
registeredDataTypes.put(name, dataType);
}
/**
* @param name the name that the datatype was registered under
* @return the datatype with the given name
*/
public static DataType getRegisteredDataTypeByName(String name) {
lazyRegisterDataTypes();
return registeredDataTypes.get(name);
}
public static String[] getRegisteredDataTypeNames() {
lazyRegisterDataTypes();
Set<String> set = registeredDataTypes.keySet();
List<String> keys = new ArrayList<String>(set);
String[] names = new String[keys.size()];
for (int i = 0; i < names.length; i++) {
names[i] = keys.get(i);
}
return names;
}
/**
* guess data type suitable for a given sequence
*
* @param sequence a string of symbols representing a molecular sequence of unknown data type.
* @return suitable DataType object
*/
public static DataType guessDataType(String sequence) {
// count A, C, G, T, U, N
long numNucs = 0;
long numChars = 0;
long numBins = 0;
for (int i = 0; i < sequence.length(); i++) {
char c = sequence.charAt(i);
int s = Nucleotides.INSTANCE.getState(c);
if (s != Nucleotides.UNKNOWN_STATE && s != Nucleotides.GAP_STATE) {
numNucs++;
}
if (c != '-' && c != '?') numChars++;
if (c == '0' || c == '1') numBins++;
}
if (numChars == 0) {
numChars = 1;
}
// more than 85 % frequency advocates nucleotide data
if ((double) numNucs / (double) numChars > 0.85) {
return Nucleotides.INSTANCE;
} else if ((double) numBins / (double) numChars > 0.2) {
return TwoStates.INSTANCE;
} else {
return AminoAcids.INSTANCE;
}
}
/**
* return the set of valid chars if they are defined, if not defined then return null
* cannot use stateCount and loop, because some data types stateCount is dynamic.
*/
public abstract char[] getValidChars();
/**
* Get number of unique states
*
* @return number of unique states
*/
public int getStateCount() {
return stateCount;
}
/**
* Get number of states including ambiguous states
*
* @return number of ambiguous states
*/
public int getAmbiguousStateCount() {
return ambiguousStateCount;
}
/**
* Get state corresponding to a character
*
* @param code state code
* @return state
*/
public int getState(String code) {
return getState(code.charAt(0));
}
/**
* Get state corresponding to a character
*
* @param c character
* @return state
*/
public int getState(char c) {
return (int) c - 'A';
}
/**
* Get state corresponding to an unknown
*
* @return state
*/
public int getUnknownState() {
return stateCount;
}
/**
* Get state corresponding to a gap
*
* @return state
*/
public int getGapState() {
return stateCount + 1;
}
/**
* Get character corresponding to a given state
*
* @param state state
* <p/>
* return corresponding character
*/
public char getChar(int state) {
return (char) (state + 'A');
}
/**
* Get a string code corresponding to a given state. By default this
* calls getChar but overriding classes may return multicharacter codes.
*
* @param state state
* <p/>
* return corresponding code
*/
public String getCode(int state) {
return String.valueOf(getChar(state));
}
/**
* Get triplet string corresponding to a given state
*
* @param state state
* <p/>
* return corresponding triplet string
*/
public String getTriplet(int state) {
return " " + getChar(state) + " ";
}
/**
* returns an array containing the non-ambiguous states that this state represents.
*/
public int[] getStates(int state) {
int[] states;
if (!isAmbiguousState(state)) {
states = new int[1];
states[0] = state;
} else {
states = new int[stateCount];
for (int i = 0; i < stateCount; i++) {
states[i] = i;
}
}
return states;
}
/**
* returns an array containing the non-ambiguous states that this state represents.
*/
public boolean[] getStateSet(int state) {
boolean[] stateSet = new boolean[stateCount];
if (!isAmbiguousState(state)) {
for (int i = 0; i < stateCount; i++) {
stateSet[i] = false;
}
stateSet[state] = true;
} else {
for (int i = 0; i < stateCount; i++) {
stateSet[i] = true;
}
}
return stateSet;
}
/**
* returns the uncorrected distance between two states
*/
public double getObservedDistance(int state1, int state2) {
if (!isAmbiguousState(state1) && !isAmbiguousState(state2) && state1 != state2) {
return 1.0;
}
return 0.0;
}
/**
* returns the uncorrected distance between two states with full
* treatment of ambiguity.
*/
public double getObservedDistanceWithAmbiguity(int state1, int state2) {
boolean[] stateSet1 = getStateSet(state1);
boolean[] stateSet2 = getStateSet(state2);
double sumMatch = 0.0;
double sum1 = 0.0;
double sum2 = 0.0;
for (int i = 0; i < stateCount; i++) {
if (stateSet1[i]) {
sum1 += 1.0;
if (stateSet1[i] == stateSet2[i]) {
sumMatch += 1.0;
}
}
if (stateSet2[i]) {
sum2 += 1.0;
}
}
return (1.0 - (sumMatch / (sum1 * sum2)));
}
public String toString() {
return getDescription();
}
/**
* description of data type
*
* @return string describing the data type
*/
public abstract String getDescription();
/**
* type of data type
*
* @return integer code for the data type
*/
public abstract int getType();
/**
* @return true if this character is an ambiguous state
*/
public boolean isAmbiguousChar(char c) {
return isAmbiguousState(getState(c));
}
/**
* @return true if this character is a gap
*/
public boolean isUnknownChar(char c) {
return isUnknownState(getState(c));
}
/**
* @return true if this character is a gap
*/
public boolean isGapChar(char c) {
return isGapState(getState(c));
}
/**
* returns true if this state is an ambiguous state.
*/
public boolean isAmbiguousState(int state) {
return (state >= stateCount);
}
/**
* @return true if this state is an unknown state
*/
public boolean isUnknownState(int state) {
return (state == getUnknownState());
}
/**
* @return true if this state is a gap
*/
public boolean isGapState(int state) {
return (state == getGapState());
}
public String getName() {
switch (getType()) {
case DataType.NUCLEOTIDES:
return "Nucleotide";
case DataType.AMINO_ACIDS:
return "Amino Acid";
case DataType.CODONS:
return "Codon";
case DataType.TWO_STATES:
return "Binary";
case DataType.COVARION:
return "Covarion";
case DataType.GENERAL:
return "Discrete Traits";
case DataType.CONTINUOUS:
return "Continuous Traits";
case DataType.MICRO_SAT:
return "Microsatellite";
default:
throw new IllegalArgumentException("Unsupported data type");
}
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof DataType)) return false;
DataType dataType = (DataType) o;
if (this.getType() != dataType.getType()) return false;
return true;
}
@Override
public int hashCode() {
return getType();
}
}