/*
* PlinkImporter.java
*
* Copyright (c) 2002-2015 Alexei Drummond, Andrew Rambaut and Marc Suchard
*
* This file is part of BEAST.
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership and licensing.
*
* BEAST is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* BEAST is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with BEAST; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301 USA
*/
package dr.evomodel.continuous.plink;
import dr.evolution.util.Taxa;
import dr.evolution.util.Taxon;
import dr.evolution.util.TaxonList;
import dr.util.*;
import dr.xml.*;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.*;
import java.util.logging.Logger;
/**
* @author Marc A. Suchard
*/
public class PlinkImporter implements Citable {
private TaxonList taxonList = null;
private Map<String, List<Double>> taxonHash;
private Map<String, Integer> taxonCounts;
private List<Integer> remove;
public PlinkImporter(Reader reader) throws IOException {
taxonHash = new HashMap<String, List<Double>>();
taxonCounts = new HashMap<String, Integer>();
remove = new ArrayList<Integer>();
parse(reader);
}
@Override
public Citation.Category getCategory() {
return Citation.Category.DATA_MODELS;
}
@Override
public String getDescription() {
return "PLink";
}
public List<Citation> getCitations() {
return Arrays.asList(new Citation(
new Author[]{
new Author("MA", "Suchard"),
new Author("A", "Boyko"),
},
Citation.Status.IN_PREPARATION)
);
}
private String formatTransformedValue(double value) {
return String.format("%+4.3e", value);
}
private String makeStringAttribute(List<Double> valueList) {
StringBuffer sb = new StringBuffer();
// sb.append(formatTransformedValue(valueList.get(0)));
boolean first = true;
for (int i = 0; i < valueList.size(); i++) {
if (!remove.contains(i)) {
if (!first) {
sb.append(" ");
} else {
first = false;
}
sb.append(formatTransformedValue(valueList.get(i)));
}
}
return sb.toString();
}
private void transformRow(String line, Map<String, List<Double>> taxonHash, Map<String, Integer> taxonCounts, List<Integer> remove) {
StringTokenizer token = new StringTokenizer(line);
token.nextToken();
token.nextToken();
String name = token.nextToken();
int a1 = Integer.parseInt(token.nextToken());
int a2 = Integer.parseInt(token.nextToken());
token.nextToken();
int count1 = Integer.parseInt(token.nextToken());
int total = Integer.parseInt(token.nextToken());
int count2 = total - count1;
// if (a1 > a2) {
// int tmp = count1;
// count1 = count2;
// count2 = tmp;
// }
double value = transform(count1, count2);
if (Double.isNaN(value)) {
StringBuffer sb = new StringBuffer();
sb.append("PLINK line: " + line + " generates invalid frequency estimate;");
if (removeMissingLoci) {
sb.append(" marking loci for removal from analysis");
List<Double> valueList = taxonHash.get(name);
remove.add(valueList.size());
// System.err.println("Marking loci #" + valueList.size());
} else {
sb.append(" filling in with default value");
value = defaultMissingValue();
}
Logger.getLogger("dr.evomodel.continuous").warning(sb.toString());
}
List<Double> valueList = taxonHash.get(name);
if (valueList == null) {
valueList = new ArrayList<Double>();
taxonHash.put(name, valueList);
taxonCounts.put(name, 1);
} else {
// sb.append(" ");
taxonCounts.put(name, taxonCounts.get(name) + 1);
}
valueList.add(value);
// sb.append(formatTransformedValue(value));
}
public String toStatisticsString() {
StringBuffer sb = new StringBuffer();
sb.append("PLINK importer read ");
sb.append(taxonHash.keySet().size()).append(" taxa with entry counts:\n");
for (String taxon : taxonHash.keySet()) {
sb.append("\t");
sb.append(taxonCounts.get(taxon));
sb.append(" = ").append(taxon).append(taxonHash.get(taxon).size()).append("\n");
}
sb.append("\tPlease cite:\n");
sb.append(Citable.Utils.getCitationString(this));
return sb.toString();
}
public String toString() {
return toTaxonBlockString();
}
public void parse(Reader source) throws IOException {
BufferedReader reader = new BufferedReader(source);
String line = reader.readLine();
if (line == null) {
throw new IllegalArgumentException("Empty file");
}
line = reader.readLine();
while (line != null) {
transformRow(line, taxonHash, taxonCounts, remove);
line = reader.readLine();
}
if (remove.size() > 0) {
Logger.getLogger("dr.evomodel.continuous").warning("Removing " + remove.size() + " loci from analysis...");
// for (int i : remove) {
// for (String taxon : taxonHash.keySet()) {
// List<Double> valueList = taxonHash.get(taxon);
// valueList.remove(i);
// }
// }
}
}
public void addTaxonAttribute(TaxonList inTaxonList, String traitName) {
taxonList = new Taxa();
for (Taxon taxon : inTaxonList) {
List<Double> valueList = taxonHash.get(taxon.getId());
if (valueList == null) {
Logger.getLogger("dr.evolution").warning("Taxon " + taxon.getId() + " not found in PLINK data");
} else {
String string = makeStringAttribute(valueList);
((Taxa)taxonList).addTaxon(taxon);
taxon.setAttribute(traitName, string);
}
if (DEBUG) {
System.err.println("Added trait for " + taxon.getId());
}
}
}
public String toTaxonBlockString() {
StringBuffer sb = new StringBuffer();
if (taxonList != null) {
for (Taxon taxon : taxonList) {
sb.append("<taxon id=\"").append(taxon.getId()).append("\">\n");
Iterator<String> attributeIterator = taxon.getAttributeNames();
while (attributeIterator.hasNext()) {
String attribute = attributeIterator.next();
sb.append("\t<attribute name=\"").append(attribute).append("\"> ");
sb.append(taxon.getAttribute(attribute));
sb.append(" </attribute>\n");
}
sb.append("</taxon>\n\n");
}
}
return sb.toString();
}
private double innerTransform(double p) {
return Math.log(p / (1.0 - p));
}
private double defaultMissingValue() {
return 0.0;
}
// private double innerTransform(double p) {
// return Math.asin(Math.sqrt(p));
// }
//
// private double defaultMissingValue() {
// return Math.asin(Math.sqrt(0.5));
// }
private double transform(int count1, int count2) {
int total = count1 + count2;
double halfFreq = 0.5 / total;
double obsFreq;
if (count1 == 0) {
obsFreq = halfFreq;
} else if (count2 == 0) {
obsFreq = 1.0 - halfFreq;
} else {
obsFreq = ((double) count1) / ((double) total);
}
return innerTransform(obsFreq);
}
// **************************************************************
// XMLObjectParser
// **************************************************************
public static XMLObjectParser PARSER = new AbstractXMLObjectParser() {
public static final String PLINK_IMPORT = "plinkImport";
public final static String INPUT_FILE_NAME = FileHelpers.FILE_NAME;
public static final String TRAIT_NAME = "traitName";
public String getParserName() {
return PLINK_IMPORT;
}
public Object parseXMLObject(XMLObject xo) throws XMLParseException {
FileReader fileReader = XMLParser.getFileReader(xo, INPUT_FILE_NAME);
Logger.getLogger("dr.evomodel.continuous").info("Attempting to read PLINK file...\n");
PlinkImporter importer;
try {
importer = new PlinkImporter(fileReader);
Logger.getLogger("dr.evomodel.continuous").info(importer.toStatisticsString());
} catch (IOException e) {
throw new XMLParseException("Unable to read plink data from file, " + fileReader.getEncoding());
}
Taxa taxonList = new Taxa();
for (int i = 0; i < xo.getChildCount(); i++) {
Object child = xo.getChild(i);
if (child instanceof Taxon) {
Taxon taxon = (Taxon) child;
taxonList.addTaxon(taxon);
} else if (child instanceof TaxonList) {
TaxonList taxonList1 = (TaxonList) child;
for (int j = 0; j < taxonList1.getTaxonCount(); j++) {
taxonList.addTaxon(taxonList1.getTaxon(j));
}
} else {
throwUnrecognizedElement(xo);
}
}
importer.addTaxonAttribute(taxonList, xo.getAttribute(TRAIT_NAME, "null"));
return importer;
}
//************************************************************************
// AbstractXMLObjectParser implementation
//************************************************************************
public String getParserDescription() {
return "Provides the likelihood of pairwise distance given vectors of coordinates" +
"for points according to the multidimensional scaling scheme of XXX & Rafferty (xxxx).";
}
public XMLSyntaxRule[] getSyntaxRules() {
return rules;
}
private final XMLSyntaxRule[] rules = {
AttributeRule.newStringRule(INPUT_FILE_NAME, false, "The name of the file containing the plink table"),
new OrRule(
new ElementRule(Taxa.class, 1, Integer.MAX_VALUE),
new ElementRule(Taxon.class, 1, Integer.MAX_VALUE)
),
AttributeRule.newStringRule(TRAIT_NAME),
};
public Class getReturnType() {
return PlinkImporter.class;
}
};
private static final boolean DEBUG = false;
private boolean removeMissingLoci = true;
}