package gov.nih.ncgc.bard.pcparser;
// $Id: PubChemAssaySource.java 3488 2009-10-29 15:49:42Z nguyenda $
import gov.nih.ncgc.bard.capextract.CAPUtil;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.BitSet;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.Vector;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.databind.JsonMappingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
public class PubChemAssayParser implements Constants {
private static final Logger logger = Logger.getLogger
(PubChemAssayParser.class.getName());
static final int PUBCHEM_SID = 0;
static final int PUBCHEM_EXT_REGID = 1;
static final int PUBCHEM_CID = 2;
static final int PUBCHEM_OUTCOME = 3;
static final int PUBCHEM_RANK = 4;
static final int PUBCHEM_URL = 5;
static final int PUBCHEM_COMMENT = 6;
static final int PUBCHEM_REVOKE = 7;
static final int PUBCHEM_NUMCOLUMNS = 8;
protected static final Pattern DOSE_RESPONSE_REGEX =
//Pattern.compile("(.+)([\\s_]+at|[\\s_]+@)?[\\s_]+((\\d*\\.\\d*)|(\\d+))[\\s_]*(uMol|MICROM|NANOM|microM|nanoM|nM|uM)(.+)?");
Pattern.compile("((.+)([\\s_]+at|[\\s_]+@)?[\\s_]+)?((\\d*\\.\\d*)|(\\d+))[\\s_]*(uMol|MICROM|NANOM|microM|nanoM|nM|uM)(.+)?");
// If the input expression is a parsable dose-response expression, then
// return an array of four elements denoting the prefix, concentration,
// unit, and postfix, respectively, of the expression. For example,
// consider the following input expression:
// % Inhibition of E. Coli WT @ 10 uMol Rep 1
// the four elements returned are:
// prefix: % Inhibition of E. Coli WT
// concentration: 10
// unit: uMol
// postfix: Rep 1
protected static Object[] parseDoseResponseExpr (String expr) {
Matcher m = DOSE_RESPONSE_REGEX.matcher(expr);
Object[] tokens = {};
if (m.find()) {
String prefix = m.group(2);
if (prefix != null) {
prefix = prefix.replaceAll
("([\\s_]+at|_|[\\s_]+@)$", "").trim();
}
String postfix = m.group(8);
if (postfix != null) {
postfix = postfix.trim();
}
Double conc = Double.valueOf(m.group(4));
String u = m.group(7);
Unit unit = Unit.unspecified;
if (u.startsWith("uM") || u.equalsIgnoreCase("microM")) {
unit = Unit.um;
}
else if (u.equalsIgnoreCase("nM") || u.equalsIgnoreCase("nanoM")) {
unit = Unit.nm;
}
tokens = new Object[]{prefix, conc, unit, postfix};
}
return tokens;
}
public static Assay parseBioassayXML (InputStream is) throws Exception {
return parseBioassayXML (new XmlTwig (is));
}
public static Assay parseBioassayXML (Reader r) throws Exception {
return parseBioassayXML (new XmlTwig (r));
}
protected static Assay parseBioassayXML (XmlTwig twig) throws Exception {
String aid = twig.getElementValue("PC-AssayContainer/PC-AssaySubmit/PC-AssaySubmit_assay/PC-AssaySubmit_assay_descr/PC-AssayDescription/PC-AssayDescription_aid/PC-ID/PC-ID_id");
if (aid == null) aid = twig.getElementValue("PC-AssayDescription/PC-AssayDescription_aid/PC-ID/PC-ID_id");
Assay assay = new Assay (new Integer (aid));
assay.setName(twig.getElementValue("PC-AssayContainer/PC-AssaySubmit/PC-AssaySubmit_assay/PC-AssaySubmit_assay_descr/PC-AssayDescription/PC-AssayDescription_name"));
assay.setSourceName(twig.getElementValue("PC-AssayContainer/PC-AssaySubmit/PC-AssaySubmit_assay/PC-AssaySubmit_assay_descr/PC-AssayDescription/PC-AssayDescription_aid-source/PC-Source/PC-Source_db/PC-DBTracking/PC-DBTracking_name"));
assay.setSourceID(twig.getElementValue("PC-AssayContainer/PC-AssaySubmit/PC-AssaySubmit_assay/PC-AssaySubmit_assay_descr/PC-AssayDescription/PC-AssayDescription_aid-source/PC-Source/PC-Source_db/PC-DBTracking/PC-DBTracking_source-id/Object-id/Object-id_str"));
assay.setDescription(twig.getElementsAsText("PC-AssayDescription_description_E", "\n"));
assay.setProtocol(twig.getElementsAsText("PC-AssayDescription_protocol_E", "\n"));
assay.setComment(twig.getElementsAsText("PC-AssayDescription_comment_E", "\n"));
String aom = twig.getElementValue("PC-AssayContainer/PC-AssaySubmit/PC-AssaySubmit_assay/PC-AssaySubmit_assay_descr/PC-AssayDescription/PC-AssayDescription_activity-outcome-method");
if (aom != null) {
assay.setOutcomeMethod
(Assay.AOM.getInstance(Integer.parseInt(aom)));
}
String grant = twig.getElementValue("PC-AssayContainer/PC-AssaySubmit/PC-AssaySubmit_assay/PC-AssaySubmit_assay_descr/PC-AssayDescription/PC-AssayDescription_grant-number/PC-AssayDescription_grant-number_E");
if (grant != null) {
assay.setGrant(grant);
}
String cat = twig.getElementValue("PC-AssayContainer/PC-AssaySubmit/PC-AssaySubmit_assay/PC-AssaySubmit_assay_descr/PC-AssayDescription/PC-AssayDescription_project-category");
if (cat != null) {
assay.setCategory(Integer.parseInt(cat));
}
/*
* xref
*/
NodeList xrefNodes = twig.getDocument().getDocumentElement()
.getElementsByTagName("PC-AnnotatedXRef");
for (int i = 0; i < xrefNodes.getLength(); ++i) {
Element xrefElm = (Element)xrefNodes.item(i);
String pmid = XmlTwig.getElementValue
(xrefElm, "PC-AnnotatedXRef/PC-AnnotatedXRef_xref/PC-XRefData/PC-XRefData_pmid");
if (pmid != null) {
//System.out.println("PMID: " + pmid);
assay.addPublication(Long.parseLong(pmid));
}
String gene =XmlTwig.getElementValue
(xrefElm, "PC-AnnotatedXRef/PC-AnnotatedXRef_xref/PC-XRefData/PC-XRefData_gene");
if (gene != null) {
//System.out.println("GENE: " + gene);
//System.out.print("\t" + gene);
try {
assay.addGene(Integer.parseInt(gene));
}
catch (NumberFormatException ex) {
logger.log(Level.SEVERE, "Bad gene id: "+gene, ex);
}
}
String url = XmlTwig.getElementValue
(xrefElm, "PC-AnnotatedXRef/PC-AnnotatedXRef_xref/PC-XRefData/PC-XRefData_dburl");
if (url != null) {
//System.out.println("URL: " + url);
assay.setURL(url);
}
String raid = XmlTwig.getElementValue
(xrefElm, "PC-AnnotatedXRef/PC-AnnotatedXRef_xref/PC-XRefData/PC-XRefData_aid");
if (raid != null) {
try {
assay.addAID(Integer.parseInt(raid));
}
catch (NumberFormatException ex) {
logger.log(Level.SEVERE, "Bad aid: "+raid, ex);
}
//System.out.println("AID: " + raid);
}
}
/*
* now parse result type fields
*/
NodeList resultTypeNodes = twig.getDocument().getDocumentElement()
.getElementsByTagName("PC-ResultType");
// dose concentrations
final Map<Integer, Double> dose = new TreeMap<Integer, Double>();
// dose response curve grouping
Map<String, BitSet> drgrp = new TreeMap<String, BitSet>();
Unit doseUnit = Unit.unspecified;
Map<Integer, ResultType> results = new TreeMap<Integer, ResultType>();
int maxTID = 0;
for (int i = 0; i < resultTypeNodes.getLength(); ++i) {
Element resultTypeElm = (Element)resultTypeNodes.item(i);
ResultType type = new ResultType ();
type.setTID(Integer.parseInt
(XmlTwig.getElementValue
(resultTypeElm, "PC-ResultType/PC-ResultType_tid")));
if (type.getTID() > maxTID) {
maxTID = type.getTID();
}
type.setName(XmlTwig.getElementValue
(resultTypeElm, "PC-ResultType/PC-ResultType_name"));
type.setDescription(XmlTwig.getElementsAsText
(resultTypeElm,
"PC-ResultType_description_E", "\n"));
String rtype = XmlTwig.getElementValue
(resultTypeElm, "PC-ResultType/PC-ResultType_type");
if (rtype != null) {
int t = Integer.parseInt(rtype);
for (Type v : Type.values()) {
if (v.ordinal() == t) {
type.setType(v);
}
}
}
String unit = XmlTwig.getElementValue
(resultTypeElm, "PC-ResultType/PC-ResultType_unit");
if (unit != null) {
type.setUnit(Unit.getInstance(Integer.parseInt(unit)));
}
Element acElm = XmlTwig.getElement
(resultTypeElm, "PC-ResultType/PC-ResultType_ac");
if (acElm != null) {
String ac = acElm.getAttribute("value");
type.setActiveConcentration(ac.equalsIgnoreCase("true"));
}
String tcVal = XmlTwig.getElementValue
(resultTypeElm, "PC-ResultType/PC-ResultType_tc/PC-ConcentrationAttr/PC-ConcentrationAttr_concentration");
if (tcVal != null &&
(type.getType() == ResultType.Type.Float
|| type.getType() == ResultType.Type.Int)) {
dose.put(type.getTID(), Double.valueOf(tcVal));
Double[] concList = {Double.valueOf(tcVal)};
type.setTestConcentration(concList);
}
String tcUnit = XmlTwig.getElementValue
(resultTypeElm, "PC-ResultType/PC-ResultType_tc/PC-ConcentrationAttr/PC-ConcentrationAttr_unit");
if (tcUnit != null) {
Unit u = Unit.getInstance(Integer.parseInt(tcUnit));
if (doseUnit != Unit.unspecified && doseUnit != u) {
throw new IllegalStateException
("Dose response concentrations have different units!");
}
doseUnit = u;
type.setTestConcUnit(u);
}
String tcDrId = XmlTwig.getElementValue
(resultTypeElm, "PC-ResultType/PC-ResultType_tc/PC-ConcentrationAttr/PC-ConcentrationAttr_dr-id");
if (tcDrId != null) {
BitSet group = drgrp.get(tcDrId);
if (group == null) {
drgrp.put(tcDrId, group = new BitSet ());
}
group.set(type.getTID());
}
else if (tcVal != null) {
// if there is concentration but no grouping info, then we
// try to parse the name of this result type to get the
// grouping info
Object[] expr = parseDoseResponseExpr (type.getName());
if (expr.length <= 0) {
// do nothing
}
else if (expr[0] != null) {
String prefix = (String)expr[0];
BitSet group = drgrp.get(prefix);
if (group == null) {
drgrp.put(prefix, group = new BitSet ());
}
group.set(type.getTID());
}
else if (expr[3] != null) {
String postfix = (String)expr[3];
BitSet group = drgrp.get(postfix);
if (group == null) {
drgrp.put(postfix, group = new BitSet ());
}
group.set(type.getTID());
}
}
results.put(type.getTID(), type);
}
/*
do we care what are the different DR???
NodeList assayDrNodes = twig.getDocument().getDocumentElement()
.getElementsByTagName("PC-AssayDRAttr");
for (int i = 0; i < assayDrNodes.getLength(); ++i) {
Element assayDrAttrElm = (Element)assayDrNodes.item(i);
}
*/
if (dose.size() == 0) { //!!!! && assay.getOutcomeMethod() == Assay.AOM.Confirmatory) {
// hmm... confirmation assay with no dose response curve?...
// perhaps older assay, so let's see if we can parse the
// TIDs to figure out if there exist columns that look like
// dr columns
doseUnit = Unit.m; // convert everything to molar
for (ResultType rt : results.values()) {
if (rt.getType() == ResultType.Type.Float
&& rt.getUnit() == ResultType.Unit.percent) {
Object[] expr = parseDoseResponseExpr (rt.getName());
if (expr.length > 0) {
// match as dose-response header
String prefix = (String)expr[0];
Double conc = (Double)expr[1];
Unit unit = (Unit)expr[2];
String postfix = (String)expr[3];
if (prefix != null) {
BitSet group = drgrp.get(prefix);
if (group == null) {
drgrp.put(prefix, group = new BitSet ());
}
group.set(rt.getTID());
}
else if (postfix != null) {
BitSet group = drgrp.get(postfix);
if (group == null) {
drgrp.put(postfix, group = new BitSet ());
}
group.set(rt.getTID());
}
if (unit == Unit.um) {
conc *= 1e-6;
}
else if (unit == Unit.nm) {
conc *= 1e-9;
}
dose.put(rt.getTID(), conc);
Double[] concList = {conc};
rt.setTestConcentration(concList);
rt.setTestConcUnit(Unit.m);
}
}
}
}
// add an additional column for dose response
if (dose.size() > 0) { //!!! > 1) {
if (drgrp.size() <= 1) { // single dose response curve...
ResultType rt = createDoseResponseType
(dose, results, dose.keySet());
rt.setTID(++maxTID);
rt.setTestConcUnit(doseUnit);
assay.addResult(rt);
}
else { // multiple dose-response curves
for (BitSet e : drgrp.values()) {
// only create DoseResult type if we have at least 3 or
// more data columns
if (e.cardinality() > 2) {
Vector<Integer> columns = new Vector<Integer>();
for (int b = e.nextSetBit(0); b >=0;
b = e.nextSetBit(b+1)) {
columns.add(b);
}
ResultType rt = createDoseResponseType
(dose, results, columns);
rt.setTID(++maxTID);
rt.setTestConcUnit(doseUnit);
assay.addResult(rt);
}
}
}
}
dose.clear();
// now add the rest of the types....
for (Map.Entry<Integer, ResultType> e : results.entrySet()) {
assay.addResult(e.getValue());
}
results.clear();
results = null;
return assay;
}
private static ResultType createDoseResponseType
(final Map<Integer, Double> dose, Map<Integer, ResultType> results,
Collection<Integer> columns) {
String title = "";
StringBuffer desc = new StringBuffer ();
// make sure the concentrations are sorted in ascending order
Vector<Integer> sortedTID = new Vector<Integer>(columns);
Collections.sort(sortedTID, new Comparator<Integer>() {
public int compare (Integer a, Integer b) {
Double da = dose.get(a);
Double db = dose.get(b);
if (da == null && db == null) return 0;
if (da == null) return -1;
if (db == null) return 1;
if (da < db) return -1;
else if (da > db) return 1;
return 0;
}
});
for (Integer tid : sortedTID) {
if (title.length() == 0) {
ResultType rt = results.get(tid);
Object[] expr = parseDoseResponseExpr (rt.getName());
if (expr.length > 0) {
if (expr[0] != null) {
title = (String)expr[0]; // prefix
}
else if (expr[3] != null) {
title = (String)expr[3]; // postfix
}
}
}
desc.append((desc.length() == 0 ? "" : " ") + tid);
}
ResultType rt = new ResultType ();
rt.setName(title);
rt.setDescription(desc.toString());
rt.setType(Type.DoseResponse);
rt.setUnit(Unit.none);
Double[] conc = new Double[sortedTID.size()];
for (int i = 0; i < conc.length; ++i) {
conc[i] = dose.get(sortedTID.get(i));
}
rt.setTestConcentration(conc);
return rt;
}
public static void assignDataTypes (Vector<ResultType> result) {
// used TIDs that have been used by dose-response parameters..
Set<Integer> drMask = new HashSet<Integer>();
// first add dose-response data (if any)
for (ResultType rt: result) {
if (rt.getType() == ResultType.Type.DoseResponse) {
rt.setContextGroup(rt.getTID());
Vector<Double> dose = new Vector<Double>();
Vector<Double> response = new Vector<Double>();
Double[] conc = rt.getTestConcentration();
String[] toks = rt.getDescription().split("\\s");
if (conc.length != toks.length) {
System.err.println("** fatal error: inconsistent "
+"dose-response data!!!");
}
for (int i = 0; i < toks.length; ++i) {
int tid = Integer.parseInt(toks[i]);
for (ResultType resultType: result)
if (resultType.getTID() == tid) {
if (resultType.getDataType().equals(Constants.DataType.Unknown)) {
resultType.setDataType(Constants.DataType.CRpt);
resultType.setContextGroup(rt.getTID());
} else {
System.err.println("Can not be CRpt; Data type already assigned for TID:"+tid+" to type "+resultType.getDataType());
System.exit(1);
}
}
}
AssayDataDoseResponseHill4p dr =
new AssayDataDoseResponseHill4p
(rt.getTID(), response.toArray(new Double[0]));
dr.setDose(dose.toArray(new Double[0]));
// figure out the hill/fit parameters!!!!
checkForHillParams
(result, rt.getName(), rt.getTID(), drMask);
// reassign dose response context groups to AC50 item
int oldContext = rt.getTID(), newContext = rt.getTID();
for (ResultType rt2: result) {
if (DataType.Ac50.equals(rt2.getDataType()))
newContext = rt2.getTID();
}
if (newContext != oldContext)
for (ResultType rt2: result) {
if (rt2.getContextGroup() == oldContext)
rt2.setContextGroup(newContext);
}
}
}
}
public static void parse (Assay assay, InputStream is) throws IOException {
BufferedReader br = new BufferedReader (new InputStreamReader (is));
String line = br.readLine();
if (line == null) {
return;
}
String[] header = tokenizer (line, ',');
String[] field = new String[header.length];
for (int rows = 1; (line = br.readLine()) != null; ++rows) {
boolean ok = tokenizer (field, line, ',');
String sid = field[PUBCHEM_SID];
String cid = field[PUBCHEM_CID];
if (!ok) {
System.err.println
("*** AID=" + assay.getAID() + ":line="
+ rows + ": expected " + header.length
+ " columns but got " + field.length
+ " instead; skipping substance " + sid);
continue;
}
long id = 0;
try {
id = Long.parseLong(cid);
}
catch (NumberFormatException ex) {
}
AssayResults ar = new AssayResults (id);
ar.setSID(sid);
String value = field[PUBCHEM_OUTCOME];
if (value != null && value.length() > 0) {
ar.setOutcome(Outcome.instanceOf
(Integer.parseInt(value)));
}
value = field[PUBCHEM_RANK];
if (value != null && value.length() > 0) {
ar.setRank(Integer.parseInt(value));
}
value = field[PUBCHEM_URL];
if (value != null && value.length() > 0) {
ar.setURL(value);
}
Map<Integer, AssayData> data =
new TreeMap<Integer, AssayData>();
for (int i = PUBCHEM_NUMCOLUMNS; i < field.length; ++i) {
int tid = Integer.parseInt(header[i]);
ResultType rt = assay.getResult(tid);
AssayData ad = parseField (rt, field[i]);
if (ad != null) {
data.put(tid, ad);
}
}
// used TIDs that have been used by dose-response parameters..
Set<Integer> drMask = new HashSet<Integer>();
// first add dose-response data (if any)
Enumeration<ResultType> result = assay.getResults();
for (; result.hasMoreElements(); ) {
ResultType rt = result.nextElement();
if (rt.getType() == ResultType.Type.DoseResponse) {
Vector<Double> dose = new Vector<Double>();
Vector<Double> response = new Vector<Double>();
Double[] conc = rt.getTestConcentration();
String[] toks = rt.getDescription().split("\\s");
if (conc.length != toks.length) {
System.err.println("** fatal error: inconsistent "
+"dose-response data!!!");
}
for (int i = 0; i < toks.length; ++i) {
int tid = Integer.parseInt(toks[i]);
AssayData ad = data.get(tid);
if (ad != null) {
double x;
switch (rt.getTestConcUnit()) {
case um: x = conc[i]*1e-6; break;
case m: x = conc[i]; break;
case nm: x = conc[i]*1e-9; break;
default: throw new IllegalStateException
("Not supported concentration unit: "
+ rt.getTestConcUnit());
}
dose.add(x);
response.add
(((Number)ad.getValue()).doubleValue());
}
}
AssayDataDoseResponseHill4p dr =
new AssayDataDoseResponseHill4p
(rt.getTID(), response.toArray(new Double[0]));
dr.setDose(dose.toArray(new Double[0]));
// figure out the hill/fit parameters!!!!
getHillParams
(assay, data.values(), rt.getName(), dr, drMask);
ar.addData(dr);
}
}
// now add the rest of the data
for (Map.Entry<Integer, AssayData> e : data.entrySet()) {
AssayData ad = e.getValue();
if (ad != null) {
ar.addData(ad);
}
}
System.out.println("Hey!");
}
} // parse
static String[] tokenizer (String line, char delim) {
Vector<String> toks = new Vector<String>();
int len = line.length(), parity = 0;
StringBuffer curtok = new StringBuffer ();
for (int i = 0; i < len; ++i) {
char ch = line.charAt(i);
if (ch == '"') {
parity ^= 1;
}
if (ch == delim) {
if (parity == 0) {
String tok = null;
if (curtok.length() > 0) {
tok = curtok.toString();
}
toks.add(tok);
curtok.setLength(0);
}
else {
curtok.append(ch);
}
}
else if (ch != '"') {
curtok.append(ch);
}
}
if (curtok.length() > 0) {
toks.add(curtok.toString());
}
return toks.toArray(new String[0]);
}
static boolean tokenizer (String[] tokens, String line, char delim) {
int len = line.length(), parity = 0;
StringBuffer curtok = new StringBuffer ();
int tokpos = 0;
for (int i = 0; i < len && tokpos < tokens.length; ++i) {
char ch = line.charAt(i);
if (ch == '"') {
parity ^= 1;
}
if (ch == delim) {
if (parity == 0) {
String tok = null;
if (curtok.length() > 0) {
tok = curtok.toString();
}
tokens[tokpos++] = tok;
curtok.setLength(0);
}
else {
curtok.append(ch);
}
}
else if (ch != '"') {
curtok.append(ch);
}
}
if (tokpos < tokens.length) {
tokens[tokpos++] =
curtok.length() > 0 ? curtok.toString() : null;
}
return tokpos == tokens.length;
}
// figure out which of the data is hill slope...
static final Pattern FOC_REGEX = Pattern.compile("\\s+FOC(\\s|$)+");
protected static void checkForHillParams
(Vector<ResultType> rtv, String name, int contextGroup, Set<Integer> mask) {
if (FOC_REGEX.matcher(name).find()) {
// might be something like: 620 nm FOC
return;
}
HashMap<Constants.DataType, ResultType> params = new HashMap<Constants.DataType, ResultType>();
ResultType ac50Rt = null, slopeRt = null, zeroRt = null, infRt = null;
for (ResultType rt: rtv) {
if (mask.contains(rt.getTID())) {
continue;
}
String text = rt.getName();
if (rt.getType() == ResultType.Type.Float) {
if (isHillSlope(text) && slopeRt == null) {
params.put(Constants.DataType.Hslope, rt);
slopeRt = rt;
rt.setDataType(Constants.DataType.Hslope);
// sigh... Emory... e.g., AID 801
if (text.indexOf("Curve Curvature") >= 0
|| text.indexOf("Curve Slope") >= 0) {
rt.setTransform(Constants.Transform.Negative);
}
}
else if (isHillZero (text) && zeroRt == null) {
params.put(Constants.DataType.Hzero, rt);
zeroRt = rt;
rt.setDataType(Constants.DataType.Hzero);
}
else if (isHillInf (text) && infRt == null) {
params.put(Constants.DataType.Hinf, rt);
infRt = rt;
rt.setDataType(Constants.DataType.Hinf);
}
else if ((ac50Rt == null || ac50Rt.getType() == ResultType.Type.String)
&& isHillXC50 (text) &&
(isConcUnit (rt.getUnit())
// a big exception to handle a small
// case of early ncgc's assays that
// don't have property unit
|| text.equals("Qualified AC50"))) {
/// always assume it's in uM???
params.put(Constants.DataType.Ac50, rt);
ac50Rt = rt;
rt.setDataType(Constants.DataType.Ac50);
}
else {
DataType dataType = isOtherType(text);
if (dataType != DataType.Unknown && !params.containsKey(dataType)) {
params.put(isOtherType(text), rt);
rt.setDataType(dataType);
}
}
}
else if (rt.getType() == ResultType.Type.String) {
if (text.equals("Potency")) {
params.put(Constants.DataType.Ac50, rt);
ac50Rt = rt;
mask.add(ac50Rt.getTID());
rt.setDataType(Constants.DataType.Ac50);
}
else {
DataType dataType = isOtherType(text);
if (dataType != DataType.Unknown && !params.containsKey(dataType)) {
params.put(isOtherType(text), rt);
rt.setDataType(dataType);
}
}
}
}
if (params.containsKey(Constants.DataType.Ac50)) {
for (DataType key: params.keySet()) {
ResultType rt = params.get(key);
if (!mask.contains(rt.getTID()))
mask.add(rt.getTID());
rt.setContextGroup(contextGroup);
}
}
// if (ac50Rt != null) { // && slopeRt != null) {
// if (!mask.contains(ac50Rt.getTID())) {
// mask.add(ac50Rt.getTID());
// }
// ac50Rt.setContextGroup(contextGroup);
//
// if (slopeRt != null) {
// mask.add(slopeRt.getTID());
// slopeRt.setContextGroup(contextGroup);
// }
//
// if (zeroRt != null) {
// mask.add(zeroRt.getTID());
// zeroRt.setContextGroup(contextGroup);
// }
//
// if (infRt != null) {
// mask.add(infRt.getTID());
// infRt.setContextGroup(contextGroup);
// }
// }
}
protected static void getHillParams
(Assay assay, Collection<AssayData> data,
String name, AssayDataDoseResponseHill4p dr, Set<Integer> mask) {
if (FOC_REGEX.matcher(name).find()) {
// might be something like: 620 nm FOC
return;
}
Double ac50 = null, slope = null, zero = null, inf = null;
ResultType ac50Rt = null, slopeRt = null, zeroRt = null, infRt = null;
for (AssayData ad : data) {
if (mask.contains(ad.getTID())) {
continue;
}
ResultType rt = assay.getResult(ad.getTID());
if (ad != null) {
String text = rt.getName();
if (rt.getType() == ResultType.Type.Float) {
if (isHillSlope(text) && slopeRt == null) {
slope = (Double)ad.getValue();
// sigh... Emory... e.g., AID 801
if (text.indexOf("Curve Curvature") >= 0
|| text.indexOf("Curve Slope") >= 0) {
slope *= -1;
}
slopeRt = rt;
}
else if (isHillZero (text) && zeroRt == null) {
zero = (Double)ad.getValue();
zeroRt = rt;
}
else if (isHillInf (text) && infRt == null) {
inf = (Double)ad.getValue();
infRt = rt;
}
else if (ac50Rt == null && isHillXC50 (text) &&
(isConcUnit (rt.getUnit())
// a big exception to handle a small
// case of early ncgc's assays that
// don't have property unit
|| text.equals("Qualified AC50"))) {
/// always assume it's in uM???
double scale = 1.;
switch (rt.getUnit()) {
case um: scale = 1e-6; break;
case nm: scale = 1e-9; break;
case mm: scale = 1e-3; break;
}
ac50 = (Double)ad.getValue() * scale;
ac50Rt = rt;
}
}
else if (text.equals("Potency")
&& rt.getType() == ResultType.Type.String) {
// parse the string version...
try {
String value = (String)ad.getValue();
int index = value.indexOf("(");
ac50 = Double.parseDouble
(index < 0 ? value : value.substring(0, index));
ac50Rt = rt;
}
catch (NumberFormatException ex) {}
}
}
}
if (ac50 != null && slope != null) {
mask.add(ac50Rt.getTID());
mask.add(slopeRt.getTID());
if (zeroRt != null) {
mask.add(zeroRt.getTID());
}
if (infRt != null) {
mask.add(infRt.getTID());
}
if (inf == null || zero == null) {
// set inf to be max response....
Double[] res = (Double[])dr.getValue();
if (res != null && res.length > 0) {
Double min = res[0], max = res[0];
for (int i = 1; i < res.length; ++i) {
if (res[i] > max) max = res[i];
if (res[i] < min) min = res[i];
}
if (inf == null) inf = max;
if (zero == null) zero = min;
}
}
if (zero != null && inf != null) {
/*
System.err.println("** dose-response " + name
+": xc50=\""+ac50Rt.getName()+"\""
+" slope=\""+slopeRt.getName()+"\""
+" zero=\""
+(zeroRt!=null?zeroRt.getName():"")+"\""
+" inf=\""
+(infRt!=null?infRt.getName():"")+"\"");
*/
dr.setHillCoef(slope);
dr.setZeroAct(zero);
dr.setInfAct(inf);
dr.setAc50(ac50);
}
}
}
// different incarnations of hill slope
static final String[] HILL_SLOPE_EXPRS = {
"Hill Slope",
"HillSlope",
"HILLSLOPE",
"Hill Coefficient",
"Hill slope",
"Hillslope",
"Curve Curvature",
"Curve Slope"
};
protected static boolean isHillSlope (String text) {
for (int i = 0; i < HILL_SLOPE_EXPRS.length; ++i) {
if (text.indexOf(HILL_SLOPE_EXPRS[i]) >= 0) {
return true;
}
}
return false;
}
static final String[] HILL_INF_EXPRS = {
"Hill Sinf",
"InfiniteActivity",
"EC50 Max",
"IC50 Max",
"Curve Top",
"TOP",
"SInf"
};
protected static boolean isHillInf (String text) {
for (int i = 0; i < HILL_INF_EXPRS.length; ++i) {
if (HILL_INF_EXPRS[i].equals(text)
|| text.indexOf(HILL_INF_EXPRS[i]) >= 0) {
return true;
}
}
return false;
}
static final String[] HILL_ZERO_EXPRS = {
"Hill S0",
"ZeroActivity",
"EC50 Min",
"IC50 Min",
"Curve Bottom",
"BOTTOM",
"S0"
};
protected static boolean isHillZero (String text) {
for (int i = 0; i < HILL_ZERO_EXPRS.length; ++i) {
if (HILL_ZERO_EXPRS[i].equals(text)
|| text.indexOf(HILL_ZERO_EXPRS[i]) >= 0) {
return true;
}
}
return false;
}
static final String[] HILL_XC50_EXPRS = {
"EC50",
"IC50",
"AC50",
"Potency",
"Qualified AC50"
};
protected static boolean isHillXC50 (String text) {
for (int i = 0; i < HILL_XC50_EXPRS.length; ++i) {
if ((HILL_XC50_EXPRS[i].equals(text)
|| text.indexOf(HILL_XC50_EXPRS[i]) >= 0)
// AID 855
&& (text.indexOf("Error") < 0
&& text.indexOf("Std") < 0
&& text.indexOf("Relative") < 0)) {
return true;
}
}
return false;
}
static final HashMap<String, Constants.DataType> OTHER_EXPRS = new HashMap<String, Constants.DataType>();
static {
OTHER_EXPRS.put("Fit_CurveClass", Constants.DataType.CurveClass);
OTHER_EXPRS.put("Fit_R2", Constants.DataType.R2);
OTHER_EXPRS.put("Excluded_Points", Constants.DataType.ExcludedPoints);
}
protected static Constants.DataType isOtherType(String text) {
for (String key: OTHER_EXPRS.keySet())
if (key.equals(text) || text.indexOf(key) >= 0)
return OTHER_EXPRS.get(key);
return Constants.DataType.Unknown;
}
protected static boolean isConcUnit (ResultType.Unit u) {
// only look at field that looks like concentration
return u == ResultType.Unit.um
|| u == ResultType.Unit.m
|| u == ResultType.Unit.nm;
}
protected static AssayData parseField (ResultType result, String value) {
if (value != null && value.length() > 0) {
switch (result.getType()) {
case String:
return new AssayData (result.getTID(), value);
case Float:
return new AssayData (result.getTID(), Double.valueOf(value));
case Int:
return new AssayData (result.getTID(), Integer.valueOf(value));
case Bool:
return new AssayData (result.getTID(), Boolean.valueOf(value));
}
}
return null;
}
private static Vector<ResultType> loadResultsFromJson(JsonNode node, ObjectMapper mapper) throws IllegalArgumentException, IllegalAccessException, InvocationTargetException, JsonParseException, JsonMappingException, IOException {
if (node == null)
return null;
HashMap<String,Method> rtSetMethods = new HashMap<String,Method>();
for (Method method: ResultType.class.getMethods()) {
if (method.getName().startsWith("set"))
rtSetMethods.put(method.getName().substring(3).toLowerCase(), method);
}
Vector<ResultType> rts = new Vector<ResultType>();
for (Iterator<JsonNode> iter =node.iterator(); iter.hasNext();) {
JsonNode child = iter.next();
ResultType rt = new ResultType();
for (Iterator<String> iter2 = child.fieldNames(); iter2.hasNext();) {
String field = iter2.next();
if (rtSetMethods.containsKey(field.toLowerCase())) {
Method method = rtSetMethods.get(field.toLowerCase());
Class<?>[] inputs = method.getParameterTypes();
if (inputs.length != 1) throw new IllegalArgumentException("Method has too many input arguements: "+method.getName());
Object input = mapper.readValue(child.get(field).traverse(), inputs[0]);
method.invoke(rt, input);
}
}
rts.add(rt);
}
return rts;
}
public static void main (String argv[]) throws Exception {
// if (argv.length < 2) {
// System.err.println("usage: PubChemAssayParser AID.xml AID.csv");
// System.exit(1);
// }
//
// Assay assay = parseBioassayXML (new FileReader (argv[0]));
// Enumeration<ResultType> rte = assay.getResults();
// Vector<ResultType> rtv = new Vector<ResultType>();
// while (rte.hasMoreElements()) {
// rtv.add(rte.nextElement());
// }
// assignDataTypes (rtv);
// for (Enumeration<ResultType> en = assay.getResults();
// en.hasMoreElements(); ) {
// System.out.println(en.nextElement());
// }
//
// System.exit(0);
// CAPExtractor c = new CAPExtractor();
// c.setHandlers();
// Dictionary d = (Dictionary)CapResourceHandlerRegistry.getInstance().getHandler(CAPConstants.CapResource.DICTIONARY).
// poll(CAPConstants.CAP_ROOT+"/dictionary", CAPConstants.CapResource.DICTIONARY).get(0);
//
Connection conn;
HashMap<Integer,Integer> exptIDLookup = new HashMap<Integer,Integer>();
HashMap<String,Integer> capDict = new HashMap<String,Integer>();
HashMap<Integer,String> capDictReverse = new HashMap<Integer,String>();
HashMap<String,String> exptTIDName = new HashMap<String,String>();
HashMap<String,String> exptTIDType = new HashMap<String,String>();
HashMap<String,Integer> exptTIDElem = new HashMap<String,Integer>();
HashMap<String,Integer> exptTIDGroup = new HashMap<String,Integer>();
try {
conn = CAPUtil.connectToBARD();
conn.setAutoCommit(false);
Statement st = conn.createStatement();
st.execute("select bard_expt_id, pubchem_aid from bard_experiment"); // where pubchem_aid=2551");
ResultSet result = st.getResultSet();
while (result.next()) {
int BardExptId = result.getInt(1);
int PubChemAID = result.getInt(2);
exptIDLookup.put(PubChemAID, BardExptId);
}
result.close();
st = conn.createStatement();
st.execute("select label, dictid from cap_dict_elem order by ins_date");
result = st.getResultSet();
while (result.next()) {
int capDictID = result.getInt(2);
String capDictLabel = result.getString(1);
capDict.put(capDictLabel, capDictID);
capDictReverse.put(capDictID, capDictLabel);
}
capDict.put("unknown", -1);
result.close();
st = conn.createStatement();
st.execute("select pubchem_aid, tid, data_type, data_type_elem, context_group, name from bard_experiment_tid");
result = st.getResultSet();
while (result.next()) {
int PubChemAID = result.getInt(1);
int tid = result.getInt(2);
String key = PubChemAID+":"+tid;
String name = result.getString(6);
exptTIDName.put(key, name);
String type = result.getString(3);
exptTIDType.put(key, type);
int typeElem = result.getInt(4);
exptTIDElem.put(key, typeElem);
if (!capDict.containsKey(type) || capDict.get(type) != typeElem) {
System.err.println("Stated type does not map to dict elem: " + type +" " + typeElem + ":" + capDict.get(type)+":"+capDictReverse.get(typeElem));
}
int group = result.getInt(5);
exptTIDGroup.put(key, group);
}
result.close();
conn.close();
} catch (Exception e) {e.printStackTrace();}
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream("latest_result_mappings.txt")));
String[] header = br.readLine().split("\t");
String line;
while ((line = br.readLine()) != null) {
String[] fline = line.split("\t");
String[] sline = new String[22];
for (int i=0; i<sline.length; i++) {
if (fline.length > i)
sline[i] = fline[i];
else sline[i] = "";
}
int PubChemAID = Integer.valueOf(sline[0]);
int PubChemTID = Integer.valueOf(sline[1]);
String PubChemTIDName = sline[2];
Integer series = sline[3].length() == 0 ? null : Integer.valueOf(sline[3]);
if (sline[4].equals("Derives")) {sline[5] = sline[4]; sline[4] = "";} /* sigh ... minor problem with mappings file */
if (sline[4].indexOf(',') > -1) {sline[4] = sline[4].substring(1, sline[4].indexOf(','));} /* sigh ... minor problems in mappings file */
Integer parentTID = sline[4].length() == 0 ? null : Integer.valueOf(sline[4]);
String relationship = sline[5];
Integer qualifierTID = sline[6].length() == 0 ? null : Integer.valueOf(sline[6]);
String resultType = sline[7];
String statsModifier = sline[8];
Integer contextTID = sline[9].length() == 0 ? null : Integer.valueOf(sline[9]);
String contextItem = sline[10];
Double concentration = sline[11].length() == 0 ? null : Double.valueOf(sline[11]);
String concUnits = sline[12];
Integer panelNumber = sline[16].length() == 0 ? null : Integer.valueOf(sline[16]);
String attr1 = sline[17];
String value1 = sline[18];
Integer excludedSeries = sline[19].length() == 0 ? null : Integer.valueOf(sline[19]);
String attr2 = sline[20];
String value2 = sline[21];
if (!exptIDLookup.containsKey(PubChemAID))
System.err.println("Missing BARD Expt ID for PubChem AID: "+PubChemAID);
else if (!resultType.equals("") && !capDict.containsKey(resultType)) {
System.err.println("Unknown result type:"+resultType+" "+line);
}
else {
String key = PubChemAID+":"+PubChemTID;
String dataType = "unknown";
int dataTypeElem = -1;
if (!resultType.equals("")) {
dataType = resultType;
dataTypeElem = capDict.get(resultType);
if (!exptTIDType.containsKey(key) || (!exptTIDType.get(key).equals(dataType) && !exptTIDType.get(key).equals("unknown")))
System.err.println("Update to exptTIDType: "+key+":"+dataType+":"+dataTypeElem+":"+exptTIDType.get(key));
if (!exptTIDElem.containsKey(key) || (!exptTIDElem.get(key).equals(dataTypeElem) && !exptTIDElem.get(key).equals(-1)))
System.err.println("Update to exptTIDElem: "+key+":"+dataType+":"+dataTypeElem+":"+exptTIDElem.get(key));
}
int contextGroup = -1;
}
}
System.exit(0);
br = new BufferedReader(new InputStreamReader(new FileInputStream("ExptTIDs_merged.txt")));
header = br.readLine().split("\t");
Map<String, Map<String, String>> exptTIDs = new HashMap<String, Map<String,String>>();
while ((line = br.readLine()) != null) {
String[] sline = line.split("\t");
String bardExptID = sline[0];
String pubChemAID = sline[1];
String TID = sline[2];
String key = bardExptID+":"+pubChemAID+":"+TID;
Map<String,String> entry = new HashMap<String,String>();
for (int i=0; i<sline.length; i++) {
entry.put(header[i], sline[i]);
}
exptTIDs.put(key, entry);
}
System.exit(0);
//Connection conn;
try {
conn = CAPUtil.connectToBARD();
conn.setAutoCommit(false);
PreparedStatement erdUpdate = conn.prepareStatement("update bard_experiment set bard_expt_result_def=? where bard_expt_id=?");
Statement st = conn.createStatement();
st.execute("select bard_expt_id, pubchem_aid, expt_result_def from bard_experiment where bard_expt_id"); // where pubchem_aid=2551");
ResultSet result = st.getResultSet();
// System.out.println("BardExptID|PubChemAID|"+ResultType.printHeader());
while (result.next()) {
String json = result.getString(3);
ObjectMapper mapper = new ObjectMapper ();
if (json != null) {
JsonNode node = mapper.readTree(json);
Vector<ResultType> exptDef = loadResultsFromJson(node, mapper);
assignDataTypes(exptDef);
int bardExptId = result.getInt(1);
int pubchemAID = result.getInt(2);
// for (ResultType rt: exptDef) {
// System.out.println(bardExptId+"|"+pubchemAID+"|"+rt.print());
// }
for (ResultType rt: exptDef) {
String key = bardExptId+":"+pubchemAID+":"+rt.getTID();
Map<String,String> entry = exptTIDs.get(key);
if (!entry.get("DataTypeElem").equals(String.valueOf(rt.getDataType().getElem()))) {
System.out.println("Updated data type: "+key+" |"+rt.getDataType()+":"+entry.get("DataType"));
DataType dataType = DataType.getDataType(entry.get("DataType"));
if (dataType == null) {
System.err.println("Data Type not found:"+entry.get("DataType"));
// System.exit(1);
} else {
rt.setDataType(dataType);
}
}
if (!entry.get("ContextGroup").equals(String.valueOf(rt.getContextGroup()))) {
System.out.println("Updated ContextGroup: "+key+" |"+rt.getContextGroup()+":"+entry.get("ContextGroup"));
try {
rt.setContextGroup(Integer.valueOf(entry.get("ContextGroup")));
} catch (Exception e) {e.printStackTrace();}
}
if (!entry.get("TestConcUnit").toLowerCase().equals(rt.getTestConcUnit().toString())) {
System.out.println("Updated TestConcUnit: "+key+" |"+rt.getTestConcUnit()+":"+entry.get("TestConcUnit"));
Unit unit = Unit.getUnit(entry.get("TestConcUnit").toLowerCase());
if (unit == null) {
System.err.println("Unit not found:"+entry.get("TestConcUnit"));
System.exit(1);
}
rt.setTestConcUnit(unit);
}
if (entry.get("TestConcValue") != null && entry.get("TestConcValue").length() > 0 && entry.get("TestConcValue").indexOf(',') == -1) {
Double[] entryValue = new Double[1];
entryValue[0] = Double.valueOf(entry.get("TestConcValue"));
Double[] rtValue = rt.getTestConcentration();
if (rtValue.length == 0 || rtValue.length == 1 && !entryValue[0].equals(rtValue[0])) {
System.out.println("Updated TestConcValue: "+key+" |"+(rtValue.length == 0 ? "" : rtValue[0])+":"+entryValue[0]);
rt.setTestConcentration(entryValue);
}
}
}
// remove DoseResponse resultTypes as they are now out-of-date anyway [contextTIDs, concs, etc.]
for (int i=exptDef.size()-1; i>-1; i--) {
ResultType rt = exptDef.get(i);
if (rt.getType().equals(Type.DoseResponse))
exptDef.remove(rt);
}
erdUpdate.setInt(2, bardExptId);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
mapper.writeValue(baos, exptDef);
String jsonOut = new String(baos.toByteArray());
System.out.println(bardExptId+":"+jsonOut);
erdUpdate.setString(1, jsonOut);
// erdUpdate.executeUpdate();
// conn.commit();
}
}
result.close();
conn.close();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}