/** * See the NOTICE file distributed with this work for additional information * regarding copyright ownership. * * This is free software; you can redistribute it and/or modify it under the * terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This software is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this software; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA, or see the FSF * site: http://www.fsf.org. */ package org.ut.biolab.medsavant.server.ontology; import java.io.*; import java.net.MalformedURLException; import java.net.URL; import java.rmi.RemoteException; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.util.*; import java.util.zip.GZIPInputStream; import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.ut.biolab.medsavant.server.MedSavantServerJob; import org.ut.biolab.medsavant.server.MedSavantServerEngine; import org.ut.biolab.medsavant.server.db.MedSavantDatabase; import org.ut.biolab.medsavant.server.db.MedSavantDatabase.OntologyColumns; import org.ut.biolab.medsavant.server.db.MedSavantDatabase.OntologyInfoColumns; import org.ut.biolab.medsavant.shared.db.TableSchema; import org.ut.biolab.medsavant.server.db.ConnectionController; import org.ut.biolab.medsavant.server.db.PooledConnection; import org.ut.biolab.medsavant.shared.model.Ontology; import org.ut.biolab.medsavant.shared.model.OntologyTerm; import org.ut.biolab.medsavant.shared.model.OntologyType; import org.ut.biolab.medsavant.shared.serverapi.OntologyManagerAdapter; import org.ut.biolab.medsavant.server.MedSavantServerUnicastRemoteObject; import static org.ut.biolab.medsavant.server.db.MedSavantDatabase.OntologyColumns.ALT_IDS; import static org.ut.biolab.medsavant.server.db.MedSavantDatabase.OntologyColumns.DEF; import static org.ut.biolab.medsavant.server.db.MedSavantDatabase.OntologyColumns.ID; import static org.ut.biolab.medsavant.server.db.MedSavantDatabase.OntologyColumns.NAME; import static org.ut.biolab.medsavant.server.db.MedSavantDatabase.OntologyColumns.ONTOLOGY; import static org.ut.biolab.medsavant.server.db.MedSavantDatabase.OntologyColumns.PARENTS; import org.ut.biolab.medsavant.server.serverapi.SessionManager; import org.ut.biolab.medsavant.shared.model.SessionExpiredException; import org.ut.biolab.medsavant.shared.util.MiscUtils; import org.ut.biolab.medsavant.shared.util.RemoteFileCache; import org.ut.biolab.medsavant.shared.util.WebResources; /** * Concrete implementation of ontology manager. * * @author tarkvara */ public class OntologyManager extends MedSavantServerUnicastRemoteObject implements OntologyManagerAdapter, OntologyColumns, OntologyInfoColumns { private static final Log LOG = LogFactory.getLog(OntologyManager.class); private static OntologyManager instance; private PooledConnection connection; private final TableSchema ontologySchema; private final TableSchema ontologyInfoSchema; public static OntologyManager getInstance() throws RemoteException { if (instance == null) { instance = new OntologyManager(); } return instance; } private OntologyManager() throws RemoteException { ontologySchema = MedSavantDatabase.OntologyTableSchema; ontologyInfoSchema = MedSavantDatabase.OntologyInfoTableSchema; } @Override public void addOntology(String sessID, String name, OntologyType type, URL oboData, URL mappingData) throws IOException, SQLException, SessionExpiredException { switch (type) { case GO: populateGOTables(sessID, name, oboData, mappingData); break; case HPO: populateHPOTables(sessID, name, oboData, mappingData); break; case OMIM: // In the case of OMIM, the mappingData is actually a text-file mapping between OMIM terms and HPO // terms; to be useful, this assumes that populateHPOTables has already created the mapping between HPO // terms and genes. populateOMIMTables(sessID, name, oboData, mappingData); break; } PooledConnection conn = ConnectionController.connectPooled(sessID); try { conn.executePreparedUpdate(ontologyInfoSchema.delete(ONTOLOGY_NAME, name).toString()); conn.executePreparedUpdate(ontologyInfoSchema.preparedInsert(TYPE, ONTOLOGY_NAME, OBO_URL, MAPPING_URL).toString(), type.toString(), name, oboData.toString(), mappingData.toString()); } finally { conn.close(); } } @Override public void removeOntology(String sessID, String ontName) throws IOException, SQLException, RemoteException, SessionExpiredException { PooledConnection conn = ConnectionController.connectPooled(sessID); throw new UnsupportedOperationException("Not supported yet."); } @Override public Ontology[] getOntologies(String sessID) throws SQLException, RemoteException, SessionExpiredException { PooledConnection conn = ConnectionController.connectPooled(sessID); try { List<Ontology> results = new ArrayList<Ontology>(); ResultSet rs = conn.executePreparedQuery(ontologyInfoSchema.distinct().select(TYPE, ONTOLOGY_NAME, OBO_URL, MAPPING_URL).toString()); while (rs.next()) { String name = rs.getString(2); try { results.add(new Ontology(Enum.valueOf(OntologyType.class, rs.getString(1)), rs.getString(2), new URL(rs.getString(3)), new URL(rs.getString(4)))); } catch (MalformedURLException ex) { LOG.warn(String.format("Error parsing URL for %s: %s", name, MiscUtils.getMessage(ex)), ex); } } return results.toArray(new Ontology[0]); } finally { conn.close(); } } @Override public OntologyTerm[] getAllTerms(String sessID, OntologyType ont) throws InterruptedException, SQLException, SessionExpiredException { makeProgress(sessID, "Connecting...", 0.0); List<OntologyTerm> result = new ArrayList<OntologyTerm>(); PooledConnection conn = ConnectionController.connectPooled(sessID); try { double prog = 0.2; makeProgress(sessID, "Executing query...", prog); String q = ontologySchema.where(ONTOLOGY, ont.toString()).whereNotNull(GENES).orderBy(ID).select(ID, NAME, DEF, ALT_IDS, PARENTS).toString(); LOG.info("Getting all ontology terms: " + q); ResultSet rs = conn.executePreparedQuery(q); while (rs.next()) { prog = 0.5 + prog * 0.5; // Just for fun, to converge on 1.0 makeProgress(sessID, "Retrieving ontology terms...", prog); result.add(new OntologyTerm(ont, rs.getString(1), rs.getString(2), rs.getString(3), StringUtils.split(rs.getString(4), ','), StringUtils.split(rs.getString(5), ','))); } } finally { conn.close(); } return result.toArray(new OntologyTerm[0]); } @Override public String[] getGenesForTerm(String sessID, OntologyTerm term, String refID) throws SQLException, SessionExpiredException { List<String> result = new ArrayList<String>(); PooledConnection conn = ConnectionController.connectPooled(sessID); try { ResultSet rs = conn.executePreparedQuery(ontologySchema.where(ID, term.getID(), ONTOLOGY, term.getOntology().toString()).select(GENES)); // Only expecting one row for each term. if (rs.next()) { String geneString = rs.getString(1); if (geneString != null && geneString.length() > 2) { // Gene-string should be of the form "|gene1|gene2|...|geneN|". // We start at position 1 to avoid an empty string at the start. result.addAll(Arrays.asList(geneString.substring(1).split("\\|"))); } } } finally { conn.close(); } return result.toArray(new String[0]); } /** * When loading an ontology it's more efficient to do a single big-ass query * instead of a whack of small ones. * * @param terms an array of terms whose genes are to be fetched * @return an map of term IDs to arrays of genes */ @Override public Map<OntologyTerm, String[]> getGenesForTerms(String sessID, OntologyTerm[] terms, String refID) throws SQLException, SessionExpiredException { Map<OntologyTerm, String[]> result = new HashMap<OntologyTerm, String[]>(); PooledConnection conn = ConnectionController.connectPooled(sessID); try { StringBuilder termsString = new StringBuilder("\'"); if (terms.length > 0) { termsString.append(terms[0].getID()); for (int i = 1; i < terms.length; i++) { termsString.append("\', \'"); termsString.append(terms[i].getID()); } termsString.append('\''); ResultSet rs = conn.executePreparedQuery(String.format("SELECT id,genes FROM ontology WHERE ontology=? and id IN (%s)", termsString), terms[0].getOntology().toString()); while (rs.next()) { OntologyTerm term = findTermByID(terms, rs.getString(1)); String genesString = rs.getString(2); if (genesString != null && genesString.length() > 2) { // Gene-string should be of the form "|gene1|gene2|...|geneN|". // We start splitting at position 1 to avoid an empty string at the start. result.put(term, genesString.substring(1).split("\\|")); } } } } finally { conn.close(); } return result; } /** * @param sessID login session * @param ont ontology to be searched (pass null to search across all * ontologies) * @param geneName name of the gene whose terms we want * @throws SQLException */ @Override public OntologyTerm[] getTermsForGene(String sessID, OntologyType ont, String geneName) throws SQLException, SessionExpiredException { List<OntologyTerm> result = new ArrayList<OntologyTerm>(); PooledConnection conn = ConnectionController.connectPooled(sessID); try { if (ont != null) { ResultSet rs = conn.executePreparedQuery("SELECT id,name,def,alt_ids,parents FROM ontology WHERE ontology=? AND INSTR(genes, ?)", ont.toString(), "|" + geneName + "|"); while (rs.next()) { result.add(new OntologyTerm(ont, rs.getString(1), rs.getString(2), rs.getString(3), StringUtils.split(rs.getString(4), ','), StringUtils.split(rs.getString(5), ','))); } } else { ResultSet rs = conn.executePreparedQuery("SELECT id,ontology.name,def,alt_ids,parents,type FROM ontology RIGHT JOIN ontology_info ON ontology = ontology_info.name WHERE INSTR(genes, ?)", "|" + geneName + "|"); while (rs.next()) { result.add(new OntologyTerm(OntologyType.valueOf(rs.getString(6)), rs.getString(1), rs.getString(2), rs.getString(3), StringUtils.split(rs.getString(4), ','), StringUtils.split(rs.getString(5), ','))); } } } finally { conn.close(); } return result.toArray(new OntologyTerm[0]); } public OntologyTerm getOntologyTerm(String sessID, OntologyType ont, String ontologyId) throws SQLException, SessionExpiredException{ PooledConnection conn = null; ResultSet rs = null; try { conn = ConnectionController.connectPooled(sessID); rs = conn.executePreparedQuery("SELECT id,name,def,alt_ids,parents FROM ontology WHERE ontology=? AND id=?", ont.toString(), ontologyId); if (rs.next()) { return new OntologyTerm(ont, rs.getString(1), rs.getString(2), rs.getString(3), StringUtils.split(rs.getString(4), ','), StringUtils.split(rs.getString(5), ',')); } else { return null; } } finally { if (rs != null) { rs.close(); } if (conn != null) { conn.close(); } } } /** * Populate the tables with Gene Ontology data * * @param oboData an OBO file containing gene ontology terms * @param goToGeneData a gzipped GAF file * (http://www.geneontology.org/GO.format.gaf-2_0.shtml) * @throws IOException */ private void populateGOTables(String sessID, String name, URL oboData, URL goToGeneData) throws IOException, SQLException, SessionExpiredException { LOG.info("Parsing OBO " + oboData); Map<String, OntologyTerm> terms = new OBOParser(OntologyType.GO).load(oboData); connection = ConnectionController.connectPooled(sessID); LOG.info("Session " + sessID + " made connection"); try { populateTable(name, terms); Map<String, Set<String>> allGenes = new HashMap<String, Set<String>>(); // Expecting a GZIPped tab-delimited text file in GAF (GO Annotation File) format. // We are only interested in columns 2 (gene), 3 (qualifier), and 4 (GO term). LOG.info("Reading annotation file " + goToGeneData + " (Cache: " + RemoteFileCache.getCacheFile(goToGeneData) + ")"); BufferedReader reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(RemoteFileCache.getCacheFile(goToGeneData))))); try { String line; while ((line = reader.readLine()) != null) { if (line.length() > 0 && line.charAt(0) != '!') { String[] fields = line.split("\t"); if (fields.length > 4 && !fields[3].equals("NOT")) { addGenesToTerm(terms, allGenes, fields[2], fields[4]); } } } PreparedStatement updateStatement = connection.prepareStatement("UPDATE ontology SET genes=? WHERE id=?"); for (String t : allGenes.keySet()) { Set<String> termGenes = allGenes.get(t); String geneString = StringUtils.join(termGenes, '|'); connection.executePreparedUpdate(updateStatement, "|" + geneString + "|", t); } } finally { reader.close(); } } finally { connection.close(); connection = null; } } private void addGenesToTerm(Map<String, OntologyTerm> allTerms, Map<String, Set<String>> allGenes, String gene, String term) { Set<String> termGenes = allGenes.get(term); if (termGenes == null) { termGenes = new HashSet<String>(); allGenes.put(term, termGenes); } termGenes.add(gene); OntologyTerm term2 = allTerms.get(term); if (term2 != null) { for (String p : term2.getParentIDs()) { addGenesToTerm(allTerms, allGenes, gene, p); } } } private void populateHPOTables(String sessID, String name, URL oboData, URL hpoToGeneData) throws IOException, SQLException, SessionExpiredException { Map<String, OntologyTerm> terms = new OBOParser(OntologyType.HPO).load(oboData); connection = ConnectionController.connectPooled(sessID); try { populateTable(name, terms); PreparedStatement updStmt = connection.prepareStatement("UPDATE ontology SET genes=? WHERE id=?"); // Mapping file from charite.de has HP terms one per line. BufferedReader reader = new BufferedReader(new FileReader(RemoteFileCache.getCacheFile(hpoToGeneData))); String line; while ((line = reader.readLine()) != null) { if (line.length() > 0 && line.charAt(0) != '#') { int hpPos = line.indexOf("(HP:"); if (hpPos > 0) { String term = line.substring(hpPos + 1, hpPos + 11); int genesStart = line.indexOf("\t["); if (genesStart > 0) { // The list of genes will be line = line.substring(genesStart + 2, line.length() - 1); String[] genes = line.split(", "); String geneString = "|"; for (String g : genes) { geneString += g.substring(0, g.indexOf('(')) + "|"; } connection.executePreparedUpdate(updStmt, geneString, term); } } } } } finally { connection.close(); connection = null; } } private void populateOMIMTables(String sessID, String name, URL oboData, URL omimToHPOData) throws IOException, SQLException, SessionExpiredException { Map<String, OntologyTerm> terms = new OBOParser(OntologyType.OMIM).load(oboData); connection = ConnectionController.connectPooled(sessID); try { populateTable(name, terms); PreparedStatement updStmt = connection.prepareStatement("UPDATE ontology SET genes=? WHERE id=?"); PreparedStatement hpoSelStmt = connection.prepareStatement("SELECT genes FROM ontology where ontology='HPO' AND (id=? OR INSTR(alt_ids, ?))"); Map<String, String[]> hpoGeneCache = new HashMap<String, String[]>(); // OMIM mapping file is ordered by OMIM number, with one HPO term on each line. // When the OMIM number changes we write the accumulated genes out for the given term. BufferedReader reader = new BufferedReader(new FileReader(RemoteFileCache.getCacheFile(omimToHPOData))); String line; String omimTerm = null; String omimGenes = null; while ((line = reader.readLine()) != null) { if (line.startsWith("OMIM")) { String[] fields = line.split("\t"); if (fields.length >= 5 && !fields[3].equals("NOT")) { String newTerm = "MIM:" + fields[1]; if (!newTerm.equals(omimTerm)) { if (omimGenes != null) { connection.executePreparedUpdate(updStmt, omimGenes, omimTerm); omimGenes = null; } omimTerm = newTerm; } String hpoTerm = fields[4]; String[] hpoGenes = hpoGeneCache.get(hpoTerm); if (hpoGenes == null) { ResultSet rs = connection.executePreparedQuery(hpoSelStmt, hpoTerm, hpoTerm); if (rs.next()) { String geneString = rs.getString(1); // pipe-delimited list of genes if (geneString != null) { hpoGenes = geneString.split("\\|"); } else { hpoGenes = new String[0]; } } else { hpoGenes = new String[0]; LOG.info("Unable to find HPO term " + hpoTerm + " for " + omimTerm); } hpoGeneCache.put(hpoTerm, hpoGenes); } for (String g : hpoGenes) { if (g.length() > 0) { g = "|" + g + "|"; if (omimGenes == null) { omimGenes = g; } else if (!omimGenes.contains(g)) { omimGenes += g.substring(1); } } } } } } if (omimGenes != null) { connection.executePreparedUpdate(updStmt, omimGenes, omimTerm); } } finally { if (connection != null) { connection.close(); } } } private void populateTable(String name, Map<String, OntologyTerm> terms) throws SQLException { String backupTableName = null; // Insert records for all the terms. Different prepared statement used depending on whether we have parents or not. PreparedStatement prep4 = connection.prepareStatement(ontologySchema.preparedInsert(ONTOLOGY, ID, NAME, DEF).toString()); PreparedStatement prep5a = connection.prepareStatement(ontologySchema.preparedInsert(ONTOLOGY, ID, NAME, DEF, ALT_IDS).toString()); PreparedStatement prep5b = connection.prepareStatement(ontologySchema.preparedInsert(ONTOLOGY, ID, NAME, DEF, PARENTS).toString()); PreparedStatement prep6 = connection.prepareStatement(ontologySchema.preparedInsert(ONTOLOGY, ID, NAME, DEF, ALT_IDS, PARENTS).toString()); int mostAltIDs = 1; for (OntologyTerm t : terms.values()) { PreparedStatement prep; if (t.getAltIDs().length > 0) { if (t.getParentIDs().length > 0) { prep = prep6; prep.setString(6, StringUtils.join(t.getParentIDs(), ',')); } else { prep = prep5a; } prep.setString(5, StringUtils.join(t.getAltIDs(), ',')); if (t.getAltIDs().length > mostAltIDs) { mostAltIDs = t.getAltIDs().length; LOG.info(t.getID() + " had " + mostAltIDs + " alt_ids."); // For debug purposes. } } else if (t.getParentIDs().length > 0) { prep = prep5b; prep.setString(5, StringUtils.join(t.getParentIDs(), ',')); } else { prep = prep4; } prep.setString(1, name); prep.setString(2, t.getID()); prep.setString(3, t.getName()); prep.setString(4, t.getDef()); prep.executeUpdate(); } LOG.debug(String.format("Inserted %d records.", terms.size())); // If we got here, drop the backup table. if (backupTableName != null) { connection.executeUpdate("DROP TABLE ontology_back"); LOG.debug("Dropped ontology_back."); } } private static OntologyTerm findTermByID(OntologyTerm[] terms, String termID) { for (OntologyTerm t : terms) { if (t.getID().equals(termID)) { return t; } } return null; } /** * Called from <code>createDatabase()</code> to create all the ontology * tables on a BLOCKING background thread. * * @param sessID */ public void populate(final String sessID) { try { MedSavantServerEngine.submitShortJob(new MedSavantServerJob(SessionManager.getInstance().getUserForSession(sessID), "Ontology Populator", null) { @Override public boolean run() { try { LOG.info("dbname for connection: " + ConnectionController.getDBName(sessID)); LOG.info("Adding GO Ontology"); addOntology(sessID, OntologyType.GO.toString(), OntologyType.GO, WebResources.GO_OBO_URL, WebResources.GO_TO_GENES_URL); LOG.info("Adding HPO Ontology"); addOntology(sessID, OntologyType.HPO.toString(), OntologyType.HPO, WebResources.HPO_OBO_URL, WebResources.HPO_TO_GENES_URL); LOG.info("Adding OMIM Ontology"); addOntology(sessID, OntologyType.OMIM.toString(), OntologyType.OMIM, WebResources.OMIM_OBO_URL, WebResources.OMIM_TO_HPO_URL); SessionManager.getInstance().unregisterSession(sessID); return true; } catch (Exception ex) { LOG.error("Error populating ontology tables.", ex); return false; } } }); } catch (Exception ex) { LOG.error("Error populating ontology tables.", ex); } } }