package gov.nih.ncgc.bard.resourcemgr.extresource.go; import gov.nih.ncgc.bard.resourcemgr.BardDBUtil; import gov.nih.ncgc.bard.resourcemgr.BardExtResourceLoader; import gov.nih.ncgc.bard.resourcemgr.BardExternalResource; import gov.nih.ncgc.bard.resourcemgr.IBardExtResourceLoader; import gov.nih.ncgc.bard.resourcemgr.util.BardResourceFetch; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.SQLException; import java.sql.Statement; import java.util.ArrayList; import java.util.logging.Logger; /** * BardGOAssociationLoader updates or creates a new go_association table that links uniprot accessions * to go terms for a subset of annotated species. * * @author braistedjc * */ public class BardGoAssociationLoader extends BardExtResourceLoader implements IBardExtResourceLoader { private static Logger logger = Logger.getLogger(BardGoAssociationLoader.class.getName()); private String inputFile; private String dbURL; private String driverName = "com.mysql.jdbc.Driver"; private String sqlInsertGoAssociation = "insert into go_association values (?,?,?,?,?,?,?,?,?,?)"; private String sqlInsertTempGoAssociation = "insert into temp_go_association values (?,?,?,?,?,?,?,?,?,?)"; private String sqlCreateTempGoAssocation = "create table if not exists temp_go_association like go_association"; private Connection conn; private long loadCnt; public BardGoAssociationLoader() { } public BardGoAssociationLoader(String inputFile, String dbURL, String driverStr) { this.inputFile = inputFile; this.dbURL = dbURL; driverName = driverStr; } public boolean load() { boolean loaded = false; if(service.getServiceKey().contains("GO-ASSOCIAION-REFRESH")) { loaded = loadAssociation(); } else if(service.getServiceKey().contains("GO-ONTOLOGY-REFRESH")){ log.info("Starting Service: "+service.getServiceKey()); loaded = updateGoTermTables(); } return loaded; } public boolean loadAssociation() { boolean loaded = false; boolean haveFiles = false; haveFiles = fetchGOHTTPAssociationResources(); long assocCnt = 0; loadCnt = 0; try { conn = BardDBUtil.connect(service.getDbURL()); conn.setAutoCommit(false); //get current table size assocCnt = BardDBUtil.getTableRowCount("go_association", service.getDbURL()); //create temp table that is empty BardDBUtil.cloneTableStructure("go_association", "temp_go_association", service.getDbURL()); //need to load any fetched .gz files String baseGoDir = service.getLocalResPath(); File goDir = new File(baseGoDir); String [] fileList = goDir.list(); String decompFileName; for(String fileName: fileList) { if(fileName.endsWith(".gz")) { decompFileName = fileName.replace(".gz", ""); log.info("unzipping file:"+fileName); //gunzip BardResourceFetch.gunzipFile(baseGoDir+"/"+fileName, baseGoDir+"/"+decompFileName); log.info("Loading go association file:"+decompFileName); //process file loadTempGoAssociation(baseGoDir+"/"+decompFileName); log.info("Finished load:"+decompFileName); } } //get final count in temp table compared to go_association assocCnt = BardDBUtil.getTableRowCount("temp_go_association", service.getDbURL()) - assocCnt; log.info("Finshed load into temp_go_association. Assoc count="+assocCnt); //swap tables from temp to production and back BardDBUtil.swapTempTableToProductionIfPassesSizeDelta("temp_go_association", "go_association", 0.90, service.getDbURL()); conn.close(); loaded = true; } catch (SQLException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return loaded; } private boolean fetchGOHTTPAssociationResources() { boolean haveFiles = false; //clear this scratch area //clearBardScratch(destPath); ArrayList <BardExternalResource> resourceList = service.getExtResources(); String goRemoteHTTPServer; String goRemoteFtpAssocDir; String goLocalScratchDir; String resourceFileName; String goURL; for(BardExternalResource resource : resourceList) { goRemoteHTTPServer = resource.getResourceServer(); goRemoteFtpAssocDir = resource.getResourcePath(); goLocalScratchDir = service.getLocalResPath(); resourceFileName = resource.getFileName(); //need to construct the external url and fetch resource goURL = "http://"+goRemoteHTTPServer+"/"+ goRemoteFtpAssocDir+"/"+resourceFileName; try { BardResourceFetch.getHttpFile(goURL, goLocalScratchDir+"/"+resourceFileName); log.info("Have GO Association Resource:"+goURL); } catch (IOException e) { log.warning("Could not retrieve GO Association Resource:"+goURL); e.printStackTrace(); //on any failure, return false, missing a file. haveFiles = false; continue; } } return haveFiles; } private boolean fetchGoOntologyTableResources() { boolean haveFiles = false; ArrayList <BardExternalResource> resourceList = service.getExtResources(); String httpServer, httpPath, fileName; String destDir = service.getLocalResPath(); for(BardExternalResource resource : resourceList) { httpServer = resource.getResourceServer(); httpPath = resource.getResourcePath(); fileName = resource.getFileName(); try { BardResourceFetch.getHttpFile("http://"+httpServer+httpPath+"/"+fileName, destDir+"/"+fileName); log.info("Have GO Table tar.gz"); haveFiles = true; } catch (IOException e) { e.printStackTrace(); log.warning("Could not retrieve GO Table tar.gz"); haveFiles = false; } } return haveFiles; } private boolean updateGoTermTables() { boolean updated = true; boolean haveFiles = fetchGoOntologyTableResources(); if(haveFiles) { String goTermPath = service.getLocalResPath(); ArrayList <BardExternalResource> resources = service.getExtResources(); if(resources.size() > 0) { String goTermTarGZIP = resources.get(0).getFileName(); String goTermTar = goTermTarGZIP.replace(".gz", ""); try { log.info("unzipping and untaring the go term table archive"); //have the files, gunzip and untar BardResourceFetch.gunzipFile(goTermPath+"/"+goTermTarGZIP, goTermPath+"/"+goTermTar); BardResourceFetch.untarFile(goTermPath+"/"+goTermTar); //get the single directory present File dir = new File(goTermPath); String [] files = dir.list(); String contentDir = ""; for(String fileName : files) { if(!fileName.endsWith(".gz") && !fileName.endsWith(".tar")) contentDir = fileName; } contentDir = goTermPath + "/" + contentDir; Connection conn = BardDBUtil.connect(service.getDbURL()); Statement stmt = conn.createStatement(); stmt.execute("create table if not exists term like go_term"); stmt.execute("truncate table term"); stmt.execute("load data infile \'"+contentDir+"/term.txt"+"\' into table term"); stmt.execute("create table if not exists term2term like go_term2term"); stmt.execute("truncate table term2term"); stmt.execute("load data infile \'"+contentDir+"/term2term.txt"+"\' into table term2term"); log.info("Created and loaded temp tables, starting table swap."); //swap temp and productin if the size is > 98% of previous, some terms may be lost but not many. BardDBUtil.swapTempTableToProductionIfPassesSizeDelta("term", "go_term", 0.90, service.getDbURL()); BardDBUtil.swapTempTableToProductionIfPassesSizeDelta("term2term", "go_term2term", 0.90, service.getDbURL()); log.info("COMPLETE: New go term and term2term tables are in production"); conn.close(); } catch (FileNotFoundException e) { e.printStackTrace(); return false; } catch (IOException e) { e.printStackTrace(); return false; } catch (SQLException e) { e.printStackTrace(); return false; } catch (ClassNotFoundException e) { e.printStackTrace(); return false; } } } else { updated = false; } return updated; } private long loadTempGoAssociation(String file) { long entryCnt = 0; try { inputFile = file; BufferedReader br = new BufferedReader(new FileReader(inputFile)); String line = ""; PreparedStatement ps = conn.prepareStatement(sqlInsertTempGoAssociation); String [] toks; while((line = br.readLine()) != null ) { if(line.startsWith("!")) continue; toks = line.split("\t"); if(toks.length < 15) continue; ps.setInt(1, 0); //source ps.setString(2, toks[0].trim()); //accession ps.setString(3, toks[1].trim()); //common_name ps.setString(4, toks[9].trim()); //taxon toks[12] = toks[12].substring(toks[12].indexOf(":")+1); if(toks[12].contains("|")) toks[12] = toks[12].substring(0, toks[12].indexOf('|')); //System.out.println("taxid=**"+toks[12]+"**"); ps.setString(5, toks[12].trim()); //term ps.setString(6, toks[4].trim()); // term type ps.setString(7, toks[8].trim()); //evidence ps.setString(8, toks[6].trim()); //db_ref ps.setString(9, toks[5].trim()); //association date ps.setString(10, toks[13].trim()); ps.addBatch(); loadCnt++; if(loadCnt % 1000 == 0) { ps.executeBatch(); conn.commit(); logger.info("load count="+loadCnt); } entryCnt++; } //finish current job ps.executeBatch(); conn.commit(); ps.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException ioe) { // TODO Auto-generated catch block ioe.printStackTrace(); } catch (SQLException sqle) { // TODO Auto-generated catch block sqle.printStackTrace(); } return entryCnt; } private int getTermIdInt(String goTermId) { int val = 0; goTermId = goTermId.trim(); if(!goTermId.startsWith("GO:")) return -1; goTermId = goTermId.substring(goTermId.indexOf(':')+1); return Integer.parseInt(goTermId); } public static void main(String [] args) { //file, url, driver BardGoAssociationLoader gal = new BardGoAssociationLoader(args[0], args[1], args[2]); System.out.println("load cnt="+gal.load()); } @Override public String getLoadStatusReport() { // TODO Auto-generated method stub return null; } }