package gov.nih.ncgc.bard.tools; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import java.util.zip.GZIPInputStream; /** * Download latest Uniprot text dump and generate an Oracle SQL loading file. * * @author Rajarshi Guha */ public class PopulateTargets { String ofilename = "uniprot.sql"; public PopulateTargets(String ofilename) { if (ofilename != null) this.ofilename = ofilename; } public void run() throws IOException { BufferedWriter writer = new BufferedWriter(new FileWriter("uniprot.sql")); writer.write("load data\n" + "infile *\n" + "append\n" + "into table protein_target\n" + "fields terminated by '\\t'\n" + "trailing nullcols\n" + "(accession, gene_id, name, taxid, description, uniprot_status)\n" + "begindata\n"); URL uniprot = new URL("ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz"); InputStream gzipStream = new GZIPInputStream(uniprot.openStream()); Reader reader = new InputStreamReader(gzipStream); BufferedReader breader = new BufferedReader(reader); String line = null; StringBuffer sb = new StringBuffer(); int n = 0; while ((line = breader.readLine()) != null) { if (line.trim().equals("//")) { String acc = "", status = "", name = "", geneid = "", desc = "", taxid = ""; String[] toks = sb.toString().split("\n"); for (String aline : toks) { if (aline.startsWith("ID")) { status = aline.split("\\s+")[2].replace(";", ""); } else if (aline.startsWith("AC")) { acc = aline.split(";")[0].trim().replace("AC ", ""); } else if (aline.startsWith("DE RecName:")) { name = aline.split("=")[1].replace(";", ""); } else if (aline.startsWith("DR GeneID;")) { geneid = aline.split(";")[1].trim(); } else if (aline.startsWith("OX ")) { taxid = aline.trim().replace("OX NCBI_TaxID=", "").replace(";", ""); } } writer.write(Util.join(new String[]{acc, geneid, name, taxid, desc, status}, "\t") + "\n"); sb = new StringBuffer(); n++; if (n % 100 == 0) System.out.print("\rProcessed " + n + " entries"); } else { sb.append(line).append("\n"); } } System.out.println(); } public static void main(String[] args) throws IOException { String ofilename = null; if (args.length == 1) { ofilename = args[0]; } PopulateTargets pt = new PopulateTargets(ofilename); pt.run(); } }