package org.seqcode.data.motifdb;
import java.io.*;
import java.util.*;
import java.util.regex.*;
import java.sql.*;
import java.text.ParseException;
import org.seqcode.data.connections.DatabaseConnectionManager;
import org.seqcode.data.connections.DatabaseException;
import org.seqcode.data.connections.UnknownRoleException;
import org.seqcode.data.io.parsing.PWMParser;
import org.seqcode.genome.Species;
import org.seqcode.gseutils.*;
/**
* Usage:
* java org.seqcode.data.motifdb.WeightMatrixImport --species "Saccharomyces cerevisiae" --wmname HSF1 --wmversion MacIsaac06 --wmtype TAMO --wmfile v1.HSF1.wm
*/
public class WeightMatrixImport {
private static int MAX_MOTIF_LEN = 200;
public static void main(String args[]) {
String species = null;
String wmname = null, wmversion = null, wmtype = null;
String wmfile = null;
for (int i = 0; i < args.length; i++) {
if (args[i].equals("--species")) {
species = args[++i];
if (species.indexOf(';') != -1) {
String[] pieces = species.split(";");
species = pieces[0];
}
}
if (args[i].equals("--wmname")) {
wmname = args[++i];
if (wmname.indexOf(';') != -1) {
String[] pieces = wmname.split(";");
wmname = pieces[0];
wmversion = pieces[1];
if (pieces.length >= 3) {
wmtype = pieces[2];
}
}
}
if (args[i].equals("--wmversion")) {
wmversion = args[++i];
}
if (args[i].equals("--wmtype")) {
wmtype = args[++i];
}
if (args[i].equals("--") ||
args[i].equals("--wmfile")) {
wmfile = args[++i];
}
}
// if (species == null) {
// System.err.println("Must supply a --species"); System.exit(1);
// }
if (wmfile == null) {
System.err.println("Must supply a --wmfile"); System.exit(1);
}
try {
if(wmname==null) {
System.err.println("Reading multiple motifs from file " + wmfile);
insertMultiWMFromFile(species,wmtype,wmfile, wmversion);
} else {
if (wmversion == null) {
System.err.println("Must supply a --wmversion"); System.exit(1);
}
insertWMFromFile(species,wmname,wmversion,wmtype,wmfile);
}
} catch (SQLException ex) {
ex.printStackTrace();
} catch (NotFoundException ex) {
ex.printStackTrace();
System.err.println("Must supply a valid species and genome");
} catch (UnknownRoleException ex ){
ex.printStackTrace();
System.err.println("Couldn't connect to role annotations");
} catch (FileNotFoundException ex) {
ex.printStackTrace();
System.err.println("Couldn't find the input file");
} catch (ParseException ex) {
ex.printStackTrace();
} catch (IOException ex) {
ex.printStackTrace();
}
}
public static int insertMatrixIntoDB(WeightMatrix matrix)
throws SQLException, NotFoundException {
java.sql.Connection cxn =DatabaseConnectionManager.getConnection("annotations");
int wmid = -1;
PreparedStatement exists = cxn.prepareStatement("select id from weightmatrix where species = ? and name = ? and version = ? and type = ?");
exists.setInt(1,matrix.speciesid);
exists.setString(2,matrix.name);
exists.setString(3,matrix.version);
exists.setString(4,matrix.type);
ResultSet rs = exists.executeQuery();
if (!rs.next()) {
rs.close();
PreparedStatement insertwm;
if (matrix.bgMapID != -1) {
insertwm = cxn.prepareStatement("insert into weightmatrix(id,species,name,version,type,bg_model_map_id) values (weightmatrix_id.nextval,?,?,?,?,?)");
insertwm.setInt(5, matrix.bgMapID);
}
else {
insertwm = cxn.prepareStatement("insert into weightmatrix(id,species,name,version,type) values (weightmatrix_id.nextval,?,?,?,?)");
}
insertwm.setInt(1,matrix.speciesid);
insertwm.setString(2,matrix.name);
insertwm.setString(3,matrix.version);
insertwm.setString(4,matrix.type);
// System.err.println(String.format("Inserting %s %s %s", matrix.name, matrix.version, matrix.type));
insertwm.execute();
insertwm.close();
PreparedStatement getId = cxn.prepareStatement("select weightmatrix_id.currval from dual");
rs = getId.executeQuery();
if (rs.next()) {
wmid = rs.getInt(1);
} else {
System.err.println("No weight matrix id");
System.exit(1);
}
rs.close();
getId.close();
PreparedStatement insertWMSM = cxn.prepareStatement("insert into weightmatrix_species_map(id, species_id, wm_id) values (weightmatrix_id.nextval, ?, ?)");
insertWMSM.setInt(1, matrix.speciesid);
insertWMSM.setInt(2, wmid);
insertWMSM.execute();
insertWMSM.close();
} else {
wmid = rs.getInt(1);
PreparedStatement deleteold = cxn.prepareStatement("delete from weightmatrixcols where weightmatrix = ?");
deleteold.setInt(1,wmid);
deleteold.execute();
deleteold.close();
}
rs.close();
exists.close();
PreparedStatement insertcol = cxn.prepareStatement("insert into weightmatrixcols(weightmatrix,position,letter,weight) values(?,?,?,?)");
insertcol.setInt(1,wmid);
for (int col = 0; col < matrix.length(); col++) {
// System.err.println(String.format(" Column %d %f %f %f %f", col,
// matrix.matrix[col]['A'],
// matrix.matrix[col]['C'],
// matrix.matrix[col]['T'],
// matrix.matrix[col]['G']));
insertcol.setInt(2,col);
insertcol.setString(3,"A");
insertcol.setFloat(4,matrix.matrix[col]['A']);
insertcol.execute();
insertcol.setString(3,"C");
insertcol.setFloat(4,matrix.matrix[col]['C']);
insertcol.execute();
insertcol.setString(3,"T");
insertcol.setFloat(4,matrix.matrix[col]['T']);
insertcol.execute();
insertcol.setString(3,"G");
insertcol.setFloat(4,matrix.matrix[col]['G']);
insertcol.execute();
}
insertcol.close();
cxn.commit();
if(cxn!=null) try {cxn.close();}catch (Exception ex) {throw new DatabaseException("Couldn't close connection with role annotations", ex); }
return wmid;
}
/**
* Parses in weight matrices in transfac format with counts or frequencies
* (not log odds)
*
* @param wmfile
* @param version
* @return
* @throws IOException
*/
public static List<WeightMatrix> readTRANSFACFreqMatrices(String wmfile, String version) throws IOException {
int[] indices = { 'A', 'C', 'G', 'T' };
LinkedList<WeightMatrix> matrices = new LinkedList<WeightMatrix>();
BufferedReader br = new BufferedReader(new FileReader(new File(wmfile)));
String line;
WeightMatrix matrix = null;
int motifCount = 0;
Vector<float[]> arrays = new Vector<float[]>();
// Read in Transfac format first
Species currentSpecies = null;
String name = null, id = null, accession = null;
Pattern speciesPattern = Pattern.compile(".*Species:.*, (.*)\\.");
while ((line = br.readLine()) != null) {
line = line.trim();
if (line.length() > 0) {
String[] pieces = line.split("\\s+");
if (pieces[0].equals("AC")) {
accession = pieces[1];
// System.err.println("read AC " + accession);
}
else if (pieces[0].equals("NA")) {
name = pieces[1];
// System.err.println("read NA " + name);
}
else if (pieces[0].equals("ID")) {
id = pieces[1];
// System.err.println("read ID " + id);
arrays.clear();
name = null;
accession = null;
currentSpecies = null;
}
else if (pieces[0].equals("BF") && currentSpecies == null) {
Matcher matcher = speciesPattern.matcher(line);
if (matcher.matches()) {
String specname = matcher.group(1);
try {
currentSpecies = new Species(specname);
// System.err.println("Got species " + specname);
}
catch (NotFoundException e) {
System.err.println("Couldn't find species " + specname);
// ignore it and move on
}
}
}
else if (pieces[0].equals("DE")) {
name = pieces[1];
if (pieces.length >= 3) {
String v_string = pieces[2];
if (pieces.length >= 4) {
for (int v = 3; v < pieces.length; v++) {
v_string = v_string + "," + pieces[v];
}
}
if (version != null) {
version = v_string + "," + version;
}
else {
version = v_string;
}
}
//initialize id and accession if they're still null
if (id == null) {
id = "";
}
if (accession == null) {
accession = "";
}
}
else if (pieces[0].equals("//")||pieces[0].equals("XX")) {
if (name != null && accession != null && id != null && arrays.size() > 0) {
matrix = new WeightMatrix(arrays.size());
for (int i = 0; i < arrays.size(); i++) {
matrix.matrix[i]['A'] = arrays.get(i)[0];
matrix.matrix[i]['C'] = arrays.get(i)[1];
matrix.matrix[i]['G'] = arrays.get(i)[2];
matrix.matrix[i]['T'] = arrays.get(i)[3];
}
matrix.name = name;
matrix.version = version;
if (id.length() > 0) {
matrix.version = matrix.version + " " + id;
}
if (accession.length() > 0) {
matrix.version = matrix.version + " " + accession;
}
// System.err.println("read version " + matrix.version);
if (currentSpecies != null) {
matrix.speciesid = currentSpecies.getDBID();
matrix.species = currentSpecies.getName();
}
matrix.type = "TRANSFAC";
matrix.normalizeFrequencies();
matrices.add(matrix);
// clean up to prepare to parse next pwm
arrays.clear();
name = null;
id = null;
accession = null;
currentSpecies = null;
}
else {
// System.err.println(String.format("name %s id %s species %s",name,id,currentSpecies
// == null ? "null" : currentSpecies.toString()));
}
}
else if (name != null && (pieces.length == 5 || pieces.length == 6) && Character.isDigit(pieces[0].charAt(0))) {
// Load the matrix
float[] fa = new float[4];
// System.err.println(" adding matrix line");
for (int i = 1; i <= 4; i++) {
fa[i - 1] = Float.parseFloat(pieces[i]);
}
arrays.add(fa);
}
}
}
br.close();
return matrices;
}
public static Set<Integer> insertMultiWMFromFile(String species, String wmtype, String wmfile, String wmversion)
throws IOException, SQLException, NotFoundException {
HashSet<Integer> ids = new HashSet<Integer>();
List<WeightMatrix> matrices = null;
System.err.println("type is " + wmtype+ " and file is " + wmfile + " and version is " + wmversion);
if(wmtype.matches(".*TAMO.*")) {
int speciesid = (new Species(species)).getDBID();
matrices = PWMParser.readTamoMatrices(wmfile);
for(WeightMatrix matrix : matrices) {
matrix.speciesid = speciesid;
}
} else if (wmtype.matches(".*TRANSFAC.*")) {
matrices = PWMParser.readTRANSFACFreqMatrices(wmfile, wmversion);
} else if (wmtype.matches(".*JASPAR.*")) {
matrices = PWMParser.readJASPARFreqMatrices(wmfile, wmversion);
} else if (wmtype.toUpperCase().matches(".*GIMME.*")) {
matrices = PWMParser.readGimmeMotifsMatrices(wmfile,wmversion,wmtype);
} else {
throw new NotFoundException("Unknown weight matrix type " + wmtype);
}
for(WeightMatrix matrix : matrices) {
ids.add(insertMatrixIntoDB(matrix));
}
return ids;
}
/* reads a weight matrix from the specified file,
inserts it into the database with the specified name and
version, and returns its dbid. This means that the file may contain
only a single weight matrix*/
public static int insertWMFromFile(String species,
String wmname,
String wmversion,
String wmtype,
String wmfile) throws SQLException, NotFoundException, UnknownRoleException, FileNotFoundException, ParseException, IOException {
WeightMatrix matrix;
System.err.println("name " + wmname + " version " + wmversion + " type " + wmtype);
if (wmtype.matches(".*TAMO.*")) {
matrix = PWMParser.readTamoMatrix(wmfile);
} else if (wmtype.matches(".*MEME.*")) {
matrix = PWMParser.readMemeMatrix(wmfile);
} else if (wmtype.matches(".*SEQ.*")) {
matrix = PWMParser.readAlignedSequenceMatrix(wmfile);
} else if (wmtype.matches(".*TRANSFAC.*")) {
//TODO add a method to read a single transfac matrix
matrix = PWMParser.readTRANSFACFreqMatrices(wmfile, wmversion).get(0);
} else if (wmtype.matches(".*PRIORITY.*")) {
matrix = PWMParser.parsePriorityBestOutput(wmfile);
} else if (wmtype.matches(".*UniProbe.*")) {
matrix = PWMParser.readUniProbeFile(wmfile);
} else if (wmtype.toUpperCase().matches(".*GIMME.*")) {
matrix = PWMParser.readGimmeMotifsMatrices(wmfile,wmversion,wmtype).get(0);
} else {
System.err.println("Didn't see a program I recognize in the type. defaulting to reading TAMO format");
matrix = PWMParser.readTamoMatrix(wmfile);
}
matrix.name = wmname;
matrix.version = wmversion;
matrix.type = wmtype;
matrix.speciesid = (new Species(species)).getDBID();
return insertMatrixIntoDB(matrix);
}
/* reads a weight matrix from the specified file,
inserts it into the database with the specified name and
version, and returns its dbid. This means that the file may contain
only a single weight matrix*/
public static int insertWMFromFile(String species, String wmname, String wmversion, String wmtype, String wmfile, String bgFreq)
throws SQLException, NotFoundException, UnknownRoleException, FileNotFoundException, ParseException, IOException {
WeightMatrix matrix;
if (wmtype.matches(".*TAMO.*")) {
matrix = PWMParser.readTamoMatrix(wmfile);
} else if (wmtype.matches(".*MEME.*")) {
matrix = PWMParser.readMemeMatrix(wmfile);
} else if (wmtype.matches(".*SEQ.*")) {
matrix = PWMParser.readAlignedSequenceMatrix(wmfile);
} else if (wmtype.matches(".*TRANSFAC.*")) {
//TODO add a method to read a single transfac matrix
matrix = PWMParser.readTRANSFACFreqMatrices(wmfile, wmversion).get(0);
} else if (wmtype.matches(".*PRIORITY.*")) {
matrix = PWMParser.parsePriorityBestOutput(wmfile);
} else if (wmtype.toUpperCase().matches(".*GIMME.*")) {
matrix = PWMParser.readGimmeMotifsMatrices(wmfile,wmversion,wmtype).get(0);
} else {
System.err.println("Didn't see a program I recognize in the type. defaulting to reading TAMO format");
matrix = PWMParser.readTamoMatrix(wmfile);
}
matrix.name = wmname;
matrix.version = wmversion;
matrix.type = wmtype;
matrix.speciesid = (new Species(species)).getDBID();
return insertMatrixIntoDB(matrix);
}
/**
* Constructs a matrix from a set of strings. The strings must all have the same length.
* The WeightMatrix returned has the frequencies of the bases at each position given.
*
* @param strings
* @return
* @throws IOException
* @throws ParseException
*/
public static WeightMatrix buildAlignedSequenceMatrix(Collection<String> strings) throws ParseException {
WeightMatrix wm = null;
int[] counts = null;
for(String line : strings) {
line = line.trim().toUpperCase();
if(line.length() > 0) {
if(wm == null) {
wm = new WeightMatrix(line.length());
counts = new int[line.length()];
for(int i = 0; i < wm.length(); i++) {
counts[i] = 0;
for(int j = 0; j < wm.matrix[i].length; j++) {
wm.matrix[i][j] = (float)0.0;
}
}
}
if(line.length() != wm.length()) {
throw new ParseException("Line \"" + line + "\" was of uneven length (" +
wm.length() + ")", 0);
}
for(int i = 0; i < line.length(); i++) {
char c = line.charAt(i);
if(c != 'N' && c != '-') {
wm.matrix[i][c] += (float)1.0;
counts[i] += 1;
}
}
}
}
for(int i = 0; wm != null && i < wm.length(); i++) {
if(counts[i] > 0) {
for(int j = 0; j < wm.matrix[i].length; j++) {
wm.matrix[i][j] /= (float)counts[i];
}
} else {
for(int j = 0; j < wm.matrix[i].length; j++) {
wm.matrix[i][j] = (float)1.0 / (float)(wm.matrix[i].length);
}
}
}
return wm;
}
}