/**
*
*/
package org.seqcode.data.io;
import java.io.IOException;
import java.text.ParseException;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.seqcode.data.motifdb.BackgroundModel;
import org.seqcode.data.motifdb.CountsBackgroundModel;
import org.seqcode.data.motifdb.FrequencyBackgroundModel;
import org.seqcode.data.motifdb.MarkovBackgroundModel;
import org.seqcode.genome.Genome;
import org.seqcode.gseutils.Args;
import org.seqcode.gseutils.NotFoundException;
/**
* @author rca
*
*/
public class BackgroundModelIO {
public static final String BG_LINE_COUNTS_REG_EX = "^([\\d]+)\\s*([ACGTacgt]+)\\s*(\\d+)\\s*";
public static final String BG_LINE_PROBS_REG_EX = "^([\\d]+)\\s*([ACGTacgt]+)\\s*(.+)";
public static final Pattern BG_LINE_COUNTS_PATTERN = Pattern.compile(BG_LINE_COUNTS_REG_EX);
public static final Pattern BG_LINE_PROBS_PATTERN = Pattern.compile(BG_LINE_PROBS_REG_EX);
public static void main(String[] args) {
try {
MarkovBackgroundModel mbg = BackgroundModelIO.parseMarkovBackgroundModel("mm8.back", Genome.findGenome("mm8"));
BackgroundModelIO.printProbsToFile(mbg, "foo.back");
}
catch (NotFoundException nfex) {
nfex.printStackTrace();
}
catch (ParseException pex) {
pex.printStackTrace();
}
catch (IOException ioex) {
ioex.printStackTrace();
}
System.exit(0);
}
/**
* Parse the lines of the background model file and initialize the frequency
* based model
* TODO make integer value header column optional
* @param model
* @param lines
* @throws ParseException
*/
private static void initFreqBackgroundModel(FrequencyBackgroundModel model, String[] lines) throws ParseException {
Matcher bgLineMatcher = BG_LINE_PROBS_PATTERN.matcher("");
int lineIndex = 0;
for (int i = 0; i < model.getMaxKmerLen(); i++) {
HashMap<String, Double> freqs = new HashMap<String, Double>();
for (int j = 0; j < Math.pow(4, i + 1); j++) {
bgLineMatcher.reset(lines[lineIndex]);
if (bgLineMatcher.matches()) {
int intVal = Integer.valueOf(bgLineMatcher.group(1));
String mer = bgLineMatcher.group(2).toUpperCase();
if ((intVal != j) || !mer.equals(BackgroundModel.int2seq(j, (i + 1)))) {
throw new ParseException("Expected index " + j + " and kmer " + BackgroundModel.int2seq(j, (i + 1))
+ ", but got " + lines[lineIndex], 0);
}
double prob = Double.valueOf(bgLineMatcher.group(3));
freqs.put(mer, prob);
lineIndex++;
}
else {
throw new ParseException("Incorrectly formatted line: " + lines[lineIndex], 0);
}
}
model.setKmerFrequencies(freqs);
}
}
/**
* Parse the lines of the background model file and initialize the markov
* model
* TODO make integer value header column optional
* @param model
* @param lines
* @throws ParseException
*/
private static void initMarkovBackgroundModel(MarkovBackgroundModel model, String[] lines) throws ParseException {
// initialize background model object
Matcher bgLineMatcher = BG_LINE_PROBS_PATTERN.matcher("");
int lineIndex = 0;
for (int i = 0; i < model.getMaxKmerLen(); i++) {
for (int j = 0; j < Math.pow(4, i + 1); j += 4) {
double[] probs = new double[4];
for (int k = 0; k < 4; k++) {
bgLineMatcher.reset(lines[lineIndex]);
if (bgLineMatcher.matches()) {
int intVal = Integer.valueOf(bgLineMatcher.group(1));
String mer = bgLineMatcher.group(2).toUpperCase();
if ((intVal != (j+k)) || !mer.equals(BackgroundModel.int2seq((j+k), (i + 1)))) {
throw new ParseException("Expected index " + (j+k) + " and kmer " + BackgroundModel.int2seq((j+k), (i + 1))
+ ", but got " + lines[lineIndex], 0);
}
probs[k] = Double.valueOf(bgLineMatcher.group(3));
lineIndex++;
}
else {
throw new ParseException("Incorrectly formatted line: " + lines[lineIndex], 0);
}
}
model.setMarkovProb(BackgroundModel.int2seq(j, i+1).substring(0, i), probs[0], probs[1], probs[2], probs[3]);
}
}
}
/**
* Parse the lines of the background model file and initialize the count
* based model
* TODO make integer value header column optional
* @param model
* @param lines
* @throws ParseException
*/
private static void initCountsBackgroundModel(CountsBackgroundModel model, String[] lines) throws ParseException {
// initialize background model object
Matcher bgLineMatcher = BG_LINE_COUNTS_PATTERN.matcher("");
int lineIndex = 0;
for (int i = 0; i < model.getMaxKmerLen(); i++) {
for (int j = 0; j < Math.pow(4, i + 1); j++) {
bgLineMatcher.reset(lines[lineIndex]);
if (bgLineMatcher.matches()) {
int intVal = Integer.valueOf(bgLineMatcher.group(1));
String mer = bgLineMatcher.group(2).toUpperCase();
if ((intVal != j) || !mer.equals(BackgroundModel.int2seq(j, (i + 1)))) {
throw new ParseException("Expected index " + j + " and kmer " + BackgroundModel.int2seq(j, (i + 1))
+ ", but got " + lines[lineIndex], 0);
}
int count = Integer.valueOf(bgLineMatcher.group(3));
model.setKmerCount(mer, count);
lineIndex++;
}
else {
throw new ParseException("Incorrectly formatted line: " + lines[lineIndex], 0);
}
}
}
}
/**
* Checks the number of lines to make sure there are a proper number for all
* kmers up to the determined maximum length
* @param lines
* @return
* @throws ParseException
*/
private static int checkModelMaxKmerLen(String[] lines) throws ParseException {
// determine the max kmer length for model and check that the number of lines is valid
int maxKmerLen = 1;
int numLines = 4;
while (numLines < lines.length) {
maxKmerLen++;
numLines = numLines + (int) Math.pow(4, maxKmerLen);
}
if (numLines != lines.length) {
throw new ParseException("Parsed " + lines.length
+ " non-comment lines, but number of lines must be 4 + 4^2 + 4^3 + ... + 4^(k+1)", 0);
}
return maxKmerLen;
}
/**
* @see parseCountsBackgroundModel(String modelName, String filename, Genome gen)
* This version uses the filename as the name for the model
* @param filename
* @return
* @throws IOException
* @throws ParseException
*/
public static CountsBackgroundModel parseCountsBackgroundModel(String filename, Genome gen) throws IOException, ParseException {
return BackgroundModelIO.parseCountsBackgroundModel(filename, filename, gen);
}
/**
*
* @param filename
* @param gen
* @return
* @throws IOException
* @throws ParseException
*/
public static CountsBackgroundModel parseCountsBackgroundModel(String modelName, String filename, Genome gen) throws IOException, ParseException {
String[] lines = LineByLineFileReader.readFile(filename, LineByLineFileReader.DEFAULT_COMMENT_PREFIXES, true);
int maxKmerLen = BackgroundModelIO.checkModelMaxKmerLen(lines);
// construct the Counts Background Model object of that order
CountsBackgroundModel bgModel = new CountsBackgroundModel(modelName, gen, maxKmerLen);
// initialize the model
BackgroundModelIO.initCountsBackgroundModel(bgModel, lines);
return bgModel;
}
/**
* @see parseMarkovBackgroundModel(String modelName, String filename, Genome gen)
* This version uses the filename as the name for the model
* @param filename
* @return
* @throws IOException
* @throws ParseException
*/
public static MarkovBackgroundModel parseMarkovBackgroundModel(String filename, Genome gen) throws IOException, ParseException {
return BackgroundModelIO.parseMarkovBackgroundModel(filename, filename, gen);
}
/**
*
* @param filename
* @return
* @throws IOException
* @throws ParseException
*/
public static MarkovBackgroundModel parseMarkovBackgroundModel(String modelName, String filename, Genome gen) throws IOException, ParseException {
String[] lines = LineByLineFileReader.readFile(filename, LineByLineFileReader.DEFAULT_COMMENT_PREFIXES, true);
int maxKmerLen = BackgroundModelIO.checkModelMaxKmerLen(lines);
/**
* construct the Markov Background Model object for that kmer length
* Note: the markov order will be maxKmerLen - 1
*/
MarkovBackgroundModel bgModel = new MarkovBackgroundModel(modelName, gen, maxKmerLen);
// initialize the model
BackgroundModelIO.initMarkovBackgroundModel(bgModel, lines);
String[] normKmers = bgModel.verifyNormalization();
if (normKmers != null) {
throw new ParseException("Model is not normalized for kmers: " + normKmers[0] + "," + normKmers[1] + ","
+ normKmers[2] + "," + normKmers[3], -1);
}
return bgModel;
}
/**
* @see parseFrequencyBackgroundModel(String modelName, String filename, Genome gen)
* This version uses the filename as the name for the model
* @param filename
* @return
* @throws IOException
* @throws ParseException
*/
public static FrequencyBackgroundModel parseFrequencyBackgroundModel(String filename, Genome gen) throws IOException, ParseException {
return BackgroundModelIO.parseFrequencyBackgroundModel(filename, filename, gen);
}
/**
*
* @param filename
* @return
* @throws IOException
* @throws ParseException
*/
public static FrequencyBackgroundModel parseFrequencyBackgroundModel(String modelName, String filename, Genome gen) throws IOException, ParseException {
String[] lines = LineByLineFileReader.readFile(filename, LineByLineFileReader.DEFAULT_COMMENT_PREFIXES, true);
int maxKmerLen = BackgroundModelIO.checkModelMaxKmerLen(lines);
// construct the Frequency Background Model object for that kmer length
FrequencyBackgroundModel bgModel = new FrequencyBackgroundModel(modelName, gen, maxKmerLen);
// initialize the model
BackgroundModelIO.initFreqBackgroundModel(bgModel, lines);
int normKmers = bgModel.verifyNormalization();
if (normKmers != -1) {
throw new ParseException("Model is not normalized for kmers of length " + normKmers, -1);
}
return bgModel;
}
/**
* Write out the background model to a file
* @param bgModel
* @param filename
* @throws IOException
*/
public static void printProbsToFile(BackgroundModel bgModel, String filename) throws IOException {
LineByLineFileWriter lblfw = null;
try {
lblfw = new LineByLineFileWriter();
lblfw.openFile(filename);
for (int i = 1; i <= bgModel.getMaxKmerLen(); i++) {
for (int j = 0; j < Math.pow(4, i); j++) {
String currKmer = BackgroundModel.int2seq(j, i);
if (bgModel instanceof MarkovBackgroundModel) {
lblfw.writeLine(j + "\t" + currKmer + "\t" + bgModel.getMarkovProb(j, i));
}
else if (bgModel instanceof FrequencyBackgroundModel) {
lblfw.writeLine(j + "\t" + currKmer + "\t" + ((FrequencyBackgroundModel)bgModel).getFrequency(j, i));
}
else if (bgModel instanceof CountsBackgroundModel) {
lblfw.writeLine(j + "\t" + currKmer + "\t" + ((CountsBackgroundModel)bgModel).getKmerCount(j, i));
}
}
}
lblfw.writeLine("");
}
finally {
if (lblfw != null) {
lblfw.closeFile();
}
}
}
}