package org.seqcode.tools.sequence; import java.util.*; import java.io.*; import org.seqcode.data.io.parsing.FASTAStream; import org.seqcode.gseutils.*; import org.seqcode.motifs.CountKmers; import cern.jet.random.ChiSquare; /** * Compare the frequency of kmers in two FASTA files. * * java org.seqcode.genome.sequence.CompareKmers --k 6 --one foo.fasta --two bar.fasta * */ public class CompareKmers { private FASTAStream fastaone, fastatwo; private int k; private CountKmers one, two; public static void main(String args[]) throws Exception { CompareKmers compare = new CompareKmers(); compare.parseArgs(args); compare.read(); compare.report(); } public void parseArgs(String args[]) throws IOException { String fone = Args.parseString(args,"one",null); String ftwo = Args.parseString(args,"two",null); k = Args.parseInteger(args,"k",6); fastaone = new FASTAStream(new File(fone)); fastatwo = new FASTAStream(new File(ftwo)); one = new CountKmers(); one.init(k,k); two = new CountKmers(); two.init(k,k); } /* read the input files and file CountKmers one and two */ public void read() { read(fastaone, one); read(fastatwo, two); } public void read(FASTAStream stream, CountKmers kmers) { while (stream.hasNext()) { Pair<String,String> pair = stream.next(); kmers.addToCounts(pair.cdr()); } } public void report() { Set<String> allKeys = one.getKeySet(k); allKeys.addAll(two.getKeySet(k)); double stat = 0; int toosmall = 0; int zero = 0, inzero = 0; double countone = 0, counttwo = 0; for (String kmer : allKeys) { int obs = one.getCount(kmer, k); int exp = two.getCount(kmer,k); countone += obs; counttwo += exp; } double scale = counttwo / countone; System.out.println(String.format("Count one is %f and two is %f", countone, counttwo)); for (String kmer : allKeys) { double obs = one.getCount(kmer, k) * scale; double exp = two.getCount(kmer, k); if (exp < 5) { toosmall++; if (exp == 0) { zero++; inzero += obs; continue; } } stat += (obs - exp) * (obs - exp) / exp; } int n = allKeys.size() - zero; cern.jet.random.engine.RandomEngine engine = new cern.jet.random.engine.DRand(); ChiSquare csquare = new ChiSquare(n - 1, engine); System.out.println(String.format("chi-squared value is %f with %d DOF. CDF is %f", stat, n-1, csquare.cdf(stat))); System.out.println(String.format("There were %d below 5 and %d that were zero (%d observations)", toosmall, zero, inzero)); } }