package jannovar.io; import static org.junit.Assert.fail; import jannovar.common.FeatureType; import jannovar.exception.FeatureFormatException; import jannovar.gff.Feature; import java.io.File; import java.io.FileOutputStream; import java.io.PrintStream; import org.junit.AfterClass; import org.junit.Assert; import org.junit.Before; import org.junit.Test; public class GFFparserTest { private static GFFparser reader = null; @Before public void setUp() throws Exception { File tmp = File.createTempFile("gff3reader-test", "gff3reader-test"); PrintStream ps = new PrintStream(new FileOutputStream(tmp)); ps.append("##gff-version 3\n"); ps.append("##sequence-region ctg123 1 1497228\n"); ps.append("ctg123\t.\tgene\t1000\t9000\t.\t+\t.\tID=gene00001;Name=EDEN\n"); ps.append("ctg123\t.\tTF_binding_site\t1000\t1012\t.\t+\t.\tID=tfbs00001;Parent=gene00001\n"); ps.append("ctg123\t.\tmRNA\t1050\t9000\t.\t+\t.\tID=mRNA00001;Parent=gene00001;Name=EDEN.1\n"); ps.append("ctg123\t.\tmRNA\t1050\t9000\t.\t+\t.\tID=mRNA00002;Parent=gene00001;Name=EDEN.2\n"); ps.append("ctg123\t.\tmRNA\t1300\t9000\t.\t+\t.\tID=mRNA00003;Parent=gene00001;Name=EDEN.3\n"); ps.append("ctg123\t.\texon\t1300\t1500\t.\t+\t.\tID=exon00001;Parent=mRNA00003\n"); ps.append("ctg123\t.\texon\t1050\t1500\t.\t+\t.\tID=exon00002;Parent=mRNA00001,mRNA00002\n"); ps.append("ctg123\t.\texon\t3000\t3902\t.\t+\t.\tID=exon00003;Parent=mRNA00001,mRNA00003\n"); ps.append("ctg123\t.\texon\t5000\t5500\t.\t+\t.\tID=exon00004;Parent=mRNA00001,mRNA00002,mRNA00003\n"); ps.append("ctg123\t.\texon\t7000\t9000\t.\t+\t.\tID=exon00005;Parent=mRNA00001,mRNA00002,mRNA00003\n"); ps.append("ctg123\t.\tCDS\t1201\t1500\t.\t+\t0\tID=cds00001;Parent=mRNA00001;Name=edenprotein.1\n"); ps.append("ctg123\t.\tCDS\t3000\t3902\t.\t+\t0\tID=cds00001;Parent=mRNA00001;Name=edenprotein.1\n"); ps.append("ctg123\t.\tCDS\t5000\t5500\t.\t+\t0\tID=cds00001;Parent=mRNA00001;Name=edenprotein.1\n"); ps.append("ctg123\t.\tCDS\t7000\t7600\t.\t+\t0\tID=cds00001;Parent=mRNA00001;Name=edenprotein.1\n"); ps.append("ctg123\t.\tCDS\t1201\t1500\t.\t+\t0\tID=cds00002;Parent=mRNA00002;Name=edenprotein.2\n"); ps.append("ctg123\t.\tCDS\t5000\t5500\t.\t+\t0\tID=cds00002;Parent=mRNA00002;Name=edenprotein.2\n"); ps.append("ctg123\t.\tCDS\t7000\t7600\t.\t+\t0\tID=cds00002;Parent=mRNA00002;Name=edenprotein.2\n"); ps.append("ctg123\t.\tCDS\t3301\t3902\t.\t+\t0\tID=cds00003;Parent=mRNA00003;Name=edenprotein.3\n"); ps.append("ctg123\t.\tCDS\t5000\t5500\t.\t+\t1\tID=cds00003;Parent=mRNA00003;Name=edenprotein.3\n"); ps.append("ctg123\t.\tCDS\t7000\t7600\t.\t+\t1\tID=cds00003;Parent=mRNA00003;Name=edenprotein.3\n"); ps.append("ctg123\t.\tCDS\t3391\t3902\t.\t+\t0\tID=cds00004;Parent=mRNA00003;Name=edenprotein.4\n"); ps.append("ctg123\t.\tCDS\t5000\t5500\t.\t+\t1\tID=cds00004;Parent=mRNA00003;Name=edenprotein.4\n"); ps.append("ctg123\t.\tCDS\t7000\t7600\t.\t+\t1\tID=cds00004;Parent=mRNA00003;Name=edenprotein.4\n"); ps.close(); // reader = new GFFparser(tmp.getAbsolutePath()); reader = new GFFparser(); // reader.parse("data/interim_GRCh37.p13_top_level_2013-07-05.gff3.gz"); } @AfterClass public static void releaseResources() { reader = null; System.gc(); } // @Test // public void testGFFversion() { // Assert.assertEquals(3, reader.getGFFversion()); // } @Test public void testProcessFeatureRNAGFF3() { String line = "ctg123\t.\texon\t5000\t5500\t.\t+\t.\tID=exon00004;Parent=mRNA00001,mRNA00002,mRNA00003"; try { reader.setGFFversion(3); Feature feature = reader.processFeature(line); Assert.assertEquals(FeatureType.EXON, feature.getType()); Assert.assertEquals(5000, feature.getStart()); Assert.assertEquals(5500, feature.getEnd()); // Assert.assertEquals('.', feature.getPhase()); Assert.assertEquals(true, feature.getStrand()); // Assert.assertEquals('.', feature.getScore()); Assert.assertEquals("ctg123", feature.getSequence_id()); Assert.assertEquals(2, feature.getAttributes().size()); Assert.assertEquals("exon00004", feature.getAttribute("ID")); Assert.assertEquals("mRNA00001,mRNA00002,mRNA00003", feature.getAttribute("Parent")); } catch (FeatureFormatException e) { fail("misformed feature line: " + line + "\n" + e); e.printStackTrace(); } } @Test public void testProcessFeatureGeneGFF3() { String line = "ctg123\t.\tgene\t1000\t9000\t.\t+\t.\tID=gene00001;Name=EDEN"; try { reader.setGFFversion(3); Feature feature = reader.processFeature(line); Assert.assertEquals(FeatureType.GENE, feature.getType()); Assert.assertEquals(1000, feature.getStart()); Assert.assertEquals(9000, feature.getEnd()); // Assert.assertEquals('.', feature.getPhase()); Assert.assertEquals(true, feature.getStrand()); // Assert.assertEquals('.', feature.getScore()); Assert.assertEquals("ctg123", feature.getSequence_id()); Assert.assertEquals(2, feature.getAttributes().size()); Assert.assertEquals("gene00001", feature.getAttribute("ID")); Assert.assertEquals("EDEN", feature.getAttribute("Name")); } catch (FeatureFormatException e) { fail("misformed feature line: " + line + "\n" + e); e.printStackTrace(); } } @Test public void testProcessFeature001GFF2() { String line = "18 protein_coding exon 246324 246433 . - . gene_id \"ENSG00000079134\"; transcript_id \"ENST00000579891\"; exon_number \"1\"; gene_name \"THOC1\"; gene_biotype \"protein_coding\"; transcript_name \"THOC1-020\"; exon_id \"ENSE00002716487\";"; try { reader.setGFFversion(2); reader.setValueSeparator(" "); Feature feature = reader.processFeature(line); Assert.assertEquals(FeatureType.EXON, feature.getType()); Assert.assertEquals(246324, feature.getStart()); Assert.assertEquals(246433, feature.getEnd()); // Assert.assertEquals('.', feature.getPhase()); Assert.assertEquals(false, feature.getStrand()); // Assert.assertEquals('.', feature.getScore()); Assert.assertEquals("18", feature.getSequence_id()); Assert.assertEquals(7, feature.getAttributes().size()); Assert.assertEquals("ENSG00000079134", feature.getAttribute("gene_id")); Assert.assertEquals("ENST00000579891", feature.getAttribute("transcript_id")); Assert.assertEquals("1", feature.getAttribute("exon_number")); Assert.assertEquals("THOC1", feature.getAttribute("gene_name")); Assert.assertEquals("protein_coding", feature.getAttribute("gene_biotype")); Assert.assertEquals("THOC1-020", feature.getAttribute("transcript_name")); Assert.assertEquals("ENSE00002716487", feature.getAttribute("exon_id")); } catch (FeatureFormatException e) { fail("misformed feature line: " + line + "\n" + e); e.printStackTrace(); } } }