/******************************************************************************* * Trombone is a flexible text processing and analysis library used * primarily by Voyant Tools (voyant-tools.org). * * Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell * * This file is part of Trombone. * * Trombone is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Trombone is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Trombone. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package org.voyanttools.trombone.input.extract; import static org.junit.Assert.*; import java.io.IOException; import java.io.InputStream; import java.net.URISyntaxException; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.junit.Test; import org.voyanttools.trombone.input.source.FileInputSource; import org.voyanttools.trombone.input.source.InputSource; import org.voyanttools.trombone.input.source.StringInputSource; import org.voyanttools.trombone.model.DocumentMetadata; import org.voyanttools.trombone.model.StoredDocumentSource; import org.voyanttools.trombone.storage.Storage; import org.voyanttools.trombone.storage.StoredDocumentSourceStorage; import org.voyanttools.trombone.util.FlexibleParameters; import org.voyanttools.trombone.util.TestHelper; /** * @author sgs * */ public class TikaExtractorTest { @Test public void testStrings() throws IOException, URISyntaxException { Storage storage = TestHelper.getDefaultTestStorage(); StoredDocumentSourceStorage storeDocumentSourceStorage = storage.getStoredDocumentSourceStorage(); FlexibleParameters parameters = new FlexibleParameters(); StoredDocumentSourceExtractor extractor = new StoredDocumentSourceExtractor(storeDocumentSourceStorage, parameters); InputSource inputSource; StoredDocumentSource storedDocumentSource; StoredDocumentSource extractedStoredDocumentSource; DocumentMetadata metadata; InputStream inputStream; String contents; inputSource = new StringInputSource("This — is <b>a</b> test."); storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource); extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource); inputStream = storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId()); contents = IOUtils.toString(inputStream); inputStream.close(); // contents = IOUtils.toString(storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId())); assertTrue("Text string shouldn't contain tags", contents.contains("<b>a</b>")); inputSource = new StringInputSource("<html><body><div>This is <b>a</b> test.</div></body></html>"); storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource); extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource); inputStream = storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId()); contents = IOUtils.toString(inputStream); inputStream.close(); // contents = IOUtils.toString(storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId())); assertTrue("HTML string should contain tags", contents.contains("<b>a</b>")); assertEquals("en", extractedStoredDocumentSource.getMetadata().getLanguageCode()); inputSource = new StringInputSource("<html><body><section><div>This is <b>a</b> test.</div></section></body></html>"); storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource); extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource); inputStream = storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId()); contents = IOUtils.toString(inputStream); inputStream.close(); // contents = IOUtils.toString(storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId())); assertTrue("HTML string should contain tags", contents.contains("<b>a</b>")); // TODO: find a way to keep html5 tags with xhtml transformer assertTrue("HTML string should contain HTML5 tags", contents.contains("<section>")); inputSource = new StringInputSource("<test>This is <b>a</b> test.</test>"); storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource); extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource); inputStream = storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId()); contents = IOUtils.toString(inputStream); inputStream.close(); // contents = IOUtils.toString(storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId())); assertTrue("XML-looking string should contain tags", contents.contains("<b>a</b>")); parameters.setParameter("inputFormat", "XML"); parameters.setParameter("language", "no"); // make sure we can override language storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource); extractor = new StoredDocumentSourceExtractor(storeDocumentSourceStorage, parameters); extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource); inputStream = storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId()); contents = IOUtils.toString(inputStream); inputStream.close(); // contents = IOUtils.toString(storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId())); assertTrue("XML-declared string should contain tags", contents.contains("<b>a</b>")); assertEquals("no", extractedStoredDocumentSource.getMetadata().getLanguageCode()); // check override language (from params above) } @Test public void testFormats() throws IOException, URISyntaxException { Storage storage = TestHelper.getDefaultTestStorage(); StoredDocumentSourceStorage storeDocumentSourceStorage = storage.getStoredDocumentSourceStorage(); FlexibleParameters parameters = new FlexibleParameters(); StoredDocumentSourceExtractor extractor = new StoredDocumentSourceExtractor(storeDocumentSourceStorage, parameters); InputSource inputSource; StoredDocumentSource storedDocumentSource; StoredDocumentSource extractedStoredDocumentSource; DocumentMetadata metadata; InputStream inputStream; String contents; String line; line = FileUtils.readLines(TestHelper.getResource("formats/chars_utf8.txt")).get(0).trim(); line = line.substring(line.indexOf("I")); inputSource = new FileInputSource(TestHelper.getResource("formats/snippet.txt")); storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource); extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource); metadata = extractedStoredDocumentSource.getMetadata(); // assertEquals("chars_utf8", metadata.getTitle()); // contents = IOUtils.toString(storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId())); // assertTrue("ensure we have two paragraphs in text", StringUtils.countMatches(contents, "<p>")==2); // assertTrue("ensure we've escaped & in text", contents.contains("&")==true); // assertTrue("ensure we have some content in text", contents.contains(line)==true); inputSource = new FileInputSource(TestHelper.getResource("formats/chars_utf8.txt")); storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource); extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource); metadata = extractedStoredDocumentSource.getMetadata(); assertEquals("chars_utf8", metadata.getTitle()); inputStream = storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId()); contents = IOUtils.toString(inputStream); inputStream.close(); // contents = IOUtils.toString(storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId())); assertTrue("ensure we have two paragraphs in text", StringUtils.countMatches(contents, "<p>")==2); assertTrue("ensure we've escaped & in text", contents.contains("&")==true); assertTrue("ensure we have some content in text", contents.contains(line)==true); inputSource = new FileInputSource(TestHelper.getResource("formats/chars.pages")); storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource); extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource); metadata = extractedStoredDocumentSource.getMetadata(); assertEquals("title for Pages document", "Titre du document test de Pages", metadata.getTitle()); assertEquals("author for Pages document", "Stéfan Sinclair", metadata.getAuthor()); assertEquals("keywords for Pages document", "test, Pages", metadata.getKeywords()); inputStream = storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId()); contents = IOUtils.toString(inputStream); inputStream.close(); // contents = IOUtils.toString(storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId())); assertTrue("ensure we have some content in Pages", contents.contains(line)==true); inputSource = new FileInputSource(TestHelper.getResource("formats/chars.doc")); storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource); extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource); metadata = extractedStoredDocumentSource.getMetadata(); assertEquals("title for MSWord (.doc) document", "Titre du document test de MSWord", metadata.getTitle()); assertEquals("author for MSWord (.doc) document", "Stéfan Sinclair", metadata.getAuthor()); assertEquals("keywords for MSWord (.doc) document", "test, MSWord", metadata.getKeywords()); // contents = IOUtils.toString(storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId())); inputStream = storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId()); contents = IOUtils.toString(inputStream); inputStream.close(); assertTrue("ensure we have some content in MSWord (.doc)", contents.contains(line)==true); inputSource = new FileInputSource(TestHelper.getResource("formats/chars.docx")); storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource); extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource); metadata = extractedStoredDocumentSource.getMetadata(); assertEquals("title for MSWord (.docx) document", "Titre du document test de MSWord", metadata.getTitle()); assertEquals("author for MSWord (.docx) document", "Stéfan Sinclair", metadata.getAuthor()); assertEquals("keywords for MSWord (.docx) document", "test, MSWord", metadata.getKeywords()); // contents = IOUtils.toString(storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId())); inputStream = storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId()); contents = IOUtils.toString(inputStream); inputStream.close(); assertTrue("ensure we have some content in MSWord (.docx)", contents.contains(line)==true); inputSource = new FileInputSource(TestHelper.getResource("formats/chars.rtf")); storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource); extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource); metadata = extractedStoredDocumentSource.getMetadata(); assertEquals("title for RTF document", "Titre du document test de RTF", metadata.getTitle()); assertEquals("author for RTF document", "Stéfan Sinclair", metadata.getAuthor()); assertEquals("keywords for RTF document", "test, RTF", metadata.getKeywords()); // contents = IOUtils.toString(storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId())); inputStream = storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId()); contents = IOUtils.toString(inputStream); inputStream.close(); assertTrue("ensure we have some content in RTF", contents.contains(line)==true); inputSource = new FileInputSource(TestHelper.getResource("formats/chars.pdf")); storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource); extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource); metadata = extractedStoredDocumentSource.getMetadata(); assertEquals("title for PDF document", "Titre du document test de PDF", metadata.getTitle()); assertEquals("author for PDF document", "Stéfan Sinclair", metadata.getAuthor()); assertEquals("keywords for PDF document", "test, PDF", metadata.getKeywords()); // contents = IOUtils.toString(storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId())); inputStream = storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId()); contents = IOUtils.toString(inputStream); inputStream.close(); assertTrue("ensure we have some content in PDF", contents.contains(line)==true); inputSource = new FileInputSource(TestHelper.getResource("formats/chars_utf8.htm")); storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource); extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource); metadata = extractedStoredDocumentSource.getMetadata(); assertEquals("fr", metadata.getLanguageCode()); assertEquals("title for HTML document", "Titre du document test de HTML", metadata.getTitle()); assertEquals("author for HTML document", "Stéfan Sinclair", metadata.getAuthor()); assertEquals("keywords for HTML document", "test, HTML", metadata.getKeywords()); // contents = IOUtils.toString(storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId())); inputStream = storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId()); contents = IOUtils.toString(inputStream); inputStream.close(); assertTrue("strip out script tag from html", contents.contains("script")==false); assertTrue("strip out style tag from html", contents.contains("style")==false); assertTrue("ensure we have some content in html", contents.contains(line)==true); inputSource = new FileInputSource(TestHelper.getResource("formats/chars.xlsx")); storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource); extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource); metadata = extractedStoredDocumentSource.getMetadata(); assertEquals("title for XLSX document", "chars", metadata.getTitle()); // contents = IOUtils.toString(storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId())); inputStream = storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId()); contents = IOUtils.toString(inputStream); inputStream.close(); assertTrue("strip out script tag from html", contents.contains("script")==false); assertTrue("strip out style tag from html", contents.contains("style")==false); assertTrue("ensure we have some content in html", contents.contains(line)==true); storage.destroy(); } }