/*******************************************************************************
* Trombone is a flexible text processing and analysis library used
* primarily by Voyant Tools (voyant-tools.org).
*
* Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell
*
* This file is part of Trombone.
*
* Trombone is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Trombone is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Trombone. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package org.voyanttools.trombone.input.extract;
import static org.junit.Assert.*;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.junit.Test;
import org.voyanttools.trombone.input.source.FileInputSource;
import org.voyanttools.trombone.input.source.InputSource;
import org.voyanttools.trombone.input.source.StringInputSource;
import org.voyanttools.trombone.model.DocumentFormat;
import org.voyanttools.trombone.model.DocumentMetadata;
import org.voyanttools.trombone.model.StoredDocumentSource;
import org.voyanttools.trombone.storage.Storage;
import org.voyanttools.trombone.storage.StoredDocumentSourceStorage;
import org.voyanttools.trombone.util.FlexibleParameters;
import org.voyanttools.trombone.util.TestHelper;
/**
* @author sgs
*
*/
public class XmlExtractorTest {
@Test
public void test() throws IOException {
Storage storage = TestHelper.getDefaultTestStorage();
StoredDocumentSourceStorage storeDocumentSourceStorage = storage.getStoredDocumentSourceStorage();
FlexibleParameters parameters = new FlexibleParameters();
StoredDocumentSourceExtractor extractor = new StoredDocumentSourceExtractor(storeDocumentSourceStorage, parameters);
InputSource inputSource;
StoredDocumentSource storedDocumentSource;
StoredDocumentSource extractedStoredDocumentSource;
DocumentMetadata metadata;
InputStream inputStream;
String contents;
String line = FileUtils.readLines(TestHelper.getResource("formats/chars_utf8.txt")).get(0).trim();
line = line.substring(line.indexOf("I"));
inputSource = new FileInputSource(TestHelper.getResource("formats/chars_utf8.xml"));
storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource);
extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource);
metadata = extractedStoredDocumentSource.getMetadata();
// this should be blank rather than the title tag (for generic XML)
assertEquals("", metadata.getTitle());
inputStream = storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId());
contents = IOUtils.toString(inputStream);
inputStream.close();
assertTrue("ensure we have some content in XML", contents.contains(line)==true);
// try with xmlContentXpath parameter and multiple nodes
extractor = new StoredDocumentSourceExtractor(storeDocumentSourceStorage, new FlexibleParameters(new String[]{"xmlContentXpath=//p"}));
inputSource = new FileInputSource(TestHelper.getResource("formats/chars_utf8.xml"));
storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource);
extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource);
metadata = extractedStoredDocumentSource.getMetadata();
// this should be blank rather than the title tag (for generic XML)
assertEquals("title for XML document", "", metadata.getTitle());
inputStream = storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId());
contents = IOUtils.toString(inputStream);
inputStream.close();
// contents = IOUtils.toString(storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId()));
assertTrue("ensure we have stripped out other content", contents.contains("<body>")==false);
assertTrue("ensure we have some content in XML with multiple nodes for the xmlContentXPath parameter", contents.contains(line)==true);
// try with xmlContentXpath parameter and single node
extractor = new StoredDocumentSourceExtractor(storeDocumentSourceStorage, new FlexibleParameters(new String[]{"xmlContentXpath=//body"}));
inputSource = new FileInputSource(TestHelper.getResource("formats/chars_utf8.xml"));
storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource);
extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource);
metadata = extractedStoredDocumentSource.getMetadata();
// this should be blank rather than the title tag (for generic XML)
assertEquals("title for XML document", "", metadata.getTitle());
inputStream = storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId());
contents = IOUtils.toString(inputStream);
inputStream.close();
// contents = IOUtils.toString(storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId()));
assertTrue("ensure we have stripped out other content", contents.contains("<head>")==false);
assertTrue("ensure we have some content in XML with a single node xmlContentXpath parameter", contents.contains(line)==true);
// try with RSS input format implicit (no inputFormat)
extractor = new StoredDocumentSourceExtractor(storeDocumentSourceStorage, new FlexibleParameters());
inputSource = new FileInputSource(TestHelper.getResource("xml/rss.xml"));
storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource);
extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource);
metadata = extractedStoredDocumentSource.getMetadata();
// this should be blank rather than the title tag (for generic XML)
assertEquals("title for RSS feed", "Website Feed", metadata.getTitle());
// assertEquals("author for RSS feed", "Me (me@example.com)", metadata.getAuthor());
// contents = IOUtils.toString(storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId()));
inputStream = storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId());
contents = IOUtils.toString(inputStream);
inputStream.close();
assertFalse(contents.contains("<!--")); // make sure we've stripped out XML comments during extraction
assertTrue("ensure we have stripped out other content in RSS feed", contents.contains("<link>")==false);
assertTrue("ensure we have three lines of description in RSS feed", StringUtils.countMatches(contents, "<description>")==2);
// try with RSS input format (explicit)
extractor = new StoredDocumentSourceExtractor(storeDocumentSourceStorage, new FlexibleParameters(new String[]{"inputFormat=RSS"}));
inputSource = new FileInputSource(TestHelper.getResource("xml/rss.xml"));
storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource);
extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource);
metadata = extractedStoredDocumentSource.getMetadata();
// this should be blank rather than the title tag (for generic XML)
assertEquals("title for RSS feed", "Website Feed", metadata.getTitle());
// assertEquals("author for RSS feed", "Me (me@example.com)", metadata.getAuthor());
// contents = IOUtils.toString(storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId()));
inputStream = storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId());
contents = IOUtils.toString(inputStream);
inputStream.close();
assertFalse(contents.contains("<!--")); // make sure we've stripped out XML comments during extraction
assertTrue("ensure we have stripped out other content in RSS feed", contents.contains("<link>")==false);
assertTrue("ensure we have three lines of description in RSS feed", StringUtils.countMatches(contents, "<description>")==2);
// try with XML
extractor = new StoredDocumentSourceExtractor(storeDocumentSourceStorage, new FlexibleParameters(new String[]{"inputFormat=XML"}));
inputSource = new FileInputSource(TestHelper.getResource("xml/rss.xml"));
storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource);
extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource);
metadata = extractedStoredDocumentSource.getMetadata();
// this should be blank rather than the title tag (for generic XML)
assertEquals(0, metadata.getTitle().length());
// assertEquals("author for RSS feed", "Me (me@example.com)", metadata.getAuthor());
// contents = IOUtils.toString(storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId()));
inputStream = storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId());
contents = IOUtils.toString(inputStream);
inputStream.close();
assertFalse(contents.contains("<!--")); // make sure we've stripped out XML comments during extraction
// make sure that we can keep multiple values for metadata
extractor = new StoredDocumentSourceExtractor(storeDocumentSourceStorage, new FlexibleParameters(new String[]{"xmlTitleXpath=//title"}));
inputSource = new FileInputSource(TestHelper.getResource("xml/rss.xml"));
storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource);
extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource);
metadata = extractedStoredDocumentSource.getMetadata();
assertEquals("title for RSS feed", "Website Feed", metadata.getTitle());
// make sure we can join string values
extractor = new StoredDocumentSourceExtractor(storeDocumentSourceStorage, new FlexibleParameters(new String[]{"xmlTitleXpath=string-join(//title,'--')"}));
inputSource = new FileInputSource(TestHelper.getResource("xml/rss.xml"));
storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource);
extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource);
metadata = extractedStoredDocumentSource.getMetadata();
// this should be blank rather than the title tag (for generic XML)
assertEquals("Website Feed--A Special Event--Announcing new Products", metadata.getTitle());
// make sure we recognize XML in a string
extractor = new StoredDocumentSourceExtractor(storeDocumentSourceStorage, new FlexibleParameters());
inputSource = new StringInputSource("<a><b>c</b><b>d</b></a>");
storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource);
extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource);
metadata = extractedStoredDocumentSource.getMetadata();
// this should be blank rather than the title tag (for generic XML)
assertEquals(DocumentFormat.XML, metadata.getDocumentFormat());
// make sure we recognize HTML in a string
extractor = new StoredDocumentSourceExtractor(storeDocumentSourceStorage, new FlexibleParameters());
inputSource = new StringInputSource("<html><body><div>This is a current sentence.</div><div>d</div></body></html>");
storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource);
extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource);
metadata = extractedStoredDocumentSource.getMetadata();
assertEquals("en", metadata.getLanguageCode()); // make sure default is English
// this should be blank rather than the title tag (for generic XML)
assertEquals(DocumentFormat.HTML, metadata.getDocumentFormat());
// make sure we find XPath in string XML
extractor = new StoredDocumentSourceExtractor(storeDocumentSourceStorage, new FlexibleParameters(new String[]{"xmlContentXpath=//b", "xmlTitleXpath=//b[1]","language=fr"}));
inputSource = new StringInputSource("<a><b>c</b><b>d & e</b><z>x</z></a>");
storedDocumentSource = storeDocumentSourceStorage.getStoredDocumentSource(inputSource);
extractedStoredDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocumentSource);
metadata = extractedStoredDocumentSource.getMetadata();
assertEquals("fr", metadata.getLanguageCode()); // make sure our set value is respected
// this should be blank rather than the title tag (for generic XML)
assertEquals(DocumentFormat.XML, metadata.getDocumentFormat());
assertEquals("c", metadata.getTitle());
// String string = IOUtils.toString(storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId()));
inputStream = storeDocumentSourceStorage.getStoredDocumentSourceInputStream(extractedStoredDocumentSource.getId());
contents = IOUtils.toString(inputStream);
inputStream.close();
assertTrue(contents.contains("<a>") && contents.contains("<b>") && !contents.contains("<z>"));
storage.destroy();
}
}