/*******************************************************************************
* Trombone is a flexible text processing and analysis library used
* primarily by Voyant Tools (voyant-tools.org).
*
* Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell
*
* This file is part of Trombone.
*
* Trombone is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Trombone is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Trombone. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package org.voyanttools.trombone.input.expand;
import static org.junit.Assert.*;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.junit.Test;
import org.voyanttools.trombone.input.source.FileInputSource;
import org.voyanttools.trombone.input.source.InputSource;
import org.voyanttools.trombone.input.source.StringInputSource;
import org.voyanttools.trombone.model.DocumentFormat;
import org.voyanttools.trombone.model.StoredDocumentSource;
import org.voyanttools.trombone.storage.Storage;
import org.voyanttools.trombone.storage.StoredDocumentSourceStorage;
import org.voyanttools.trombone.util.FlexibleParameters;
import org.voyanttools.trombone.util.TestHelper;
/**
* @author "Stéfan Sinclair"
*
*/
public class XmlExpanderTest {
@Test
public void test() throws IOException {
Storage storage = TestHelper.getDefaultTestStorage();
StoredDocumentSourceStorage storedDocumentSourceStorage = storage.getStoredDocumentSourceStorage();
StoredDocumentSourceExpander storedDocumentSourceExpander;
InputSource inputSource;
StoredDocumentSource storedDocumentSource;
List<StoredDocumentSource> expandedSourceDocumentSources;
FlexibleParameters parameters;
InputStream inputStream;
FileInputStream fileInputStream;
// make sure we have one document when no xmlDocumentXpath is specified (and input format is XML, not RSS)
parameters = new FlexibleParameters(new String[]{"inputFormat=XML"});
storedDocumentSourceExpander = new StoredDocumentSourceExpander(storedDocumentSourceStorage, parameters);
inputSource = new FileInputSource(TestHelper.getResource("xml/rss.xml"));
storedDocumentSource = storedDocumentSourceStorage.getStoredDocumentSource(inputSource);
expandedSourceDocumentSources = storedDocumentSourceExpander.expandXml(storedDocumentSource);
assertEquals("XML file with no Xpath should contain one document", 1, expandedSourceDocumentSources.size());
// with xmlDocumentXpath we should have two for //item
parameters = new FlexibleParameters(new String[]{"xmlDocumentsXpath=//item"});
storedDocumentSourceExpander = new StoredDocumentSourceExpander(storedDocumentSourceStorage, parameters);
inputSource = new FileInputSource(TestHelper.getResource("xml/rss.xml"));
storedDocumentSource = storedDocumentSourceStorage.getStoredDocumentSource(inputSource);
expandedSourceDocumentSources = storedDocumentSourceExpander.expandXml(storedDocumentSource);
assertEquals("XML file with no Xpath should contain one document", 2, expandedSourceDocumentSources.size());
inputStream = null;
fileInputStream = null;
try {
inputStream = storedDocumentSourceStorage.getStoredDocumentSourceInputStream(expandedSourceDocumentSources.get(1).getId());
fileInputStream = new FileInputStream(TestHelper.getResource("xml/rss.xml.item2_xpath.xml"));
assertTrue(IOUtils.contentEquals(fileInputStream, inputStream));
}
finally {
if (inputStream!=null) {inputStream.close();}
if (fileInputStream!=null) {fileInputStream.close();}
}
// with no xmlDocumentXpath but splitDocuments we should have two for //item
parameters = new FlexibleParameters(new String[]{"inputFormat=RSS","splitDocuments=true"});
storedDocumentSourceExpander = new StoredDocumentSourceExpander(storedDocumentSourceStorage, parameters);
inputSource = new FileInputSource(TestHelper.getResource("xml/rss.xml"));
storedDocumentSource = storedDocumentSourceStorage.getStoredDocumentSource(inputSource);
expandedSourceDocumentSources = storedDocumentSourceExpander.expandXml(storedDocumentSource);
assertEquals(2, expandedSourceDocumentSources.size());
inputStream = null;
fileInputStream = null;
try {
inputStream = storedDocumentSourceStorage.getStoredDocumentSourceInputStream(expandedSourceDocumentSources.get(1).getId());
fileInputStream = new FileInputStream(TestHelper.getResource("xml/rss.xml.item2_xpath.xml"));
assertTrue(IOUtils.contentEquals(fileInputStream, inputStream));
}
finally {
if (inputStream!=null) {inputStream.close();}
if (fileInputStream!=null) {fileInputStream.close();}
}
// with xmlDocumentXpath we should have one for dc:creator
parameters = new FlexibleParameters(new String[]{"xmlDocumentsXpath=//dc:creator"});
storedDocumentSourceExpander = new StoredDocumentSourceExpander(storedDocumentSourceStorage, parameters);
inputSource = new FileInputSource(TestHelper.getResource("xml/rss.xml"));
storedDocumentSource = storedDocumentSourceStorage.getStoredDocumentSource(inputSource);
expandedSourceDocumentSources = storedDocumentSourceExpander.expandXml(storedDocumentSource);
assertEquals("XML file with no Xpath should contain one document", 1, expandedSourceDocumentSources.size());
inputStream = null;
fileInputStream = null;
try {
inputStream = storedDocumentSourceStorage.getStoredDocumentSourceInputStream(expandedSourceDocumentSources.get(0).getId());
fileInputStream = new FileInputStream(TestHelper.getResource("xml/rss.xml.dc_creator_xpath.xml"));
assertTrue(IOUtils.contentEquals(fileInputStream, inputStream));
}
finally {
if (inputStream!=null) {inputStream.close();}
if (fileInputStream!=null) {fileInputStream.close();}
}
// with xmlDocumentXpath we should have one for local-name()='creator'
parameters = new FlexibleParameters(new String[]{"xmlDocumentsXpath=//*[local-name()='creator']"});
storedDocumentSourceExpander = new StoredDocumentSourceExpander(storedDocumentSourceStorage, parameters);
inputSource = new FileInputSource(TestHelper.getResource("xml/rss.xml"));
storedDocumentSource = storedDocumentSourceStorage.getStoredDocumentSource(inputSource);
expandedSourceDocumentSources = storedDocumentSourceExpander.expandXml(storedDocumentSource);
assertEquals("XML file with local name creator should contain 1 document", 1, expandedSourceDocumentSources.size());
// with xmlDocumentXpath we should have none for creator (without namespace
parameters = new FlexibleParameters(new String[]{"xmlDocumentsXpath=//creator"});
storedDocumentSourceExpander = new StoredDocumentSourceExpander(storedDocumentSourceStorage, parameters);
inputSource = new FileInputSource(TestHelper.getResource("xml/rss.xml"));
storedDocumentSource = storedDocumentSourceStorage.getStoredDocumentSource(inputSource);
expandedSourceDocumentSources = storedDocumentSourceExpander.expandXml(storedDocumentSource);
assertEquals("XML file with creator (no namespace) should contain no documents", 0, expandedSourceDocumentSources.size());
// RSS documents within a zip archive (nested expansion)
parameters = new FlexibleParameters(new String[]{"inputFormat=RSS","splitDocuments=true"});
storedDocumentSourceExpander = new StoredDocumentSourceExpander(storedDocumentSourceStorage, parameters);
inputSource = new FileInputSource(TestHelper.getResource("archive/rss.xml.zip"));
storedDocumentSource = storedDocumentSourceStorage.getStoredDocumentSource(inputSource);
expandedSourceDocumentSources = storedDocumentSourceExpander.getExpandedStoredDocumentSources(storedDocumentSource);
assertEquals("XML file with no Xpath should contain one document", 2, expandedSourceDocumentSources.size());
inputStream = null;
fileInputStream = null;
try {
inputStream = storedDocumentSourceStorage.getStoredDocumentSourceInputStream(expandedSourceDocumentSources.get(1).getId());
fileInputStream = new FileInputStream(TestHelper.getResource("xml/rss.xml.item2_xpath.xml"));
assertTrue(IOUtils.contentEquals(fileInputStream, inputStream));
}
finally {
if (inputStream!=null) {inputStream.close();}
if (fileInputStream!=null) {fileInputStream.close();}
}
// make sure a string is recognized as XML
parameters = new FlexibleParameters();
storedDocumentSourceExpander = new StoredDocumentSourceExpander(storedDocumentSourceStorage, parameters);
inputSource = new StringInputSource("<a><b>c</b><b>d</b></a>");
storedDocumentSource = storedDocumentSourceStorage.getStoredDocumentSource(inputSource);
expandedSourceDocumentSources = storedDocumentSourceExpander.getExpandedStoredDocumentSources(storedDocumentSource);
assertEquals("XML string with no Xpath should contain one document", 1, expandedSourceDocumentSources.size());
assertEquals("input string should be recognized as XML", DocumentFormat.XML, expandedSourceDocumentSources.get(0).getMetadata().getDocumentFormat());
// make sure an XML string is expanded
parameters = new FlexibleParameters(new String[]{"xmlDocumentsXpath=//b"});
storedDocumentSourceExpander = new StoredDocumentSourceExpander(storedDocumentSourceStorage, parameters);
inputSource = new StringInputSource("<a><b>c</b><b>d</b></a>");
storedDocumentSource = storedDocumentSourceStorage.getStoredDocumentSource(inputSource);
expandedSourceDocumentSources = storedDocumentSourceExpander.getExpandedStoredDocumentSources(storedDocumentSource);
assertEquals("XML string with no documents xpath should contain two documents", 2, expandedSourceDocumentSources.size());
assertEquals("input string should be recognized as XML", DocumentFormat.XML, expandedSourceDocumentSources.get(0).getMetadata().getDocumentFormat());
// make sure namespaces works properly
parameters = new FlexibleParameters(new String[]{"xmlDocumentsXpath=//*[local-name()='table']"});
storedDocumentSourceExpander = new StoredDocumentSourceExpander(storedDocumentSourceStorage, parameters);
inputSource = new FileInputSource(TestHelper.getResource("xml/namespaces.xml"));
storedDocumentSource = storedDocumentSourceStorage.getStoredDocumentSource(inputSource);
expandedSourceDocumentSources = storedDocumentSourceExpander.getExpandedStoredDocumentSources(storedDocumentSource);
assertEquals("XML namespaces example should have three documents", 3, expandedSourceDocumentSources.size());
assertTrue("Make sure our first table still has & entity", IOUtils.toString(storedDocumentSourceStorage.getStoredDocumentSourceInputStream(expandedSourceDocumentSources.get(0).getId())).contains("&"));
// test groupby xpath
parameters = new FlexibleParameters(new String[]{"xmlDocumentsXpath=//*[local-name()='table']","xmlGroupByXpath=//*[local-name()='length']"});
storedDocumentSourceExpander = new StoredDocumentSourceExpander(storedDocumentSourceStorage, parameters);
inputSource = new FileInputSource(TestHelper.getResource("xml/namespaces.xml"));
storedDocumentSource = storedDocumentSourceStorage.getStoredDocumentSource(inputSource);
expandedSourceDocumentSources = storedDocumentSourceExpander.getExpandedStoredDocumentSources(storedDocumentSource);
assertEquals("XML namespaces example should have three documents", 2, expandedSourceDocumentSources.size());
parameters = new FlexibleParameters(new String[]{"xmlDocumentsXpath=//*[local-name()='table']","xmlGroupByXpath=//*[local-name()='width']"});
storedDocumentSourceExpander = new StoredDocumentSourceExpander(storedDocumentSourceStorage, parameters);
inputSource = new FileInputSource(TestHelper.getResource("xml/namespaces.xml"));
storedDocumentSource = storedDocumentSourceStorage.getStoredDocumentSource(inputSource);
expandedSourceDocumentSources = storedDocumentSourceExpander.getExpandedStoredDocumentSources(storedDocumentSource);
assertEquals("XML namespaces example should have three documents", 1, expandedSourceDocumentSources.size());
storage.destroy();
}
}