/*******************************************************************************
* Trombone is a flexible text processing and analysis library used
* primarily by Voyant Tools (voyant-tools.org).
*
* Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell
*
* This file is part of Trombone.
*
* Trombone is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Trombone is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Trombone. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package org.voyanttools.trombone.input.expand;
import it.svario.xpathapi.jaxp.XPathAPI;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import javax.xml.namespace.QName;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPathException;
import net.sf.saxon.lib.NamespaceConstant;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.compress.utils.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.tika.detect.XmlRootExtractor;
import org.voyanttools.trombone.input.source.InputSource;
import org.voyanttools.trombone.input.source.Source;
import org.voyanttools.trombone.input.source.StringInputSource;
import org.voyanttools.trombone.model.DocumentFormat;
import org.voyanttools.trombone.model.DocumentMetadata;
import org.voyanttools.trombone.model.StoredDocumentSource;
import org.voyanttools.trombone.storage.StoredDocumentSourceStorage;
import org.voyanttools.trombone.util.FlexibleParameters;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
/**
* An expander that looks for sub-documents within an XML document, especially
* if the {@code xmlDocumentsXpath} parameter is set to a valid XPath
* expression. The XPath expression should now support namespaces declared in
* the root element. When a single XPath expression is provided, documents are
* created from each of the matching {@link Node}s. When multiple XPath
* expressions are provided, all the nodes matching each XPath expression are
* combined into one document (so one document per XPath expression). To simply
* extract all of the content from one XML document into one source document,
* don't use an expander, use the xmlContentXpath parameter instead (which will
* be handled by the XML parser).
*
* @author "Stéfan Sinclair"
*/
class XmlExpander implements Expander {
/**
* all parameters sent, only some of which may be relevant to some expanders
*/
private FlexibleParameters parameters;
/**
* the stored document storage strategy
*/
private StoredDocumentSourceStorage storedDocumentSourceStorage;
/**
* the Transformer used to produce XML output from nodes
*/
private Transformer transformer;
/**
* Create a new instance of this expander (this should only be done by
* {@link StoredDocumentSourceExpander}.
*
* @param storedDocumentSourceStorage
* a stored storage strategy
* @param parameters
* that may be relevant to this expander, including
* {@code xmlDocumentsXapth}
*/
XmlExpander(StoredDocumentSourceStorage storedDocumentSourceStorage,
FlexibleParameters parameters) {
this.storedDocumentSourceStorage = storedDocumentSourceStorage;
this.parameters = parameters;
try {
transformer = TransformerFactory.newInstance().newTransformer();
} catch (TransformerConfigurationException e) {
throw new IllegalStateException(
"Unable to create XML transformer.", e);
}
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
}
/*
* (non-Javadoc)
*
* @see org.voyanttools.trombone.input.expand.Expander#
* getExpandedStoredDocumentSources
* (org.voyanttools.trombone.document.StoredDocumentSource)
*/
public List<StoredDocumentSource> getExpandedStoredDocumentSources(
StoredDocumentSource storedDocumentSource) throws IOException {
List<StoredDocumentSource> childStoredDocumentSources = new ArrayList<StoredDocumentSource>();
String xmlDocumentsXpath = parameters.getParameterValue("xmlDocumentsXpath", "");
// no format specified, so let's have a peek at the contents to see if we can determine a sub-format
DocumentFormat guessedFormat = DocumentFormat.UNKNOWN;
if (parameters.getParameterValue("inputFormat","").isEmpty()) {
InputStream is = null;
try {
is = storedDocumentSourceStorage.getStoredDocumentSourceInputStream(storedDocumentSource.getId());
XmlRootExtractor xmlRootExtractor = new XmlRootExtractor();
QName qname = xmlRootExtractor.extractRootElement(is);
String name = qname.getLocalPart();
if (name.equals("feed") && qname.getNamespaceURI().toLowerCase().contains("atom")) guessedFormat = DocumentFormat.ATOM;
else if (name.equals("TEI")) guessedFormat = DocumentFormat.TEI;
else if (name.equals("teiCorpus")) guessedFormat = DocumentFormat.TEICORPUS;
else if (name.equals("rss")) guessedFormat = DocumentFormat.RSS;
else if (name.equals("EEBO")) guessedFormat = DocumentFormat.EEBODREAM;
}
finally {
if (is!=null) is.close();
}
}
// check to see if we need to set xmlDocumentsXpath using defaults for format
if (xmlDocumentsXpath.isEmpty() && (parameters.getParameterValue("inputFormat","").isEmpty()==false || guessedFormat!=DocumentFormat.UNKNOWN)) {
String guessedFormatString = guessedFormat==DocumentFormat.UNKNOWN ? parameters.getParameterValue("inputFormat","") : guessedFormat.name();
String resourcePath = "/org/voyanttools/trombone/input-formats/"+guessedFormatString.toLowerCase()+".xml";
Properties properties = new Properties();
URL url = this.getClass().getResource(resourcePath);
if (url!=null) {
File file = new File(url.getPath());
if (file.exists()) {
FileInputStream in = null;
try {
in = new FileInputStream(file);
properties.loadFromXML(in);
}
finally {
if (in!=null) {
in.close();
}
}
}
if (properties.containsKey("xmlDocumentsXpath")) {
xmlDocumentsXpath = properties.getProperty("xmlDocumentsXpath");
}
}
}
String xmlGroupByXpath = parameters.getParameterValue("xmlGroupByXpath", "");
if (xmlDocumentsXpath.isEmpty()) {
childStoredDocumentSources.add(storedDocumentSource);
return childStoredDocumentSources;
}
DocumentMetadata parentMetadata = storedDocumentSource.getMetadata();
String parentId = storedDocumentSource.getId();
String multipleExpandedStoredDocumentSourcesPrefix = DigestUtils.md5Hex(xmlDocumentsXpath+xmlGroupByXpath);
childStoredDocumentSources = storedDocumentSourceStorage.getMultipleExpandedStoredDocumentSources(parentId, multipleExpandedStoredDocumentSourcesPrefix);
if (childStoredDocumentSources != null && childStoredDocumentSources.isEmpty() == false) {
return childStoredDocumentSources;
}
// for some reason XPathAPI doesn't work properly with the default
// XPathFactory, so we'll use Saxon
System.setProperty("javax.xml.xpath.XPathFactory:"
+ NamespaceConstant.OBJECT_MODEL_SAXON,
"net.sf.saxon.xpath.XPathFactoryImpl");
InputStream inputStream = null;
Document doc;
try {
inputStream = storedDocumentSourceStorage
.getStoredDocumentSourceInputStream(storedDocumentSource
.getId());
DocumentBuilderFactory factory = DocumentBuilderFactory
.newInstance();
factory.setFeature("http://xml.org/sax/features/validation", false);
factory.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
factory.setFeature("http://xml.org/sax/features/external-general-entities", false);
factory.setIgnoringComments(true);
DocumentBuilder builder = factory.newDocumentBuilder();
doc = builder.parse(inputStream);
} catch (ParserConfigurationException e) {
throw new IOException("Error with XML parser configuration for "
+ storedDocumentSource, e);
} catch (SAXException e) {
throw new IOException("Error with XML parsing for "
+ storedDocumentSource, e);
} finally {
if (inputStream != null)
inputStream.close();
}
List<NodeInputSource> nodeInputSources = getChildStoredDocumentSources(doc, xmlDocumentsXpath, parentId, parentMetadata);
if (nodeInputSources.isEmpty()==false) {
if (xmlGroupByXpath.isEmpty()==false) {
Map<String, List<NodeInputSource>> groupedNodeInputSources = new HashMap<String, List<NodeInputSource>>();
for (NodeInputSource nodeInputSource : nodeInputSources) {
List<String> keys;
try {
Node fragment = doc.createDocumentFragment();
fragment.appendChild(nodeInputSource.node);
keys = XPathAPI.selectNodeListAsStrings(fragment, xmlGroupByXpath);
} catch (XPathException e) {
throw new IllegalArgumentException("Unable to use this XPath: "+xmlGroupByXpath, e);
}
if (keys.isEmpty()==false) {
String key = StringUtils.join(keys, " ");
if (groupedNodeInputSources.containsKey(key)==false) {
groupedNodeInputSources.put(key, new ArrayList<NodeInputSource>());
}
groupedNodeInputSources.get(key).add(nodeInputSource);
}
}
for (Map.Entry<String, List<NodeInputSource>> mappedNodeInputSources : groupedNodeInputSources.entrySet()) {
List<NodeInputSource> mappedNodeInputSourcesList = mappedNodeInputSources.getValue();
// if (mappedNodeInputSourcesList.size()==1) { // just one, so use it
// childStoredDocumentSources.add(getStoredDocumentSource(mappedNodeInputSourcesList.get(0)));
// }
// else { // multiple, we need to wrap with root node
String key = mappedNodeInputSources.getKey();
Node newParentNode = doc.getDocumentElement().cloneNode(false);
for (NodeInputSource nodeInputSource : mappedNodeInputSourcesList) {
newParentNode.appendChild(nodeInputSource.node);
}
NodeInputSource newNodeInputSource = getChildStoredDocumentSource(newParentNode, parentId, parentMetadata, parentId+";group:"+key);
newNodeInputSource.documentMetadata.setTitle(key);
childStoredDocumentSources.add(getStoredDocumentSource(newNodeInputSource));
// }
}
}
else {
for (NodeInputSource nodeInputSource : nodeInputSources) {
childStoredDocumentSources.add(getStoredDocumentSource(nodeInputSource));
}
}
}
// each node is a separate document
// if (xmlDocumentsXpaths.length == 1) {
// childStoredDocumentSources.addAll(getChildStoredDocumentSources(
// doc, xmlDocumentsXpaths[0], parentId, parentMetadata));
// }
//
// // each xpath is a separate document
// else {
// childStoredDocumentSources.addAll(getChildStoredDocumentSources(
// doc, xmlDocumentsXpaths, parentId, parentMetadata));
// }
storedDocumentSourceStorage.setMultipleExpandedStoredDocumentSources(
parentId, childStoredDocumentSources,
multipleExpandedStoredDocumentSourcesPrefix);
return childStoredDocumentSources;
}
/**
* Get a list of stored document sources. Matching nodes for each XPath
* expression are concatenated into a single document (one document per
* XPath).
*
* @param doc
* the {@link Document} to be searched
* @param xmlDocumentsXpaths
* the list of XPath expressions to find nodes
* @param parentId
* the ID of the stored parent document
* @param parentMetadata
* the metadata of the stored parent document
* @return a list of {@link StoredDocumentSource}s
* @throws IOException
* an exception that occurs during processing
*/
/*
private List<StoredDocumentSource> getChildStoredDocumentSources(
Document doc, String[] xmlDocumentsXpaths, String parentId,
DocumentMetadata parentMetadata) throws IOException {
List<StoredDocumentSource> childStoredDocumentSources = new ArrayList<StoredDocumentSource>();
for (int i = 0, len = xmlDocumentsXpaths.length; i < len; i++) {
List<Node> docs;
try {
docs = XPathAPI.selectListOfNodes(doc.getDocumentElement(),
xmlDocumentsXpaths[i], doc.getDocumentElement());
} catch (XPathException e) {
throw new IllegalArgumentException(
"A problem was encountered proccesing this XPath query: "
+ xmlDocumentsXpaths[i], e);
}
if (docs.isEmpty()) {
continue;
}
Node newParentNode = doc.getDocumentElement().cloneNode(false);
for (Node node : docs) {
newParentNode.appendChild(node);
}
StoredDocumentSource childStoredDocumentSource = getChildStoredDocumentSource(
newParentNode, parentId, parentMetadata,
xmlDocumentsXpaths[i] + "[" + (i) + "]");
childStoredDocumentSources.add(childStoredDocumentSource);
}
return childStoredDocumentSources;
}
*/
/**
* Get a list of stored document sources. Each node matching the specified
* XPath expression becomes a separate document.
*
* @param doc
* the {@link Document} to be searched
* @param xmlDocumentsXpath
* the XPath expressions to find nodes
* @param parentId
* the ID of the stored parent document
* @param parentMetadata
* the metadata of the stored parent document
* @return a list of {@link StoredDocumentSource}s
* @throws IOException
* an exception that occurs during processing
*/
private List<NodeInputSource> getChildStoredDocumentSources(
Document doc, String xmlDocumentsXpath, String parentId,
DocumentMetadata parentMetadata) throws IOException {
List<NodeInputSource> childNodeInputSources = new ArrayList<NodeInputSource>();
List<Node> docs;
try {
docs = XPathAPI.selectListOfNodes(doc.getDocumentElement(),
xmlDocumentsXpath, doc.getDocumentElement());
} catch (XPathException e) {
throw new IllegalArgumentException(
"A problem was encountered proccesing this XPath query: "
+ xmlDocumentsXpath, e);
}
for (int i = 0, len = docs.size(); i < len; i++) {
NodeInputSource childStoredDocumentSource = getChildStoredDocumentSource(
docs.get(i), parentId, parentMetadata, xmlDocumentsXpath
+ "[" + (i) + "]");
childNodeInputSources.add(childStoredDocumentSource);
}
return childNodeInputSources;
}
/**
* Get a {@link StoredDocumentSource} from the specified {@link Node} and
* parent information.
*
* @param node
* the {@link Node} from with to produce an XML document
* @param parentId
* the ID of the stored parent document
* @param parentMetadata
* the metadata of the stored parent document
* @param location
* the approximate XPath location that can help generate a unique
* identifier
* @return a {@link StoredDocumentSource}
* @throws IOException
* an exception that occurs during IO processing
*/
private NodeInputSource getChildStoredDocumentSource(Node node,
String parentId, DocumentMetadata parentMetadata, String location)
throws IOException {
DocumentMetadata metadata = parentMetadata.asParent(parentId, DocumentMetadata.ParentType.EXPANSION);
metadata.setModified(parentMetadata.getModified());
metadata.setSource(Source.STRING);
metadata.setLocation(location);
metadata.setDocumentFormat(parentMetadata.getDocumentFormat()==DocumentFormat.SATORBASE ? parentMetadata.getDocumentFormat() : DocumentFormat.XML);
String id = DigestUtils.md5Hex(parentId + location);
return new NodeInputSource(id, node, metadata);
}
private StoredDocumentSource getStoredDocumentSource(NodeInputSource nodeInputSource) throws IOException {
StringWriter sw = new StringWriter(); // no need to close
Result streamResult = new StreamResult(sw);
try {
transformer.transform(new DOMSource(nodeInputSource.node), streamResult);
} catch (TransformerException e) {
throw new IOException("Unable to transform node from stored document: "+nodeInputSource.documentMetadata);
}
InputSource inputSource = new StringInputSource(nodeInputSource.id, nodeInputSource.documentMetadata, sw.toString());
return storedDocumentSourceStorage.getStoredDocumentSource(inputSource);
}
private class NodeInputSource {
private Node node;
private String id;
private DocumentMetadata documentMetadata;
private NodeInputSource(String id, Node node, DocumentMetadata documentMetadata) {
this.node = node;
this.id = id;
this.documentMetadata = documentMetadata;
}
}
}