/******************************************************************************* * Trombone is a flexible text processing and analysis library used * primarily by Voyant Tools (voyant-tools.org). * * Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell * * This file is part of Trombone. * * Trombone is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Trombone is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Trombone. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package org.voyanttools.trombone.input.extract; //import it.svario.xpathapi.jaxp.XPathAPI; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.Serializable; import java.io.StringWriter; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.Arrays; import java.util.HashSet; import java.util.Locale; import java.util.Properties; import java.util.Set; import javax.xml.namespace.QName; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Result; import javax.xml.transform.Source; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMResult; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; import org.apache.tika.detect.XmlRootExtractor; import org.voyanttools.trombone.input.source.InputSource; import org.voyanttools.trombone.model.DocumentFormat; import org.voyanttools.trombone.model.DocumentMetadata; import org.voyanttools.trombone.model.StoredDocumentSource; import org.voyanttools.trombone.storage.StoredDocumentSourceStorage; import org.voyanttools.trombone.util.FlexibleParameters; import org.voyanttools.trombone.util.LangDetector; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import net.sf.saxon.lib.NamespaceConstant; import net.sf.saxon.xpath.XPathFactoryImpl; /** * @author sgs * */ public class XmlExtractor implements Extractor, Serializable { private static final long serialVersionUID = -8659873836740839314L; private StoredDocumentSourceStorage storedDocumentSourceStorage; private FlexibleParameters parameters; /** * the Transformer used to produce XML output from nodes */ private Transformer transformer; private XPathFactory xpathFactory; public XmlExtractor( StoredDocumentSourceStorage storedDocumentSourceStorage, FlexibleParameters parameters) { this.storedDocumentSourceStorage = storedDocumentSourceStorage; this.parameters = parameters; try { transformer = TransformerFactory.newInstance().newTransformer(); } catch (TransformerConfigurationException e) { throw new IllegalStateException( "Unable to create XML transformer.", e); } transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no"); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); xpathFactory = new XPathFactoryImpl(); // for some reason XPathAPI doesn't work properly with the default // XPathFactory, so we'll use Saxon System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON, "net.sf.saxon.xpath.XPathFactoryImpl"); } /* (non-Javadoc) * @see org.voyanttools.trombone.input.extract.Extractor#getInputSource(org.voyanttools.trombone.document.StoredDocumentSource) */ @Override public InputSource getExtractableInputSource(StoredDocumentSource storedDocumentSource) throws IOException { FlexibleParameters localParameters = parameters.clone(); // no format specified, so let's have a peek at the contents to see if we can determine a sub-format DocumentFormat guessedFormat = DocumentFormat.UNKNOWN; if (localParameters.getParameterValue("inputFormat","").isEmpty()) { DocumentFormat df = storedDocumentSource.getMetadata().getDocumentFormat(); if (df.isXml() && df!=DocumentFormat.XML) {guessedFormat=df;} else { InputStream is = null; try { is = storedDocumentSourceStorage.getStoredDocumentSourceInputStream(storedDocumentSource.getId()); XmlRootExtractor xmlRootExtractor = new XmlRootExtractor(); QName qname = xmlRootExtractor.extractRootElement(is); if (qname!=null) { String name = qname.getLocalPart(); if (name.equals("feed") && qname.getNamespaceURI().toLowerCase().contains("atom")) guessedFormat = DocumentFormat.ATOM; else if (name.equals("TEI")) guessedFormat = DocumentFormat.TEI; else if (name.equals("teiCorpus")) guessedFormat = DocumentFormat.TEICORPUS; else if (name.equals("rss")) guessedFormat = DocumentFormat.RSS; else if (name.equals("EEBO")) guessedFormat = DocumentFormat.EEBODREAM; } } finally { if (is!=null) is.close(); } } } if (localParameters.getParameterValue("inputFormat","").isEmpty()==false || guessedFormat!=DocumentFormat.UNKNOWN) { String guessedFormatString = guessedFormat==DocumentFormat.UNKNOWN ? localParameters.getParameterValue("inputFormat","") : guessedFormat.name(); Properties properties = new Properties(); String resourcePath = "/org/voyanttools/trombone/input-formats/"+guessedFormatString.toLowerCase()+".xml"; URL url = this.getClass().getResource(resourcePath); if (url!=null) { File file = new File(url.getPath()); if (file.exists()) { FileInputStream in = null; try { in = new FileInputStream(file); properties.loadFromXML(in); } finally { if (in!=null) { in.close(); } } } if (localParameters.getParameterBooleanValue("splitDocuments")) { for (String key : properties.stringPropertyNames()) { if (key.contains(".splitDocuments")) { localParameters.setParameter(key.split("\\.")[0], properties.getProperty(key)); // overwrite prefix key } } } for (String key : properties.stringPropertyNames()) { if (localParameters.getParameterValue(key,"").isEmpty()==true) { localParameters.setParameter(key, properties.getProperty(key)); } } } } String[] relevantParameters = new String[]{"xmlContentXpath","xmlTitleXpath","xmlAuthorXpath","xmlPubPlaceXpath","xmlPublisherXpath","xmlPubDateXpath","xmlKeywordXpath","xmlCollectionXpath","xmlExtraMetadataXpath"}; StringBuilder parametersBuilder = new StringBuilder(); for (String p : relevantParameters) { if (localParameters.getParameterValue(p, "").isEmpty()==false) { parametersBuilder.append(p); for (String s : localParameters.getParameterValues(p)) { parametersBuilder.append(s); } } } /* This was skipped, but we probably need to extract anyway to strip XML comments, detect language, etc. * // no special parameters and nothing to extract from XML, so just return the original stored document if (parametersBuilder.length()==0) { return new StoredDocumentSourceInputSource(storedDocumentSourceStorage, storedDocumentSource); } */ return new ExtractableXmlInputSource(DigestUtils.md5Hex(storedDocumentSource.getId()+relevantParameters+String.valueOf(serialVersionUID)), storedDocumentSource, localParameters); } private class ExtractableXmlInputSource implements InputSource { private String id; private String storedDocumentSourceId; private StoredDocumentSource storedDocumentSource; private DocumentMetadata metadata; private boolean isProcessed = false; private FlexibleParameters localParameters; private ExtractableXmlInputSource(String id, StoredDocumentSource storedDocumentSource, FlexibleParameters localParameters) { this.id = id; this.storedDocumentSourceId = storedDocumentSource.getId(); this.storedDocumentSource = storedDocumentSource; this.metadata = storedDocumentSource.getMetadata().asParent(storedDocumentSourceId, DocumentMetadata.ParentType.EXTRACTION); this.metadata.setLocation(storedDocumentSource.getMetadata().getLocation()); this.metadata.setDocumentFormat(DocumentFormat.XML); this.localParameters = localParameters; } @Override public InputStream getInputStream() throws IOException { InputStream inputStream = null; Document doc; try { inputStream = storedDocumentSourceStorage .getStoredDocumentSourceInputStream(storedDocumentSourceId); DocumentBuilderFactory factory = DocumentBuilderFactory .newInstance(); factory.setFeature("http://xml.org/sax/features/validation", false); factory.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); factory.setFeature("http://xml.org/sax/features/external-general-entities", false); factory.setIgnoringComments(true); DocumentBuilder builder = factory.newDocumentBuilder(); doc = builder.parse(inputStream); } catch (ParserConfigurationException e) { throw new IOException("Error with XML parser configuration for " + storedDocumentSource, e); } catch (SAXException e) { throw new IOException("Error with XML parsing for " + storedDocumentSource, e); } finally { if (inputStream != null) inputStream.close(); } if (localParameters.containsKey("xmlExtractorTemplate")) { Source source = null; String xmlExtractorTemplate = localParameters.getParameterValue("xmlExtractorTemplate"); URI templateUrl; try { templateUrl = this.getClass().getResource("/org/voyanttools/trombone/templates/"+xmlExtractorTemplate).toURI(); } catch (URISyntaxException e1) { throw new IOException("Unable to find local template directory", e1); } File file = new File(templateUrl); if (file.exists()) { source = new StreamSource(file); } if (source!=null) { DOMResult result = new DOMResult(); try { Transformer extractorTransformer = TransformerFactory.newInstance().newTransformer(source); extractorTransformer.transform(new DOMSource(doc), result); } catch (TransformerException e) { throw new IOException("Unable to transform document during expansion "+metadata, e); } try { doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument(); } catch (ParserConfigurationException e) { throw new IllegalStateException("Unable to create new XML document during templated extraction.", e); } doc = (Document) result.getNode(); // DOMImplementationLS domImplementation = (DOMImplementationLS) doc.getImplementation(); // LSSerializer lsSerializer = domImplementation.createLSSerializer(); // System.out.println(lsSerializer.writeToString(doc).substring(0, 3000)); } else { throw new IOException("Unable to find extractor template "+xmlExtractorTemplate); } } // try to find title if needed String[] titles = getNodesAsStringsFromParametersValue(doc, "xmlTitleXpath"); if (titles.length>0) { metadata.setTitles(titles); } // try to find author if needed String[] authors = getNodesAsStringsFromParametersValue(doc, "xmlAuthorXpath"); if (authors.length>0) { metadata.setAuthors(authors); } // try to find publplace if needed String[] pubPlaces = getNodesAsStringsFromParametersValue(doc, "xmlPubPlaceXpath"); if (pubPlaces.length>0) { metadata.setPubPlaces(pubPlaces); } // try to find title if needed String[] publishers = getNodesAsStringsFromParametersValue(doc, "xmlPublisherXpath"); if (publishers.length>0) { metadata.setPublishers(publishers); } // try to find pubDates if needed String[] pubDates = getNodesAsStringsFromParametersValue(doc, "xmlPubDateXpath"); if (pubDates.length>0) { metadata.setPubDates(pubDates); } // try to find pubDates if needed String[] collections = getNodesAsStringsFromParametersValue(doc, "xmlCollectionXpath"); if (collections.length>0) { metadata.setCollections(collections); } // try to find pubDates if needed String[] keywords = getNodesAsStringsFromParametersValue(doc, "xmlKeywordXpath"); if (keywords.length>0) { metadata.setKeywords(keywords); } for (String extra : localParameters.getParameterValues("xmlExtraMetadataXpath")) { for (String x :extra.split("(\r\n|\r|\n)+")) { x = x.trim(); String[] parts = x.split("="); if (parts.length>1) { String key = parts[0].trim(); String xpath = StringUtils.join(Arrays.copyOfRange(parts, 1, parts.length), "=").trim(); String[] values = getNodesAsStringsFromXpath(doc, xpath); if (values.length>0) { metadata.setExtras(key, values); } } } } // if no XPath is defined, consider the whole source XML (but allow for additional metadata ot be identified String xmlContentXpath = localParameters.getParameterValue("xmlContentXpath","/"); NodeList nodeList; XPath xpath = xpathFactory.newXPath(); try { nodeList = (NodeList) xpath.evaluate(xmlContentXpath, doc.getDocumentElement(), XPathConstants.NODESET); } catch (XPathExpressionException e) { throw new IllegalArgumentException( "A problem was encountered proccesing this XPath query: " + xmlContentXpath, e); } Node newParentNode; // just use the single node as root if (nodeList.getLength()==1) { newParentNode = nodeList.item(0); } // encapsulate child nodes in document root else { newParentNode = doc.getDocumentElement().cloneNode(false); for (int i=0, len=nodeList.getLength(); i<len; i++) { newParentNode.appendChild(nodeList.item(i)); } } StringWriter sw = new StringWriter(); // no need to close Result streamResult = new StreamResult(sw); try { transformer.transform(new DOMSource(newParentNode), streamResult); } catch (TransformerException e) { throw new IOException( "Unable to transform node during XML extraction: "+storedDocumentSource); } String string = sw.toString(); // String string = StringEscapeUtils.unescapeXml(sw.toString()); // byte[] bytes = string.getBytes("UTF-8"); // ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(bytes); // try to determine language metadata.setLanguageCode(LangDetector.langDetector.detect(string, parameters)); isProcessed = true; return new ByteArrayInputStream(string.getBytes("UTF-8")); // return new ByteArrayInputStream(StringEscapeUtils.unescapeXml(string).getBytes("UTF-8")); } private String[] getNodesAsStringsFromParametersValue(Document doc, String parameterKey) { String xpathString = localParameters.getParameterValue(parameterKey,""); return getNodesAsStringsFromXpath(doc, xpathString); } private String[] getNodesAsStringsFromXpath(Document doc, String xpathString) { String[] strings = new String[0]; if (xpathString.isEmpty()==false) { Set<String> values = new HashSet<String>(); XPath xpath = xpathFactory.newXPath(); NodeList nodeList; try { // this is awful to have to specify the return type, this should probably use a different library if (xpathString.startsWith("string") || xpathString.startsWith("concat(") || xpathString.startsWith("replace(")) { String s = (String) xpath.evaluate(xpathString, doc.getDocumentElement(), XPathConstants.STRING); values.add(s); } else { nodeList = (NodeList) xpath.evaluate(xpathString, doc.getDocumentElement(), XPathConstants.NODESET); for (int i=0, len=nodeList.getLength(); i<len; i++) { values.add(nodeList.item(i).getTextContent()); } } } catch (XPathExpressionException e) { throw new IllegalArgumentException( "A problem was encountered proccesing this XPath query: " + xpathString, e); } return values.toArray(strings); } return strings; } @Override public DocumentMetadata getMetadata() throws IOException { return isProcessed ? this.metadata : storedDocumentSourceStorage.getStoredDocumentSourceMetadata(id); } @Override public String getUniqueId() { return this.id; } } }