XmlExtractor.java example

Explorer
trombone-master
- src
  - main
    - java
  - test
    - java
      - org
        voyanttools
        trombone
        document
        MetadataTest.java
        input
        expand
        ArchiveExpanderTest.java
        CompressedExpanderTest.java
        XmlExpanderTest.java
        XslExpanderTest.java
        extract
        BagItExtractorTest.java
        TikaExtractorTest.java
        XmlExtractorTest.java
        index
        LuceneIndexerTest.java
        lucene
        StoredToLuceneDocumentMapperTest.java
        analysis
        OpenNlpLemmaTokenizerTest.java
        StanfordNlpLemmaTokenizerTest.java
        search
        FieldPrefixAwareSimpleQueryParserTest.java
        model
        CorpusCollocateTest.java
        CorpusTermMinimalsDBTest.java
        CorpusTermsQueueTest.java
        DocumentTermsTest.java
        KeywordsTest.java
        TableTest.java
        storage
        file
        FileStoredDocumentSourceStorageTest.java
        TromboneMigration.java
        tool
        DocumentCollocatesTest.java
        DocumentTermsTest.java
        KwicsTest.java
        StoredResourceTest.java
        TableCorrelationsTest.java
        TableManagerTest.java
        build
        CorpusBuilderTest.java
        CorpusCreatorTest.java
        DocumentExpanderTest.java
        DocumentExtractorTest.java
        DocumentStorerTest.java
        corpus
        CorpusCollocatesTest.java
        CorpusFacetsTest.java
        CorpusManagerTest.java
        CorpusTermsTest.java
        DocumentContextsTest.java
        DocumentNgramsTest.java
        DocumentTermsTest.java
        DocumentTokensTest.java
        DocumentsMetadataTest.java
        SimpleSortedSetFacetsExample.java
        VelizaTest.java
        util
        EmbeddedWebServer.java
        TestHelper.java
/*******************************************************************************
 * Trombone is a flexible text processing and analysis library used
 * primarily by Voyant Tools (voyant-tools.org).
 * 
 * Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell
 * 
 * This file is part of Trombone.
 * 
 * Trombone is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Trombone is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with Trombone.  If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package org.voyanttools.trombone.input.extract;

//import it.svario.xpathapi.jaxp.XPathAPI;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.io.StringWriter;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;

import javax.xml.namespace.QName;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMResult;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.tika.detect.XmlRootExtractor;
import org.voyanttools.trombone.input.source.InputSource;
import org.voyanttools.trombone.model.DocumentFormat;
import org.voyanttools.trombone.model.DocumentMetadata;
import org.voyanttools.trombone.model.StoredDocumentSource;
import org.voyanttools.trombone.storage.StoredDocumentSourceStorage;
import org.voyanttools.trombone.util.FlexibleParameters;
import org.voyanttools.trombone.util.LangDetector;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import net.sf.saxon.lib.NamespaceConstant;
import net.sf.saxon.xpath.XPathFactoryImpl;

/**
 * @author sgs
 *
 */
public class XmlExtractor implements Extractor, Serializable {
	
	
	private static final long serialVersionUID = -8659873836740839314L;
	private StoredDocumentSourceStorage storedDocumentSourceStorage;
	private FlexibleParameters parameters;
	
	/**
	 * the Transformer used to produce XML output from nodes
	 */
	private Transformer transformer;
	
	private XPathFactory xpathFactory;

	public XmlExtractor(
			StoredDocumentSourceStorage storedDocumentSourceStorage,
			FlexibleParameters parameters) {
		this.storedDocumentSourceStorage = storedDocumentSourceStorage;
		this.parameters = parameters;
		try {
			transformer = TransformerFactory.newInstance().newTransformer();
		} catch (TransformerConfigurationException e) {
			throw new IllegalStateException(
					"Unable to create XML transformer.", e);
		}
		transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
		transformer.setOutputProperty(OutputKeys.INDENT, "yes");
		
		xpathFactory = new XPathFactoryImpl();
		
		// for some reason XPathAPI doesn't work properly with the default
		// XPathFactory, so we'll use Saxon
		System.setProperty("javax.xml.xpath.XPathFactory:"
				+ NamespaceConstant.OBJECT_MODEL_SAXON,
				"net.sf.saxon.xpath.XPathFactoryImpl");
	}

	/* (non-Javadoc)
	 * @see org.voyanttools.trombone.input.extract.Extractor#getInputSource(org.voyanttools.trombone.document.StoredDocumentSource)
	 */
	@Override
	public InputSource getExtractableInputSource(StoredDocumentSource storedDocumentSource)
			throws IOException {
		
		FlexibleParameters localParameters = parameters.clone();
		
		// no format specified, so let's have a peek at the contents to see if we can determine a sub-format
		DocumentFormat guessedFormat = DocumentFormat.UNKNOWN;
		if (localParameters.getParameterValue("inputFormat","").isEmpty()) {
			DocumentFormat df = storedDocumentSource.getMetadata().getDocumentFormat();
			if (df.isXml() && df!=DocumentFormat.XML) {guessedFormat=df;}
			else {
				InputStream is = null;
				try {
					is = storedDocumentSourceStorage.getStoredDocumentSourceInputStream(storedDocumentSource.getId());
					XmlRootExtractor xmlRootExtractor = new XmlRootExtractor();
					QName qname = xmlRootExtractor.extractRootElement(is);
					if (qname!=null) {
						String name = qname.getLocalPart();
						if (name.equals("feed") && qname.getNamespaceURI().toLowerCase().contains("atom")) guessedFormat = DocumentFormat.ATOM;
						else if (name.equals("TEI")) guessedFormat = DocumentFormat.TEI;
						else if (name.equals("teiCorpus")) guessedFormat = DocumentFormat.TEICORPUS;
						else if (name.equals("rss")) guessedFormat = DocumentFormat.RSS;
						else if (name.equals("EEBO")) guessedFormat = DocumentFormat.EEBODREAM;
					}
				}
				finally {
					if (is!=null) is.close();
				}
			}
		}
		
		
		if (localParameters.getParameterValue("inputFormat","").isEmpty()==false || guessedFormat!=DocumentFormat.UNKNOWN) {
			
			String guessedFormatString = guessedFormat==DocumentFormat.UNKNOWN ? localParameters.getParameterValue("inputFormat","") : guessedFormat.name();			
			Properties properties = new Properties();
			
			String resourcePath = "/org/voyanttools/trombone/input-formats/"+guessedFormatString.toLowerCase()+".xml";
			URL url = this.getClass().getResource(resourcePath);
			if (url!=null) {
				File file = new File(url.getPath());
				if (file.exists()) {
					FileInputStream in = null;
					try {
						in = new FileInputStream(file);
						properties.loadFromXML(in);
						
					}
					finally {
						if (in!=null) {
							in.close();
						}
					}
				}
				if (localParameters.getParameterBooleanValue("splitDocuments")) {
					for (String key : properties.stringPropertyNames()) {
						if (key.contains(".splitDocuments")) {
							localParameters.setParameter(key.split("\\.")[0], properties.getProperty(key)); // overwrite prefix key
						}
					}
				}
				for (String key : properties.stringPropertyNames()) {
					if (localParameters.getParameterValue(key,"").isEmpty()==true) {
						localParameters.setParameter(key, properties.getProperty(key));
					}
				}
			}
			
		}
		
		String[] relevantParameters = new String[]{"xmlContentXpath","xmlTitleXpath","xmlAuthorXpath","xmlPubPlaceXpath","xmlPublisherXpath","xmlPubDateXpath","xmlKeywordXpath","xmlCollectionXpath","xmlExtraMetadataXpath"};
		StringBuilder parametersBuilder = new StringBuilder();
		for (String p : relevantParameters) {
			if (localParameters.getParameterValue(p, "").isEmpty()==false) {
				parametersBuilder.append(p);
				for (String s : localParameters.getParameterValues(p)) {
					parametersBuilder.append(s);
				}
			}
		}
		
		/* This was skipped, but we probably need to extract anyway to strip XML comments, detect language, etc.
		 * 
		// no special parameters and nothing to extract from XML, so just return the original stored document
		if (parametersBuilder.length()==0) {
			return new StoredDocumentSourceInputSource(storedDocumentSourceStorage, storedDocumentSource);
		}
		*/
		
		return new ExtractableXmlInputSource(DigestUtils.md5Hex(storedDocumentSource.getId()+relevantParameters+String.valueOf(serialVersionUID)), storedDocumentSource, localParameters);
	}

	private class ExtractableXmlInputSource implements InputSource {
		
		
		private String id;
		
		private String storedDocumentSourceId;

		private StoredDocumentSource storedDocumentSource;
		
		private DocumentMetadata metadata;
		
		private boolean isProcessed = false;
		
		private FlexibleParameters localParameters;
		
		private ExtractableXmlInputSource(String id, StoredDocumentSource storedDocumentSource, FlexibleParameters localParameters) {
			this.id = id;
			this.storedDocumentSourceId = storedDocumentSource.getId();
			this.storedDocumentSource = storedDocumentSource;
			this.metadata = storedDocumentSource.getMetadata().asParent(storedDocumentSourceId, DocumentMetadata.ParentType.EXTRACTION);
			this.metadata.setLocation(storedDocumentSource.getMetadata().getLocation());
			this.metadata.setDocumentFormat(DocumentFormat.XML);
			this.localParameters = localParameters;
		}

		@Override
		public InputStream getInputStream() throws IOException {

			InputStream inputStream = null;
			Document doc;
			try {

				inputStream = storedDocumentSourceStorage
						.getStoredDocumentSourceInputStream(storedDocumentSourceId);
				DocumentBuilderFactory factory = DocumentBuilderFactory
						.newInstance();
				factory.setFeature("http://xml.org/sax/features/validation", false);
				factory.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
				factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
				factory.setFeature("http://xml.org/sax/features/external-general-entities", false);
				factory.setIgnoringComments(true);
				DocumentBuilder builder = factory.newDocumentBuilder();
				doc = builder.parse(inputStream);

			} catch (ParserConfigurationException e) {
				throw new IOException("Error with XML parser configuration for "
						+ storedDocumentSource, e);
			} catch (SAXException e) {
				throw new IOException("Error with XML parsing for "
						+ storedDocumentSource, e);
			} finally {
				if (inputStream != null)
					inputStream.close();
			}
			
			if (localParameters.containsKey("xmlExtractorTemplate")) {

				Source source = null;
				String xmlExtractorTemplate = localParameters.getParameterValue("xmlExtractorTemplate");
			
				URI templateUrl;
				try {
					templateUrl = this.getClass().getResource("/org/voyanttools/trombone/templates/"+xmlExtractorTemplate).toURI();
				} catch (URISyntaxException e1) {
					throw new IOException("Unable to find local template directory", e1);
				}
				File file = new File(templateUrl);
				if (file.exists()) {
					source = new StreamSource(file);
				}
				
				if (source!=null) {
					DOMResult result = new DOMResult();
					try {
						Transformer extractorTransformer = TransformerFactory.newInstance().newTransformer(source);
						extractorTransformer.transform(new DOMSource(doc), result);
					} catch (TransformerException e) {
						throw new IOException("Unable to transform document during expansion "+metadata, e);
					}
					try {
						doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
					} catch (ParserConfigurationException e) {
						throw new IllegalStateException("Unable to create new XML document during templated extraction.", e);
					}
					doc = (Document) result.getNode();
//					   DOMImplementationLS domImplementation = (DOMImplementationLS) doc.getImplementation();
//					    LSSerializer lsSerializer = domImplementation.createLSSerializer();
//					    System.out.println(lsSerializer.writeToString(doc).substring(0, 3000));
				}
				else {
					throw new IOException("Unable to find extractor template "+xmlExtractorTemplate);
				}
			}
			
			
			// try to find title if needed
			String[] titles = getNodesAsStringsFromParametersValue(doc, "xmlTitleXpath");
			if (titles.length>0) {
				metadata.setTitles(titles);
			}
			
			// try to find author if needed
			String[] authors = getNodesAsStringsFromParametersValue(doc, "xmlAuthorXpath");
			if (authors.length>0) {
				metadata.setAuthors(authors);
			}

			// try to find publplace if needed
			String[] pubPlaces = getNodesAsStringsFromParametersValue(doc, "xmlPubPlaceXpath");
			if (pubPlaces.length>0) {
				metadata.setPubPlaces(pubPlaces);
			}

			// try to find title if needed
			String[] publishers = getNodesAsStringsFromParametersValue(doc, "xmlPublisherXpath");
			if (publishers.length>0) {
				metadata.setPublishers(publishers);
			}

			// try to find pubDates if needed
			String[] pubDates = getNodesAsStringsFromParametersValue(doc, "xmlPubDateXpath");
			if (pubDates.length>0) {
				metadata.setPubDates(pubDates);
			}
			
			// try to find pubDates if needed
			String[] collections = getNodesAsStringsFromParametersValue(doc, "xmlCollectionXpath");
			if (collections.length>0) {
				metadata.setCollections(collections);
			}
			
			// try to find pubDates if needed
			String[] keywords = getNodesAsStringsFromParametersValue(doc, "xmlKeywordXpath");
			if (keywords.length>0) {
				metadata.setKeywords(keywords);
			}
			
			for (String extra : localParameters.getParameterValues("xmlExtraMetadataXpath")) {
				for (String x :extra.split("(\r\n|\r|\n)+")) {
					x = x.trim();
					String[] parts = x.split("=");
					if (parts.length>1) {
						String key = parts[0].trim();
						String xpath = StringUtils.join(Arrays.copyOfRange(parts, 1, parts.length), "=").trim();
						String[] values = getNodesAsStringsFromXpath(doc, xpath);
						if (values.length>0) {
							metadata.setExtras(key, values);
						}
					}
				}
			}
			
			// if no XPath is defined, consider the whole source XML (but allow for additional metadata ot be identified
			String xmlContentXpath = localParameters.getParameterValue("xmlContentXpath","/");
			
			NodeList nodeList;
			XPath xpath = xpathFactory.newXPath();
			try {
				nodeList = (NodeList) xpath.evaluate(xmlContentXpath, doc.getDocumentElement(), XPathConstants.NODESET);
			} catch (XPathExpressionException e) {
				throw new IllegalArgumentException(
						"A problem was encountered proccesing this XPath query: " + xmlContentXpath, e);
			}
			
			Node newParentNode;
			// just use the single node as root
			if (nodeList.getLength()==1) {
				newParentNode = nodeList.item(0);
			}
			
			// encapsulate child nodes in document root
			else {
				newParentNode = doc.getDocumentElement().cloneNode(false);
				for (int i=0, len=nodeList.getLength(); i<len; i++) {
					newParentNode.appendChild(nodeList.item(i));
				}
			}
			

			StringWriter sw = new StringWriter(); // no need to close
			Result streamResult = new StreamResult(sw);
			try {
				
				transformer.transform(new DOMSource(newParentNode), streamResult);
			} catch (TransformerException e) {
				throw new IOException(
						"Unable to transform node during XML extraction: "+storedDocumentSource);
			}
	
			String string = sw.toString();
//			String string = StringEscapeUtils.unescapeXml(sw.toString());
//			byte[] bytes = string.getBytes("UTF-8");
//			ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(bytes);
			
			// try to determine language
			metadata.setLanguageCode(LangDetector.langDetector.detect(string, parameters));

	        isProcessed = true;

	        return new ByteArrayInputStream(string.getBytes("UTF-8"));
//	        return new ByteArrayInputStream(StringEscapeUtils.unescapeXml(string).getBytes("UTF-8"));
			
		}

		private String[] getNodesAsStringsFromParametersValue(Document doc, String parameterKey) {
			String xpathString = localParameters.getParameterValue(parameterKey,"");
			return getNodesAsStringsFromXpath(doc, xpathString);
		}

		private String[] getNodesAsStringsFromXpath(Document doc, String xpathString) {
			String[] strings = new String[0];
			if (xpathString.isEmpty()==false) {
				Set<String> values = new HashSet<String>();
				XPath xpath = xpathFactory.newXPath();
				NodeList nodeList;
				try {
					// this is awful to have to specify the return type, this should probably use a different library
					if (xpathString.startsWith("string") || xpathString.startsWith("concat(") || xpathString.startsWith("replace(")) {
						 String s = (String) xpath.evaluate(xpathString, doc.getDocumentElement(), XPathConstants.STRING);
						 values.add(s);
					}
					else {
						nodeList = (NodeList) xpath.evaluate(xpathString, doc.getDocumentElement(), XPathConstants.NODESET);
						for (int i=0, len=nodeList.getLength(); i<len; i++) {
							values.add(nodeList.item(i).getTextContent());
						}
					}
				}
				catch (XPathExpressionException e) {
					throw new IllegalArgumentException(
							"A problem was encountered proccesing this XPath query: " + xpathString, e);
				}
				return values.toArray(strings);
			}
			return strings;
		}
		
		@Override
		public DocumentMetadata getMetadata() throws IOException {
			return isProcessed ? this.metadata : storedDocumentSourceStorage.getStoredDocumentSourceMetadata(id);
		}

		@Override
		public String getUniqueId() {
			return this.id;
		}
	}
	

}