/*
* #%L
* Common package for I/O and related utilities
* %%
* Copyright (C) 2005 - 2015 Open Microscopy Environment:
* - Board of Regents of the University of Wisconsin-Madison
* - Glencoe Software, Inc.
* - University of Dundee
* %%
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* #L%
*/
package loci.common.xml;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.transform.ErrorListener;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Templates;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.sax.SAXSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import javax.xml.validation.Schema;
import javax.xml.validation.SchemaFactory;
import javax.xml.validation.Validator;
import loci.common.Constants;
import loci.common.RandomAccessInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* A utility class for working with XML.
*
* @author Curtis Rueden ctrueden at wisc.edu
* @author Chris Allan callan at blackcat.ca
* @author Melissa Linkert melissa at glencoesoftware.com
*/
public final class XMLTools {
// -- Constants --
static final Logger LOGGER = LoggerFactory.getLogger(XMLTools.class);
private static final String XSI_NS =
"http://www.w3.org/2001/XMLSchema-instance";
private static final String XML_SCHEMA_PATH =
"http://www.w3.org/2001/XMLSchema";
private static final SchemaFactory FACTORY =
SchemaFactory.newInstance(XML_SCHEMA_PATH);
private static final TransformerFactory transformFactory = createTransformFactory();
private static TransformerFactory createTransformFactory() {
TransformerFactory factory = TransformerFactory.newInstance();
factory.setErrorListener(new XMLListener());
return factory;
};
// -- Fields --
private static ThreadLocal<HashMap<URI, Schema>> schemas =
new ThreadLocal<HashMap<URI, Schema>>()
{
@Override
protected HashMap<URI, Schema> initialValue() {
return new HashMap<URI, Schema>();
}
};
// -- Constructor --
private XMLTools() { }
// -- XML to/from DOM --
/**
* Creates a new {@link DocumentBuilder} via {@link DocumentBuilderFactory}
* or logs and throws a {@link RuntimeException}.
*/
public static DocumentBuilder createBuilder() {
try {
return DocumentBuilderFactory.newInstance().newDocumentBuilder();
}
catch (ParserConfigurationException e) {
LOGGER.error("Cannot create DocumentBuilder", e);
throw new RuntimeException(e);
}
}
/**
* Calls {@link DocumentBuilder#newDocument()} on a
* {@link #createBuilder() new builder}.
*/
public static Document createDocument() {
return createBuilder().newDocument();
}
/** Parses a DOM from the given XML file on disk. */
public static Document parseDOM(File file)
throws ParserConfigurationException, SAXException, IOException
{
InputStream is = new FileInputStream(file);
try {
Document doc = parseDOM(is);
return doc;
} finally {
is.close();
}
}
/** Parses a DOM from the given XML string. */
public static Document parseDOM(String xml)
throws ParserConfigurationException, SAXException, IOException
{
byte[] bytes = xml.getBytes(Constants.ENCODING);
InputStream is = new ByteArrayInputStream(bytes);
try {
Document doc = parseDOM(is);
return doc;
} finally {
is.close();
}
}
/** Parses a DOM from the given XML input stream. */
public static Document parseDOM(InputStream is)
throws ParserConfigurationException, SAXException, IOException
{
final InputStream in =
is.markSupported() ? is : new BufferedInputStream(is);
checkUTF8(in);
// Java XML factories are not declared to be thread safe
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder db = factory.newDocumentBuilder();
db.setErrorHandler(new ParserErrorHandler());
return db.parse(in);
}
/** Converts the given DOM back to a string. */
public static String getXML(Document doc)
throws TransformerConfigurationException, TransformerException
{
StringWriter stringWriter = new StringWriter();
Result result = new StreamResult(stringWriter);
writeXML(result, doc, true);
return stringWriter.getBuffer().toString();
}
/**
* Dumps the given OME-XML DOM tree to a string.
* @param schemaLocation if null, no xmlns attribute will be added.
* @return OME-XML as a string.
*/
public static String dumpXML(String schemaLocation, Document doc, Element r) {
return dumpXML(schemaLocation, doc, r, true);
}
/**
* Dumps the given OME-XML DOM tree to a string.
* @param schemaLocation if null, no xmlns attribute will be added.
* @return OME-XML as a string.
*/
public static String dumpXML(String schemaLocation, Document doc, Element r,
boolean includeXMLDeclaration) {
try {
ByteArrayOutputStream os = new ByteArrayOutputStream();
if (schemaLocation != null) {
r.setAttribute("xmlns:xsi", XSI_NS);
r.setAttribute("xsi:schemaLocation", schemaLocation);
}
doc.appendChild(r);
writeXML(os, doc, includeXMLDeclaration);
return os.toString(Constants.ENCODING);
}
catch (TransformerException exc) {
LOGGER.warn("Failed to create XML", exc);
throw new RuntimeException(exc);
}
catch (UnsupportedEncodingException exc) {
LOGGER.warn("Failed to create XML", exc);
throw new RuntimeException(exc);
}
}
// -- Filtering --
/** Escape special characters. */
public static String escapeXML(String s) {
StringBuffer sb = new StringBuffer();
for (int i=0; i<s.length(); i++) {
char c = s.charAt(i);
if (c == '<') {
sb.append("<");
}
else if (c == '>') {
sb.append(">");
}
else if (c == '&') {
sb.append("&");
}
else if (c == '\"') {
sb.append(""");
}
else if (c == '\'') {
sb.append("'");
}
else {
sb.append(c);
}
}
return sb.toString();
}
/** Remove invalid characters from an XML string. */
public static String sanitizeXML(String s) {
final char[] c = s.toCharArray();
for (int i=0; i<s.length(); i++) {
if ((Character.isISOControl(c[i]) && c[i] != '\n' && c[i] != '\t' &&
c[i] != '\r') || !Character.isDefined(c[i]))
{
c[i] = ' ';
}
// eliminate invalid sequences
if (i > 0 && c[i - 1] == '&' && c[i] == '#') c[i - 1] = ' ';
}
return new String(c);
}
/** Indents XML to be more readable. */
public static String indentXML(String xml) {
return indentXML(xml, 3, false);
}
/** Indents XML by the given spacing to be more readable. */
public static String indentXML(String xml, int spacing) {
return indentXML(xml, spacing, false);
}
/**
* Indents XML to be more readable, avoiding any whitespace
* injection into CDATA if the preserveCData flag is set.
*/
public static String indentXML(String xml, boolean preserveCData) {
return indentXML(xml, 3, preserveCData);
}
/**
* Indents XML by the given spacing to be more readable, avoiding any
* whitespace injection into CDATA if the preserveCData flag is set.
*/
public static String indentXML(String xml, int spacing,
boolean preserveCData)
{
if (xml == null) return null; // garbage in, garbage out
StringBuffer sb = new StringBuffer();
StringTokenizer st = new StringTokenizer(xml, "<>", true);
int indent = 0, noSpace = 0;
boolean first = true, element = false;
while (st.hasMoreTokens()) {
String token = st.nextToken().trim();
if (token.equals("")) continue;
if (token.equals("<")) {
element = true;
continue;
}
if (element && token.equals(">")) {
element = false;
continue;
}
if (!element && preserveCData) noSpace = 2;
if (noSpace == 0) {
// advance to next line
if (first) first = false;
else sb.append("\n");
}
// adjust indent backwards
if (element && token.startsWith("/")) indent -= spacing;
if (noSpace == 0) {
// apply indent
for (int j=0; j<indent; j++) sb.append(" ");
}
// output element contents
if (element) sb.append("<");
sb.append(token);
if (element) sb.append(">");
if (noSpace == 0) {
// adjust indent forwards
if (element &&
!token.startsWith("?") && // ?xml tag, probably
!token.startsWith("/") && // end element
!token.endsWith("/") && // standalone element
!token.startsWith("!")) // comment
{
indent += spacing;
}
}
if (noSpace > 0) noSpace--;
}
sb.append("\n");
return sb.toString();
}
// -- Parsing --
/** Parses the given XML string into a list of key/value pairs. */
public static Hashtable<String, String> parseXML(String xml)
throws IOException
{
MetadataHandler handler = new MetadataHandler();
parseXML(xml, handler);
return handler.getMetadata();
}
/**
* Parses the given XML string using the specified XML handler.
*/
public static void parseXML(String xml, DefaultHandler handler)
throws IOException
{
parseXML(xml.getBytes(Constants.ENCODING), handler);
}
/**
* Parses the XML contained in the given input stream into
* using the specified XML handler.
* Be very careful, as 'stream' <b>will</b> be closed by the SAX parser.
*/
public static void parseXML(RandomAccessInputStream stream,
DefaultHandler handler) throws IOException
{
parseXML((InputStream) stream, handler);
}
/**
* Parses the XML contained in the given byte array into
* using the specified XML handler.
*/
public static void parseXML(byte[] xml, DefaultHandler handler)
throws IOException
{
parseXML(new ByteArrayInputStream(xml), handler);
}
/**
* Parses the XML contained in the given InputStream using the
* specified XML handler.
*/
public static void parseXML(InputStream xml, DefaultHandler handler)
throws IOException
{
try {
// Java XML factories are not declared to be thread safe
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser parser = factory.newSAXParser();
parser.parse(xml, handler);
}
catch (ParserConfigurationException exc) {
IOException e = new IOException();
e.initCause(exc);
throw e;
}
catch (SAXException exc) {
IOException e = new IOException();
e.initCause(exc);
throw e;
}
}
// -- I/O --
/** Writes the specified DOM to the given output stream. */
public static void writeXML(OutputStream os, Document doc)
throws TransformerException
{
writeXML(os, doc, true);
}
/** Writes the specified DOM to the given output stream. */
public static void writeXML(OutputStream os, Document doc,
boolean includeXMLDeclaration)
throws TransformerException
{
writeXML(new StreamResult(os), doc, includeXMLDeclaration);
}
/** Writes the specified DOM to the given stream. */
public static void writeXML(Result output, Document doc,
boolean includeXMLDeclaration)
throws TransformerException
{
Transformer idTransform = transformFactory.newTransformer();
if (!includeXMLDeclaration) {
idTransform.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
}
Source input = new DOMSource(doc);
idTransform.transform(input, output);
}
// -- XSLT --
/** Gets an XSLT template from the given resource location. */
public static Templates getStylesheet(String resourcePath,
Class<?> sourceClass)
{
InputStream xsltStream;
if (sourceClass == null) {
try {
xsltStream = new FileInputStream(resourcePath);
}
catch (IOException exc) {
LOGGER.debug("Could not open file", exc);
return null;
}
}
else {
xsltStream = sourceClass.getResourceAsStream(resourcePath);
}
try {
StreamSource xsltSource = new StreamSource(xsltStream);
// Java XML factories are not declared to be thread safe
return transformFactory.newTemplates(xsltSource);
}
catch (TransformerConfigurationException exc) {
LOGGER.debug("Could not construct template", exc);
}
finally {
try {
if (xsltStream != null) xsltStream.close();
}
catch (IOException e) {
LOGGER.debug("Could not close file", e);
}
}
return null;
}
/** Replaces NS:tag with NS_tag for undeclared namespaces */
public static String avoidUndeclaredNamespaces(String xml) {
int gt = xml.indexOf('>');
if (gt > 0 && xml.startsWith("<?xml ")) {
gt = xml.indexOf('>', gt + 1);
}
if (gt > 0) {
String firstTag = xml.substring(0, gt + 1).toLowerCase();
// the first tag is a comment; we need to find the first "real" tag
while (firstTag.endsWith("-->")) {
gt = xml.indexOf('>', gt + 1);
firstTag = xml.substring(0, gt + 1).toLowerCase();
}
Set namespaces = new HashSet();
Pattern pattern = Pattern.compile(" xmlns:(\\w+)");
Matcher matcher = pattern.matcher(firstTag);
while (matcher.find()) {
namespaces.add(matcher.group(1));
}
pattern = Pattern.compile("</?(\\w+):");
matcher = pattern.matcher(xml);
while (matcher.find()) {
String namespace = matcher.group(1);
if (!namespace.equalsIgnoreCase("OME") && !namespace.startsWith("ns") &&
!namespaces.contains(namespace.toLowerCase()))
{
int end = matcher.end();
xml = xml.substring(0, end - 1) + "_" + xml.substring(end);
}
}
Pattern emptyNamespaces = Pattern.compile(" xmlns:(\\w+)=\"\"");
matcher = emptyNamespaces.matcher(firstTag);
while (matcher.find()) {
int start = matcher.start();
int end = matcher.end();
xml = xml.substring(0, start + 1) + xml.substring(end);
}
}
return xml;
}
/** Transforms the given XML string using the specified XSLT stylesheet. */
public static String transformXML(String xml, Templates xslt)
throws IOException
{
xml = avoidUndeclaredNamespaces(xml);
return transformXML(new StreamSource(new StringReader(xml)), xslt);
}
/** Transforms the given XML data using the specified XSLT stylesheet. */
public static String transformXML(Source xmlSource, Templates xslt)
throws IOException
{
Transformer trans;
try {
trans = xslt.newTransformer();
trans.setErrorListener(new XMLListener());
}
catch (TransformerConfigurationException exc) {
IOException e = new IOException();
e.initCause(exc);
throw e;
}
StringWriter xmlWriter = new StringWriter();
StreamResult xmlResult = new StreamResult(xmlWriter);
try {
trans.transform(xmlSource, xmlResult);
}
catch (TransformerException exc) {
IOException e = new IOException();
e.initCause(exc);
throw e;
}
return xmlWriter.toString();
}
// -- Validation --
/**
* Attempts to validate the given XML string using
* Java's XML validation facility. Requires Java 1.5+.
* @param xml The XML string to validate.
* @return whether or not validation was successful.
*/
public static boolean validateXML(String xml) {
return validateXML(xml, null);
}
/**
* Attempts to validate the given XML string using
* Java's XML validation facility. Requires Java 1.5+.
* @param xml The XML string to validate.
* @param label String describing the type of XML being validated.
* @return whether or not validation was successful.
*/
public static boolean validateXML(String xml, String label) {
if (label == null) label = "XML";
Exception exception = null;
// get path to schema from root element using SAX
LOGGER.info("Parsing schema path");
ValidationSAXHandler saxHandler = new ValidationSAXHandler();
try {
// Java XML factories are not declared to be thread safe
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser = factory.newSAXParser();
InputStream is =
new ByteArrayInputStream(xml.getBytes(Constants.ENCODING));
saxParser.parse(is, saxHandler);
}
catch (ParserConfigurationException exc) { exception = exc; }
catch (SAXException exc) { exception = exc; }
catch (IOException exc) { exception = exc; }
if (exception != null) {
LOGGER.warn("Error parsing schema path from {}", label, exception);
return false;
}
String schemaPath = saxHandler.getSchemaPath();
if (schemaPath == null) {
LOGGER.error("No schema path found. Validation cannot continue.");
return false;
}
LOGGER.info(schemaPath);
LOGGER.info("Validating {}", label);
// compile the schema
URI schemaLocation = null;
try {
schemaLocation = new URI(schemaPath);
}
catch (URISyntaxException exc) {
LOGGER.info("Error accessing schema at {}", schemaPath, exc);
return false;
}
Schema schema = schemas.get().get(schemaLocation);
if (schema == null) {
try {
schema = FACTORY.newSchema(schemaLocation.toURL());
schemas.get().put(schemaLocation, schema);
}
catch (MalformedURLException exc) {
LOGGER.info("Error parsing schema at {}", schemaPath, exc);
return false;
}
catch (SAXException exc) {
LOGGER.info("Error parsing schema at {}", schemaPath, exc);
return false;
}
}
// get a validator from the schema
Validator validator = schema.newValidator();
// prepare the XML source
StringReader reader = new StringReader(xml);
InputSource is = new InputSource(reader);
SAXSource source = new SAXSource(is);
// validate the XML
ValidationErrorHandler errorHandler = new ValidationErrorHandler();
validator.setErrorHandler(errorHandler);
try {
validator.validate(source);
}
catch (IOException exc) { exception = exc; }
catch (SAXException exc) { exception = exc; }
final int errors = errorHandler.getErrorCount();
if (errors > 0) {
LOGGER.info("Error validating document: {} errors found", errors);
return false;
}
else LOGGER.info("No validation errors found.");
return errorHandler.ok();
}
// -- Helper methods --
/**
* Checks the given stream for a UTF-8 BOM header, skipping it if present. If
* no UTF-8 BOM is present, the position of the stream is unchanged.
* <p>
* We must discard this character because <a href=
* "http://www.rgagnon.com/javadetails/java-handle-utf8-file-with-bom.html"
* >Java does not handle it correctly</a>.
* </p>
*/
private static void checkUTF8(InputStream is) throws IOException {
// check first 3 bytes of the stream
is.mark(3);
if (is.read() != 0xef || is.read() != 0xbb || is.read() != 0xbf) {
// NB: Data stream does not start with the UTF-8 BOM; reset it.
is.reset();
}
}
// -- Helper class --
/** ErrorListener implementation that logs errors and warnings using SLF4J. */
static class XMLListener implements ErrorListener {
@Override
public void error(TransformerException e) {
LOGGER.debug("", e);
}
@Override
public void fatalError(TransformerException e) {
LOGGER.debug("", e);
}
@Override
public void warning(TransformerException e) {
LOGGER.debug("", e);
}
}
}