/** * Copyright 2015 StreamSets Inc. * * Licensed under the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.streamsets.pipeline.lib.xml; import com.google.common.base.Strings; import com.google.common.collect.Lists; import com.streamsets.pipeline.api.Field; import com.streamsets.pipeline.api.ext.io.ObjectLengthException; import com.streamsets.pipeline.api.impl.Utils; import com.streamsets.pipeline.lib.xml.xpath.MatchStatus; import com.streamsets.pipeline.lib.xml.xpath.XPathMatchingEventReader; import org.apache.commons.lang3.StringUtils; import javax.xml.namespace.QName; import javax.xml.stream.XMLEventReader; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.events.Attribute; import javax.xml.stream.events.Characters; import javax.xml.stream.events.EndElement; import javax.xml.stream.events.Namespace; import javax.xml.stream.events.StartElement; import javax.xml.stream.events.XMLEvent; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; public class StreamingXmlParser { public static final String VALUE_KEY = "value"; public static final String ATTR_PREFIX_KEY = "attr|"; private static final String NS_PREFIX_KEY = "ns|"; public static final String GENERATED_NAMESPACE_PREFIX = "ns"; public static final String XPATH_KEY = "xpath"; public static final String XMLATTR_ATTRIBUTE_PREFIX = "xmlAttr:"; private final Reader reader; private final XPathMatchingEventReader xmlEventReader; private final boolean useFieldAttributesInsteadOfFields; private String recordElement; private boolean closed; private String lastParsedFieldXpathPrefix; private final LinkedList<String> elementNameStack = new LinkedList<>(); private int generatedNsPrefixCount = 1; private final Map<String, String> namespaceUriToPrefix = new HashMap<>(); // reads a full XML document as a single Field public StreamingXmlParser(Reader xmlEventReader) throws IOException, XMLStreamException { this(xmlEventReader, null, null, 0, true); } // reads an XML document producing a Field for each first level 'recordElement' element, other first level elements // are ignored public StreamingXmlParser(Reader xmlEventReader, String recordElement) throws IOException, XMLStreamException { this(xmlEventReader, recordElement, null, 0, true); } public StreamingXmlParser(Reader xmlEventReader, String recordElement, Map<String, String> namespaces) throws IOException, XMLStreamException { this(xmlEventReader, recordElement, namespaces, 0, true); } // reads an XML document producing a Field for each first level 'recordElement' element, other first level elements // are ignored public StreamingXmlParser(Reader reader, String recordElement, long initialPosition) throws IOException, XMLStreamException { this(reader, recordElement, null, initialPosition, true); } public StreamingXmlParser( Reader reader, String recordElement, Map<String, String> namespaces, long initialPosition, boolean useFieldAttributesInsteadOfFields ) throws IOException, XMLStreamException { this.reader = reader; this.useFieldAttributesInsteadOfFields = useFieldAttributesInsteadOfFields; if (Strings.isNullOrEmpty(recordElement)) { this.recordElement = Constants.ROOT_ELEMENT_PATH; } else { this.recordElement = recordElement; } XMLInputFactory factory = XMLInputFactory.newFactory(); factory.setProperty("javax.xml.stream.isCoalescing", true); factory.setProperty("javax.xml.stream.isSupportingExternalEntities", false); factory.setProperty("javax.xml.stream.supportDTD", false); this.xmlEventReader = new XPathMatchingEventReader(factory.createXMLEventReader(reader), this.recordElement, namespaces); while (hasNext(xmlEventReader) && !peek(xmlEventReader).isEndDocument() && !peek(xmlEventReader).isStartElement()) { read(xmlEventReader); } if (recordElement == null || recordElement.isEmpty()) { StartElement startE = (StartElement) peek(xmlEventReader); this.recordElement = startE.getName().getLocalPart(); } else { //consuming root StartElement startE = (StartElement) read(xmlEventReader); elementNameStack.addFirst(getNameAndTrackNs(startE.getName())); } if (initialPosition > 0) { //fastforward to initial position while (hasNext(xmlEventReader) && peek(xmlEventReader).getLocation().getCharacterOffset() < initialPosition) { read(xmlEventReader); fastForwardLeaseReader(); } xmlEventReader.clearLastMatch(); } } public Reader getReader() { return reader; } public String getLastParsedFieldXpathPrefix() { return lastParsedFieldXpathPrefix; } public Map<String, String> getNamespaceUriToPrefixMappings() { return Collections.unmodifiableMap(namespaceUriToPrefix); } public void close() { closed = true; try { xmlEventReader.close(); } catch (Exception ex) { // NOP } elementNameStack.clear(); generatedNsPrefixCount = 1; namespaceUriToPrefix.clear(); } private String getNameAndTrackNs(QName name) { final String uri = name.getNamespaceURI(); if (!Strings.isNullOrEmpty(uri)) { String prefix; if (!namespaceUriToPrefix.containsKey(uri)) { prefix = name.getPrefix(); if (Strings.isNullOrEmpty(prefix)) { //generate a new namespace prefix for it prefix = GENERATED_NAMESPACE_PREFIX + generatedNsPrefixCount++; } //else the element already came with a prefix, so just use that namespaceUriToPrefix.put(uri, prefix); } else { prefix = namespaceUriToPrefix.get(uri); } return prefix + ":" + name.getLocalPart(); } else { // element is in no namespace return name.getLocalPart(); } } public Field read() throws IOException, XMLStreamException { if (closed) { throw new IOException("The parser has been closed"); } Field field = null; if (hasNext(xmlEventReader)) { int depth = 0; // we need to skip first level elements that are not the record delimiter and we have to ignore record delimiter // elements deeper than first level while (hasNext(xmlEventReader) && !isStartOfRecord(peek(xmlEventReader), depth)) { XMLEvent event = read(xmlEventReader); if (event.isStartElement()) { elementNameStack.addFirst(getNameAndTrackNs(event.asStartElement().getName())); depth++; } else if (event.getEventType() == XMLEvent.END_ELEMENT) { elementNameStack.removeFirst(); depth--; } } if (hasNext(xmlEventReader)) { StartElement startE = (StartElement) xmlEventReader.getLastMatchingEvent(); field = parse(xmlEventReader, startE); // the while loop consumes the start element for a record, and the parse method above consumes the end // so remove it from the stack elementNameStack.removeFirst(); } // if advancing, don't evaluate XPath matches xmlEventReader.clearLastMatch(); } return field; } protected void fastForwardLeaseReader() { } public long getReaderPosition() throws XMLStreamException { return (hasNext(xmlEventReader)) ? peek(xmlEventReader).getLocation().getCharacterOffset() : -1; } public String getXpathPrefix() { return "/" + StringUtils.join(Lists.reverse(elementNameStack), "/"); } private boolean isStartOfRecord(XMLEvent event, int depth) { return xmlEventReader.getLastElementMatchResult() == MatchStatus.ELEMENT_MATCH; } boolean isIgnorable(XMLEvent event) { return event.getEventType() == XMLEvent.PROCESSING_INSTRUCTION || event.getEventType() == XMLEvent.COMMENT; } void skipIgnorable(XMLEventReader reader) throws XMLStreamException { while (reader.hasNext() && isIgnorable(reader.peek())) { reader.nextEvent(); } } boolean hasNext(XMLEventReader reader) throws XMLStreamException { skipIgnorable(reader); return reader.hasNext(); } XMLEvent peek(XMLEventReader reader) throws XMLStreamException { skipIgnorable(reader); return reader.peek(); } XMLEvent read(XMLEventReader reader) throws XMLStreamException { skipIgnorable(reader); return reader.nextEvent(); } String getName(String namePrefix, Attribute element) { return getName(element.getName(), namePrefix); } String getName(StartElement element) { return getName(element.getName(), null); } private String getName(QName name, String namePrefix) { StringBuilder sb = new StringBuilder(); if (!Strings.isNullOrEmpty(namePrefix)) { sb.append(namePrefix); } sb.append(getNameAndTrackNs(name)); return sb.toString(); } Map<String, Field> toField(StartElement startE) { Map<String, Field> map = new LinkedHashMap<>(); Iterator attrs = startE.getAttributes(); while (attrs.hasNext()) { Attribute attr = (Attribute) attrs.next(); map.put(getName(ATTR_PREFIX_KEY, attr), Field.create(attr.getValue())); } Iterator nss = startE.getNamespaces(); while (nss.hasNext()) { Namespace ns = (Namespace) nss.next(); map.put(getName(NS_PREFIX_KEY, ns), Field.create(ns.getNamespaceURI())); } return map; } protected boolean isOverMaxObjectLength() throws XMLStreamException { return false; } @SuppressWarnings("unchecked") private void addContent(Map<String, Object> contents, String name, Field field) throws XMLStreamException, ObjectLengthException { throwIfOverMaxObjectLength(); List<Field> list = (List<Field>) contents.get(name); if (list == null) { list = new ArrayList<>(); contents.put(name, list); } list.add(field); } @SuppressWarnings("unchecked") Field parse(XMLEventReader reader, StartElement startE) throws XMLStreamException, ObjectLengthException { Map<String, Field> map = this.useFieldAttributesInsteadOfFields ? new LinkedHashMap<>() : toField(startE); Map<String, Field> startEMap = map; Map<String, Object> contents = new LinkedHashMap<>(); boolean maybeText = true; while (hasNext(reader) && !peek(reader).isEndElement()) { XMLEvent next = read(reader); if (next.isCharacters()) { // If this set of characters is all whitespace, ignore. if (next.asCharacters().isWhiteSpace()) { continue; } else if (peek(reader).isEndElement() && maybeText) { contents.put(VALUE_KEY, Field.create(((Characters)next).getData())); } else if (peek(reader).isStartElement()) { StartElement subStartE = (StartElement) read(reader); Field subField = parse(reader, subStartE); addContent(contents, getName(subStartE), subField); if (hasNext(reader) && peek(reader).isCharacters()) { read(reader); } } else if (maybeText) { throw new XMLStreamException(Utils.format( "Unexpected XMLEvent '{}', it should be START_ELEMENT or END_ELEMENT", next), next.getLocation()); } } else if (next.isStartElement()) { String name = getName((StartElement) next); Field field = parse(reader, (StartElement) next); addContent(contents, name, field); } else { throw new XMLStreamException(Utils.format("Unexpected XMLEvent '{}', it should be START_ELEMENT or CHARACTERS", next), next.getLocation()); } maybeText = false; } if (hasNext(reader)) { EndElement endE = (EndElement) read(reader); if (!endE.getName().equals(startE.getName())) { throw new XMLStreamException(Utils.format("Unexpected EndElement '{}', it should be '{}'", endE.getName().getLocalPart(), startE.getName().getLocalPart()), endE.getLocation()); } for (Map.Entry<String, Object> entry : contents.entrySet()) { if (entry.getValue() instanceof Field) { startEMap.put(entry.getKey(), (Field) entry.getValue()); } else { startEMap.put(entry.getKey(), Field.create((List<Field>)entry.getValue())); } } } final Field field = Field.create(startEMap); if (this.useFieldAttributesInsteadOfFields) { Iterator attrs = startE.getAttributes(); while (attrs.hasNext()) { Attribute attr = (Attribute) attrs.next(); field.setAttribute(getName(XMLATTR_ATTRIBUTE_PREFIX, attr), attr.getValue()); } Iterator nss = startE.getNamespaces(); while (nss.hasNext()) { Namespace ns = (Namespace) nss.next(); field.setAttribute(getName(null, ns), ns.getNamespaceURI()); } } lastParsedFieldXpathPrefix = getXpathPrefix(); return field; } protected void throwIfOverMaxObjectLength() throws XMLStreamException, ObjectLengthException { } }