/**
* Copyright 2015 StreamSets Inc.
*
* Licensed under the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.streamsets.pipeline.lib.xml;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import com.streamsets.pipeline.api.Field;
import com.streamsets.pipeline.api.ext.io.ObjectLengthException;
import com.streamsets.pipeline.api.impl.Utils;
import com.streamsets.pipeline.lib.xml.xpath.MatchStatus;
import com.streamsets.pipeline.lib.xml.xpath.XPathMatchingEventReader;
import org.apache.commons.lang3.StringUtils;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.Namespace;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
public class StreamingXmlParser {
public static final String VALUE_KEY = "value";
public static final String ATTR_PREFIX_KEY = "attr|";
private static final String NS_PREFIX_KEY = "ns|";
public static final String GENERATED_NAMESPACE_PREFIX = "ns";
public static final String XPATH_KEY = "xpath";
public static final String XMLATTR_ATTRIBUTE_PREFIX = "xmlAttr:";
private final Reader reader;
private final XPathMatchingEventReader xmlEventReader;
private final boolean useFieldAttributesInsteadOfFields;
private String recordElement;
private boolean closed;
private String lastParsedFieldXpathPrefix;
private final LinkedList<String> elementNameStack = new LinkedList<>();
private int generatedNsPrefixCount = 1;
private final Map<String, String> namespaceUriToPrefix = new HashMap<>();
// reads a full XML document as a single Field
public StreamingXmlParser(Reader xmlEventReader) throws IOException, XMLStreamException {
this(xmlEventReader, null, null, 0, true);
}
// reads an XML document producing a Field for each first level 'recordElement' element, other first level elements
// are ignored
public StreamingXmlParser(Reader xmlEventReader, String recordElement)
throws IOException, XMLStreamException {
this(xmlEventReader, recordElement, null, 0, true);
}
public StreamingXmlParser(Reader xmlEventReader, String recordElement, Map<String, String> namespaces)
throws IOException, XMLStreamException {
this(xmlEventReader, recordElement, namespaces, 0, true);
}
// reads an XML document producing a Field for each first level 'recordElement' element, other first level elements
// are ignored
public StreamingXmlParser(Reader reader, String recordElement, long initialPosition)
throws IOException, XMLStreamException {
this(reader, recordElement, null, initialPosition, true);
}
public StreamingXmlParser(
Reader reader,
String recordElement,
Map<String, String> namespaces,
long initialPosition,
boolean useFieldAttributesInsteadOfFields
)
throws IOException, XMLStreamException {
this.reader = reader;
this.useFieldAttributesInsteadOfFields = useFieldAttributesInsteadOfFields;
if (Strings.isNullOrEmpty(recordElement)) {
this.recordElement = Constants.ROOT_ELEMENT_PATH;
} else {
this.recordElement = recordElement;
}
XMLInputFactory factory = XMLInputFactory.newFactory();
factory.setProperty("javax.xml.stream.isCoalescing", true);
factory.setProperty("javax.xml.stream.isSupportingExternalEntities", false);
factory.setProperty("javax.xml.stream.supportDTD", false);
this.xmlEventReader = new XPathMatchingEventReader(factory.createXMLEventReader(reader), this.recordElement, namespaces);
while (hasNext(xmlEventReader) && !peek(xmlEventReader).isEndDocument() && !peek(xmlEventReader).isStartElement()) {
read(xmlEventReader);
}
if (recordElement == null || recordElement.isEmpty()) {
StartElement startE = (StartElement) peek(xmlEventReader);
this.recordElement = startE.getName().getLocalPart();
} else {
//consuming root
StartElement startE = (StartElement) read(xmlEventReader);
elementNameStack.addFirst(getNameAndTrackNs(startE.getName()));
}
if (initialPosition > 0) {
//fastforward to initial position
while (hasNext(xmlEventReader) && peek(xmlEventReader).getLocation().getCharacterOffset() < initialPosition) {
read(xmlEventReader);
fastForwardLeaseReader();
}
xmlEventReader.clearLastMatch();
}
}
public Reader getReader() {
return reader;
}
public String getLastParsedFieldXpathPrefix() {
return lastParsedFieldXpathPrefix;
}
public Map<String, String> getNamespaceUriToPrefixMappings() {
return Collections.unmodifiableMap(namespaceUriToPrefix);
}
public void close() {
closed = true;
try {
xmlEventReader.close();
} catch (Exception ex) {
// NOP
}
elementNameStack.clear();
generatedNsPrefixCount = 1;
namespaceUriToPrefix.clear();
}
private String getNameAndTrackNs(QName name) {
final String uri = name.getNamespaceURI();
if (!Strings.isNullOrEmpty(uri)) {
String prefix;
if (!namespaceUriToPrefix.containsKey(uri)) {
prefix = name.getPrefix();
if (Strings.isNullOrEmpty(prefix)) {
//generate a new namespace prefix for it
prefix = GENERATED_NAMESPACE_PREFIX + generatedNsPrefixCount++;
} //else the element already came with a prefix, so just use that
namespaceUriToPrefix.put(uri, prefix);
} else {
prefix = namespaceUriToPrefix.get(uri);
}
return prefix + ":" + name.getLocalPart();
} else {
// element is in no namespace
return name.getLocalPart();
}
}
public Field read() throws IOException, XMLStreamException {
if (closed) {
throw new IOException("The parser has been closed");
}
Field field = null;
if (hasNext(xmlEventReader)) {
int depth = 0;
// we need to skip first level elements that are not the record delimiter and we have to ignore record delimiter
// elements deeper than first level
while (hasNext(xmlEventReader) && !isStartOfRecord(peek(xmlEventReader), depth)) {
XMLEvent event = read(xmlEventReader);
if (event.isStartElement()) {
elementNameStack.addFirst(getNameAndTrackNs(event.asStartElement().getName()));
depth++;
} else if (event.getEventType() == XMLEvent.END_ELEMENT) {
elementNameStack.removeFirst();
depth--;
}
}
if (hasNext(xmlEventReader)) {
StartElement startE = (StartElement) xmlEventReader.getLastMatchingEvent();
field = parse(xmlEventReader, startE);
// the while loop consumes the start element for a record, and the parse method above consumes the end
// so remove it from the stack
elementNameStack.removeFirst();
}
// if advancing, don't evaluate XPath matches
xmlEventReader.clearLastMatch();
}
return field;
}
protected void fastForwardLeaseReader() {
}
public long getReaderPosition() throws XMLStreamException {
return (hasNext(xmlEventReader)) ? peek(xmlEventReader).getLocation().getCharacterOffset() : -1;
}
public String getXpathPrefix() {
return "/" + StringUtils.join(Lists.reverse(elementNameStack), "/");
}
private boolean isStartOfRecord(XMLEvent event, int depth) {
return xmlEventReader.getLastElementMatchResult() == MatchStatus.ELEMENT_MATCH;
}
boolean isIgnorable(XMLEvent event) {
return event.getEventType() == XMLEvent.PROCESSING_INSTRUCTION || event.getEventType() == XMLEvent.COMMENT;
}
void skipIgnorable(XMLEventReader reader) throws XMLStreamException {
while (reader.hasNext() && isIgnorable(reader.peek())) {
reader.nextEvent();
}
}
boolean hasNext(XMLEventReader reader) throws XMLStreamException {
skipIgnorable(reader);
return reader.hasNext();
}
XMLEvent peek(XMLEventReader reader) throws XMLStreamException {
skipIgnorable(reader);
return reader.peek();
}
XMLEvent read(XMLEventReader reader) throws XMLStreamException {
skipIgnorable(reader);
return reader.nextEvent();
}
String getName(String namePrefix, Attribute element) {
return getName(element.getName(), namePrefix);
}
String getName(StartElement element) {
return getName(element.getName(), null);
}
private String getName(QName name, String namePrefix) {
StringBuilder sb = new StringBuilder();
if (!Strings.isNullOrEmpty(namePrefix)) {
sb.append(namePrefix);
}
sb.append(getNameAndTrackNs(name));
return sb.toString();
}
Map<String, Field> toField(StartElement startE) {
Map<String, Field> map = new LinkedHashMap<>();
Iterator attrs = startE.getAttributes();
while (attrs.hasNext()) {
Attribute attr = (Attribute) attrs.next();
map.put(getName(ATTR_PREFIX_KEY, attr), Field.create(attr.getValue()));
}
Iterator nss = startE.getNamespaces();
while (nss.hasNext()) {
Namespace ns = (Namespace) nss.next();
map.put(getName(NS_PREFIX_KEY, ns), Field.create(ns.getNamespaceURI()));
}
return map;
}
protected boolean isOverMaxObjectLength() throws XMLStreamException {
return false;
}
@SuppressWarnings("unchecked")
private void addContent(Map<String, Object> contents, String name, Field field) throws
XMLStreamException,
ObjectLengthException {
throwIfOverMaxObjectLength();
List<Field> list = (List<Field>) contents.get(name);
if (list == null) {
list = new ArrayList<>();
contents.put(name, list);
}
list.add(field);
}
@SuppressWarnings("unchecked")
Field parse(XMLEventReader reader, StartElement startE) throws XMLStreamException, ObjectLengthException {
Map<String, Field> map = this.useFieldAttributesInsteadOfFields ? new LinkedHashMap<>() : toField(startE);
Map<String, Field> startEMap = map;
Map<String, Object> contents = new LinkedHashMap<>();
boolean maybeText = true;
while (hasNext(reader) && !peek(reader).isEndElement()) {
XMLEvent next = read(reader);
if (next.isCharacters()) {
// If this set of characters is all whitespace, ignore.
if (next.asCharacters().isWhiteSpace()) {
continue;
} else if (peek(reader).isEndElement() && maybeText) {
contents.put(VALUE_KEY, Field.create(((Characters)next).getData()));
} else if (peek(reader).isStartElement()) {
StartElement subStartE = (StartElement) read(reader);
Field subField = parse(reader, subStartE);
addContent(contents, getName(subStartE), subField);
if (hasNext(reader) && peek(reader).isCharacters()) {
read(reader);
}
} else if (maybeText) {
throw new XMLStreamException(Utils.format(
"Unexpected XMLEvent '{}', it should be START_ELEMENT or END_ELEMENT", next), next.getLocation());
}
} else if (next.isStartElement()) {
String name = getName((StartElement) next);
Field field = parse(reader, (StartElement) next);
addContent(contents, name, field);
} else {
throw new XMLStreamException(Utils.format("Unexpected XMLEvent '{}', it should be START_ELEMENT or CHARACTERS",
next), next.getLocation());
}
maybeText = false;
}
if (hasNext(reader)) {
EndElement endE = (EndElement) read(reader);
if (!endE.getName().equals(startE.getName())) {
throw new XMLStreamException(Utils.format("Unexpected EndElement '{}', it should be '{}'",
endE.getName().getLocalPart(), startE.getName().getLocalPart()),
endE.getLocation());
}
for (Map.Entry<String, Object> entry : contents.entrySet()) {
if (entry.getValue() instanceof Field) {
startEMap.put(entry.getKey(), (Field) entry.getValue());
} else {
startEMap.put(entry.getKey(), Field.create((List<Field>)entry.getValue()));
}
}
}
final Field field = Field.create(startEMap);
if (this.useFieldAttributesInsteadOfFields) {
Iterator attrs = startE.getAttributes();
while (attrs.hasNext()) {
Attribute attr = (Attribute) attrs.next();
field.setAttribute(getName(XMLATTR_ATTRIBUTE_PREFIX, attr), attr.getValue());
}
Iterator nss = startE.getNamespaces();
while (nss.hasNext()) {
Namespace ns = (Namespace) nss.next();
field.setAttribute(getName(null, ns), ns.getNamespaceURI());
}
}
lastParsedFieldXpathPrefix = getXpathPrefix();
return field;
}
protected void throwIfOverMaxObjectLength() throws XMLStreamException, ObjectLengthException {
}
}