/** * Copyright 2015 StreamSets Inc. * * Licensed under the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.streamsets.pipeline.lib.parser.xml; import com.google.common.base.Strings; import com.streamsets.pipeline.api.Field; import com.streamsets.pipeline.api.Record; import com.streamsets.pipeline.api.Stage; import com.streamsets.pipeline.api.ext.io.OverrunReader; import com.streamsets.pipeline.lib.parser.AbstractDataParser; import com.streamsets.pipeline.lib.parser.DataParserException; import com.streamsets.pipeline.lib.xml.OverrunStreamingXmlParser; import com.streamsets.pipeline.lib.xml.StreamingXmlParser; import javax.xml.stream.XMLStreamException; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.regex.MatchResult; import java.util.regex.Matcher; import java.util.regex.Pattern; public class XmlCharDataParser extends AbstractDataParser { private static final Pattern INDEX_PATTERN = Pattern.compile("((.*)/\\S+)(\\[\\d+\\]).*"); private static final Pattern VALUE_PATTERN = Pattern.compile("'?(\\S+\\[\\d+\\])?'?/value$"); private static final Pattern ATTR_PATTERN = Pattern.compile("'?(\\S+\\[\\d+\\])?'?/'attr\\|(\\S+)'"); public static final String REMOVE_FIELD_PATH_SINGLE_QUOTE_PATTERN = "'?([^']*)'?(\\[\\d+\\])?"; public static final String RECORD_ATTRIBUTE_NAMESPACE_PREFIX = "xmlns:"; private final Stage.Context context; private final String readerId; private final int maxObjectLen; private final OverrunStreamingXmlParser parser; private final boolean includeXpath; private long readerOffset; public XmlCharDataParser(Stage.Context context, String readerId, OverrunReader reader, long readerOffset, String recordElement, int maxObjectLen) throws IOException { this(context, readerId, reader, readerOffset, recordElement, false, null, maxObjectLen, true); } public XmlCharDataParser(Stage.Context context, String readerId, OverrunReader reader, long readerOffset, String recordElement, boolean includeXpath, int maxObjectLen) throws IOException { this(context, readerId, reader, readerOffset, recordElement, includeXpath, null, maxObjectLen, true); } public XmlCharDataParser( Stage.Context context, String readerId, OverrunReader reader, long readerOffset, String recordElement, boolean includeXpath, Map<String, String> namespaces, int maxObjectLen, boolean useFieldAttributesInsteadOfFields ) throws IOException { this.context = context; this.readerId = readerId; this.readerOffset = readerOffset; this.maxObjectLen = maxObjectLen; this.includeXpath = includeXpath; try { parser = new OverrunStreamingXmlParser( reader, recordElement, namespaces, readerOffset, maxObjectLen, useFieldAttributesInsteadOfFields ); } catch (XMLStreamException ex) { throw new IOException(ex); } } @Override public Record parse() throws IOException, DataParserException { Record record = null; long offset = -1; try { offset = getOffsetAsLong(); Field field = parser.read(); readerOffset = -1; if (field != null) { record = createRecord(offset, field); } } catch (XMLStreamException ex) { throw new DataParserException(Errors.XML_PARSER_02, readerId, offset, maxObjectLen); } return record; } protected Record createRecord(long offset, Field field) throws DataParserException { Record record = context.createRecord(readerId + "::" + offset); record.set(field); if (includeXpath) { setFieldXpathAttributes(record); } return record; } private void setFieldXpathAttributes(Record record) { for (String path : record.getEscapedFieldPaths()) { // Only interested in leaves of the path tree so pass any complex types. // This check is needed because an XML element may be named as "value". if (record.get(path).getType() == Field.Type.LIST || record.get(path).getType() == Field.Type.LIST_MAP || record.get(path).getType() == Field.Type.MAP) { continue; } Matcher matcher = VALUE_PATTERN.matcher(path); Field field = record.get(path); String xpath = null; if (matcher.matches()) { String fieldPath = removeSingleQuotesFromFieldPath(matcher.group(1)); xpath = toXpath(fieldPath, record); } else { matcher = ATTR_PATTERN.matcher(path); if (matcher.matches()) { String fieldPath = removeSingleQuotesFromFieldPath(matcher.group(1)); String attribute = matcher.group(2); xpath = toXpath(fieldPath, record) + "/@" + attribute; } } if (!Strings.isNullOrEmpty(xpath)) { field.setAttribute(StreamingXmlParser.XPATH_KEY, xpath); } } Record.Header header = record.getHeader(); for (Map.Entry<String, String> nsEntry : parser.getNamespaceUriToPrefixMappings().entrySet()) { header.setAttribute(RECORD_ATTRIBUTE_NAMESPACE_PREFIX + nsEntry.getValue(), nsEntry.getKey()); } } private static String removeSingleQuotesFromFieldPath(String fieldPath) { if (Strings.isNullOrEmpty(fieldPath)) { return fieldPath; } else { return fieldPath.replaceAll(REMOVE_FIELD_PATH_SINGLE_QUOTE_PATTERN, "$1$2"); } } private String toXpath(String fieldPath, Record record) { if (fieldPath == null) { fieldPath = ""; } String xpath = fieldPath; List<MatchResult> matchResults = new ArrayList<>(); Matcher matcher = INDEX_PATTERN.matcher(fieldPath); while (matcher.matches()) { MatchResult matchResult = matcher.toMatchResult(); matchResults.add(matchResult); String parentPath = matchResult.group(2); matcher = INDEX_PATTERN.matcher(parentPath); } for (MatchResult matchResult : matchResults) { String currentPath = matchResult.group(1); String fieldIndex = matchResult.group(3); // If the field is an array of a single value, flatten it out // to make it comply with the XPath syntax. if (record.get(currentPath).getValueAsList().size() == 1) { xpath = xpath.replace(currentPath + fieldIndex, currentPath); } } return parser.getLastParsedFieldXpathPrefix() + xpath; } @Override public String getOffset() throws DataParserException { return String.valueOf(getOffsetAsLong()); } private long getOffsetAsLong() throws DataParserException { try { return (readerOffset > -1) ? readerOffset : parser.getReaderPosition(); } catch (XMLStreamException ex) { throw new DataParserException(Errors.XML_PARSER_01, ex.toString(), ex); } } @Override public void close() throws IOException { parser.close(); } }