/** * Copyright 2015 StreamSets Inc. * * Licensed under the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.streamsets.pipeline.lib.parser.delimited; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.streamsets.pipeline.api.Field; import com.streamsets.pipeline.api.Record; import com.streamsets.pipeline.api.Stage; import com.streamsets.pipeline.api.ext.io.OverrunReader; import com.streamsets.pipeline.api.impl.Utils; import com.streamsets.pipeline.config.CsvHeader; import com.streamsets.pipeline.config.CsvRecordType; import com.streamsets.pipeline.lib.csv.OverrunCsvParser; import com.streamsets.pipeline.lib.parser.AbstractDataParser; import com.streamsets.pipeline.lib.parser.DataParserException; import com.streamsets.pipeline.lib.parser.RecoverableDataParserException; import org.apache.commons.csv.CSVFormat; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; public class DelimitedCharDataParser extends AbstractDataParser { private final Stage.Context context; private final String readerId; private final OverrunCsvParser parser; private List<Field> headers; private boolean eof; private CsvRecordType recordType; private final String nullConstant; public DelimitedCharDataParser( Stage.Context context, String readerId, OverrunReader reader, long readerOffset, int skipStartLines, CSVFormat format, CsvHeader header, int maxObjectLen, CsvRecordType recordType, boolean parseNull, String nullConstant) throws IOException { this.context = context; this.readerId = readerId; this.recordType = recordType; this.nullConstant = parseNull ? nullConstant : null; switch (header) { case WITH_HEADER: format = format.withHeader((String[])null).withSkipHeaderRecord(true); break; case IGNORE_HEADER: format = format.withHeader((String[])null).withSkipHeaderRecord(true); break; case NO_HEADER: format = format.withHeader((String[])null).withSkipHeaderRecord(false); break; default: throw new RuntimeException(Utils.format("Unknown header error: {}", header)); } parser = new OverrunCsvParser(reader, format, readerOffset, skipStartLines, maxObjectLen); String[] hs = parser.getHeaders(); if (header != CsvHeader.IGNORE_HEADER && hs != null) { headers = new ArrayList<>(); for (String h : hs) { headers.add(Field.create(h)); } } } @Override public Record parse() throws IOException, DataParserException { Record record = null; long offset = parser.getReaderPosition(); String[] columns = parser.read(); if (columns != null) { record = createRecord(offset, columns); } else { eof = true; } return record; } protected Record createRecord(long offset, String[] columns) throws DataParserException { Record record = context.createRecord(readerId + "::" + offset); // In case that the number of columns does not equal the number of expected columns from header, report the // parsing error as recoverable issue - it's safe to continue reading the stream. if(headers != null && columns.length > headers.size()) { record.set(Field.create(Field.Type.MAP, ImmutableMap.builder() .put("columns", getListField(columns)) .put("headers", Field.create(Field.Type.LIST, headers)) .build() )); throw new RecoverableDataParserException(record, Errors.DELIMITED_PARSER_01, offset, columns.length, headers.size()); } if(recordType == CsvRecordType.LIST) { List<Field> row = new ArrayList<>(); for (int i = 0; i < columns.length; i++) { Map<String, Field> cell = new HashMap<>(); Field header = (headers != null) ? headers.get(i) : null; if (header != null) { cell.put("header", header); } Field value = getField(columns[i]); cell.put("value", value); row.add(Field.create(cell)); } record.set(Field.create(row)); } else { LinkedHashMap<String, Field> listMap = new LinkedHashMap<>(); for (int i = 0; i < columns.length; i++) { String key; Field header = (headers != null) ? headers.get(i) : null; if(header != null) { key = header.getValueAsString(); } else { key = i + ""; } listMap.put(key, getField(columns[i])); } record.set(Field.createListMap(listMap)); } return record; } private Field getListField(String ...values) { ImmutableList.Builder listBuilder = ImmutableList.builder(); for(String value : values) { listBuilder.add(Field.create(Field.Type.STRING, value)); } return Field.create(Field.Type.LIST, listBuilder.build()); } private Field getField(String value) { if(nullConstant != null && nullConstant.equals(value)) { return Field.create(Field.Type.STRING, null); } return Field.create(Field.Type.STRING, value); } @Override public String getOffset() { return (eof) ? String.valueOf(-1) : String.valueOf(parser.getReaderPosition()); } @Override public void close() throws IOException { parser.close(); } }