/** * Copyright 2015 StreamSets Inc. * * Licensed under the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.streamsets.pipeline.lib.csv; import com.streamsets.pipeline.api.ext.io.CountingReader; import com.streamsets.pipeline.api.ext.io.ObjectLengthException; import com.streamsets.pipeline.api.impl.Utils; import com.streamsets.pipeline.lib.util.ExceptionUtils; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVRecord; import org.apache.commons.io.IOUtils; import java.io.Closeable; import java.io.IOException; import java.io.Reader; import java.util.Iterator; public class CsvParser implements Closeable, AutoCloseable { private long currentPos; private long skipLinesPosCorrection; private final CSVParser parser; private final CountingReader reader; private final int maxObjectLen; private Iterator<CSVRecord> iterator; private CSVRecord nextRecord; private final String[] headers; private boolean closed; public CsvParser(Reader reader, CSVFormat format, int maxObjectLen) throws IOException { this(new CountingReader(reader), format, maxObjectLen, 0, 0); } @SuppressWarnings("unchecked") public CsvParser( CountingReader reader, CSVFormat format, int maxObjectLen, long initialPosition, int skipStartLines ) throws IOException { Utils.checkNotNull(reader, "reader"); Utils.checkNotNull(reader.getPos() == 0, "reader must be in position zero, the CsvParser will fast-forward to the initialPosition"); Utils.checkNotNull(format, "format"); Utils.checkArgument(initialPosition >= 0, "initialPosition must be greater or equal than zero"); Utils.checkArgument(skipStartLines >= 0, "skipStartLines must be greater or equal than zero"); this.reader = reader; currentPos = initialPosition; this.maxObjectLen = maxObjectLen; if (initialPosition == 0) { if (skipStartLines > 0) { skipLinesPosCorrection = skipLines(reader, skipStartLines); currentPos = skipLinesPosCorrection; } if (format.getSkipHeaderRecord()) { format = format.withSkipHeaderRecord(false); parser = new CSVParser(reader, format, 0, 0); headers = read(); } else { parser = new CSVParser(reader, format, 0, 0); headers = null; } } else { if (format.getSkipHeaderRecord()) { format = format.withSkipHeaderRecord(false); parser = new CSVParser(reader, format, 0, 0); headers = read(); while (getReaderPosition() < initialPosition && read() != null) { } if (getReaderPosition() != initialPosition) { throw new IOException(Utils.format("Could not position reader at position '{}', got '{}' instead", initialPosition, getReaderPosition())); } } else { IOUtils.skipFully(reader, initialPosition); parser = new CSVParser(reader, format, initialPosition, 0); headers = null; } } } private long skipLines(Reader reader, int lines) throws IOException { int count = 0; int skipped = 0; while (skipped < lines) { int c = reader.read(); if (c == -1) { throw new IOException(Utils.format("Could not skip '{}' lines, reached EOF", lines)); } // this is enough to handle \n and \r\n EOL files if (c == '\n') { skipped++; } count++; } return count; } protected Reader getReader() { return reader; } protected CSVRecord nextRecord() throws IOException { return (iterator.hasNext()) ? iterator.next() : null; } public String[] getHeaders() throws IOException { return headers; } public long getReaderPosition() { return currentPos; } public String[] read() throws IOException { if (closed) { throw new IOException("Parser has been closed"); } if (iterator == null) { iterator = parser.iterator(); nextRecord = nextRecord(); } CSVRecord record = nextRecord; if (nextRecord != null) { nextRecord = nextRecord(); } long prevPos = currentPos; currentPos = (nextRecord != null) ? nextRecord.getCharacterPosition() + skipLinesPosCorrection : reader.getPos(); if (maxObjectLen > -1) { if (currentPos - prevPos > maxObjectLen) { ExceptionUtils.throwUndeclared(new ObjectLengthException(Utils.format( "CSV Object at offset '{}' exceeds max length '{}'", prevPos, maxObjectLen), prevPos)); } } return toArray(record); } private String[] toArray(CSVRecord record) { String[] array = (record == null) ? null : new String[record.size()]; if (array != null) { for (int i = 0; i < record.size(); i++) { array[i] = record.get(i); } } return array; } @Override public void close() { try { closed = true; parser.close(); } catch (IOException ex) { //NOP } } }