/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zillabyte.motherbrain.utils.csv; import static com.zillabyte.motherbrain.utils.csv.Token.Type.TOKEN; import java.io.Closeable; import java.io.File; import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; /** * Parses CSV files according to the specified format. * * Because CSV appears in many different dialects, the parser supports many formats by allowing the * specification of a {@link CSVFormat}. * * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream. * * <h2>Creating instances</h2> * <p> * There are several static factory methods that can be used to create instances for various types of resources: * </p> * <ul> * <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li> * <li>{@link #parse(String, CSVFormat)}</li> * <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li> * </ul> * <p> * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor. * * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut: * </p> * <pre> * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) { * ... * } * </pre> * * <h2>Parsing record wise</h2> * <p> * To parse a CSV input from a file, you write: * </p> * * <pre> * File csvData = new File("/path/to/csv"); * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180); * for (CSVRecord csvRecord : parser) { * ... * } * </pre> * * <p> * This will read the parse the contents of the file using the * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format. * </p> * * <p> * To parse CSV input in a format like Excel, you write: * </p> * * <pre> * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL); * for (CSVRecord csvRecord : parser) { * ... * } * </pre> * * <p> * If the predefined formats don't match the format at hands, custom formats can be defined. More information about * customising CSVFormats is available in {@link CSVFormat CSVFormat JavaDoc}. * </p> * * <h2>Parsing into memory</h2> * <p> * If parsing record wise is not desired, the contents of the input can be read completely into memory. * </p> * * <pre> * Reader in = new StringReader("a;b\nc;d"); * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL); * List<CSVRecord> list = parser.getRecords(); * </pre> * * <p> * There are two constraints that have to be kept in mind: * </p> * * <ol> * <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from * the input, those records will not end up in the in memory representation of your CSV data.</li> * <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're * parsing a 150MB file of CSV data the contents will be read completely into memory.</li> * </ol> * * <h2>Notes</h2> * <p> * Internal parser state is completely covered by the format and the reader-state. * </p> * * @version $Id: CSVParser.java 1610772 2014-07-15 17:52:53Z britter $ * * @see <a href="package-summary.html">package documentation for more details</a> */ public final class CSVParser implements Iterable<CSVRecord>, Closeable { /** * Creates a parser for the given {@link File}. * * <p><strong>Note:</strong> This method internally creates a FileReader using * {@link FileReader#FileReader(java.io.File)} which in turn relies on the default encoding of the JVM that * is executing the code. If this is insufficient create a URL to the file and use * {@link #parse(URL, Charset, CSVFormat)}</p> * * @param file * a CSV file. Must not be null. * @param charset * A charset * @param format * the CSVFormat used for CSV parsing. Must not be null. * @return a new parser * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either file or format are null. * @throws IOException * If an I/O error occurs */ public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException { Assertions.notNull(file, "file"); Assertions.notNull(format, "format"); // Use the default Charset explicitly return new CSVParser(new InputStreamReader(new FileInputStream(file), charset), format); } /** * Creates a parser for the given {@link String}. * * @param string * a CSV string. Must not be null. * @param format * the CSVFormat used for CSV parsing. Must not be null. * @return a new parser * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either string or format are null. * @throws IOException * If an I/O error occurs */ public static CSVParser parse(final String string, final CSVFormat format) throws IOException { Assertions.notNull(string, "string"); Assertions.notNull(format, "format"); return new CSVParser(new StringReader(string), format); } /** * Creates a parser for the given URL. * * <p> * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless * you close the {@code url}. * </p> * * @param url * a URL. Must not be null. * @param charset * the charset for the resource. Must not be null. * @param format * the CSVFormat used for CSV parsing. Must not be null. * @return a new parser * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either url, charset or format are null. * @throws IOException * If an I/O error occurs */ public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException { Assertions.notNull(url, "url"); Assertions.notNull(charset, "charset"); Assertions.notNull(format, "format"); return new CSVParser(new InputStreamReader(url.openStream(), charset), format); } // the following objects are shared to reduce garbage private final CSVFormat format; /** A mapping of column names to column indices */ private final Map<String, Integer> headerMap; private final Lexer lexer; /** A record buffer for getRecord(). Grows as necessary and is reused. */ private final List<String> record = new ArrayList<String>(); private long recordNumber; private final Token reusableToken = new Token(); /** * Customized CSV parser using the given {@link CSVFormat} * * <p> * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, * unless you close the {@code reader}. * </p> * * @param reader * a Reader containing CSV-formatted input. Must not be null. * @param format * the CSVFormat used for CSV parsing. Must not be null. * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either reader or format are null. * @throws IOException * If there is a problem reading the header or skipping the first record */ public CSVParser(final Reader reader, final CSVFormat format) throws IOException { Assertions.notNull(reader, "reader"); Assertions.notNull(format, "format"); this.format = format; this.lexer = new Lexer(format, new ExtendedBufferedReader(reader)); this.headerMap = this.initializeHeader(); } private void addRecordValue() { final String input = this.reusableToken.content.toString(); final String nullString = this.format.getNullString(); if (nullString == null) { this.record.add(input); } else { this.record.add(input.equalsIgnoreCase(nullString) ? null : input); } } /** * Closes resources. * * @throws IOException * If an I/O error occurs */ public void close() throws IOException { if (this.lexer != null) { this.lexer.close(); } } /** * Returns the current line number in the input stream. * * <p> * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to * the record number. * </p> * * @return current line number */ public long getCurrentLineNumber() { return this.lexer.getCurrentLineNumber(); } /** * Returns a copy of the header map that iterates in column order. * <p> * The map keys are column names. The map values are 0-based indices. * </p> * @return a copy of the header map that iterates in column order. */ public Map<String, Integer> getHeaderMap() { return this.headerMap == null ? null : new LinkedHashMap<String, Integer>(this.headerMap); } /** * Returns the current record number in the input stream. * * <p> * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to * the line number. * </p> * * @return current line number */ public long getRecordNumber() { return this.recordNumber; } /** * Parses the CSV input according to the given format and returns the content as a list of * {@link CSVRecord CSVRecords}. * * <p> * The returned content starts at the current parse-position in the stream. * </p> * * @return list of {@link CSVRecord CSVRecords}, may be empty * @throws IOException * on parse error or input read-failure */ public List<CSVRecord> getRecords() throws IOException { CSVRecord rec; // can not use Java 7 diamond notation here, since JavaNCSS will fail, see https://jira.codehaus.org/browse/JAVANCSS-51 List<CSVRecord> records = new ArrayList<CSVRecord>(); while ((rec = this.nextRecord()) != null) { records.add(rec); } return records; } /** * Initializes the name to index mapping if the format defines a header. * * @return null if the format has no header. * @throws IOException if there is a problem reading the header or skipping the first record */ private Map<String, Integer> initializeHeader() throws IOException { Map<String, Integer> hdrMap = null; final String[] formatHeader = this.format.getHeader(); if (formatHeader != null) { hdrMap = new LinkedHashMap<String, Integer>(); String[] headerRecord = null; if (formatHeader.length == 0) { // read the header from the first line of the file final CSVRecord nextRecord = this.nextRecord(); if (nextRecord != null) { headerRecord = nextRecord.values(); } } else { if (this.format.isSkippingHeaderRecord()) { this.nextRecord(); } headerRecord = formatHeader; } // build the name to index mappings if (headerRecord != null) { for (int i = 0; i < headerRecord.length; i++) { final String header = headerRecord[i]; final boolean containsHeader = hdrMap.containsKey(header); final boolean emptyHeader = header == null || header.trim().isEmpty(); if (containsHeader && (!emptyHeader || (emptyHeader && !this.format.isIgnoringEmptyHeaders()))) { throw new IllegalArgumentException("The header contains a duplicate name: \"" + header + "\" in " + Arrays.toString(headerRecord)); } hdrMap.put(header, Integer.valueOf(i)); } } } return hdrMap; } public boolean isClosed() { return this.lexer.isClosed(); } /** * Returns an iterator on the records. * * <p>IOExceptions occurring during the iteration are wrapped in a * RuntimeException. * If the parser is closed a call to {@code next()} will throw a * NoSuchElementException.</p> */ public Iterator<CSVRecord> iterator() { return new Iterator<CSVRecord>() { private CSVRecord current; private CSVRecord getNextRecord() { try { return CSVParser.this.nextRecord(); } catch (final IOException e) { // TODO: This is not great, throw an ISE instead? throw new RuntimeException(e); } } public boolean hasNext() { if (CSVParser.this.isClosed()) { return false; } if (this.current == null) { this.current = this.getNextRecord(); } return this.current != null; } public CSVRecord next() { if (CSVParser.this.isClosed()) { throw new NoSuchElementException("CSVParser has been closed"); } CSVRecord next = this.current; this.current = null; if (next == null) { // hasNext() wasn't called before next = this.getNextRecord(); if (next == null) { throw new NoSuchElementException("No more CSV records available"); } } return next; } public void remove() { throw new UnsupportedOperationException(); } }; } /** * Parses the next record from the current point in the stream. * * @return the record as an array of values, or <tt>null</tt> if the end of the stream has been reached * @throws IOException * on parse error or input read-failure */ CSVRecord nextRecord() throws IOException { CSVRecord result = null; this.record.clear(); StringBuilder sb = null; do { this.reusableToken.reset(); this.lexer.nextToken(this.reusableToken); switch (this.reusableToken.type) { case TOKEN: this.addRecordValue(); break; case EORECORD: this.addRecordValue(); break; case EOF: if (this.reusableToken.isReady) { this.addRecordValue(); } break; case INVALID: throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence"); case COMMENT: // Ignored currently if (sb == null) { // first comment for this record sb = new StringBuilder(); } else { sb.append(Constants.LF); } sb.append(this.reusableToken.content); this.reusableToken.type = TOKEN; // Read another token break; default: throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type); } } while (this.reusableToken.type == TOKEN); if (!this.record.isEmpty()) { this.recordNumber++; final String comment = sb == null ? null : sb.toString(); result = new CSVRecord(this.record.toArray(new String[this.record.size()]), this.headerMap, comment, this.recordNumber); } return result; } }