CSVParser.java example

Explorer

motherbrain-master
- src
  - com
    - zillabyte
      - motherbrain
        api
        APIException.java
        APIService.java
        LocalAPIService.java
        RelationsHelper.java
        RestAPIHelper.java
        benchmarking
        Benchmark.java
        BenchmarkFactory.java
        GraphiteBenchmarker.java
        MovingBenchmarker.java
        NoopBenchmarker.java
        StderrBenchmarker.java
        container
        Container.java
        ContainerCacher.java
        ContainerEnvironmentHelper.java
        ContainerException.java
        ContainerExecuteBuilder.java
        ContainerFactory.java
        ContainerPathHelper.java
        ContainerSerializer.java
        ContainerWrapper.java
        NoopRemoteContainerCleaner.java
        PipeGateway.java
        RemoteContainerCleaner.java
        TcpSocketHelper.java
        UnixSocketHelper.java
        local
        InplaceContainer.java
        InplaceContainerFactory.java
        InplaceSerializer.java
        coordination
        AskHandler.java
        AskWrapper.java
        CoordinationException.java
        CoordinationService.java
        CoordinationServiceWrapper.java
        Lock.java
        MessageHandler.java
        RemoteCoordinationException.java
        Watcher.java
        mock
        MockStateService.java
        redis
        RedisException.java
        TransactionalMessageWrapper.java
        flow
        ActionTimeoutException.java
        App.java
        Component.java
        EndCyclePolicy.java
        Fields.java
        Flow.java
        FlowCompilationException.java
        FlowException.java
        FlowInstance.java
        FlowInstanceSetBuilder.java
        FlowInstanceWrapper.java
        FlowOperationInstance.java
        FlowOperationInstanceCollection.java
        FlowRecoveryException.java
        FlowService.java
        FlowState.java
        FlowStateCoordinator.java
        FlowStateException.java
        MapTuple.java
        StateMachine.java
        StateMachineException.java
        StateMachineHelper.java
        StreamBuilder.java
        aggregation
        AggregationException.java
        AggregationKey.java
        AggregationStore.java
        AggregationStoreFactory.java
        AggregationStoreWrapper.java
        Aggregator.java
        CachedStore.java
        CachedStoreFactory.java
        DefaultAggregationStoreFactory.java
        DiskBackedStore.java
        FlowAggregationSplitter.java
        MemoryStore.java
        MemoryStoreFactory.java
        PartialAggregator.java
        StoreType.java
        TuplePage.java
        buffer
        BufferClientFactory.java
        BufferConsumer.java
        BufferFlusher.java
        BufferProducer.java
        BufferService.java
        SinkToBuffer.java
        SourceFromBuffer.java
        mock
        LocalBufferClientFactory.java
        LocalBufferConsumer.java
        LocalBufferProducer.java
        LocalDevBufferClientFactory.java
        LocalDevBufferFlusher.java
        LocalDevBufferProducer.java
        MockBufferProducer.java
        MockBufferService.java
        collectors
        OutputCollector.java
        coordinated
        AckTuple.java
        BaseCoordTuple.java
        BatchCompleteAckTuple.java
        BatchCompleteTuple.java
        BatchState.java
        BatchTracker.java
        BatchedTuple.java
        CoordTupleOptions.java
        CoordinatedOutputCollector.java
        DeadNodeDetectedException.java
        ExplicitAckRequestTuple.java
        MaxIterationsExceededException.java
        ObserveIncomingTupleAction.java
        PingTuple.java
        PongTuple.java
        QueuedTuple.java
        support
        CoordinatedOutputCollectorSupportFactory.java
        EmptyTupleIdSet.java
        FailedTupleHandler.java
        TupleIdGenerator.java
        TupleIdMapper.java
        TupleIdSet.java
        naive
        DoNothingFailedTupleHandler.java
        NaiveCoordinatedOutputCollectorSupportFactory.java
        SerialTupleIdGenerator.java
        UncompressedTupleIdMapper.java
        UncompressedTupleIdSet.java
        components
        ComponentInput.java
        ComponentOutput.java
        builtin
        BuiltinComponents.java
        FetchUrlComponent.java
        config
        FlowConfig.java
        OperationConfig.java
        UserConfig.java
        error
        strategies
        ErrorStrategyFactory.java
        ErrorThresholdExceededException.java
        FakeLocalException.java
        FlowErrorStrategy.java
        ForgivingFlowErrorStrategy.java
        OperationErrorStrategy.java
        PassiveWorkerPercentageAndAbsoluteOperationErrorStrategy.java
        StrictFlowErrorStrategy.java
        WorkerThrowsErrorPercentageAndAbsoluteOperationErrorStrategy.java
        graph
        Connection.java
        FlowGraph.java
        heartbeats
        Heartbeat.java
        HeartbeatException.java
        local
        LocalFlowController.java
        LocalFlowOutputCollector.java
        LocalFlowService.java
        LocalOperationSlot.java
        operations
        AggregationOperation.java
        AggregationState.java
        Function.java
        FunctionState.java
        GroupBy.java
        Join.java
        JoinType.java
        MockOperationLogger.java
        Operation.java
        OperationDeadException.java
        OperationException.java
        OperationLogger.java
        OperationLoggerException.java
        OperationMessage.java
        OperationSleeper.java
        ProcessableOperation.java
        Sink.java
        SinkState.java
        Source.java
        SourceState.java
        builtin
        Clumper.java
        Count.java
        RateLimiter.java
        Unique.java
        decorators
        EmitDecorator.java
        RemoveFields.java
        RenameFields.java
        RetainFields.java
        multilang
        MultiLangCleaner.java
        MultiLangErrorHandler.java
        MultiLangException.java
        MultiLangLogHandler.java
        MultiLangMessageHandler.java
        MultiLangProcess.java
        MultiLangProcessDiedUnexpectedlyException.java
        MultiLangProcessException.java
        MultiLangProcessGeneralOperationObserver.java
        MultiLangProcessStartupLogObserver.java
        MultiLangProcessTupleObserver.java
        builder
        APIFlowBuilder.java
        APIFlowBuilderFactory.java
        FlowBuilderFactory.java
        FlowFetcher.java
        FlowValidator.java
        InplaceFlowBuilder.java
        InplaceFlowBuilderFactory.java
        MultilangFlowCompiler.java
        PlaceHolderOperation.java
        RouteBy.java
        operations
        LocalComponent.java
        MultiLangAggregator.java
        MultiLangOperation.java
        MultiLangRunEach.java
        MultiLangRunSource.java
        MultilangClumper.java
        MultilangHandler.java
        rpc
        RPCHelper.java
        RPCRequest.java
        RPCResponse.java
        RPCSink.java
        RPCSource.java
        queues
        InputQueue.java
        MockQueueFactory.java
        OutputQueue.java
        QueueFactory.java
        sourcefromrelation
        ShardReader.java
        ShardType.java
        tests
        helpers
        MockInstanceHelper.java
        metrics
        Metrics.java
        MockMetrics.java
        reactor
        lightweight
        ProcessTimeoutException.java
        relational
        APIRelationDefFactory.java
        AliasedQuery.java
        BufferQuery.java
        ColumnDef.java
        DataType.java
        DefaultStreamException.java
        MissingFieldException.java
        Query.java
        RelationBackend.java
        RelationDef.java
        RelationDefFactory.java
        RelationException.java
        S3OnlyQuery.java
        StreamMarker.java
        StreamReader.java
        UnexpectedFieldException.java
        naivepostgresimpl
        NaivePostgresStreamMarker.java
        redshiftimpl
        RedshiftStreamMarker.java
        RedshiftStreamReader.java
        shell
        LocalOsxShellFactory.java
        MachineType.java
        ShellFactory.java
        UbuntuEc2ShellFactory.java
        UbuntuTeamCityShellFactory.java
        UbuntuVagrantShellFactory.java
        state
        package-info.java
        top
        BasicTopService.java
        CommandLine.java
        LocalCommandLineHelper.java
        LocalServiceMain.java
        MotherbrainException.java
        MotherbrainRuntimeException.java
        TopService.java
        universe
        Config.java
        Environment.java
        ExceptionHandler.java
        ExpectedConfigNotPresent.java
        FileFactory.java
        FileFactoryException.java
        LocalUniverseBuilder.java
        LoggerFactory.java
        MockConfig.java
        S3Exception.java
        SSHException.java
        SSHFactory.java
        Universe.java
        UniverseBuilder.java
        utils
        ByteArrayWrapper.java
        CompressUtils.java
        DateHelper.java
        ExceptionAttempt.java
        FileLockUtil.java
        Glob.java
        JSONUtil.java
        JarCompilationException.java
        Log4jWrapper.java
        MapBuilder.java
        MeteredLog.java
        SerializableMonitor.java
        UrlHelper.java
        Utils.java
        VersionComparer.java
        backoff
        BackoffTicker.java
        ExponentialBackoffTicker.java
        csv
        Assertions.java
        CSVFormat.java
        CSVParser.java
        CSVPrinter.java
        CSVRecord.java
        Constants.java
        ExtendedBufferedReader.java
        Lexer.java
        Quote.java
        Token.java
        package-info.java
        dfs
        DFSService.java
        DFSServiceWrapper.java
        LocalDFSService.java
        queue
        ByteSizable.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.zillabyte.motherbrain.utils.csv;

import static com.zillabyte.motherbrain.utils.csv.Token.Type.TOKEN;

import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;

/**
 * Parses CSV files according to the specified format.
 *
 * Because CSV appears in many different dialects, the parser supports many formats by allowing the
 * specification of a {@link CSVFormat}.
 *
 * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream.
 *
 * <h2>Creating instances</h2>
 * <p>
 * There are several static factory methods that can be used to create instances for various types of resources:
 * </p>
 * <ul>
 *     <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li>
 *     <li>{@link #parse(String, CSVFormat)}</li>
 *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
 * </ul>
 * <p>
 * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
 *
 * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
 * </p>
 * <pre>
 * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) {
 *     ...
 * }
 * </pre>
 *
 * <h2>Parsing record wise</h2>
 * <p>
 * To parse a CSV input from a file, you write:
 * </p>
 *
 * <pre>
 * File csvData = new File("/path/to/csv");
 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
 * for (CSVRecord csvRecord : parser) {
 *     ...
 * }
 * </pre>
 *
 * <p>
 * This will read the parse the contents of the file using the
 * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
 * </p>
 *
 * <p>
 * To parse CSV input in a format like Excel, you write:
 * </p>
 *
 * <pre>
 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
 * for (CSVRecord csvRecord : parser) {
 *     ...
 * }
 * </pre>
 *
 * <p>
 * If the predefined formats don't match the format at hands, custom formats can be defined. More information about
 * customising CSVFormats is available in {@link CSVFormat CSVFormat JavaDoc}.
 * </p>
 *
 * <h2>Parsing into memory</h2>
 * <p>
 * If parsing record wise is not desired, the contents of the input can be read completely into memory.
 * </p>
 *
 * <pre>
 * Reader in = new StringReader("a;b\nc;d");
 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
 * List<CSVRecord> list = parser.getRecords();
 * </pre>
 *
 * <p>
 * There are two constraints that have to be kept in mind:
 * </p>
 *
 * <ol>
 *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
 *     the input, those records will not end up in the in memory representation of your CSV data.</li>
 *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're
 *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
 * </ol>
 *
 * <h2>Notes</h2>
 * <p>
 * Internal parser state is completely covered by the format and the reader-state.
 * </p>
 *
 * @version $Id: CSVParser.java 1610772 2014-07-15 17:52:53Z britter $
 *
 * @see <a href="package-summary.html">package documentation for more details</a>
 */
public final class CSVParser implements Iterable<CSVRecord>, Closeable {

    /**
     * Creates a parser for the given {@link File}.
     *
     * <p><strong>Note:</strong> This method internally creates a FileReader using
     * {@link FileReader#FileReader(java.io.File)} which in turn relies on the default encoding of the JVM that
     * is executing the code. If this is insufficient create a URL to the file and use
     * {@link #parse(URL, Charset, CSVFormat)}</p>
     *
     * @param file
     *            a CSV file. Must not be null.
     * @param charset
     *            A charset
     * @param format
     *            the CSVFormat used for CSV parsing. Must not be null.
     * @return a new parser
     * @throws IllegalArgumentException
     *             If the parameters of the format are inconsistent or if either file or format are null.
     * @throws IOException
     *             If an I/O error occurs
     */
    public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException {
        Assertions.notNull(file, "file");
        Assertions.notNull(format, "format");
        // Use the default Charset explicitly
        return new CSVParser(new InputStreamReader(new FileInputStream(file), charset), format);
    }

    /**
     * Creates a parser for the given {@link String}.
     *
     * @param string
     *            a CSV string. Must not be null.
     * @param format
     *            the CSVFormat used for CSV parsing. Must not be null.
     * @return a new parser
     * @throws IllegalArgumentException
     *             If the parameters of the format are inconsistent or if either string or format are null.
     * @throws IOException
     *             If an I/O error occurs
     */
    public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
        Assertions.notNull(string, "string");
        Assertions.notNull(format, "format");

        return new CSVParser(new StringReader(string), format);
    }

    /**
     * Creates a parser for the given URL.
     *
     * <p>
     * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
     * you close the {@code url}.
     * </p>
     *
     * @param url
     *            a URL. Must not be null.
     * @param charset
     *            the charset for the resource. Must not be null.
     * @param format
     *            the CSVFormat used for CSV parsing. Must not be null.
     * @return a new parser
     * @throws IllegalArgumentException
     *             If the parameters of the format are inconsistent or if either url, charset or format are null.
     * @throws IOException
     *             If an I/O error occurs
     */
    public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
        Assertions.notNull(url, "url");
        Assertions.notNull(charset, "charset");
        Assertions.notNull(format, "format");

        return new CSVParser(new InputStreamReader(url.openStream(), charset), format);
    }

    // the following objects are shared to reduce garbage

    private final CSVFormat format;

    /** A mapping of column names to column indices */
    private final Map<String, Integer> headerMap;

    private final Lexer lexer;

    /** A record buffer for getRecord(). Grows as necessary and is reused. */
    private final List<String> record = new ArrayList<String>();

    private long recordNumber;

    private final Token reusableToken = new Token();

    /**
     * Customized CSV parser using the given {@link CSVFormat}
     *
     * <p>
     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
     * unless you close the {@code reader}.
     * </p>
     *
     * @param reader
     *            a Reader containing CSV-formatted input. Must not be null.
     * @param format
     *            the CSVFormat used for CSV parsing. Must not be null.
     * @throws IllegalArgumentException
     *             If the parameters of the format are inconsistent or if either reader or format are null.
     * @throws IOException
     *             If there is a problem reading the header or skipping the first record
     */
    public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
        Assertions.notNull(reader, "reader");
        Assertions.notNull(format, "format");

        this.format = format;
        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
        this.headerMap = this.initializeHeader();
    }

    private void addRecordValue() {
        final String input = this.reusableToken.content.toString();
        final String nullString = this.format.getNullString();
        if (nullString == null) {
            this.record.add(input);
        } else {
            this.record.add(input.equalsIgnoreCase(nullString) ? null : input);
        }
    }

    /**
     * Closes resources.
     *
     * @throws IOException
     *             If an I/O error occurs
     */
    public void close() throws IOException {
        if (this.lexer != null) {
            this.lexer.close();
        }
    }

    /**
     * Returns the current line number in the input stream.
     *
     * <p>
     * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
     * the record number.
     * </p>
     *
     * @return current line number
     */
    public long getCurrentLineNumber() {
        return this.lexer.getCurrentLineNumber();
    }

    /**
     * Returns a copy of the header map that iterates in column order.
     * <p>
     * The map keys are column names. The map values are 0-based indices.
     * </p>
     * @return a copy of the header map that iterates in column order.
     */
    public Map<String, Integer> getHeaderMap() {
        return this.headerMap == null ? null : new LinkedHashMap<String, Integer>(this.headerMap);
    }

    /**
     * Returns the current record number in the input stream.
     *
     * <p>
     * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
     * the line number.
     * </p>
     *
     * @return current line number
     */
    public long getRecordNumber() {
        return this.recordNumber;
    }

    /**
     * Parses the CSV input according to the given format and returns the content as a list of
     * {@link CSVRecord CSVRecords}.
     *
     * <p>
     * The returned content starts at the current parse-position in the stream.
     * </p>
     *
     * @return list of {@link CSVRecord CSVRecords}, may be empty
     * @throws IOException
     *             on parse error or input read-failure
     */
    public List<CSVRecord> getRecords() throws IOException {
        CSVRecord rec;
        // can not use Java 7 diamond notation here, since JavaNCSS will fail, see https://jira.codehaus.org/browse/JAVANCSS-51
        List<CSVRecord> records = new ArrayList<CSVRecord>();
        while ((rec = this.nextRecord()) != null) {
            records.add(rec);
        }
        return records;
    }

    /**
     * Initializes the name to index mapping if the format defines a header.
     *
     * @return null if the format has no header.
     * @throws IOException if there is a problem reading the header or skipping the first record
     */
    private Map<String, Integer> initializeHeader() throws IOException {
        Map<String, Integer> hdrMap = null;
        final String[] formatHeader = this.format.getHeader();
        if (formatHeader != null) {
            hdrMap = new LinkedHashMap<String, Integer>();

            String[] headerRecord = null;
            if (formatHeader.length == 0) {
                // read the header from the first line of the file
                final CSVRecord nextRecord = this.nextRecord();
                if (nextRecord != null) {
                    headerRecord = nextRecord.values();
                }
            } else {
                if (this.format.isSkippingHeaderRecord()) {
                    this.nextRecord();
                }
                headerRecord = formatHeader;
            }

            // build the name to index mappings
            if (headerRecord != null) {
                for (int i = 0; i < headerRecord.length; i++) {
                    final String header = headerRecord[i];
                    final boolean containsHeader = hdrMap.containsKey(header);
                    final boolean emptyHeader = header == null || header.trim().isEmpty();
                    if (containsHeader && (!emptyHeader || (emptyHeader && !this.format.isIgnoringEmptyHeaders()))) {
                        throw new IllegalArgumentException("The header contains a duplicate name: \"" + header +
                                "\" in " + Arrays.toString(headerRecord));
                    }
                    hdrMap.put(header, Integer.valueOf(i));
                }
            }
        }
        return hdrMap;
    }

    public boolean isClosed() {
        return this.lexer.isClosed();
    }

    /**
     * Returns an iterator on the records.
     *
     * <p>IOExceptions occurring during the iteration are wrapped in a
     * RuntimeException.
     * If the parser is closed a call to {@code next()} will throw a
     * NoSuchElementException.</p>
     */
    public Iterator<CSVRecord> iterator() {
        return new Iterator<CSVRecord>() {
            private CSVRecord current;

            private CSVRecord getNextRecord() {
                try {
                    return CSVParser.this.nextRecord();
                } catch (final IOException e) {
                    // TODO: This is not great, throw an ISE instead?
                    throw new RuntimeException(e);
                }
            }

            public boolean hasNext() {
                if (CSVParser.this.isClosed()) {
                    return false;
                }
                if (this.current == null) {
                    this.current = this.getNextRecord();
                }

                return this.current != null;
            }

            public CSVRecord next() {
                if (CSVParser.this.isClosed()) {
                    throw new NoSuchElementException("CSVParser has been closed");
                }
                CSVRecord next = this.current;
                this.current = null;

                if (next == null) {
                    // hasNext() wasn't called before
                    next = this.getNextRecord();
                    if (next == null) {
                        throw new NoSuchElementException("No more CSV records available");
                    }
                }

                return next;
            }

            public void remove() {
                throw new UnsupportedOperationException();
            }
        };
    }

    /**
     * Parses the next record from the current point in the stream.
     *
     * @return the record as an array of values, or <tt>null</tt> if the end of the stream has been reached
     * @throws IOException
     *             on parse error or input read-failure
     */
    CSVRecord nextRecord() throws IOException {
        CSVRecord result = null;
        this.record.clear();
        StringBuilder sb = null;
        do {
            this.reusableToken.reset();
            this.lexer.nextToken(this.reusableToken);
            switch (this.reusableToken.type) {
            case TOKEN:
                this.addRecordValue();
                break;
            case EORECORD:
                this.addRecordValue();
                break;
            case EOF:
                if (this.reusableToken.isReady) {
                    this.addRecordValue();
                }
                break;
            case INVALID:
                throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence");
            case COMMENT: // Ignored currently
                if (sb == null) { // first comment for this record
                    sb = new StringBuilder();
                } else {
                    sb.append(Constants.LF);
                }
                sb.append(this.reusableToken.content);
                this.reusableToken.type = TOKEN; // Read another token
                break;
            default:
                throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type);
            }
        } while (this.reusableToken.type == TOKEN);

        if (!this.record.isEmpty()) {
            this.recordNumber++;
            final String comment = sb == null ? null : sb.toString();
            result = new CSVRecord(this.record.toArray(new String[this.record.size()]), this.headerMap, comment,
                    this.recordNumber);
        }
        return result;
    }

}