Lexer.java example

Explorer

motherbrain-master
- src
  - com
    - zillabyte
      - motherbrain
        api
        APIException.java
        APIService.java
        LocalAPIService.java
        RelationsHelper.java
        RestAPIHelper.java
        benchmarking
        Benchmark.java
        BenchmarkFactory.java
        GraphiteBenchmarker.java
        MovingBenchmarker.java
        NoopBenchmarker.java
        StderrBenchmarker.java
        container
        Container.java
        ContainerCacher.java
        ContainerEnvironmentHelper.java
        ContainerException.java
        ContainerExecuteBuilder.java
        ContainerFactory.java
        ContainerPathHelper.java
        ContainerSerializer.java
        ContainerWrapper.java
        NoopRemoteContainerCleaner.java
        PipeGateway.java
        RemoteContainerCleaner.java
        TcpSocketHelper.java
        UnixSocketHelper.java
        local
        InplaceContainer.java
        InplaceContainerFactory.java
        InplaceSerializer.java
        coordination
        AskHandler.java
        AskWrapper.java
        CoordinationException.java
        CoordinationService.java
        CoordinationServiceWrapper.java
        Lock.java
        MessageHandler.java
        RemoteCoordinationException.java
        Watcher.java
        mock
        MockStateService.java
        redis
        RedisException.java
        TransactionalMessageWrapper.java
        flow
        ActionTimeoutException.java
        App.java
        Component.java
        EndCyclePolicy.java
        Fields.java
        Flow.java
        FlowCompilationException.java
        FlowException.java
        FlowInstance.java
        FlowInstanceSetBuilder.java
        FlowInstanceWrapper.java
        FlowOperationInstance.java
        FlowOperationInstanceCollection.java
        FlowRecoveryException.java
        FlowService.java
        FlowState.java
        FlowStateCoordinator.java
        FlowStateException.java
        MapTuple.java
        StateMachine.java
        StateMachineException.java
        StateMachineHelper.java
        StreamBuilder.java
        aggregation
        AggregationException.java
        AggregationKey.java
        AggregationStore.java
        AggregationStoreFactory.java
        AggregationStoreWrapper.java
        Aggregator.java
        CachedStore.java
        CachedStoreFactory.java
        DefaultAggregationStoreFactory.java
        DiskBackedStore.java
        FlowAggregationSplitter.java
        MemoryStore.java
        MemoryStoreFactory.java
        PartialAggregator.java
        StoreType.java
        TuplePage.java
        buffer
        BufferClientFactory.java
        BufferConsumer.java
        BufferFlusher.java
        BufferProducer.java
        BufferService.java
        SinkToBuffer.java
        SourceFromBuffer.java
        mock
        LocalBufferClientFactory.java
        LocalBufferConsumer.java
        LocalBufferProducer.java
        LocalDevBufferClientFactory.java
        LocalDevBufferFlusher.java
        LocalDevBufferProducer.java
        MockBufferProducer.java
        MockBufferService.java
        collectors
        OutputCollector.java
        coordinated
        AckTuple.java
        BaseCoordTuple.java
        BatchCompleteAckTuple.java
        BatchCompleteTuple.java
        BatchState.java
        BatchTracker.java
        BatchedTuple.java
        CoordTupleOptions.java
        CoordinatedOutputCollector.java
        DeadNodeDetectedException.java
        ExplicitAckRequestTuple.java
        MaxIterationsExceededException.java
        ObserveIncomingTupleAction.java
        PingTuple.java
        PongTuple.java
        QueuedTuple.java
        support
        CoordinatedOutputCollectorSupportFactory.java
        EmptyTupleIdSet.java
        FailedTupleHandler.java
        TupleIdGenerator.java
        TupleIdMapper.java
        TupleIdSet.java
        naive
        DoNothingFailedTupleHandler.java
        NaiveCoordinatedOutputCollectorSupportFactory.java
        SerialTupleIdGenerator.java
        UncompressedTupleIdMapper.java
        UncompressedTupleIdSet.java
        components
        ComponentInput.java
        ComponentOutput.java
        builtin
        BuiltinComponents.java
        FetchUrlComponent.java
        config
        FlowConfig.java
        OperationConfig.java
        UserConfig.java
        error
        strategies
        ErrorStrategyFactory.java
        ErrorThresholdExceededException.java
        FakeLocalException.java
        FlowErrorStrategy.java
        ForgivingFlowErrorStrategy.java
        OperationErrorStrategy.java
        PassiveWorkerPercentageAndAbsoluteOperationErrorStrategy.java
        StrictFlowErrorStrategy.java
        WorkerThrowsErrorPercentageAndAbsoluteOperationErrorStrategy.java
        graph
        Connection.java
        FlowGraph.java
        heartbeats
        Heartbeat.java
        HeartbeatException.java
        local
        LocalFlowController.java
        LocalFlowOutputCollector.java
        LocalFlowService.java
        LocalOperationSlot.java
        operations
        AggregationOperation.java
        AggregationState.java
        Function.java
        FunctionState.java
        GroupBy.java
        Join.java
        JoinType.java
        MockOperationLogger.java
        Operation.java
        OperationDeadException.java
        OperationException.java
        OperationLogger.java
        OperationLoggerException.java
        OperationMessage.java
        OperationSleeper.java
        ProcessableOperation.java
        Sink.java
        SinkState.java
        Source.java
        SourceState.java
        builtin
        Clumper.java
        Count.java
        RateLimiter.java
        Unique.java
        decorators
        EmitDecorator.java
        RemoveFields.java
        RenameFields.java
        RetainFields.java
        multilang
        MultiLangCleaner.java
        MultiLangErrorHandler.java
        MultiLangException.java
        MultiLangLogHandler.java
        MultiLangMessageHandler.java
        MultiLangProcess.java
        MultiLangProcessDiedUnexpectedlyException.java
        MultiLangProcessException.java
        MultiLangProcessGeneralOperationObserver.java
        MultiLangProcessStartupLogObserver.java
        MultiLangProcessTupleObserver.java
        builder
        APIFlowBuilder.java
        APIFlowBuilderFactory.java
        FlowBuilderFactory.java
        FlowFetcher.java
        FlowValidator.java
        InplaceFlowBuilder.java
        InplaceFlowBuilderFactory.java
        MultilangFlowCompiler.java
        PlaceHolderOperation.java
        RouteBy.java
        operations
        LocalComponent.java
        MultiLangAggregator.java
        MultiLangOperation.java
        MultiLangRunEach.java
        MultiLangRunSource.java
        MultilangClumper.java
        MultilangHandler.java
        rpc
        RPCHelper.java
        RPCRequest.java
        RPCResponse.java
        RPCSink.java
        RPCSource.java
        queues
        InputQueue.java
        MockQueueFactory.java
        OutputQueue.java
        QueueFactory.java
        sourcefromrelation
        ShardReader.java
        ShardType.java
        tests
        helpers
        MockInstanceHelper.java
        metrics
        Metrics.java
        MockMetrics.java
        reactor
        lightweight
        ProcessTimeoutException.java
        relational
        APIRelationDefFactory.java
        AliasedQuery.java
        BufferQuery.java
        ColumnDef.java
        DataType.java
        DefaultStreamException.java
        MissingFieldException.java
        Query.java
        RelationBackend.java
        RelationDef.java
        RelationDefFactory.java
        RelationException.java
        S3OnlyQuery.java
        StreamMarker.java
        StreamReader.java
        UnexpectedFieldException.java
        naivepostgresimpl
        NaivePostgresStreamMarker.java
        redshiftimpl
        RedshiftStreamMarker.java
        RedshiftStreamReader.java
        shell
        LocalOsxShellFactory.java
        MachineType.java
        ShellFactory.java
        UbuntuEc2ShellFactory.java
        UbuntuTeamCityShellFactory.java
        UbuntuVagrantShellFactory.java
        state
        package-info.java
        top
        BasicTopService.java
        CommandLine.java
        LocalCommandLineHelper.java
        LocalServiceMain.java
        MotherbrainException.java
        MotherbrainRuntimeException.java
        TopService.java
        universe
        Config.java
        Environment.java
        ExceptionHandler.java
        ExpectedConfigNotPresent.java
        FileFactory.java
        FileFactoryException.java
        LocalUniverseBuilder.java
        LoggerFactory.java
        MockConfig.java
        S3Exception.java
        SSHException.java
        SSHFactory.java
        Universe.java
        UniverseBuilder.java
        utils
        ByteArrayWrapper.java
        CompressUtils.java
        DateHelper.java
        ExceptionAttempt.java
        FileLockUtil.java
        Glob.java
        JSONUtil.java
        JarCompilationException.java
        Log4jWrapper.java
        MapBuilder.java
        MeteredLog.java
        SerializableMonitor.java
        UrlHelper.java
        Utils.java
        VersionComparer.java
        backoff
        BackoffTicker.java
        ExponentialBackoffTicker.java
        csv
        Assertions.java
        CSVFormat.java
        CSVParser.java
        CSVPrinter.java
        CSVRecord.java
        Constants.java
        ExtendedBufferedReader.java
        Lexer.java
        Quote.java
        Token.java
        package-info.java
        dfs
        DFSService.java
        DFSServiceWrapper.java
        LocalDFSService.java
        queue
        ByteSizable.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.zillabyte.motherbrain.utils.csv;

import static com.zillabyte.motherbrain.utils.csv.Constants.BACKSPACE;
import static com.zillabyte.motherbrain.utils.csv.Constants.CR;
import static com.zillabyte.motherbrain.utils.csv.Constants.END_OF_STREAM;
import static com.zillabyte.motherbrain.utils.csv.Constants.FF;
import static com.zillabyte.motherbrain.utils.csv.Constants.LF;
import static com.zillabyte.motherbrain.utils.csv.Constants.TAB;
import static com.zillabyte.motherbrain.utils.csv.Constants.UNDEFINED;
import static com.zillabyte.motherbrain.utils.csv.Token.Type.COMMENT;
import static com.zillabyte.motherbrain.utils.csv.Token.Type.EOF;
import static com.zillabyte.motherbrain.utils.csv.Token.Type.EORECORD;
import static com.zillabyte.motherbrain.utils.csv.Token.Type.INVALID;
import static com.zillabyte.motherbrain.utils.csv.Token.Type.TOKEN;

import java.io.Closeable;
import java.io.IOException;

/**
 *
 *
 * @version $Id: Lexer.java 1610490 2014-07-14 19:25:03Z britter $
 */
final class Lexer implements Closeable {

    /**
     * Constant char to use for disabling comments, escapes and encapsulation. The value -2 is used because it
     * won't be confused with an EOF signal (-1), and because the Unicode value {@code FFFE} would be encoded as two
     * chars (using surrogates) and thus there should never be a collision with a real text char.
     */
    private static final char DISABLED = '\ufffe';

    private final char delimiter;
    private final char escape;
    private final char quoteChar;
    private final char commentStart;

    private final boolean ignoreSurroundingSpaces;
    private final boolean ignoreEmptyLines;

    /** The input stream */
    private final ExtendedBufferedReader reader;

    /** INTERNAL API. but ctor needs to be called dynamically by PerformanceTest class */
    Lexer(final CSVFormat format, final ExtendedBufferedReader reader) {
        this.reader = reader;
        this.delimiter = format.getDelimiter();
        this.escape = mapNullToDisabled(format.getEscape());
        this.quoteChar = mapNullToDisabled(format.getQuoteChar());
        this.commentStart = mapNullToDisabled(format.getCommentStart());
        this.ignoreSurroundingSpaces = format.isIgnoringSurroundingSpaces();
        this.ignoreEmptyLines = format.isIgnoringEmptyLines();
    }

    /**
     * Returns the next token.
     * <p/>
     * A token corresponds to a term, a record change or an end-of-file indicator.
     *
     * @param token
     *            an existing Token object to reuse. The caller is responsible to initialize the Token.
     * @return the next token found
     * @throws java.io.IOException
     *             on stream access error
     */
    Token nextToken(final Token token) throws IOException {

        // get the last read char (required for empty line detection)
        int lastChar = reader.getLastChar();

        // read the next char and set eol
        int c = reader.read();
        /*
         * Note: The following call will swallow LF if c == CR. But we don't need to know if the last char was CR or LF
         * - they are equivalent here.
         */
        boolean eol = readEndOfLine(c);

        // empty line detection: eol AND (last char was EOL or beginning)
        if (ignoreEmptyLines) {
            while (eol && isStartOfLine(lastChar)) {
                // go on char ahead ...
                lastChar = c;
                c = reader.read();
                eol = readEndOfLine(c);
                // reached end of file without any content (empty line at the end)
                if (isEndOfFile(c)) {
                    token.type = EOF;
                    // don't set token.isReady here because no content
                    return token;
                }
            }
        }

        // did we reach eof during the last iteration already ? EOF
        if (isEndOfFile(lastChar) || (!isDelimiter(lastChar) && isEndOfFile(c))) {
            token.type = EOF;
            // don't set token.isReady here because no content
            return token;
        }

        if (isStartOfLine(lastChar) && isCommentStart(c)) {
            final String line = reader.readLine();
            if (line == null) {
                token.type = EOF;
                // don't set token.isReady here because no content
                return token;
            }
            final String comment = line.trim();
            token.content.append(comment);
            token.type = COMMENT;
            return token;
        }

        // important: make sure a new char gets consumed in each iteration
        while (token.type == INVALID) {
            // ignore whitespaces at beginning of a token
            if (ignoreSurroundingSpaces) {
                while (isWhitespace(c) && !eol) {
                    c = reader.read();
                    eol = readEndOfLine(c);
                }
            }

            // ok, start of token reached: encapsulated, or token
            if (isDelimiter(c)) {
                // empty token return TOKEN("")
                token.type = TOKEN;
            } else if (eol) {
                // empty token return EORECORD("")
                // noop: token.content.append("");
                token.type = EORECORD;
            } else if (isQuoteChar(c)) {
                // consume encapsulated token
                parseEncapsulatedToken(token);
            } else if (isEndOfFile(c)) {
                // end of file return EOF()
                // noop: token.content.append("");
                token.type = EOF;
                token.isReady = true; // there is data at EOF
            } else {
                // next token must be a simple token
                // add removed blanks when not ignoring whitespace chars...
                parseSimpleToken(token, c);
            }
        }
        return token;
    }

    /**
     * Parses a simple token.
     * <p/>
     * Simple token are tokens which are not surrounded by encapsulators. A simple token might contain escaped
     * delimiters (as \, or \;). The token is finished when one of the following conditions become true:
     * <ul>
     * <li>end of line has been reached (EORECORD)</li>
     * <li>end of stream has been reached (EOF)</li>
     * <li>an unescaped delimiter has been reached (TOKEN)</li>
     * </ul>
     *
     * @param token
     *            the current token
     * @param ch
     *            the current character
     * @return the filled token
     * @throws IOException
     *             on stream access error
     */
    private Token parseSimpleToken(final Token token, int ch) throws IOException {
        // Faster to use while(true)+break than while(token.type == INVALID)
        while (true) {
            if (readEndOfLine(ch)) {
                token.type = EORECORD;
                break;
            } else if (isEndOfFile(ch)) {
                token.type = EOF;
                token.isReady = true; // There is data at EOF
                break;
            } else if (isDelimiter(ch)) {
                token.type = TOKEN;
                break;
            } else if (isEscape(ch)) {
                final int unescaped = readEscape();
                if (unescaped == Constants.END_OF_STREAM) { // unexpected char after escape
                    token.content.append((char) ch).append((char) reader.getLastChar());
                } else {
                    token.content.append((char) unescaped);
                }
                ch = reader.read(); // continue
            } else {
                token.content.append((char) ch);
                ch = reader.read(); // continue
            }
        }

        if (ignoreSurroundingSpaces) {
            trimTrailingSpaces(token.content);
        }

        return token;
    }

    /**
     * Parses an encapsulated token.
     * <p/>
     * Encapsulated tokens are surrounded by the given encapsulating-string. The encapsulator itself might be included
     * in the token using a doubling syntax (as "", '') or using escaping (as in \", \'). Whitespaces before and after
     * an encapsulated token are ignored. The token is finished when one of the following conditions become true:
     * <ul>
     * <li>an unescaped encapsulator has been reached, and is followed by optional whitespace then:</li>
     * <ul>
     * <li>delimiter (TOKEN)</li>
     * <li>end of line (EORECORD)</li>
     * </ul>
     * <li>end of stream has been reached (EOF)</li> </ul>
     *
     * @param token
     *            the current token
     * @return a valid token object
     * @throws IOException
     *             on invalid state: EOF before closing encapsulator or invalid character before delimiter or EOL
     */
    private Token parseEncapsulatedToken(final Token token) throws IOException {
        // save current line number in case needed for IOE
        final long startLineNumber = getCurrentLineNumber();
        int c;
        while (true) {
            c = reader.read();

            if (isEscape(c)) {
                final int unescaped = readEscape();
                if (unescaped == Constants.END_OF_STREAM) { // unexpected char after escape
                    token.content.append((char) c).append((char) reader.getLastChar());
                } else {
                    token.content.append((char) unescaped);
                }
            } else if (isQuoteChar(c)) {
                if (isQuoteChar(reader.lookAhead())) {
                    // double or escaped encapsulator -> add single encapsulator to token
                    c = reader.read();
                    token.content.append((char) c);
                } else {
                    // token finish mark (encapsulator) reached: ignore whitespace till delimiter
                    while (true) {
                        c = reader.read();
                        if (isDelimiter(c)) {
                            token.type = TOKEN;
                            return token;
                        } else if (isEndOfFile(c)) {
                            token.type = EOF;
                            token.isReady = true; // There is data at EOF
                            return token;
                        } else if (readEndOfLine(c)) {
                            token.type = EORECORD;
                            return token;
                        } else if (!isWhitespace(c)) {
                            // error invalid char between token and next delimiter
                            throw new IOException("(line " + getCurrentLineNumber() +
                                    ") invalid char between encapsulated token and delimiter");
                        }
                    }
                }
            } else if (isEndOfFile(c)) {
                // error condition (end of file before end of token)
                throw new IOException("(startline " + startLineNumber +
                        ") EOF reached before encapsulated token finished");
            } else {
                // consume character
                token.content.append((char) c);
            }
        }
    }

    private char mapNullToDisabled(final Character c) {
        return c == null ? DISABLED : c.charValue();
    }

    /**
     * Returns the current line number
     *
     * @return the current line number
     */
    long getCurrentLineNumber() {
        return reader.getCurrentLineNumber();
    }

    // TODO escape handling needs more work
    /**
     * Handle an escape sequence.
     * The current character must be the escape character.
     * On return, the next character is available by calling {@link ExtendedBufferedReader#getLastChar()}
     * on the input stream.
     *
     * @return the unescaped character (as an int) or {@link Constants#END_OF_STREAM} if char following the escape is
     *      invalid.
     * @throws IOException if there is a problem reading the stream or the end of stream is detected:
     *      the escape character is not allowed at end of strem
     */
    int readEscape() throws IOException {
        // the escape char has just been read (normally a backslash)
        final int ch = reader.read();
        switch (ch) {
        case 'r':
            return CR;
        case 'n':
            return LF;
        case 't':
            return TAB;
        case 'b':
            return BACKSPACE;
        case 'f':
            return FF;
        case CR:
        case LF:
        case FF: // TODO is this correct?
        case TAB: // TODO is this correct? Do tabs need to be escaped?
        case BACKSPACE: // TODO is this correct?
            return ch;
        case END_OF_STREAM:
            throw new IOException("EOF whilst processing escape sequence");
        default:
            // Now check for meta-characters
            if (isMetaChar(ch)) {
                return ch;
            }
            // indicate unexpected char - available from in.getLastChar()
            return END_OF_STREAM;
        }
    }

    void trimTrailingSpaces(final StringBuilder buffer) {
        int length = buffer.length();
        while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
            length = length - 1;
        }
        if (length != buffer.length()) {
            buffer.setLength(length);
        }
    }

    /**
     * Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character...
     *
     * @return true if the given or next character is a line-terminator
     */
    boolean readEndOfLine(int ch) throws IOException {
        // check if we have \r\n...
        if (ch == CR && reader.lookAhead() == LF) {
            // note: does not change ch outside of this method!
            ch = reader.read();
        }
        return ch == LF || ch == CR;
    }

    boolean isClosed() {
        return reader.isClosed();
    }

    /**
     * @return true if the given char is a whitespace character
     */
    boolean isWhitespace(final int ch) {
        return !isDelimiter(ch) && Character.isWhitespace((char) ch);
    }

    /**
     * Checks if the current character represents the start of a line: a CR, LF or is at the start of the file.
     *
     * @param ch the character to check
     * @return true if the character is at the start of a line.
     */
    boolean isStartOfLine(final int ch) {
        return ch == LF || ch == CR || ch == UNDEFINED;
    }

    /**
     * @return true if the given character indicates end of file
     */
    boolean isEndOfFile(final int ch) {
        return ch == END_OF_STREAM;
    }

    boolean isDelimiter(final int ch) {
        return ch == delimiter;
    }

    boolean isEscape(final int ch) {
        return ch == escape;
    }

    boolean isQuoteChar(final int ch) {
        return ch == quoteChar;
    }

    boolean isCommentStart(final int ch) {
        return ch == commentStart;
    }

    private boolean isMetaChar(final int ch) {
        return ch == delimiter ||
               ch == escape ||
               ch == quoteChar ||
               ch == commentStart;
    }

    /**
     * Closes resources.
     *
     * @throws IOException
     *             If an I/O error occurs
     */
    public void close() throws IOException {
        reader.close();
    }
}