/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.streamsets.pipeline.lib.parser.avro; import com.streamsets.pipeline.api.Record; import com.streamsets.pipeline.api.Stage; import com.streamsets.pipeline.lib.io.OverrunInputStream; import com.streamsets.pipeline.lib.parser.AbstractDataParser; import com.streamsets.pipeline.lib.parser.DataParserException; import com.streamsets.pipeline.lib.util.AvroTypeUtil; import org.apache.avro.Schema; import org.apache.avro.file.DataFileStream; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumReader; import java.io.IOException; import java.io.InputStream; public class AvroDataStreamParser extends AbstractDataParser { private static final String OFFSET_SEPARATOR = "::"; private final Schema avroSchema; private final String streamName; private long recordCount; private final DatumReader<GenericRecord> datumReader; private final DataFileStream<GenericRecord> dataFileStream; private final OverrunInputStream overrunInputStream; private boolean eof; private Stage.Context context; public AvroDataStreamParser(Stage.Context context, Schema schema, String streamName, InputStream inputStream, long recordCount, int maxObjectLength) throws IOException { this.context = context; avroSchema = schema; this.streamName = streamName; this.recordCount = recordCount; datumReader = new GenericDatumReader<>(avroSchema, avroSchema, GenericData.get()); //Reader schema argument is optional overrunInputStream = new OverrunInputStream(inputStream, maxObjectLength, true); dataFileStream = new DataFileStream<>(overrunInputStream, datumReader); seekToOffset(); } @Override public Record parse() throws IOException, DataParserException { //seekToOffset to the required position if(dataFileStream.hasNext()) { //reset count for the next object. //maxObjectLength indicates that a single record should not exceed the specified limit. Max 1 MB. //The file itself may contain multiple large records and the total file size may be over maxObjectLength overrunInputStream.resetCount(); GenericRecord avroRecord = dataFileStream.next(); recordCount++; Record record = context.createRecord(streamName + OFFSET_SEPARATOR + recordCount); record.set(AvroTypeUtil.avroToSdcField(record, avroRecord.getSchema(), avroRecord)); return record; } eof = true; return null; } private void seekToOffset() throws IOException { int count = 0; while(count < recordCount) { if(dataFileStream.hasNext()) { overrunInputStream.resetCount(); dataFileStream.next(); count++; } else { break; } } } @Override public String getOffset() throws DataParserException { return eof ? String.valueOf(-1) : String.valueOf(recordCount); } @Override public void close() throws IOException { dataFileStream.close(); } }