package org.rakam.aws.s3; import com.amazonaws.AmazonClientException; import com.amazonaws.services.cloudwatch.AmazonCloudWatchAsyncClient; import com.amazonaws.services.cloudwatch.model.Dimension; import com.amazonaws.services.cloudwatch.model.MetricDatum; import com.amazonaws.services.cloudwatch.model.PutMetricDataRequest; import com.amazonaws.services.kinesis.AmazonKinesisClient; import com.amazonaws.services.s3.AmazonS3Client; import com.amazonaws.services.s3.model.ObjectMetadata; import com.amazonaws.services.s3.model.PutObjectRequest; import com.google.common.base.Throwables; import io.airlift.log.Logger; import io.airlift.slice.BasicSliceInput; import io.airlift.slice.DynamicSliceOutput; import org.apache.avro.Schema; import org.apache.avro.generic.FilteredRecordWriter; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.BinaryEncoder; import org.apache.avro.io.DatumWriter; import org.apache.avro.io.EncoderFactory; import org.rakam.analysis.metadata.Metastore; import org.rakam.aws.AWSConfig; import org.rakam.collection.Event; import org.rakam.collection.FieldDependencyBuilder; import org.rakam.collection.SchemaField; import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.UUID; import static org.rakam.util.AvroUtil.convertAvroSchema; public class S3BulkEventStore { private final static Logger LOGGER = Logger.get(S3BulkEventStore.class); private final Metastore metastore; private final AmazonS3Client s3Client; private final AWSConfig config; private final int conditionalMagicFieldsSize; private final AmazonCloudWatchAsyncClient cloudWatchClient; private final AmazonKinesisClient kinesis; public S3BulkEventStore(Metastore metastore, AWSConfig config, FieldDependencyBuilder.FieldDependency fieldDependency) { this.metastore = metastore; this.config = config; this.s3Client = new AmazonS3Client(config.getCredentials()); s3Client.setRegion(config.getAWSRegion()); if (config.getS3Endpoint() != null) { s3Client.setEndpoint(config.getS3Endpoint()); } kinesis = new AmazonKinesisClient(config.getCredentials()); kinesis.setRegion(config.getAWSRegion()); if (config.getKinesisEndpoint() != null) { kinesis.setEndpoint(config.getKinesisEndpoint()); } cloudWatchClient = new AmazonCloudWatchAsyncClient(config.getCredentials()); cloudWatchClient.setRegion(config.getAWSRegion()); this.conditionalMagicFieldsSize = fieldDependency.dependentFields.size(); } public void upload(String project, List<Event> events, int tryCount) { GenericData data = GenericData.get(); DynamicSliceOutput buffer = new DynamicSliceOutput(events.size() * 30); Map<String, List<Event>> map = new HashMap<>(); events.forEach(event -> map.computeIfAbsent(event.collection(), (col) -> new ArrayList<>()).add(event)); BinaryEncoder encoder = null; String batchId = UUID.randomUUID().toString(); List<String> uploadedFiles = new ArrayList<>(); try { for (Map.Entry<String, List<Event>> entry : map.entrySet()) { buffer.reset(); List<SchemaField> collection = metastore.getCollection(project, entry.getKey()); Schema avroSchema = convertAvroSchema(collection); DatumWriter writer = new FilteredRecordWriter(avroSchema, data); encoder = EncoderFactory.get().directBinaryEncoder(buffer, encoder); encoder.writeInt(collection.size()); for (SchemaField schemaField : collection) { encoder.writeString(schemaField.getName()); } encoder.writeInt(entry.getValue().size()); int expectedSchemaSize = collection.size() + conditionalMagicFieldsSize; for (Event event : entry.getValue()) { GenericRecord properties = event.properties(); List<Schema.Field> existingFields = properties.getSchema().getFields(); if (existingFields.size() != expectedSchemaSize) { GenericData.Record record = new GenericData.Record(avroSchema); for (int i = 0; i < existingFields.size(); i++) { if (existingFields.get(i).schema().getType() != Schema.Type.NULL) { record.put(i, properties.get(i)); } } properties = record; } writer.write(properties, encoder); } ObjectMetadata objectMetadata = new ObjectMetadata(); int bulkSize = buffer.size(); objectMetadata.setContentLength(bulkSize); String key = events.get(0).project() + "/" + entry.getKey() + "/" + batchId; PutObjectRequest putObjectRequest = new PutObjectRequest(config.getEventStoreBulkS3Bucket(), key, new SafeSliceInputStream(new BasicSliceInput(buffer.slice())), objectMetadata); putObjectRequest.getRequestClientOptions().setReadLimit(bulkSize); s3Client.putObject(putObjectRequest); ByteBuffer allocate = ByteBuffer.allocate(key.length() + 1 + 8); allocate.put((byte) 1); allocate.putLong(bulkSize); allocate.put(key.getBytes(StandardCharsets.UTF_8)); allocate.clear(); putMetadataToKinesis(allocate, events.get(0).project(), entry.getKey(), 3); uploadedFiles.add(key); } LOGGER.debug("Stored batch file '%s', %d events in %d collection.", batchId, events.size(), map.size()); cloudWatchClient.putMetricDataAsync(new PutMetricDataRequest() .withNamespace("rakam-middleware-collection") .withMetricData(new MetricDatum() .withMetricName("bulk") .withValue(((Number) events.size()).doubleValue()) .withDimensions(new Dimension().withName("project").withValue(project)))); } catch (IOException | AmazonClientException e) { for (String uploadedFile : uploadedFiles) { s3Client.deleteObject(config.getEventStoreBulkS3Bucket(), uploadedFile); } if(tryCount <= 0) { throw Throwables.propagate(e); } upload(project, events, tryCount - 1); } } private void putMetadataToKinesis(ByteBuffer allocate, String project, String collection, int tryCount) { try { kinesis.putRecord(config.getEventStoreStreamName(), allocate, project + "|" + collection); } catch (Exception e) { if (tryCount == 0) { throw e; } putMetadataToKinesis(allocate, project, collection, tryCount - 1); } } private class SafeSliceInputStream extends InputStream { private final BasicSliceInput sliceInput; public SafeSliceInputStream(BasicSliceInput sliceInput) { this.sliceInput = sliceInput; } @Override public int read() throws IOException { return sliceInput.read(); } @Override public int read(byte[] b) throws IOException { return sliceInput.read(b); } @Override public int read(byte[] b, int off, int len) throws IOException { return sliceInput.read(b, off, len); } @Override public long skip(long n) throws IOException { return sliceInput.skip(n); } @Override public int available() throws IOException { return sliceInput.available(); } @Override public void close() throws IOException { sliceInput.close(); } @Override public synchronized void mark(int readlimit) { throw new RuntimeException("mark/reset not supported"); } @Override public synchronized void reset() throws IOException { throw new IOException("mark/reset not supported"); } @Override public boolean markSupported() { return false; } } }