/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.writer;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.util.Collections;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.testng.Assert;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import com.google.common.collect.ImmutableList;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.State;
import gobblin.crypto.EncryptionConfigParser;
import gobblin.crypto.EncryptionFactory;
import gobblin.metadata.types.GlobalMetadata;
/**
* Unit tests for {@link SimpleDataWriter}.
*
* @author akshay@nerdwallet.com
*/
@Test(groups = { "gobblin.writer" })
public class SimpleDataWriterTest {
private String filePath;
private final String schema = "";
private final int newLine = "\n".getBytes()[0];
private State properties;
private static final String ENCRYPT_PREFIX = "writer.encrypt.";
@BeforeMethod
public void setUp() throws Exception {
properties = new State();
// Making the staging and/or output dirs if necessary
File stagingDir = new File(TestConstants.TEST_STAGING_DIR);
File outputDir = new File(TestConstants.TEST_OUTPUT_DIR);
if (!stagingDir.exists()) {
stagingDir.mkdirs();
}
if (!outputDir.exists()) {
outputDir.mkdirs();
}
this.filePath = TestConstants.TEST_EXTRACT_NAMESPACE.replaceAll("\\.", "/") + "/" + TestConstants.TEST_EXTRACT_TABLE
+ "/" + TestConstants.TEST_EXTRACT_ID + "_" + TestConstants.TEST_EXTRACT_PULL_TYPE;
properties.setProp(ConfigurationKeys.SIMPLE_WRITER_DELIMITER, "\n");
properties.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, TestConstants.TEST_FS_URI);
properties.setProp(ConfigurationKeys.WRITER_STAGING_DIR, TestConstants.TEST_STAGING_DIR);
properties.setProp(ConfigurationKeys.WRITER_OUTPUT_DIR, TestConstants.TEST_OUTPUT_DIR);
properties.setProp(ConfigurationKeys.WRITER_FILE_PATH, this.filePath);
properties.setProp(ConfigurationKeys.WRITER_FILE_NAME, TestConstants.TEST_FILE_NAME);
properties.setProp(ConfigurationKeys.SIMPLE_WRITER_PREPEND_SIZE, false);
}
/**
* Test writing records without a delimiter and make sure it works.
* @throws IOException
*/
@Test
public void testWriteBytesNoDelim() throws IOException {
properties.setProp(ConfigurationKeys.SIMPLE_WRITER_DELIMITER, "");
// Build a writer to write test records
SimpleDataWriter writer = buildSimpleDataWriter();
byte[] rec1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 0 };
byte[] rec2 = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 };
byte[] rec3 = { 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45 };
writer.write(rec1);
writer.write(rec2);
writer.write(rec3);
writer.close();
writer.commit();
Assert.assertEquals(writer.recordsWritten(), 3);
Assert.assertEquals(writer.bytesWritten(), rec1.length + rec2.length + rec3.length);
File outputFile = new File(writer.getOutputFilePath());
InputStream is = new FileInputStream(outputFile);
int c, resNum = 0, resi = 0;
byte[][] records = { rec1, rec2, rec3 };
while ((c = is.read()) != -1) {
if (resi >= records[resNum].length) {
resNum++;
resi = 0;
}
Assert.assertEquals(c, records[resNum][resi]);
resi++;
}
}
/**
* Prepend the size to each record without delimiting the record. Each record
* should be prepended by the size of that record and the bytes written should
* include the prepended bytes.
*/
@Test
public void testPrependSizeWithoutDelimiter() throws IOException {
properties.setProp(ConfigurationKeys.SIMPLE_WRITER_PREPEND_SIZE, true);
properties.setProp(ConfigurationKeys.SIMPLE_WRITER_DELIMITER, "");
SimpleDataWriter writer = buildSimpleDataWriter();
byte[] rec1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 0 };
byte[] rec2 = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 };
byte[] rec3 = { 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45 };
byte[][] records = { rec1, rec2, rec3 };
writer.write(rec1);
writer.write(rec2);
writer.write(rec3);
writer.close();
writer.commit();
Assert.assertEquals(writer.recordsWritten(), 3);
Assert.assertEquals(writer.bytesWritten(), rec1.length + rec2.length + rec3.length + (Long.SIZE / 8 * 3));
File outputFile = new File(writer.getOutputFilePath());
DataInputStream dis = new DataInputStream(new FileInputStream(outputFile));
for (int i = 0; i < 3; i++) {
long size = dis.readLong();
Assert.assertEquals(size, records[i].length);
for (int j = 0; j < size; j++) {
Assert.assertEquals(dis.readByte(), records[i][j]);
}
}
}
/**
* Use the simple data writer to write random bytes to a file and ensure
* they are the same when read back.
*
* @throws IOException
*/
@Test
public void testWriteRandomBytes() throws IOException {
// Build a writer to write test records
SimpleDataWriter writer = buildSimpleDataWriter();
byte[] rec1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 0 };
byte[] rec2 = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 };
byte[] rec3 = { 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45 };
writer.write(rec1);
writer.write(rec2);
writer.write(rec3);
writer.close();
writer.commit();
Assert.assertEquals(writer.recordsWritten(), 3);
Assert.assertEquals(writer.bytesWritten(), rec1.length + rec2.length + rec3.length + 3); // 3 bytes for newline character
File outputFile = new File(writer.getOutputFilePath());
InputStream is = new FileInputStream(outputFile);
int c, resNum = 0, resi = 0;
byte[][] records = { rec1, rec2, rec3 };
while ((c = is.read()) != -1) {
if (c != newLine) {
Assert.assertEquals(c, records[resNum][resi]);
resi++;
} else {
resNum++;
resi = 0;
}
}
}
/**
* Prepend the size to each record and delimit the record. Each record
* should be prepended by the size of that record and the bytes written should
* include the prepended bytes.
*/
@Test
public void testPrependSizeWithDelimiter() throws IOException {
properties.setProp(ConfigurationKeys.SIMPLE_WRITER_PREPEND_SIZE, true);
SimpleDataWriter writer = buildSimpleDataWriter();
byte[] rec1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 0 };
byte[] rec2 = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 };
byte[] rec3 = { 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45 };
byte[][] records = { rec1, rec2, rec3 };
writer.write(rec1);
writer.write(rec2);
writer.write(rec3);
writer.close();
writer.commit();
Assert.assertEquals(writer.recordsWritten(), 3);
Assert.assertEquals(writer.bytesWritten(), rec1.length + rec2.length + rec3.length + (Long.SIZE / 8 * 3) + 3);
File outputFile = new File(writer.getOutputFilePath());
DataInputStream dis = new DataInputStream(new FileInputStream(outputFile));
for (int i = 0; i < 3; i++) {
long size = dis.readLong();
Assert.assertEquals(size, records[i].length + 1);
for (int j = 0; j < size - 1; j++) {
Assert.assertEquals(dis.readByte(), records[i][j]);
}
Assert.assertEquals(dis.readByte(), '\n');
}
}
@Test
public void testSupportsGzip() throws IOException {
properties.setProp(ConfigurationKeys.WRITER_CODEC_TYPE, "gzip");
properties.setProp(ConfigurationKeys.SIMPLE_WRITER_DELIMITER, "");
byte[] toWrite = new byte[] { 'a', 'b', 'c', 'd'};
SimpleDataWriter writer = buildSimpleDataWriter();
writer.write(toWrite);
writer.close();
writer.commit();
File outputFile = new File(writer.getOutputFilePath());
InputStream in = new GZIPInputStream(new FileInputStream(outputFile));
byte[] contents = IOUtils.toByteArray(in);
Assert.assertEquals(contents, toWrite, "Expected gzip'd content to be written out");
Assert.assertTrue(outputFile.getName().endsWith(".gzip"), "Expected gzip'd file to end in .gzip");
}
@Test
public void testSupportsGzipAndEncryption() throws IOException {
final String ENCRYPTION_TYPE = "insecure_shift";
final String COMPRESSION_TYPE = "gzip";
properties.setProp(ConfigurationKeys.WRITER_CODEC_TYPE, COMPRESSION_TYPE);
properties.setProp(ENCRYPT_PREFIX + EncryptionConfigParser.ENCRYPTION_ALGORITHM_KEY,
ENCRYPTION_TYPE);
properties.setProp(ConfigurationKeys.SIMPLE_WRITER_DELIMITER, "");
byte[] toWrite = new byte[] { 'a', 'b', 'c', 'd'};
SimpleDataWriter writer = buildSimpleDataWriter();
writer.write(toWrite);
writer.close();
writer.commit();
File outputFile = new File(writer.getOutputFilePath());
Assert.assertTrue(outputFile.getName().endsWith("." + COMPRESSION_TYPE + "." + ENCRYPTION_TYPE),
"Expected compression & encryption in file name!");
InputStream decryptedFile =
EncryptionFactory.buildStreamCryptoProvider(ENCRYPTION_TYPE, Collections.<String, Object>emptyMap())
.decodeInputStream(new FileInputStream(outputFile));
InputStream uncompressedFile = new GZIPInputStream(decryptedFile);
byte[] contents = IOUtils.toByteArray(uncompressedFile);
Assert.assertEquals(contents, toWrite, "expected to decode same contents");
}
/**
* Use the simple writer to write json entries to a file and ensure that
* they are the same when read back.
*
* @throws IOException
*/
@Test
public void testWrite() throws IOException {
SimpleDataWriter writer = buildSimpleDataWriter();
int totalBytes = 3; // 3 extra bytes for the newline character
// Write all test records
for (String record : TestConstants.JSON_RECORDS) {
byte[] toWrite = record.getBytes();
Assert.assertEquals(toWrite.length, record.length()); // ensure null byte does not get added to end
writer.write(toWrite);
totalBytes += toWrite.length;
}
writer.close();
writer.commit();
Assert.assertEquals(writer.recordsWritten(), 3);
Assert.assertEquals(writer.bytesWritten(), totalBytes);
File outputFile = new File(writer.getOutputFilePath());
BufferedReader br = new BufferedReader(new FileReader(outputFile));
String line;
int lineNumber = 0;
while ((line = br.readLine()) != null) {
Assert.assertEquals(line, TestConstants.JSON_RECORDS[lineNumber]);
lineNumber++;
}
br.close();
Assert.assertEquals(lineNumber, 3);
}
private SimpleDataWriter buildSimpleDataWriter()
throws IOException {
SimpleDataWriterBuilder b = (SimpleDataWriterBuilder)new SimpleDataWriterBuilder()
.writeTo(Destination.of(Destination.DestinationType.HDFS, properties)).writeInFormat(WriterOutputFormat.AVRO)
.withWriterId(TestConstants.TEST_WRITER_ID).withSchema(this.schema).forBranch(0);
return new SimpleDataWriter(b, properties);
}
/**
* If the staging file exists, the simple data writer should overwrite its contents.
*
* @throws IOException
*/
@Test
public void testOverwriteExistingStagingFile() throws IOException {
byte[] randomBytesStage = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 0 };
byte[] randomBytesWrite = { 11, 12, 13, 14, 15 };
Path stagingFile = new Path(TestConstants.TEST_STAGING_DIR + Path.SEPARATOR + this.filePath,
TestConstants.TEST_FILE_NAME + "." + TestConstants.TEST_WRITER_ID + "." + "tmp");
Configuration conf = new Configuration();
// Add all job configuration properties so they are picked up by Hadoop
for (String key : properties.getPropertyNames()) {
conf.set(key, properties.getProp(key));
}
FileSystem fs = FileSystem.get(URI.create(TestConstants.TEST_FS_URI), conf);
OutputStream os = fs.create(stagingFile);
os.write(randomBytesStage);
os.flush();
os.close();
SimpleDataWriter writer = buildSimpleDataWriter();
writer.write(randomBytesWrite);
writer.close();
writer.commit();
Assert.assertEquals(writer.recordsWritten(), 1);
Assert.assertEquals(writer.bytesWritten(), randomBytesWrite.length + 1);
File writeFile = new File(writer.getOutputFilePath());
int c, i = 0;
InputStream is = new FileInputStream(writeFile);
while ((c = is.read()) != -1) {
if (i == 5) {
Assert.assertEquals(c, (byte) newLine); // the last byte should be newline
i++;
continue;
}
Assert.assertEquals(randomBytesWrite[i], c);
i++;
}
}
@AfterMethod
public void tearDown() throws IOException {
// Clean up the staging and/or output directories if necessary
File testRootDir = new File(TestConstants.TEST_ROOT_DIR);
if (testRootDir.exists()) {
FileUtil.fullyDelete(testRootDir);
}
}
}