/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.data.management.retention.sql;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Timestamp;
import org.apache.commons.dbcp.BasicDataSource;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.Path;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.testng.Assert;
import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
/**
* A Proof of concept to represent Retention policies as SQL queries. The POC uses Apache Derby in-memory database to
* store directory structure metadata.
*/
public class SqlBasedRetentionPoc {
private static final int TWO_YEARS_IN_DAYS = 365 * 2;
private static final String DAILY_PARTITION_PATTERN = "yyyy/MM/dd";
private BasicDataSource basicDataSource;
private Connection connection;
/**
* <ul>
* <li>Create the in-memory database and connect
* <li>Create tables for snapshots and daily_paritions
* <li>Attach all user defined functions from {@link SqlUdfs}
* </ul>
*
*/
@BeforeClass
public void setup() throws SQLException {
basicDataSource = new BasicDataSource();
basicDataSource.setDriverClassName("org.apache.derby.jdbc.EmbeddedDriver");
basicDataSource.setUrl("jdbc:derby:memory:derbypoc;create=true");
Connection connection = basicDataSource.getConnection();
connection.setAutoCommit(false);
this.connection = connection;
execute("CREATE TABLE Snapshots (dataset_path VARCHAR(255), name VARCHAR(255), path VARCHAR(255), ts TIMESTAMP, row_count bigint)");
execute("CREATE TABLE Daily_Partitions (dataset_path VARCHAR(255), path VARCHAR(255), ts TIMESTAMP)");
// Register UDFs
execute("CREATE FUNCTION TIMESTAMP_DIFF(timestamp1 TIMESTAMP, timestamp2 TIMESTAMP, unitString VARCHAR(50)) RETURNS BIGINT PARAMETER STYLE JAVA NO SQL LANGUAGE JAVA EXTERNAL NAME 'gobblin.data.management.retention.sql.SqlUdfs.timestamp_diff'");
}
@AfterClass
public void cleanUp() throws Exception {
basicDataSource.close();
}
/**
*
* The test inserts a few test snapshots. A query is issued to retrieve the two most recent snapshots
*/
@Test
public void testKeepLast2Snapshots() throws Exception {
insertSnapshot(new Path("/data/databases/Forum/Comments/1453743903767-PT-440505235"));
insertSnapshot(new Path("/data/databases/Forum/Comments/1453830569999-PT-440746131"));
insertSnapshot(new Path("/data/databases/Forum/Comments/1453860526464-PT-440847244"));
insertSnapshot(new Path("/data/databases/Forum/Comments/1453889323804-PT-440936752"));
// Derby does not support LIMIT keyword. The suggested workaround is to setMaxRows in the PreparedStatement
PreparedStatement statement = connection.prepareStatement("SELECT name FROM Snapshots ORDER BY ts desc");
statement.setMaxRows(2);
ResultSet rs = statement.executeQuery();
// Snapshots to be retained
rs.next();
Assert.assertEquals(rs.getString(1), "1453889323804-PT-440936752");
rs.next();
Assert.assertEquals(rs.getString(1), "1453860526464-PT-440847244");
}
/**
* The test inserts a few time partitioned datasets. A query is issued that retrieves the partitions older than 2
* years.
*/
@Test
public void testKeepLast2YearsOfDailyPartitions() throws Exception {
insertDailyPartition(new Path("/data/tracking/MetricEvent/daily/2015/11/25")); //61 days
insertDailyPartition(new Path("/data/tracking/MetricEvent/daily/2015/12/01")); // 55 days
insertDailyPartition(new Path("/data/tracking/MetricEvent/daily/2014/11/21")); // 430 days
insertDailyPartition(new Path("/data/tracking/MetricEvent/daily/2014/01/22")); // 733 days (more than 2 years)
insertDailyPartition(new Path("/data/tracking/MetricEvent/daily/2013/01/25")); // 1095 days (more than 2 years)
// Use the current timestamp for consistent test results.
Timestamp currentTimestamp =
new Timestamp(DateTimeFormat.forPattern(DAILY_PARTITION_PATTERN).parseDateTime("2016/01/25").getMillis());
PreparedStatement statement =
connection.prepareStatement("SELECT path FROM Daily_Partitions WHERE TIMESTAMP_DIFF(?, ts, 'Days') > ?");
statement.setTimestamp(1, currentTimestamp);
statement.setLong(2, TWO_YEARS_IN_DAYS);
ResultSet rs = statement.executeQuery();
// Daily partitions to be cleaned
rs.next();
Assert.assertEquals(rs.getString(1), "/data/tracking/MetricEvent/daily/2014/01/22");
rs.next();
Assert.assertEquals(rs.getString(1), "/data/tracking/MetricEvent/daily/2013/01/25");
}
private void insertSnapshot(Path snapshotPath) throws Exception {
String datasetPath = StringUtils.substringBeforeLast(snapshotPath.toString(), Path.SEPARATOR);
String snapshotName = StringUtils.substringAfterLast(snapshotPath.toString(), Path.SEPARATOR);
long ts = Long.parseLong(StringUtils.substringBefore(snapshotName, "-PT-"));
long recordCount = Long.parseLong(StringUtils.substringAfter(snapshotName, "-PT-"));
PreparedStatement insert = connection.prepareStatement("INSERT INTO Snapshots VALUES (?, ?, ?, ?, ?)");
insert.setString(1, datasetPath);
insert.setString(2, snapshotName);
insert.setString(3, snapshotPath.toString());
insert.setTimestamp(4, new Timestamp(ts));
insert.setLong(5, recordCount);
insert.executeUpdate();
}
private void insertDailyPartition(Path dailyPartitionPath) throws Exception {
String datasetPath = StringUtils.substringBeforeLast(dailyPartitionPath.toString(), Path.SEPARATOR + "daily");
DateTime partition =
DateTimeFormat.forPattern(DAILY_PARTITION_PATTERN).parseDateTime(
StringUtils.substringAfter(dailyPartitionPath.toString(), "daily" + Path.SEPARATOR));
PreparedStatement insert = connection.prepareStatement("INSERT INTO Daily_Partitions VALUES (?, ?, ?)");
insert.setString(1, datasetPath);
insert.setString(2, dailyPartitionPath.toString());
insert.setTimestamp(3, new Timestamp(partition.getMillis()));
insert.executeUpdate();
}
private void execute(String query) throws SQLException {
PreparedStatement insertStatement = connection.prepareStatement(query);
insertStatement.executeUpdate();
}
}