/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.data.management.copy.hive;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.mockito.Mockito;
import org.mockito.invocation.InvocationOnMock;
import org.mockito.stubbing.Answer;
import org.testng.Assert;
import org.testng.annotations.Test;
import com.google.common.base.Optional;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import gobblin.configuration.State;
import gobblin.data.management.copy.CopyEntity;
import gobblin.data.management.copy.entities.PostPublishStep;
import gobblin.data.management.copy.hive.HiveCopyEntityHelper.DeregisterFileDeleteMethod;
import gobblin.hive.HiveRegProps;
import gobblin.metrics.event.MultiTimingEvent;
public class HiveCopyEntityHelperTest {
private final Path sourceRoot = new Path("/source");
private final Path targetRoot = new Path("/target");
@Test public void testResolvePath() throws Exception {
Assert.assertEquals(HiveTargetPathHelper.resolvePath("/data/$DB/$TABLE", "db", "table"), new Path("/data/db/table"));
Assert.assertEquals(HiveTargetPathHelper.resolvePath("/data/$TABLE", "db", "table"), new Path("/data/table"));
Assert.assertEquals(HiveTargetPathHelper.resolvePath("/data", "db", "table"), new Path("/data/table"));
}
@Test
public void testFullPathDiff() throws Exception {
Map<Path, FileStatus> sourceMap = Maps.newHashMap();
Map<Path, FileStatus> targetDesiredMap = Maps.newHashMap();
List<Path> expectedFilesToCopy = Lists.newArrayList();
List<Path> expectedFilesToSkipCopy = Lists.newArrayList();
List<Path> expectedFilesToDelete = Lists.newArrayList();
List<Path> expectedFilesToSkipDelete = Lists.newArrayList();
populateSourceAndTargetEntities(sourceMap, targetDesiredMap, expectedFilesToCopy,
expectedFilesToSkipCopy, expectedFilesToDelete, expectedFilesToSkipDelete);
TestLocationDescriptor sourceLocation = new TestLocationDescriptor(sourceMap);
TestLocationDescriptor targetDesiredLocation = new TestLocationDescriptor(targetDesiredMap);
TestLocationDescriptor existingTargetLocation = new TestLocationDescriptor(Maps.newHashMap(targetDesiredMap));
MultiTimingEvent timer = Mockito.mock(MultiTimingEvent.class);
HiveCopyEntityHelper helper = Mockito.mock(HiveCopyEntityHelper.class);
HiveTargetPathHelper targetPathHelper = Mockito.mock(HiveTargetPathHelper.class);
Mockito.when(targetPathHelper
.getTargetPath(Mockito.any(Path.class), Mockito.any(FileSystem.class), Mockito.any(Optional.class),
Mockito.anyBoolean())).then(new Answer<Path>() {
@Override
public Path answer(InvocationOnMock invocation)
throws Throwable {
Path path = (Path) invocation.getArguments()[0];
return new Path(path.toString().replace(sourceRoot.toString(), targetRoot.toString()));
}
});
Mockito.when(helper.getTargetPathHelper()).thenReturn(targetPathHelper);
HiveCopyEntityHelper.DiffPathSet diff =
HiveCopyEntityHelper.fullPathDiff(sourceLocation, targetDesiredLocation,
Optional.<HiveLocationDescriptor>of(existingTargetLocation), Optional.<Partition>absent(), timer, helper);
Assert.assertEquals(diff.filesToCopy.size(), expectedFilesToCopy.size());
for (Path expectedFileToCopy : expectedFilesToCopy) {
Assert.assertTrue(containsPath(diff.filesToCopy, expectedFileToCopy));
}
for (Path expectedFileToSkipCopy : expectedFilesToSkipCopy) {
Assert.assertFalse(containsPath(diff.filesToCopy, expectedFileToSkipCopy));
}
Assert.assertEquals(diff.pathsToDelete.size(), expectedFilesToDelete.size());
for (Path expectedFileToDelete : expectedFilesToDelete) {
Assert.assertTrue(diff.pathsToDelete.contains(expectedFileToDelete));
}
for (Path expectedFileToSkipDelete : expectedFilesToSkipDelete) {
Assert.assertFalse(diff.pathsToDelete.contains(expectedFileToSkipDelete));
}
}
@Test
public void testFullPathDiffWithUnmanagedPathsWithoutDeletePolicy() throws Exception {
Map<Path, FileStatus> sourceMap = Maps.newHashMap();
Map<Path, FileStatus> targetDesiredMap = Maps.newHashMap();
List<Path> expectedFilesToCopy = Lists.newArrayList();
List<Path> expectedFilesToSkipCopy = Lists.newArrayList();
List<Path> expectedFilesToDelete = Lists.newArrayList();
List<Path> expectedFilesToSkipDelete = Lists.newArrayList();
populateSourceAndTargetEntities(sourceMap, targetDesiredMap, expectedFilesToCopy,
expectedFilesToSkipCopy, expectedFilesToDelete, expectedFilesToSkipDelete);
// add un-managed files to the target path
Path path6 = new Path("path6");
Path targetPath6 = new Path(targetRoot, path6);
Map<Path, FileStatus> targetDesiredMapWithExtraFile = Maps.newHashMap(targetDesiredMap);
targetDesiredMapWithExtraFile.put(targetPath6, getFileStatus(targetPath6, 0, 10));
expectedFilesToDelete.add(targetPath6);
TestLocationDescriptor sourceLocation = new TestLocationDescriptor(sourceMap);
TestLocationDescriptor targetDesiredLocation = new TestLocationDescriptor(targetDesiredMapWithExtraFile);
TestLocationDescriptor existingTargetLocation = new TestLocationDescriptor(Maps.newHashMap(targetDesiredMap));
Table table = Mockito.mock(Table.class);
HiveDataset hiveDataset = Mockito.mock(HiveDataset.class);
MultiTimingEvent timer = Mockito.mock(MultiTimingEvent.class);
HiveCopyEntityHelper helper = Mockito.mock(HiveCopyEntityHelper.class);
HiveTargetPathHelper targetPathHelper = Mockito.mock(HiveTargetPathHelper.class);
Mockito.when(helper.getDataset()).thenReturn(hiveDataset);
Mockito.when(hiveDataset.getTable()).thenReturn(table);
Mockito.when(table.getCompleteName()).thenReturn("table1");
Mockito.when(targetPathHelper
.getTargetPath(Mockito.any(Path.class), Mockito.any(FileSystem.class), Mockito.any(Optional.class),
Mockito.anyBoolean())).then(new Answer<Path>() {
@Override
public Path answer(InvocationOnMock invocation)
throws Throwable {
Path path = (Path) invocation.getArguments()[0];
return new Path(path.toString().replace(sourceRoot.toString(), targetRoot.toString()));
}
});
Mockito.when(helper.getTargetPathHelper()).thenReturn(targetPathHelper);
// Add policy to not delete un-managed data
Mockito.when(helper.getUnmanagedDataPolicy()).thenReturn(HiveCopyEntityHelper.UnmanagedDataPolicy.ABORT);
// We should receive an exception that un-managed files are detected
try {
HiveCopyEntityHelper.DiffPathSet diff =
HiveCopyEntityHelper.fullPathDiff(sourceLocation, targetDesiredLocation,
Optional.<HiveLocationDescriptor>of(existingTargetLocation), Optional.<Partition>absent(), timer, helper);
Assert.fail("Expected an IOException but did not receive any");
} catch (IOException ex) {
// Ignore IOException if message is what we expect
String expectedExceptionMessage = "New table / partition would pick up existing, undesired files in target file "
+ "system. table1, files [/target/path6].";
Assert.assertEquals(ex.getMessage(), expectedExceptionMessage);
}
}
@Test
public void testFullPathDiffWithUnmanagedPathsWithDeletePolicy() throws Exception {
Map<Path, FileStatus> sourceMap = Maps.newHashMap();
Map<Path, FileStatus> targetDesiredMap = Maps.newHashMap();
List<Path> expectedFilesToCopy = Lists.newArrayList();
List<Path> expectedFilesToSkipCopy = Lists.newArrayList();
List<Path> expectedFilesToDelete = Lists.newArrayList();
List<Path> expectedFilesToSkipDelete = Lists.newArrayList();
populateSourceAndTargetEntities(sourceMap, targetDesiredMap, expectedFilesToCopy,
expectedFilesToSkipCopy, expectedFilesToDelete, expectedFilesToSkipDelete);
// add un-managed files to the target path
Path path6 = new Path("path6");
Path targetPath6 = new Path(targetRoot, path6);
Map<Path, FileStatus> targetDesiredMapWithExtraFile = Maps.newHashMap(targetDesiredMap);
targetDesiredMapWithExtraFile.put(targetPath6, getFileStatus(targetPath6, 0, 10));
expectedFilesToDelete.add(targetPath6);
TestLocationDescriptor sourceLocation = new TestLocationDescriptor(sourceMap);
TestLocationDescriptor targetDesiredLocation = new TestLocationDescriptor(targetDesiredMapWithExtraFile);
TestLocationDescriptor existingTargetLocation = new TestLocationDescriptor(Maps.newHashMap(targetDesiredMap));
Table table = Mockito.mock(Table.class);
HiveDataset hiveDataset = Mockito.mock(HiveDataset.class);
MultiTimingEvent timer = Mockito.mock(MultiTimingEvent.class);
HiveCopyEntityHelper helper = Mockito.mock(HiveCopyEntityHelper.class);
HiveTargetPathHelper targetPathHelper = Mockito.mock(HiveTargetPathHelper.class);
Mockito.when(helper.getDataset()).thenReturn(hiveDataset);
Mockito.when(hiveDataset.getTable()).thenReturn(table);
Mockito.when(table.getCompleteName()).thenReturn("table1");
Mockito.when(targetPathHelper
.getTargetPath(Mockito.any(Path.class), Mockito.any(FileSystem.class), Mockito.any(Optional.class),
Mockito.anyBoolean())).then(new Answer<Path>() {
@Override
public Path answer(InvocationOnMock invocation)
throws Throwable {
Path path = (Path) invocation.getArguments()[0];
return new Path(path.toString().replace(sourceRoot.toString(), targetRoot.toString()));
}
});
Mockito.when(helper.getTargetPathHelper()).thenReturn(targetPathHelper);
// Add policy to delete un-managed data
Mockito.when(helper.getUnmanagedDataPolicy()).thenReturn(HiveCopyEntityHelper.UnmanagedDataPolicy.DELETE_UNMANAGED_DATA);
// Since policy is specified to delete un-managed data, this should not throw exception and un-managed file should
// .. show up in pathsToDelete in the diff
HiveCopyEntityHelper.DiffPathSet diff =
HiveCopyEntityHelper.fullPathDiff(sourceLocation, targetDesiredLocation,
Optional.<HiveLocationDescriptor>of(existingTargetLocation), Optional.<Partition>absent(), timer, helper);
Assert.assertEquals(diff.filesToCopy.size(), expectedFilesToCopy.size());
for (Path expectedFileToCopy : expectedFilesToCopy) {
Assert.assertTrue(containsPath(diff.filesToCopy, expectedFileToCopy));
}
for (Path expectedFileToSkipCopy : expectedFilesToSkipCopy) {
Assert.assertFalse(containsPath(diff.filesToCopy, expectedFileToSkipCopy));
}
Assert.assertEquals(diff.pathsToDelete.size(), expectedFilesToDelete.size());
for (Path expectedFileToDelete : expectedFilesToDelete) {
Assert.assertTrue(diff.pathsToDelete.contains(expectedFileToDelete));
}
for (Path expectedFileToSkipDelete : expectedFilesToSkipDelete) {
Assert.assertFalse(diff.pathsToDelete.contains(expectedFileToSkipDelete));
}
}
private void populateSourceAndTargetEntities(Map<Path, FileStatus> sourceMap, Map<Path, FileStatus> targetDesiredMap,
List<Path> expectedFilesToCopy, List<Path> expectedFilesToSkipCopy,
List<Path> expectedFilesToDelete, List<Path> expectedFilesToSkipDelete) {
List<FileStatus> sourceFileStatuses = Lists.newArrayList();
List<FileStatus> desiredTargetStatuses = Lists.newArrayList();
// already exists in target
Path path1 = new Path("path1");
Path sourcePath1 = new Path(sourceRoot, path1);
Path targetPath1 = new Path(targetRoot, path1);
sourceFileStatuses.add(getFileStatus(sourcePath1, 0, 0));
desiredTargetStatuses.add(getFileStatus(targetPath1, 0, 10));
expectedFilesToSkipCopy.add(sourcePath1);
expectedFilesToSkipDelete.add(targetPath1);
// not exists in target
Path path2 = new Path("path2");
Path sourcePath2 = new Path(sourceRoot, path2);
Path targetPath2 = new Path(targetRoot, path2);
sourceFileStatuses.add(getFileStatus(sourcePath2, 0, 0));
expectedFilesToCopy.add(sourcePath2);
expectedFilesToSkipDelete.add(targetPath2);
// exists in target, different length
Path path3 = new Path("path3");
Path sourcePath3 = new Path(sourceRoot, path3);
Path targetPath3 = new Path(targetRoot, path3);
sourceFileStatuses.add(getFileStatus(sourcePath3, 0, 0));
desiredTargetStatuses.add(getFileStatus(targetPath3, 10, 0));
expectedFilesToCopy.add(sourcePath3);
expectedFilesToDelete.add(targetPath3);
// exists in target, newer modtime
Path path4 = new Path("path4");
Path sourcePath4 = new Path(sourceRoot, path4);
Path targetPath4 = new Path(targetRoot, path4);
sourceFileStatuses.add(getFileStatus(sourcePath4, 0, 10));
desiredTargetStatuses.add(getFileStatus(targetPath4, 0, 0));
expectedFilesToCopy.add(sourcePath4);
expectedFilesToDelete.add(targetPath4);
// only on target, expect delete
Path path5 = new Path("path5");
Path sourcePath5 = new Path(sourceRoot, path5);
Path targetPath5 = new Path(targetRoot, path5);
desiredTargetStatuses.add(getFileStatus(targetPath5, 0, 10));
expectedFilesToSkipCopy.add(sourcePath5);
expectedFilesToDelete.add(targetPath5);
for(FileStatus status : sourceFileStatuses) {
sourceMap.put(status.getPath(), status);
}
for(FileStatus status : desiredTargetStatuses) {
targetDesiredMap.put(status.getPath(), status);
}
}
@Test
public void testAddTableDeregisterSteps() throws Exception {
HiveDataset dataset = Mockito.mock(HiveDataset.class);
Mockito.when(dataset.getProperties()).thenReturn(new Properties());
HiveCopyEntityHelper helper = Mockito.mock(HiveCopyEntityHelper.class);
Mockito.when(helper.getDeleteMethod()).thenReturn(DeregisterFileDeleteMethod.NO_DELETE);
Mockito.when(helper.getTargetURI()).thenReturn(Optional.of("/targetURI"));
Mockito.when(helper.getHiveRegProps()).thenReturn(new HiveRegProps(new State()));
Mockito.when(helper.getDataset()).thenReturn(dataset);
Mockito.when(helper.addTableDeregisterSteps(Mockito.any(List.class), Mockito.any(String.class), Mockito.anyInt(),
Mockito.any(org.apache.hadoop.hive.ql.metadata.Table.class))).thenCallRealMethod();
org.apache.hadoop.hive.ql.metadata.Table meta_table = Mockito.mock(org.apache.hadoop.hive.ql.metadata.Table.class);
org.apache.hadoop.hive.metastore.api.Table api_table =
Mockito.mock(org.apache.hadoop.hive.metastore.api.Table.class);
Mockito.when(api_table.getDbName()).thenReturn("TestDB");
Mockito.when(api_table.getTableName()).thenReturn("TestTable");
Mockito.when(meta_table.getTTable()).thenReturn(api_table);
List<CopyEntity> copyEntities = new ArrayList<CopyEntity>();
String fileSet = "testFileSet";
int initialPriority = 0;
int priority = helper.addTableDeregisterSteps(copyEntities, fileSet, initialPriority, meta_table);
Assert.assertTrue(priority == 1);
Assert.assertTrue(copyEntities.size() == 1);
Assert.assertTrue(copyEntities.get(0) instanceof PostPublishStep);
PostPublishStep p = (PostPublishStep) (copyEntities.get(0));
Assert
.assertTrue(p.getStep().toString().contains("Deregister table TestDB.TestTable on Hive metastore /targetURI"));
}
@Test public void testReplacedPrefix() throws Exception {
Path sourcePath = new Path("/data/databases/DB1/Table1/SS1/part1.avro");
Path prefixTobeReplaced = new Path("/data/databases");
Path prefixReplacement = new Path("/data/databases/_parallel");
Path expected = new Path("/data/databases/_parallel/DB1/Table1/SS1/part1.avro");
Assert.assertEquals(HiveCopyEntityHelper.replacedPrefix(sourcePath, prefixTobeReplaced, prefixReplacement), expected);
}
private boolean containsPath(Collection<FileStatus> statuses, Path path) {
for (FileStatus status : statuses) {
if (status.getPath().equals(path)) {
return true;
}
}
return false;
}
private FileStatus getFileStatus(Path path, long len, long modtime) {
return new FileStatus(len, false, 0, 0, modtime, path);
}
public class TestLocationDescriptor extends HiveLocationDescriptor {
Map<Path, FileStatus> paths;
public TestLocationDescriptor(Map<Path, FileStatus> paths) {
super(null, null, null, null);
this.paths = paths;
}
@Override
public Map<Path, FileStatus> getPaths()
throws IOException {
return this.paths;
}
}
}