/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.util.recordcount;
import java.util.Random;
import java.util.regex.Pattern;
import org.apache.hadoop.fs.Path;
import com.google.common.base.Preconditions;
import gobblin.util.RecordCountProvider;
/**
* Implementation of {@link RecordCountProvider}, which provides record count from file path.
* The file name should follow the pattern: {Prefix}{RecordCount}.{SystemCurrentTimeInMills}.{RandomInteger}{SUFFIX}.
* The prefix should be either {@link #M_OUTPUT_FILE_PREFIX} or {@link #MR_OUTPUT_FILE_PREFIX}.
* For example, given a file path: "/a/b/c/part-m-123.1444437036.12345.avro", the record count will be 123.
*/
public class CompactionRecordCountProvider extends RecordCountProvider {
public static final String MR_OUTPUT_FILE_PREFIX = "part-r-";
public static final String M_OUTPUT_FILE_PREFIX = "part-m-";
private static final String SEPARATOR = ".";
private static final String SUFFIX = ".avro";
private static final Random RANDOM = new Random();
/**
* Construct the file name as {filenamePrefix}{recordCount}.{SystemCurrentTimeInMills}.{RandomInteger}{SUFFIX}.
*/
public static String constructFileName(String filenamePrefix, long recordCount) {
Preconditions.checkArgument(
filenamePrefix.equals(M_OUTPUT_FILE_PREFIX) || filenamePrefix.equals(MR_OUTPUT_FILE_PREFIX),
String.format("%s is not a supported prefix, which should be %s, or %s.", filenamePrefix, M_OUTPUT_FILE_PREFIX,
MR_OUTPUT_FILE_PREFIX));
StringBuilder sb = new StringBuilder();
sb.append(filenamePrefix);
sb.append(Long.toString(recordCount));
sb.append(SEPARATOR);
sb.append(Long.toString(System.currentTimeMillis()));
sb.append(SEPARATOR);
sb.append(Integer.toString(RANDOM.nextInt(Integer.MAX_VALUE)));
sb.append(SUFFIX);
return sb.toString();
}
/**
* Get the record count through filename.
*/
@Override
public long getRecordCount(Path filepath) {
String filename = filepath.getName();
Preconditions.checkArgument(filename.startsWith(M_OUTPUT_FILE_PREFIX) || filename.startsWith(MR_OUTPUT_FILE_PREFIX),
String.format("%s is not a supported filename, which should start with %s, or %s.", filename,
M_OUTPUT_FILE_PREFIX, MR_OUTPUT_FILE_PREFIX));
String prefixWithCounts = filename.split(Pattern.quote(SEPARATOR))[0];
if (filename.startsWith(M_OUTPUT_FILE_PREFIX)) {
return Long.parseLong(prefixWithCounts.substring(M_OUTPUT_FILE_PREFIX.length()));
}
return Long.parseLong(prefixWithCounts.substring(MR_OUTPUT_FILE_PREFIX.length()));
}
/**
* This method currently supports converting the given {@link Path} from {@link IngestionRecordCountProvider}.
* The converted {@link Path} will start with {@link #M_OUTPUT_FILE_PREFIX}.
*/
@Override
public Path convertPath(Path path, RecordCountProvider src) {
if (this.getClass().equals(src.getClass())) {
return path;
} else if (src.getClass().equals(IngestionRecordCountProvider.class)) {
String newFileName = constructFileName(M_OUTPUT_FILE_PREFIX, src.getRecordCount(path));
return new Path(path.getParent(), newFileName);
} else {
throw getNotImplementedException(src);
}
}
}