package gobblin.compaction.verify;
import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
import gobblin.compaction.dataset.DatasetHelper;
import gobblin.compaction.mapreduce.MRCompactor;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.State;
import gobblin.util.HadoopUtils;
import gobblin.util.RecordCountProvider;
import gobblin.util.recordcount.IngestionRecordCountProvider;
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.Collection;
/**
* A class helps to calculate, serialize, deserialize record count.
*
* By using {@link IngestionRecordCountProvider}, the default input file name should in in format
* {file_name}.{record_count}.{extension}. For example, given a file path: "/a/b/c/file.123.avro",
* the record count will be 123.
*/
@Slf4j
public class InputRecordCountHelper {
@Getter
private final FileSystem fs;
private final State state;
private final RecordCountProvider inputRecordCountProvider;
private final String AVRO = "avro";
public final static String RECORD_COUNT_FILE = "_record_count";
/**
* Constructor
*/
public InputRecordCountHelper(State state) {
try {
this.fs = getSourceFileSystem (state);
this.state = state;
this.inputRecordCountProvider = (RecordCountProvider) Class
.forName(state.getProp(MRCompactor.COMPACTION_INPUT_RECORD_COUNT_PROVIDER,
MRCompactor.DEFAULT_COMPACTION_INPUT_RECORD_COUNT_PROVIDER))
.newInstance();
} catch (Exception e) {
throw new RuntimeException("Failed to instantiate " + InputRecordCountHelper.class.getName(), e);
}
}
/**
* Calculate record count at given paths
* @param paths all paths where the record count are calculated
* @return record count after parsing all files under given paths
*/
public long calculateRecordCount (Collection<Path> paths) throws IOException {
long sum = 0;
for (Path path: paths) {
sum += inputRecordCountProvider.getRecordCount(DatasetHelper.getApplicableFilePaths(this.fs, path, Lists.newArrayList(AVRO)));
}
return sum;
}
/**
* Read record count from a specific directory.
* File name is {@link InputRecordCountHelper#RECORD_COUNT_FILE}
* @param fs file system in use
* @param dir directory where a record file will be read
* @return record count
*/
public static long readRecordCount (FileSystem fs, Path dir) throws IOException {
if (!fs.exists(new Path(dir, RECORD_COUNT_FILE))) {
return 0;
}
try (BufferedReader br = new BufferedReader(new InputStreamReader(fs.open (new Path (dir, RECORD_COUNT_FILE)), Charsets.UTF_8))) {
long count = Long.parseLong(br.readLine());
return count;
}
}
/**
* Write record count to a specific directory.
* File name is {@link InputRecordCountHelper#RECORD_COUNT_FILE}
* @param fs file system in use
* @param dir directory where a record file will be saved
*/
public static void writeRecordCount (FileSystem fs, Path dir, long count) throws IOException {
try (FSDataOutputStream outputFileStream = fs.create(new Path(dir, RECORD_COUNT_FILE))) {
outputFileStream.writeBytes(Long.toString(count));
}
}
protected FileSystem getSourceFileSystem (State state)
throws IOException {
Configuration conf = HadoopUtils.getConfFromState(state);
String uri = state.getProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
return HadoopUtils.getOptionallyThrottledFileSystem(FileSystem.get(URI.create(uri), conf), state);
}
}