package gobblin.compaction.parser; import com.google.common.base.Joiner; import gobblin.dataset.FileSystemDataset; import lombok.AllArgsConstructor; import org.apache.commons.lang.StringUtils; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import com.google.common.base.Preconditions; import lombok.Getter; import lombok.Setter; import gobblin.compaction.mapreduce.MRCompactor; import gobblin.configuration.State; /** * A parser which converts {@link FileSystemDataset} to {@link CompactionParserResult} */ @AllArgsConstructor public class CompactionPathParser { State state; /** * A parsed result returned by {@link CompactionPathParser#parse(FileSystemDataset)} */ public static class CompactionParserResult { @Getter @Setter private String srcBaseDir; @Getter @Setter private String dstBaseDir; @Getter @Setter private String srcSubDir; @Getter @Setter private String dstSubDir; @Getter private DateTime time; @Getter private String timeString; @Getter private String datasetName; @Getter private String dstAbsoluteDir; } /** * Parse a {@link FileSystemDataset} to some detailed parts like source base directory, * source sub directory, destination based directory, destination sub directory, and time * information. */ public CompactionParserResult parse (FileSystemDataset dataset) { CompactionParserResult result = new CompactionParserResult(); result.srcBaseDir = getSrcBaseDir (state); result.srcSubDir = getSrcSubDir (state); result.dstBaseDir = getDstBaseDir (state); result.dstSubDir = getDstSubDir (state); parseTimeAndDatasetName(dataset, result); result.dstAbsoluteDir = Joiner.on("/").join (result.dstBaseDir, result.datasetName, result.dstSubDir, result.timeString); return result; } private void parseTimeAndDatasetName (FileSystemDataset dataset, CompactionParserResult rst) { String commonBase = rst.getSrcBaseDir(); String fullPath = dataset.datasetURN(); int startPos = fullPath.indexOf(commonBase) + commonBase.length(); String relative = StringUtils.removeStart(fullPath.substring(startPos), "/"); int delimiterStart = StringUtils.indexOf(relative, rst.getSrcSubDir()); if (delimiterStart == -1) { throw new StringIndexOutOfBoundsException(); } int delimiterEnd = relative.indexOf("/", delimiterStart); String datasetName = StringUtils.removeEnd(relative.substring(0, delimiterStart), "/"); String timeString = StringUtils.removeEnd(relative.substring(delimiterEnd + 1), "/"); rst.datasetName = datasetName; rst.timeString = timeString; rst.time = getTime (timeString); } private DateTime getTime (String timeString) { DateTimeZone timeZone = DateTimeZone.forID(MRCompactor.DEFAULT_COMPACTION_TIMEZONE); int splits = StringUtils.countMatches(timeString, "/"); String timePattern = ""; if (splits == 3) { timePattern = "YYYY/MM/dd/HH"; } else if (splits == 2) { timePattern = "YYYY/MM/dd"; } DateTimeFormatter timeFormatter = DateTimeFormat.forPattern(timePattern).withZone(timeZone); return timeFormatter.parseDateTime (timeString); } private String getSrcBaseDir(State state) { Preconditions.checkArgument(state.contains(MRCompactor.COMPACTION_INPUT_DIR), "Missing required property " + MRCompactor.COMPACTION_INPUT_DIR); return state.getProp(MRCompactor.COMPACTION_INPUT_DIR); } private String getSrcSubDir(State state) { Preconditions.checkArgument(state.contains(MRCompactor.COMPACTION_INPUT_SUBDIR), "Missing required property " + MRCompactor.COMPACTION_INPUT_SUBDIR); return state.getProp(MRCompactor.COMPACTION_INPUT_SUBDIR); } private String getDstBaseDir(State state) { Preconditions.checkArgument(state.contains(MRCompactor.COMPACTION_DEST_DIR), "Missing required property " + MRCompactor.COMPACTION_DEST_DIR); return state.getProp(MRCompactor.COMPACTION_DEST_DIR); } private String getDstSubDir(State state) { Preconditions.checkArgument(state.contains(MRCompactor.COMPACTION_DEST_SUBDIR), "Missing required property " + MRCompactor.COMPACTION_DEST_SUBDIR); return state.getProp(MRCompactor.COMPACTION_DEST_SUBDIR); } }