/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.example.wikipedia;
import java.io.IOException;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import com.google.common.base.Function;
import com.google.common.base.Predicates;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.gson.JsonElement;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.SourceState;
import gobblin.configuration.WorkUnitState;
import gobblin.source.Source;
import gobblin.source.extractor.Extractor;
import gobblin.source.extractor.WatermarkInterval;
import gobblin.source.extractor.extract.AbstractSource;
import gobblin.source.extractor.extract.LongWatermark;
import gobblin.source.workunit.Extract;
import gobblin.source.workunit.WorkUnit;
import gobblin.source.workunit.Extract.TableType;
/**
* An implementation of {@link Source} for the Wikipedia example.
*
* <p>
* This source creates a {@link gobblin.source.workunit.WorkUnit}, and uses
* {@link WikipediaExtractor} to pull the data from Wikipedia.
* </p>
*
* @author Ziyang Liu
*/
public class WikipediaSource extends AbstractSource<String, JsonElement> {
public static final String ARTICLE_TITLE="gobblin.wikipediaSource.workUnit.title";
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
Map<String, Iterable<WorkUnitState>> previousWorkUnits = state.getPreviousWorkUnitStatesByDatasetUrns();
List<String> titles = new LinkedList<>(Splitter.on(",").omitEmptyStrings().
splitToList(state.getProp(WikipediaExtractor.SOURCE_PAGE_TITLES)));
Map<String, LongWatermark> prevHighWatermarks = Maps.newHashMap();
for (Map.Entry<String, Iterable<WorkUnitState>> entry : previousWorkUnits.entrySet()) {
Iterable<LongWatermark> watermarks =
Iterables.transform(entry.getValue(), new Function<WorkUnitState, LongWatermark>() {
@Override
public LongWatermark apply(WorkUnitState wus) {
return wus.getActualHighWatermark(LongWatermark.class);
}
});
watermarks = Iterables.filter(watermarks, Predicates.notNull());
List<LongWatermark> watermarkList = Lists.newArrayList(watermarks);
if (watermarkList.size() > 0) {
prevHighWatermarks.put(entry.getKey(), Collections.max(watermarkList));
}
}
Extract extract = createExtract(TableType.SNAPSHOT_ONLY,
state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY), "WikipediaOutput");
List<WorkUnit> workUnits = Lists.newArrayList();
for (String title : titles) {
LongWatermark prevWatermark = prevHighWatermarks.containsKey(title) ? prevHighWatermarks.get(title) :
new LongWatermark(-1);
prevHighWatermarks.remove(title);
WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(prevWatermark, new LongWatermark(-1)));
workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, title);
workUnits.add(workUnit);
}
for (Map.Entry<String, LongWatermark> nonProcessedDataset : prevHighWatermarks.entrySet()) {
WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(nonProcessedDataset.getValue(),
nonProcessedDataset.getValue()));
workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, nonProcessedDataset.getKey());
workUnits.add(workUnit);
}
return workUnits;
}
@Override
public Extractor<String, JsonElement> getExtractor(WorkUnitState state) throws IOException {
return new WikipediaExtractor(state);
}
@Override
public void shutdown(SourceState state) {
//nothing to do
}
}