package eu.dnetlib.iis.common.pig.udfs;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import org.apache.pig.EvalFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.schema.Schema;
/**
* Deduplicates bag of tuples where tuple[0] is identifier and tuple[confidenceLevelPosition] is confidence level.
* Highest confidence level is picked when identifier duplicate is found. Identifiers are sorted lexicographically.
*
* @author mhorst
*/
public class IdConfidenceTupleDeduplicator extends EvalFunc<DataBag> {
/**
* Confidence level position in tuple. First element is indexed with 0.
*/
private final int confidenceLevelPosition;
//------------------------ CONSTRUCTORS --------------------------
public IdConfidenceTupleDeduplicator() {
this(1);
}
public IdConfidenceTupleDeduplicator(int confidenceLevelPosition) {
super();
this.confidenceLevelPosition = confidenceLevelPosition;
}
/**
* @param confidenceLevelPosition {@link String} representation of {@link Integer} value.
* Required by PIG.
*/
public IdConfidenceTupleDeduplicator(String confidenceLevelPosition) {
this(Integer.valueOf(confidenceLevelPosition));
}
//------------------------ PUBLIC --------------------------
/**
* Deduplicates tuples by grouping them by identifier stored in tuple[0] and picking the one with highest
* confidence level stored in tuple[confidenceLevelPosition].
*
* @param tuple {@link DataBag} holding group of tuples to be deduplicated
*/
@Override
public DataBag exec(Tuple tuple) throws IOException {
if (tuple == null || tuple.size() == 0) {
return null;
}
DataBag inputTuples = (DataBag) tuple.get(0);
if (inputTuples != null && inputTuples.size() > 1) {
// deduplicating only if more than one element
Map<String, Tuple> deduplicatedTuplesMap = new TreeMap<String, Tuple>();
Iterator<Tuple> inputTuplesIterator = inputTuples.iterator();
while (inputTuplesIterator.hasNext()) {
Tuple currentTuple = inputTuplesIterator.next();
updateStoredTupleWhenConfidenceLevelHigher(currentTuple, deduplicatedTuplesMap);
}
return BagFactory.getInstance().newDefaultBag(new ArrayList<Tuple>(deduplicatedTuplesMap.values()));
} else {
return inputTuples;
}
}
@Override
public Schema outputSchema(Schema input) {
return input;
}
//------------------------ PRIVATE --------------------------
/**
* Stores <code>newTuple</code> in <code>deduplicatedTuplesMap</code> when its confidence level is
* higher than already stored tuple or when tuple for given identifier (stored as first tuple element) was not stored yet.
*/
private void updateStoredTupleWhenConfidenceLevelHigher(Tuple newTuple, Map<String, Tuple> deduplicatedTuplesMap) throws ExecException {
String tupleId = (String) newTuple.get(0);
Tuple storedTuple = deduplicatedTuplesMap.get(tupleId);
if (storedTuple != null) {
Float newTupleConfidenceLevel = (Float) newTuple.get(this.confidenceLevelPosition);
Float storedTupleConfidenceLevel = (Float) storedTuple.get(this.confidenceLevelPosition);
if (newTupleConfidenceLevel != null &&
(storedTupleConfidenceLevel == null || newTupleConfidenceLevel > storedTupleConfidenceLevel)) {
// replacing stored tuple with new tuple when confidence level is higher or when stored tuple didn't have it defined
deduplicatedTuplesMap.put(tupleId, newTuple);
}
} else {
deduplicatedTuplesMap.put(tupleId, newTuple);
}
}
}