package eu.dnetlib.iis.common.pig.udfs;
import com.google.common.collect.Lists;
import java.io.IOException;
import java.util.*;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.SchemaUtil;
/**
* Computes the difference between two data bags.
* The output data bag contains those elements from the first input bag, that
* were not present in the second one.
*
* @author Dominika Tkaczyk
*/
public class StringBagsDifference extends EvalFunc<DataBag> {
@Override
public DataBag exec(Tuple tuple) throws IOException {
if (tuple == null || tuple.size() != 2) {
return null;
}
DataBag dbMain = (DataBag) tuple.get(0);
DataBag dbSub = (DataBag) tuple.get(1);
if (dbMain == null || dbSub == null) {
return dbMain;
}
List<Tuple> tuples = new ArrayList<Tuple>();
Iterator<Tuple> itMain = dbMain.iterator();
while (itMain.hasNext()) {
tuples.add(itMain.next());
}
Iterator<Tuple> itSub = dbSub.iterator();
while (itSub.hasNext()) {
tuples.remove(itSub.next());
}
if (tuples.isEmpty()) {
return null;
}
BagFactory bagFactory = BagFactory.getInstance();
return bagFactory.newDefaultBag(tuples);
}
@Override
public Schema outputSchema(Schema input) {
try {
return SchemaUtil.newBagSchema(Lists.newArrayList(DataType.CHARARRAY));
} catch (FrontendException ex) {
return null;
}
}
}