package eu.dnetlib.iis.common.pig.udfs;
import com.google.common.collect.Lists;
import java.io.IOException;
import java.util.*;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.SchemaUtil;
/**
* Merges two or more data bags containing chararray elements.
* The output data bag contains the elements from all input bags without repetitions.
*
* @author Dominika Tkaczyk
*/
public class StringBagsMerger extends EvalFunc<DataBag> {
@Override
public DataBag exec(Tuple tuple) throws IOException {
if (tuple == null || tuple.size() == 0) {
return null;
}
List<Tuple> tuples = new ArrayList<Tuple>();
for (int i = 0; i < tuple.size(); i++) {
if (tuple.get(i) != null) {
DataBag db = (DataBag) tuple.get(i);
Iterator<Tuple> it = db.iterator();
while (it.hasNext()) {
Tuple next = it.next();
if (!tuples.contains(next)) {
tuples.add(next);
}
}
}
}
if (tuples.isEmpty()) {
return null;
}
BagFactory bagFactory = BagFactory.getInstance();
return bagFactory.newDefaultBag(Lists.newArrayList(tuples));
}
@Override
public Schema outputSchema(Schema input) {
try {
return SchemaUtil.newBagSchema(Lists.newArrayList(DataType.CHARARRAY));
} catch (FrontendException ex) {
return null;
}
}
}