package eu.dnetlib.iis.common.pig.udfs;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.pig.EvalFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
/**
* @author Dominika Tkaczyk
* @author Michal Oniszczuk
*/
public class IdReplacerUDF extends EvalFunc<Tuple> {
/**
* From the result we remove two first fields of the input tuple. This is because two first fields are UDF
* parameters, the rest is the input record in which we replace ids.
*/
private static final int FIELDS_TO_REMOVE = 2;
@Override
/**
* UDF replaces original id with new id string. The input tuple should contain following elements:
* a dot-separated list of field ids representing a path in the input tree structure (first element),
* new id string (second element), the rest of the tuple should contain the consecutive fields of the input record.
*
* Replacing of multiple id fields is not implemented yet. Only one field replacing is implemented.
*
* @param tuple a tuple (id path - dot-separated list of fields indicating position in the record schema tree, new id string, input tuple fields...)
* @return
* @throws IOException
*/
public Tuple exec(Tuple input) throws IOException {
checkArguments(input);
String oldIdPath = (String) input.get(0);
String newId = (String) input.get(1);
Tuple record = getRecord(input);
Schema recordSchema = getRecordSchema(getInputSchema());
replaceIdFieldInTuple(newId, oldIdPath, record, recordSchema);
return record;
}
private static void checkArguments(Tuple input) throws IllegalArgumentException {
checkNotNull(input);
checkInputSize(input);
}
private static void checkInputSize(Tuple input) throws IllegalArgumentException {
final int minimalInputSize = FIELDS_TO_REMOVE + 1;
final int actualSize = input.size();
if (actualSize < minimalInputSize) {
throw new IllegalArgumentException(
"Not enough arguments passed to " + IdReplacerUDF.class.getName() +
". Expected at least " + minimalInputSize +
", but got " + actualSize + " arguments.");
}
}
private static void checkNotNull(Tuple input) throws IllegalArgumentException {
if (input == null) {
throw new IllegalArgumentException(
IdReplacerUDF.class.getName() + ": Input tuple cannot be null");
}
}
/**
* Leaves only fields belonging to the input record, removes fields corresponding to other UDF parameters.
*/
private static Tuple getRecord(Tuple input) throws ExecException {/*-?|2014-04-23 IdReplacer with 2 IDs to replace|mafju|c2|?*/
List<Object> strippedInput = new ArrayList<Object>();
for (int i = FIELDS_TO_REMOVE; i < input.size(); i++) {
strippedInput.add(input.get(i));
}
return TupleFactory.getInstance().newTuple(strippedInput);
}
/**
* Leaves only fields belonging to the input record, removes fields corresponding to other UDF parameters.
*/
private static Schema getRecordSchema(Schema inputSchema) throws FrontendException {
Schema output = new Schema();
for (int i = FIELDS_TO_REMOVE; i < inputSchema.size(); i++) {
output.add(inputSchema.getField(i));
}
return output;
}
private static void replaceIdFieldInTuple(String newId, String oldIdPath, Tuple tuple, Schema tupleSchema) throws ExecException, FrontendException {
if (newId != null) {
Pair<Tuple, Integer> idTupleAndPos = getIdField(oldIdPath, tuple, tupleSchema);
idTupleAndPos.getLeft().set(idTupleAndPos.getRight(), newId);
}
}
/**
* Extracts from an input record the tuple containing the last field in a path and position of that field in that tuple.
* Traverses the record schema tree using path. Path is a list of field names indicating position in that tree.
* <p/>
* In other words: returns a (sort-of-a) lens allowing modification of a field indicated by idPath.
* More on lenses: http://stackoverflow.com/a/5597750/257401
*
* @param path dot-separated list of field names
* @param record input record
* @param schema input record schema
* @return a pair (the tuple containing the last field in a path, position of that field in that tuple)
* @throws FrontendException
* @throws ExecException
*/
private static Pair<Tuple, Integer> getIdField(String path, Tuple record, Schema schema) throws FrontendException, ExecException {
String[] fields = path.split("\\.");
for (int i = 0; i < fields.length - 1; i++) {
int pos = schema.getPosition(fields[i]);
schema = schema.getField(fields[i]).schema;
record = (Tuple) record.get(pos);
}
return new ImmutablePair<Tuple, Integer>(record, schema.getPosition(fields[fields.length - 1]));
}
@Override
public Schema outputSchema(Schema input) {
try {
return new Schema(new Schema.FieldSchema(getSchemaName(getClass().getName().toLowerCase().replace('.', '_'), input),
getRecordSchema(input), DataType.TUPLE));
} catch (FrontendException ex) {
return null;
}
}
}