package eu.dnetlib.iis.wf.primary.converters;
import java.io.IOException;
import org.apache.avro.mapred.AvroKey;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Mapper;
import eu.dnetlib.iis.metadataextraction.schemas.DocumentText;
import eu.dnetlib.iis.wf.primary.schemas.DocumentContentClasspath;
/**
* Mapper converting {@link DocumentContentClasspath} to {@link DocumentText}
* by retrieving text content from classpath location.
*
* @author mhorst
*/
public class DocumentClasspathToTextConverter
extends Mapper<AvroKey<DocumentContentClasspath>, NullWritable, AvroKey<DocumentText>, NullWritable> {
@Override
protected void map(AvroKey<DocumentContentClasspath> key, NullWritable ignore, Context context)
throws IOException, InterruptedException{
DocumentText.Builder builder = DocumentText.newBuilder();
builder.setId(key.datum().getId());
builder.setText(
IOUtils.toString(
Thread.currentThread().getContextClassLoader().getResourceAsStream(
key.datum().getClasspathLocation().toString()), "utf8"));
context.write(new AvroKey<DocumentText>(builder.build()),
NullWritable.get());
}
}