package eu.dnetlib.iis.wf.metadataextraction;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.avro.mapred.AvroKey;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;
import org.apache.zookeeper.server.ByteBufferInputStream;
import org.jdom.Document;
import org.jdom.Element;
import com.itextpdf.text.exceptions.InvalidPdfException;
import eu.dnetlib.iis.audit.schemas.Fault;
import eu.dnetlib.iis.common.WorkflowRuntimeParameters;
import eu.dnetlib.iis.common.fault.FaultUtils;
import eu.dnetlib.iis.common.javamapreduce.MultipleOutputs;
import eu.dnetlib.iis.importer.schemas.DocumentContent;
import eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata;
import eu.dnetlib.iis.wf.importer.content.approver.ContentApprover;
import eu.dnetlib.iis.wf.importer.content.approver.InvalidCountableContentApproverWrapper;
import eu.dnetlib.iis.wf.importer.content.approver.PDFHeaderBasedContentApprover;
import pl.edu.icm.cermine.ContentExtractor;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.exception.TransformationException;
import pl.edu.icm.cermine.tools.timeout.TimeoutException;
/**
* Metadata extractor module.
*
* @author Mateusz Kobos
* @author mhorst
*
*/
public class MetadataExtractorMapper extends Mapper<AvroKey<DocumentContent>, NullWritable, NullWritable, NullWritable> {
public static final String NAMED_OUTPUT_META = "output.meta";
public static final String NAMED_OUTPUT_FAULT = "output.fault";
public static final String EXCLUDED_IDS = "excluded.ids";
public static final String LOG_FAULT_PROCESSING_TIME_THRESHOLD_SECS = "log.fault.processing.time.threshold.secs";
public static final String INTERRUPT_PROCESSING_TIME_THRESHOLD_SECS = "interrupt.processing.time.threshold.secs";
public static final String FAULT_CODE_PROCESSING_TIME_THRESHOLD_EXCEEDED = "ProcessingTimeThresholdExceeded";
public static final String FAULT_SUPPLEMENTARY_DATA_PROCESSING_TIME = "processing_time";
protected static final Logger log = Logger.getLogger(MetadataExtractorMapper.class);
/**
* Progress log interval.
*/
private static final int PROGRESS_LOG_INTERVAL = 100;
private static final long SECS_TO_MILLIS = 1000l;
/**
* Multiple outputs.
*/
private MultipleOutputs mos;
/**
* Document metadata named output.
*/
private String namedOutputMeta;
/**
* Fault named output.
*/
private String namedOutputFault;
/**
* Current progress.
*/
private int currentProgress = 0;
/**
* Interval time.
*/
private long intervalTime = 0;
/**
* Processing timeout threshold, metadata extraction for given record will be interrupted when threshold exceeded.
*/
private Integer interruptionTimeoutSecs;
/**
* Processing time threshold. When exceeded apropriate object will be
* written to error datastore.
*/
private long processingTimeThreshold = Long.MAX_VALUE;
/**
* Set of object identifiers objects excluded from processing.
*/
private Set<String> excludedIds = Collections.emptySet();
/**
* Content approver module.
*/
private ContentApprover contentApprover;
/**
* Hadoop counters enum of invalid records
*/
public static enum InvalidRecordCounters {
INVALID_PDF_HEADER
}
private static final String invalidPdfHeaderMsg = "content PDF header not approved!";
//------------------------ LOGIC --------------------------
@Override
protected void setup(Context context) throws IOException, InterruptedException {
namedOutputMeta = context.getConfiguration().get(NAMED_OUTPUT_META);
if (namedOutputMeta == null || namedOutputMeta.isEmpty()) {
throw new RuntimeException("no named output provided for metadata");
}
namedOutputFault = context.getConfiguration().get(NAMED_OUTPUT_FAULT);
if (namedOutputFault == null || namedOutputFault.isEmpty()) {
throw new RuntimeException("no named output provided for fault");
}
String excludedIdsCSV = context.getConfiguration().get(EXCLUDED_IDS);
if (excludedIdsCSV != null && !excludedIdsCSV.trim().isEmpty()
&& !WorkflowRuntimeParameters.UNDEFINED_NONEMPTY_VALUE.equals(excludedIdsCSV)) {
log.info("got excluded ids: " + excludedIdsCSV);
excludedIds = new HashSet<String>(Arrays.asList(StringUtils.split(excludedIdsCSV.trim(), ',')));
} else {
log.info("got no excluded ids");
}
// handling processing time threshold: interruption and fault logging
interruptionTimeoutSecs = WorkflowRuntimeParameters.getIntegerParamValue(
INTERRUPT_PROCESSING_TIME_THRESHOLD_SECS, context.getConfiguration());
Integer processingTimeThresholdSecs = WorkflowRuntimeParameters.getIntegerParamValue(
LOG_FAULT_PROCESSING_TIME_THRESHOLD_SECS, context.getConfiguration());
if (processingTimeThresholdSecs != null) {
this.processingTimeThreshold = SECS_TO_MILLIS * processingTimeThresholdSecs;
}
Counter invalidPdfCounter = context.getCounter(InvalidRecordCounters.INVALID_PDF_HEADER);
invalidPdfCounter.setValue(0);
this.contentApprover = new InvalidCountableContentApproverWrapper(new PDFHeaderBasedContentApprover(), invalidPdfCounter);
mos = instantiateMultipleOutputs(context);
currentProgress = 0;
intervalTime = System.currentTimeMillis();
}
@Override
public void cleanup(Context context) throws IOException, InterruptedException {
mos.close();
}
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
*/
@Override
public void map(AvroKey<DocumentContent> key, NullWritable ignore, Context context)
throws IOException, InterruptedException {
DocumentContent content = key.datum();
String documentId = content.getId().toString();
if (excludedIds.contains(documentId)) {
log.info("skipping processing for excluded id " + documentId);
return;
}
if (content.getPdf()!=null) {
ByteBuffer byteBuffer = content.getPdf();
if (byteBuffer.hasArray() && contentApprover.approve(byteBuffer.array())) {
try (InputStream inputStream = new ByteBufferInputStream(byteBuffer)) {
processStream(documentId, inputStream);
}
} else {
log.info(invalidPdfHeaderMsg);
handleException(new InvalidPdfException(invalidPdfHeaderMsg), content.getId().toString());
}
} else {
log.warn("no byte data found for id: " + content.getId());
}
}
/**
* Instantiates {@link MultipleOutputs} instance.
*/
protected MultipleOutputs instantiateMultipleOutputs(Context context) {
return new MultipleOutputs(context);
}
/**
* Processes content input stream. Does not close contentStream.
*
* @param documentId document identifier
* @param contentStream stream to be processed
*/
protected void processStream(String documentId, InputStream contentStream) throws IOException, InterruptedException {
currentProgress++;
if (currentProgress % PROGRESS_LOG_INTERVAL == 0) {
log.info("metadata extaction progress: " + currentProgress + ", time taken to process "
+ PROGRESS_LOG_INTERVAL + " elements: " + ((System.currentTimeMillis() - intervalTime) / 1000)
+ " secs");
intervalTime = System.currentTimeMillis();
}
log.info("starting processing for id: " + documentId);
long startTime = System.currentTimeMillis();
try {
ContentExtractor extractor = interruptionTimeoutSecs != null ? new ContentExtractor(interruptionTimeoutSecs)
: new ContentExtractor();
extractor.setPDF(contentStream);
handleContent(extractor, documentId);
} catch (Exception e) {
log.error((e.getCause() instanceof InvalidPdfException) ? "Invalid PDF file"
: "got unexpected exception, just logging", e);
handleException(e, documentId);
return;
}
handleProcessingTime(System.currentTimeMillis() - startTime, documentId);
}
//------------------------ PRIVATE --------------------------
/**
* Extracts metadata and plaintext from content using extractor. Writes data to namedOutputMeta.
*
* @param extractor content extractor holding PDF stream
* @param documentId document identifier
*/
private void handleContent(ContentExtractor extractor, String documentId) throws TimeoutException, AnalysisException, IOException, InterruptedException, TransformationException {
Element resultElem = extractor.getContentAsNLM();
Document doc = new Document(resultElem);
String text = null;
try {
text = extractor.getRawFullText();
} catch (AnalysisException e) {
log.error("unable to extract plaintext, writing extracted metadata only", e);
}
mos.write(namedOutputMeta, new AvroKey<ExtractedDocumentMetadata>(
NlmToDocumentWithBasicMetadataConverter.convertFull(documentId, doc, text)));
}
/**
* Handles exception by converting it to {@link Fault} and writing it to fault output.
* Empty {@link ExtractedDocumentMetadata} result is written to metadata output.
*
* @param e Exception to be handled
* @param documentId document identifier
*/
private void handleException(Exception e, String documentId) throws IOException, InterruptedException {
// writing empty result
mos.write(namedOutputMeta, new AvroKey<ExtractedDocumentMetadata>(NlmToDocumentWithBasicMetadataConverter.createEmpty(documentId)));
// writing fault result
mos.write(namedOutputFault, new AvroKey<Fault>(FaultUtils.exceptionToFault(documentId, e, null)));
}
/**
* Handles document processing time by writing fault when processing time exceeded predefined threshold.
* @param processingTime processing time in milliseconds
* @param documentId document identifier
*/
private void handleProcessingTime(long processingTime, String documentId) throws IOException, InterruptedException {
if (processingTime > processingTimeThreshold) {
Map<CharSequence, CharSequence> supplementaryData = new HashMap<CharSequence, CharSequence>();
supplementaryData.put(FAULT_SUPPLEMENTARY_DATA_PROCESSING_TIME, String.valueOf(processingTime));
// writing fault result
mos.write(namedOutputFault,
new AvroKey<Fault>(Fault.newBuilder().setInputObjectId(documentId)
.setTimestamp(System.currentTimeMillis())
.setCode(FAULT_CODE_PROCESSING_TIME_THRESHOLD_EXCEEDED)
.setSupplementaryData(supplementaryData).build()));
}
log.info("finished processing for id " + documentId + " in " + (processingTime / 1000) + " secs");
}
}