package eu.dnetlib.iis.wf.metadataextraction; import java.io.File; import java.io.FileFilter; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.lang.reflect.InvocationTargetException; import java.security.InvalidParameterException; import java.util.Map.Entry; import java.util.Properties; import org.apache.commons.beanutils.PropertyUtils; import org.apache.log4j.Logger; import org.jdom.Document; import org.jdom.Element; import eu.dnetlib.iis.common.java.io.JsonUtils; import eu.dnetlib.iis.common.report.test.ValueSpecMatcher; import eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata; import pl.edu.icm.cermine.ContentExtractor; /** * Metadata extractor main class executing extraction for all files provided as arguments. * @author mhorst * */ public class MetadataExtractorMain { private static final long interruptionDefaultTimeoutSecs = 600; private static final Logger log = Logger.getLogger(MetadataExtractorMain.class); private static final String fileNameSuffixContent = ".pdf"; private static final String fileNameSuffixExpectations = ".expectations"; private static final String expectationTimeoutSecs = "timeout.secs"; private static final String expectationExceptionClass = "exception.class"; private static final String expectationExceptionMessage = "exception.message"; private static final String expectationMetadataPrefix = "metadata."; private static final ValueSpecMatcher valueMatcher = new ValueSpecMatcher(); //------------------------ LOGIC -------------------------- public static void main(String[] args) throws Exception { if (args.length>0) { for (String fileLoc : args) { File rootFile = new File(fileLoc); if (rootFile.exists()) { process(rootFile); } else { throw new InvalidParameterException("Location does not exist: " + fileLoc); } } } else { throw new InvalidParameterException("no pdf file path provided"); } } //------------------------ PRIVATE -------------------------- /** * Handles file or directory. */ private static final void process(File file) throws Exception { if (file.isDirectory()) { File[] files = file.listFiles(new FileFilter() { @Override public boolean accept(File pathname) { return pathname.isDirectory() || pathname.getName().toLowerCase().endsWith(fileNameSuffixContent); } }); for (File currentFile : files) { process(currentFile); } } else { processFile(file); } } /** * Handles PDF file and optional expectations file. */ private static final void processFile(File file) throws Exception { log.info("processing file: " + file); long timeout = interruptionDefaultTimeoutSecs; // looking for optional expectations Properties expectations = readExpectations(new File(generateExpectationsFileName(file))); if (expectations.getProperty(expectationTimeoutSecs)!=null) { timeout = Long.parseLong(expectations.getProperty(expectationTimeoutSecs)); log.info("overriding default timeout: " + interruptionDefaultTimeoutSecs + " with new value: " + timeout + " [secs]"); } InputStream inputStream = new FileInputStream(file); ExtractedDocumentMetadata extractedMetadata = null; try { ContentExtractor extractor = new ContentExtractor(timeout); extractor.setPDF(inputStream); Element resultElem = extractor.getContentAsNLM(); String rawText = extractor.getRawFullText(); extractedMetadata = NlmToDocumentWithBasicMetadataConverter.convertFull( generateId(file), new Document(resultElem), rawText); validateContent(extractedMetadata, expectations); } catch (UnmetExpectationException e) { log.error("expectations were unmet for record:\n" + JsonUtils.toPrettyJSON(extractedMetadata.toString())); throw e; } catch (Exception e) { handleException(e, expectations); } finally { inputStream.close(); } } /** * Generates id based on file name. */ private static final String generateId(File file) { return file.getName().substring(0, file.getName().lastIndexOf('.')); } /** * Generates expectations file absolute path based on content file name by replacing file extension. */ private static final String generateExpectationsFileName(File contentFile) { return contentFile.getAbsolutePath().substring(0, contentFile.getAbsolutePath().lastIndexOf('.')) + fileNameSuffixExpectations; } /** * Reads whole set of expectations defined as properties. */ private static final Properties readExpectations(File file) throws IOException { Properties expecations = new Properties(); if (file.exists()) { expecations.load(new FileInputStream(file)); } return expecations; } /** * Validates record fields against specified expectations. RuntimeException is thrown when invalid. * * @param extractedMetadata metadata record to be validated * @param expectations set of field expectations defined as properties where key is field location (prefixed with 'metadata.') and value is expected value */ private static void validateContent(ExtractedDocumentMetadata extractedMetadata, Properties expectations) throws IllegalAccessException, InvocationTargetException, NoSuchMethodException, UnmetExpectationException { for (Entry<Object, Object> fieldExpectation : expectations.entrySet()) { String keyCandidate = (String)fieldExpectation.getKey(); if (keyCandidate.startsWith(expectationMetadataPrefix)) { String fieldPath = keyCandidate.substring(expectationMetadataPrefix.length()); String currentValue = PropertyUtils.getNestedProperty(extractedMetadata, fieldPath).toString(); String expectedValue = fieldExpectation.getValue().toString(); if (!valueMatcher.matches(currentValue, expectedValue)) { throw new UnmetExpectationException("expectation not met: invalid field value for path: " + fieldPath + ", expected: '" + fieldExpectation.getValue() + "', " + "got: '" + currentValue + "'"); } } } } /** * Handles exception by validating against exception expectations if any defined. * When expectations are not defined or conditions are not met exception is rethrown. */ private static void handleException(Exception e, Properties expectations) throws Exception { if (expectations.containsKey(expectationExceptionClass)) { String expectedExceptionClass = expectations.getProperty(expectationExceptionClass); if (!expectedExceptionClass.equals(e.getClass().getName())) { throwExpectationNotMet(expectationExceptionClass, expectations, e.getClass().getName(), e); } else { logExpectationMet(expectationExceptionClass, expectations); } String optionalExceptionMessage = expectations.getProperty(expectationExceptionMessage); if (optionalExceptionMessage != null) { if (!optionalExceptionMessage.equals(e.getMessage())) { throwExpectationNotMet(expectationExceptionMessage, expectations, e.getMessage(), e); } else { logExpectationMet(expectationExceptionMessage, expectations); } } } else { log.error("expectation '" + expectationExceptionClass + "' was not defined but exception occured while handling content, interrupting!"); throw e; } } /** * Logs information about met expectation. */ private static void logExpectationMet(String expectationName, Properties expectations) { log.info("expectation met, name: '" + expectationName + "', expected value: '" + expectations.getProperty(expectationName) + "'"); } /** * Throws {@link UnmetExpectationException} with unmet expectation details. * @throws UnmetExpectationException */ private static void throwExpectationNotMet(String expectationName, Properties expectations, String receivedValue, Exception e) throws UnmetExpectationException { throw new UnmetExpectationException("expecatation not met, name: '" + expectationName + "', expected value: '" + expectations.getProperty(expectationName) + "', got: '" + receivedValue + "'", e); } }