package eu.dnetlib.iis.wf.documentsclassification; import java.io.File; import java.io.IOException; import java.net.URL; import org.apache.commons.io.FileUtils; import org.junit.After; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.core.io.Resource; import org.springframework.core.io.support.PathMatchingResourcePatternResolver; import org.springframework.core.io.support.ResourcePatternResolver; import com.google.common.io.Files; import eu.dnetlib.iis.common.schemas.ReportEntry; import eu.dnetlib.iis.common.utils.AvroAssertTestUtil; import eu.dnetlib.iis.common.utils.AvroTestUtils; import eu.dnetlib.iis.common.utils.JsonAvroTestUtils; import eu.dnetlib.iis.documentsclassification.schemas.DocumentToDocumentClasses; import eu.dnetlib.iis.transformers.metadatamerger.schemas.ExtractedDocumentMetadataMergedWithOriginal; import pl.edu.icm.sparkutils.test.SparkJob; import pl.edu.icm.sparkutils.test.SparkJobBuilder; import pl.edu.icm.sparkutils.test.SparkJobExecutor; /** * @author Ɓukasz Dumiszewski */ public class DocumentClassificationJobTest { private static final Logger log = LoggerFactory.getLogger(DocumentClassificationJobTest.class); private SparkJobExecutor executor = new SparkJobExecutor(); private File workingDir; private String inputDirPath; private String outputDirPath; private static String scriptDirPath; private String reportDirPath; @BeforeClass public static void beforeClass() throws IOException { scriptDirPath = DocumentClassificationJobTest.class.getResource("/eu/dnetlib/iis/wf/documentsclassification/oozie_app/lib/scripts").getPath(); copyMadis(scriptDirPath + "/madis"); } @Before public void before() { workingDir = Files.createTempDir(); inputDirPath = workingDir + "/document_classification/input"; outputDirPath = workingDir + "/document_classification/output"; reportDirPath = workingDir + "/document_classification/report"; } @After public void after() throws IOException { FileUtils.deleteDirectory(workingDir); } //------------------------ TESTS -------------------------- @Test public void documentClassificationJob() throws IOException { // given String jsonInputFile = "src/test/resources/eu/dnetlib/iis/wf/documentsclassification/data/input/few_documents.json"; String jsonOutputFile = "src/test/resources/eu/dnetlib/iis/wf/documentsclassification/data/expected_output/few_document_to_document_classes.json"; String jsonReportFile = "src/test/resources/eu/dnetlib/iis/wf/documentsclassification/data/expected_output/few_document_report.json"; AvroTestUtils.createLocalAvroDataStore( JsonAvroTestUtils.readJsonDataStore(jsonInputFile, ExtractedDocumentMetadataMergedWithOriginal.class), inputDirPath); // execute executeJob(); // assert AvroAssertTestUtil.assertEqualsWithJsonIgnoreOrder(outputDirPath, jsonOutputFile, DocumentToDocumentClasses.class); AvroAssertTestUtil.assertEqualsWithJsonIgnoreOrder(reportDirPath, jsonReportFile, ReportEntry.class); } @Test public void documentClassificationJob_EMPTY_ABSTRACT() throws IOException { // given String jsonInputFile = "src/test/resources/eu/dnetlib/iis/wf/documentsclassification/data/input/documents_with_empty_abstract.json"; String jsonOutputFile = "src/test/resources/eu/dnetlib/iis/wf/documentsclassification/data/expected_output/empty.json"; String jsonReportFile = "src/test/resources/eu/dnetlib/iis/wf/documentsclassification/data/expected_output/empty_report.json"; AvroTestUtils.createLocalAvroDataStore( JsonAvroTestUtils.readJsonDataStore(jsonInputFile, ExtractedDocumentMetadataMergedWithOriginal.class), inputDirPath); // execute executeJob(); // assert AvroAssertTestUtil.assertEqualsWithJsonIgnoreOrder(outputDirPath, jsonOutputFile, DocumentToDocumentClasses.class); AvroAssertTestUtil.assertEqualsWithJsonIgnoreOrder(reportDirPath, jsonReportFile, ReportEntry.class); } @Test public void documentClassificationJob_EMPTY() throws IOException { // given String avroInputFile = "src/test/resources/eu/dnetlib/iis/wf/documentsclassification/data/input/empty.avro"; String jsonOutputFile = "src/test/resources/eu/dnetlib/iis/wf/documentsclassification/data/expected_output/empty.json"; String jsonReportFile = "src/test/resources/eu/dnetlib/iis/wf/documentsclassification/data/expected_output/empty_report.json"; FileUtils.copyFileToDirectory(new File(avroInputFile), new File(inputDirPath)); // execute executeJob(); // assert AvroAssertTestUtil.assertEqualsWithJsonIgnoreOrder(outputDirPath, jsonOutputFile, DocumentToDocumentClasses.class); AvroAssertTestUtil.assertEqualsWithJsonIgnoreOrder(reportDirPath, jsonReportFile, ReportEntry.class); } //------------------------ PRIVATE -------------------------- private void executeJob() { SparkJob sparkJob = SparkJobBuilder .create() .setAppName(getClass().getName()) .setMainClass(DocumentClassificationJob.class) .addArg("-inputAvroPath", inputDirPath) .addArg("-outputAvroPath", outputDirPath) .addArg("-scriptDirPath", scriptDirPath) .addArg("-outputReportPath", reportDirPath) .addJobProperty("spark.driver.host", "localhost") .build(); executor.execute(sparkJob); } private static void copyMadis(String destDirectory) throws IOException { String directoryToScan = "/eu/dnetlib/iis/3rdparty/scripts/madis/"; ResourcePatternResolver resolver = new PathMatchingResourcePatternResolver(); Resource[] resources = resolver.getResources(ResourcePatternResolver.CLASSPATH_ALL_URL_PREFIX + directoryToScan + "**"); for (Resource resource : resources) { if (resource.exists() & resource.isReadable() & (resource.contentLength() > 0 || resource.getFilename().equals("__init__.py"))) { URL url = resource.getURL(); String urlString = url.toExternalForm(); String targetName = urlString.substring(urlString.indexOf(directoryToScan)+directoryToScan.length()); File destination = new File(destDirectory, targetName); FileUtils.copyURLToFile(url, destination); log.debug("Copied {} to {}", url, destination.getAbsolutePath()); } else { log.debug("Did not copy, seems to be directory: {} ", resource.getDescription()); } } } }