package eu.dnetlib.iis.wf.citationmatching.direct; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.isOneOf; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertThat; import java.io.File; import java.io.IOException; import java.util.Collections; import java.util.List; import org.apache.avro.util.Utf8; import org.apache.commons.io.FileUtils; import org.hamcrest.Matcher; import org.junit.After; import org.junit.Before; import org.junit.Test; import pl.edu.icm.sparkutils.test.SparkJob; import pl.edu.icm.sparkutils.test.SparkJobBuilder; import pl.edu.icm.sparkutils.test.SparkJobExecutor; import com.google.common.io.Files; import eu.dnetlib.iis.common.citations.schemas.Citation; import eu.dnetlib.iis.common.citations.schemas.CitationEntry; import eu.dnetlib.iis.common.schemas.ReportEntry; import eu.dnetlib.iis.common.utils.AvroAssertTestUtil; import eu.dnetlib.iis.common.utils.AvroTestUtils; import eu.dnetlib.iis.common.utils.JsonAvroTestUtils; import eu.dnetlib.iis.transformers.metadatamerger.schemas.ExtractedDocumentMetadataMergedWithOriginal; /** * * @author madryk * */ public class CitationMatchingDirectJobTest { private SparkJobExecutor executor = new SparkJobExecutor(); private File workingDir; private String inputDirPath; private String outputDirPath; private String reportDirPath; @Before public void before() { workingDir = Files.createTempDir(); inputDirPath = workingDir + "/spark_citation_matching_direct/input"; outputDirPath = workingDir + "/spark_citation_matching_direct/output"; reportDirPath = workingDir + "/spark_citation_matching_direct/report"; } @After public void after() throws IOException { FileUtils.deleteDirectory(workingDir); } //------------------------ TESTS -------------------------- @Test public void citationMatchingDirect() throws IOException { // given String jsonInputFile = "src/test/resources/eu/dnetlib/iis/wf/citationmatching/direct/data/input/documents.json"; String jsonOutputFile = "src/test/resources/eu/dnetlib/iis/wf/citationmatching/direct/data/expected_output/citations.json"; String jsonReportFile = "src/test/resources/eu/dnetlib/iis/wf/citationmatching/direct/data/expected_output/report.json"; AvroTestUtils.createLocalAvroDataStore( JsonAvroTestUtils.readJsonDataStore(jsonInputFile, ExtractedDocumentMetadataMergedWithOriginal.class), inputDirPath); // execute executor.execute(buildCitationMatchingDirectJob(inputDirPath, outputDirPath, reportDirPath)); // assert AvroAssertTestUtil.assertEqualsWithJsonIgnoreOrder(outputDirPath, jsonOutputFile, Citation.class); AvroAssertTestUtil.assertEqualsWithJsonIgnoreOrder(reportDirPath, jsonReportFile, ReportEntry.class); } @Test public void citationMatchingDirect_MULTIPLE_SAME_DOI() throws IOException { // given String jsonInputFile = "src/test/resources/eu/dnetlib/iis/wf/citationmatching/direct/data/input/documents_multiple_same_doi.json"; String jsonReportFile = "src/test/resources/eu/dnetlib/iis/wf/citationmatching/direct/data/expected_output/report_one_matched.json"; AvroTestUtils.createLocalAvroDataStore( JsonAvroTestUtils.readJsonDataStore(jsonInputFile, ExtractedDocumentMetadataMergedWithOriginal.class), inputDirPath); // execute executor.execute(buildCitationMatchingDirectJob(inputDirPath, outputDirPath, reportDirPath)); // assert List<Citation> citations = AvroTestUtils.readLocalAvroDataStore(outputDirPath); assertEquals(1, citations.size()); assertCitation(citations.get(0), is(new Utf8("id-1")), 8, isOneOf(new Utf8("id-2"), new Utf8("id-3"), new Utf8("id-4"))); AvroAssertTestUtil.assertEqualsWithJsonIgnoreOrder(reportDirPath, jsonReportFile, ReportEntry.class); } @Test public void citationMatchingDirect_MULTIPLE_SAME_PMID() throws IOException { // given String jsonInputFile = "src/test/resources/eu/dnetlib/iis/wf/citationmatching/direct/data/input/documents_multiple_same_pmid.json"; String jsonReportFile = "src/test/resources/eu/dnetlib/iis/wf/citationmatching/direct/data/expected_output/report_one_matched.json"; AvroTestUtils.createLocalAvroDataStore( JsonAvroTestUtils.readJsonDataStore(jsonInputFile, ExtractedDocumentMetadataMergedWithOriginal.class), inputDirPath); // execute executor.execute(buildCitationMatchingDirectJob(inputDirPath, outputDirPath, reportDirPath)); // assert List<Citation> citations = AvroTestUtils.readLocalAvroDataStore(outputDirPath); assertEquals(1, citations.size()); assertCitation(citations.get(0), is(new Utf8("id-1")), 8, isOneOf(new Utf8("id-2"), new Utf8("id-3"), new Utf8("id-4"))); AvroAssertTestUtil.assertEqualsWithJsonIgnoreOrder(reportDirPath, jsonReportFile, ReportEntry.class); } @Test public void citationMatchingDirect_MULTIPLE_SAME_PMID_WITH_TYPE() throws IOException { // given String jsonInputFile = "src/test/resources/eu/dnetlib/iis/wf/citationmatching/direct/data/input/documents_multiple_same_pmid_with_type.json"; String jsonReportFile = "src/test/resources/eu/dnetlib/iis/wf/citationmatching/direct/data/expected_output/report_one_matched.json"; AvroTestUtils.createLocalAvroDataStore( JsonAvroTestUtils.readJsonDataStore(jsonInputFile, ExtractedDocumentMetadataMergedWithOriginal.class), inputDirPath); // execute executor.execute(buildCitationMatchingDirectJob(inputDirPath, outputDirPath, reportDirPath)); // assert List<Citation> citations = AvroTestUtils.readLocalAvroDataStore(outputDirPath); assertEquals(1, citations.size()); assertCitation(citations.get(0), is(new Utf8("id-1")), 8, is(new Utf8("id-3"))); AvroAssertTestUtil.assertEqualsWithJsonIgnoreOrder(reportDirPath, jsonReportFile, ReportEntry.class); } //------------------------ PRIVATE -------------------------- private void assertCitation(Citation citation, Matcher<? super CharSequence> sourceDocumentIdMatcher, Integer position, Matcher<? super CharSequence> destinationDocumentIdMatcher) { CitationEntry citationEntry = citation.getEntry(); assertThat(citation.getSourceDocumentId(), sourceDocumentIdMatcher); assertEquals(position, citationEntry.getPosition()); assertThat(citationEntry.getDestinationDocumentId(), destinationDocumentIdMatcher); assertEquals(Float.valueOf(1f), citationEntry.getConfidenceLevel()); assertNull(citationEntry.getRawText()); assertThat(citationEntry.getExternalDestinationDocumentIds(), equalTo(Collections.EMPTY_MAP)); } private SparkJob buildCitationMatchingDirectJob(String inputDirPath, String outputDirPath, String reportDirPath) { SparkJob sparkJob = SparkJobBuilder .create() .setAppName("Spark Citation Matching Direct") .setMainClass(CitationMatchingDirectJob.class) .addArg("-inputAvroPath", inputDirPath) .addArg("-outputAvroPath", outputDirPath) .addArg("-outputReportPath", reportDirPath) .addJobProperty("spark.driver.host", "localhost") .build(); return sparkJob; } }