package eu.dnetlib.iis.wf.citationmatching; import java.io.IOException; import org.apache.hadoop.io.NullWritable; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameters; import eu.dnetlib.iis.citationmatching.schemas.Citation; import eu.dnetlib.iis.citationmatching.schemas.DocumentMetadata; import eu.dnetlib.iis.citationmatching.schemas.ReferenceMetadata; import eu.dnetlib.iis.common.java.io.HdfsUtils; import pl.edu.icm.coansys.citations.ConfigurableCitationMatchingService; import pl.edu.icm.coansys.citations.CoreCitationMatchingService; import pl.edu.icm.coansys.citations.CoreCitationMatchingSimpleFactory; /** * Citation matching job * * @author madryk */ public class IisCitationMatchingJob { private static CoreCitationMatchingSimpleFactory coreCitationMatchingFactory = new CoreCitationMatchingSimpleFactory(); //------------------------ LOGIC -------------------------- public static void main(String[] args) throws IOException { IisCitationMatchingJobParameters params = new IisCitationMatchingJobParameters(); JCommander jcommander = new JCommander(params); jcommander.parse(args); SparkConf conf = new SparkConf(); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "pl.edu.icm.coansys.citations.MatchableEntityKryoRegistrator"); try (JavaSparkContext sc = new JavaSparkContext(conf)) { ConfigurableCitationMatchingService<String, ReferenceMetadata, String, DocumentMetadata, Citation, NullWritable> citationMatchingService = createConfigurableCitationMatchingService(sc, params); HdfsUtils.remove(sc.hadoopConfiguration(), params.outputDirPath); HdfsUtils.remove(sc.hadoopConfiguration(), params.outputReportPath); citationMatchingService.matchCitations(sc, params.fullDocumentPath, params.fullDocumentPath, params.outputDirPath); } } //------------------------ PRIVATE -------------------------- private static ConfigurableCitationMatchingService<String, ReferenceMetadata, String, DocumentMetadata, Citation, NullWritable> createConfigurableCitationMatchingService(JavaSparkContext sc, IisCitationMatchingJobParameters params) { ConfigurableCitationMatchingService<String, ReferenceMetadata, String, DocumentMetadata, Citation, NullWritable> configurableCitationMatchingService = new ConfigurableCitationMatchingService<>(); CoreCitationMatchingService coreCitationMatchingService = coreCitationMatchingFactory.createCoreCitationMatchingService(sc, params.maxHashBucketSize); configurableCitationMatchingService.setCoreCitationMatchingService(coreCitationMatchingService); configurableCitationMatchingService.setNumberOfPartitions(params.numberOfPartitions); ReferenceMetadataInputReader referenceMetadataInputReader = new ReferenceMetadataInputReader(); ReferenceMetadataInputConverter referenceMetadataInputConverter = new ReferenceMetadataInputConverter(); configurableCitationMatchingService.setInputCitationReader(referenceMetadataInputReader); configurableCitationMatchingService.setInputCitationConverter(referenceMetadataInputConverter); DocumentMetadataInputReader documentMetadataInputReader = new DocumentMetadataInputReader(); DocumentMetadataInputConverter documentMetadataInputConverter = new DocumentMetadataInputConverter(); configurableCitationMatchingService.setInputDocumentReader(documentMetadataInputReader); configurableCitationMatchingService.setInputDocumentConverter(documentMetadataInputConverter); CitationMatchingCounterReporter citationMatchingReporter = new CitationMatchingCounterReporter(); citationMatchingReporter.setSparkContext(sc); citationMatchingReporter.setReportPath(params.outputReportPath); CitationOutputConverter citationOutputConverter = new CitationOutputConverter(); CitationOutputWriter citationOutputWriter = new CitationOutputWriter(); citationOutputWriter.setCitationMatchingReporter(citationMatchingReporter); configurableCitationMatchingService.setOutputConverter(citationOutputConverter); configurableCitationMatchingService.setOutputWriter(citationOutputWriter); return configurableCitationMatchingService; } @Parameters(separators = "=") private static class IisCitationMatchingJobParameters { @Parameter(names = "-fullDocumentPath", required = true, description = "path to directory/file with full documents (document with references") private String fullDocumentPath; @Parameter(names = "-outputDirPath", required = true, description = "path to directory with results") private String outputDirPath; @Parameter(names = "-outputReportPath", required = true, description = "path to directory with report") private String outputReportPath; @Parameter(names="-maxHashBucketSize", required = false, description = "max number of the citation-documents pairs for a given hash") private long maxHashBucketSize = 10000; @Parameter(names="-numberOfPartitions", required = false, description = "number of partitions used for rdds with citations and documents read from input files, if not set it will depend on the input format") private Integer numberOfPartitions = 5; } }