package io.github.infolis.algorithm; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Files; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import io.github.infolis.InfolisConfig; import io.github.infolis.datastore.DataStoreClient; import io.github.infolis.datastore.FileResolver; import io.github.infolis.model.Execution; import io.github.infolis.model.ExecutionStatus; import io.github.infolis.model.entity.InfolisFile; import io.github.infolis.util.RegexUtils; import io.github.infolis.util.SerializationUtils; import javax.ws.rs.BadRequestException; import javax.ws.rs.ProcessingException; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * * @author kata * */ public class BibliographyExtractor extends BaseAlgorithm { public BibliographyExtractor(DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient, FileResolver inputFileResolver, FileResolver outputFileResolver) { super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver); } private static final Logger log = LoggerFactory.getLogger(BibliographyExtractor.class); private static final List<String> executionTags = Arrays.asList("BIB_REMOVED"); protected static List<String> getExecutionTags() { return executionTags; } /** * Compute the ratio of numbers on page: a high number of numbers is assumed * to be typical for bibliographies as they contain many years, page numbers * and dates. * * @param sections sections of text read from a text file * @return text of sections that are not classified as bibliography * @throws IOException */ protected String removeBibliography(List<String> sections) { String textWithoutBib = ""; boolean startedBib = false; for (int i = 0; i < sections.size(); i++) { String section = sections.get(i); double numNumbers = 0.0; double numDecimals = 0.0; double numChars = section.length(); if (numChars == 0.0) continue; // determine the amount of numbers (numeric and decimal) Matcher matcherNumeric = RegexUtils.patternNumeric.matcher(section); Matcher matcherDecimal = RegexUtils.patternDecimal.matcher(section); while (matcherNumeric.find()) numNumbers++; while (matcherDecimal.find()) numDecimals++; boolean containsCueWord = false; for (String s : InfolisConfig.getBibliographyCues()) { if (section.contains(s)) { containsCueWord = true; break; } } // use hasBibNumberRatio_d method from python scripts // TODO learn thresholds if (containsCueWord && ((numNumbers / numChars) >= 0.005) && ((numNumbers / numChars) <= 0.1) && ((numDecimals / numChars) <= 0.004)) { startedBib = true; continue; } if (startedBib) { if (((numNumbers / numChars) >= 0.01) && ((numNumbers / numChars) <= 0.1) && ((numDecimals / numChars) <= 0.004)) { } else { textWithoutBib += section; } } else { if (((numNumbers / numChars) >= 0.08) && ((numNumbers / numChars) <= 0.1) && ((numDecimals / numChars) <= 0.004)) { } else { textWithoutBib += section; } } } return textWithoutBib; } protected List<String> tokenizeSections(String text, int sentencesPerSection) { List<String> sections = new ArrayList<String>(); int n = 0; String section = ""; String[] lines = text.split(System.getProperty("line.separator")); for (int i = 0; i < lines.length; i++) { n++; section += lines[i] + System.getProperty("line.separator"); if (n >= sentencesPerSection) { sections.add(section); section = ""; n = 0; } } if (n < sentencesPerSection) sections.add(section); return sections; } @Override public void validate() throws IllegalAlgorithmArgumentException { Execution exec = this.getExecution(); if ((null == exec.getInputFiles() || exec.getInputFiles().isEmpty()) && (null == exec.getInfolisFileTags() || exec.getInfolisFileTags().isEmpty())) { throw new IllegalArgumentException("Must set at least one inputFile!"); } } public String transformFilename(String filename, String outputDir) { String outFileName = SerializationUtils.changeFileExtension(filename, "bibless.txt"); if (null != outputDir && !outputDir.isEmpty()) { outFileName = SerializationUtils.changeBaseDir(outFileName, outputDir); } return outFileName; } @Override public void execute() throws IOException { Execution tagExec = getExecution().createSubExecution(TagSearcher.class); tagExec.getInfolisFileTags().addAll(getExecution().getInfolisFileTags()); tagExec.instantiateAlgorithm(this).run(); getExecution().getInputFiles().addAll(tagExec.getInputFiles()); int counter = 0; for (String inputFileURI : getExecution().getInputFiles()) { counter++; log.debug(inputFileURI); InfolisFile inputFile; try { inputFile = getInputDataStoreClient().get(InfolisFile.class, inputFileURI); } catch (BadRequestException | ProcessingException e) { error(log, "Could not retrieve file " + inputFileURI + ": " + e.getMessage()); getExecution().setStatus(ExecutionStatus.FAILED); return; } if (null == inputFile) { error(log, "File was not registered with the data store: " + inputFileURI); getExecution().setStatus(ExecutionStatus.FAILED); return; } if (null == inputFile.getMediaType() || !inputFile.getMediaType().equals("text/plain")) { error(log, "File \"{}\" is not text/plain but is {} ", inputFileURI, inputFile.getMediaType()); getExecution().setStatus(ExecutionStatus.FAILED); return; } debug(log, "Start removing bib from {}", inputFile); String text; InputStream is = null; try { is = getInputFileResolver().openInputStream(inputFile); text = IOUtils.toString(is); } catch (IOException e) { fatal(log, "Error reading text file: " + e); getExecution().setStatus(ExecutionStatus.FAILED); return; } finally { is.close(); } //TODO: Test optimal section size List<String> inputSections = tokenizeSections(text, 10); text = removeBibliography(inputSections); InfolisFile outFile = new InfolisFile(); // if no output directory is given, create temporary output files if (null == getExecution().getOutputDirectory() || getExecution().getOutputDirectory().equals("")) { String REMOVED_BIB_DIR_PREFIX = "removedBib-"; String tempDir = Files.createTempDirectory(InfolisConfig.getTmpFilePath().toAbsolutePath(), REMOVED_BIB_DIR_PREFIX).toString(); FileUtils.forceDeleteOnExit(new File(tempDir)); getExecution().setOutputDirectory(tempDir); } // creates a new file for each text document outFile.setFileName(transformFilename(inputFile.getFileName(), getExecution().getOutputDirectory())); outFile.setMediaType("text/plain"); outFile.setMd5(SerializationUtils.getHexMd5(text)); outFile.setFileStatus("AVAILABLE"); Set<String> tagsToSet = getExecution().getTags(); tagsToSet.addAll(inputFile.getTags()); tagsToSet.addAll(executionTags); tagsToSet.remove(TextExtractor.getExecutionTagBibNotRemoved()); outFile.setTags(tagsToSet); OutputStream outStream = null; try { outStream = getOutputFileResolver().openOutputStream(outFile); IOUtils.write(text, outStream); } catch (IOException e) { error(log, "Error copying text to output stream: " + e); getExecution().setStatus(ExecutionStatus.FAILED); return; } finally { outStream.close(); } updateProgress(counter, getExecution().getInputFiles().size()); debug(log, "Removed bibliography from file {}", outFile); outFile.setManifestsEntity(inputFile.getManifestsEntity()); getOutputDataStoreClient().post(InfolisFile.class, outFile); getExecution().getOutputFiles().add(outFile.getUri()); } debug(log, "No of OutputFiles of this execution: {}", getExecution().getOutputFiles().size()); getExecution().setStatus(ExecutionStatus.FINISHED); } }