package io.github.infolis.algorithm; import io.github.infolis.InfolisConfig; import io.github.infolis.datastore.DataStoreClient; import io.github.infolis.datastore.DataStoreClientFactory; import io.github.infolis.datastore.DataStoreStrategy; import io.github.infolis.datastore.FileResolver; import io.github.infolis.datastore.FileResolverFactory; import io.github.infolis.model.Execution; import io.github.infolis.model.ExecutionStatus; import io.github.infolis.model.entity.InfolisFile; import io.github.infolis.util.SerializationUtils; import io.github.infolis.util.TextCleaningUtils; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.Set; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.exception.ExceptionUtils; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; import org.kohsuke.args4j.CmdLineException; import org.kohsuke.args4j.CmdLineParser; import org.kohsuke.args4j.Option; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.net.MediaType; import javax.ws.rs.BadRequestException; import javax.ws.rs.ProcessingException; /** * * @author kba * @author kata */ public class TextExtractor extends BaseAlgorithm { public TextExtractor(DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient, FileResolver inputFileResolver, FileResolver outputFileResolver) { super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver); try { stripper = new PDFTextStripper(); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } } private static final Logger log = LoggerFactory.getLogger(TextExtractor.class); private static final String executionTag = "TEXT_EXTRACTED"; private static final String executionTagUntokenized = "UNTOKENIZED"; private static final String executionTagBibNotRemoved = "BIBNOTREMOVED"; private final List<String> executionTags = new ArrayList<>(Arrays.asList( executionTag, executionTagUntokenized, executionTagBibNotRemoved)); private final PDFTextStripper stripper; protected static String getExecutionTagUntokenized() { return executionTagUntokenized; } protected static String getExecutionTagBibNotRemoved() { return executionTagBibNotRemoved; } private String removeBibSection(String text) { BibliographyExtractor bibExtractor = new BibliographyExtractor( getInputDataStoreClient(), getOutputDataStoreClient(), getInputFileResolver(), getOutputFileResolver()); //TODO: Test optimal section size this.executionTags.addAll(BibliographyExtractor.getExecutionTags()); this.executionTags.remove(getExecutionTagBibNotRemoved()); return bibExtractor.removeBibliography(bibExtractor.tokenizeSections(text, 10)); } private String tokenizeText(String text) throws IOException { Tokenizer tokenizer = new TokenizerStanford( getInputDataStoreClient(), getOutputDataStoreClient(), getInputFileResolver(), getOutputFileResolver()); Execution exec = new Execution(); exec.setTokenizeNLs(getExecution().getTokenizeNLs()); exec.setPtb3Escaping(getExecution().getPtb3Escaping()); tokenizer.setExecution(exec); String tokenizedText = tokenizer.getTokenizedText(tokenizer.getTokenizedSentences(text)); this.executionTags.addAll(tokenizer.getExecutionTags()); this.executionTags.remove(getExecutionTagUntokenized()); return tokenizedText; } public InfolisFile extract(InfolisFile inFile, int startPage, boolean tokenize) throws IOException { String asText = null; // TODO make configurable String outFileName = SerializationUtils.changeFileExtension(inFile.getFileName(), "txt"); // if no output directory is given, create temporary output files if (null == getExecution().getOutputDirectory() || getExecution().getOutputDirectory().equals("")) { String EXTRACTED_DIR_PREFIX = "extracted-"; String tempDir = Files.createTempDirectory(InfolisConfig.getTmpFilePath().toAbsolutePath(), EXTRACTED_DIR_PREFIX).toString(); FileUtils.forceDeleteOnExit(new File(tempDir)); outFileName = SerializationUtils.changeBaseDir(outFileName, tempDir); } else { outFileName = SerializationUtils.changeBaseDir(outFileName, getExecution().getOutputDirectory()); } InfolisFile outFile = new InfolisFile(); outFile.setFileName(outFileName); outFile.setOriginalName(inFile.getFileName()); outFile.setMediaType("text/plain"); if (getExecution().getOverwriteTextfiles() == false) { File _outFile = new File(outFileName); if (_outFile.exists()) { debug(log, "File exists: {}, skipping text extraction for {}", _outFile, inFile); asText = FileUtils.readFileToString(_outFile, "utf-8"); outFile.setMd5(SerializationUtils.getHexMd5(asText)); outFile.setFileStatus("AVAILABLE"); return outFile; } } InputStream inStream = null; OutputStream outStream = null; PDDocument pdfIn = null; try { inStream = getInputFileResolver().openInputStream(inFile); try { pdfIn = PDDocument.load(inStream); asText = extractText(pdfIn, startPage); if (null == asText) { throw new IOException("extractText returned null!"); } if (getExecution().isRemoveBib()) { asText = removeBibSection(asText); } if (getExecution().isTokenize()) { asText = tokenizeText(asText); } Set<String> tagsToSet = getExecution().getTags(); tagsToSet.addAll(inFile.getTags()); tagsToSet.addAll(executionTags); outFile.setTags(tagsToSet); outFile.setMd5(SerializationUtils.getHexMd5(asText)); outFile.setFileStatus("AVAILABLE"); try { outStream = getOutputFileResolver().openOutputStream(outFile); try { IOUtils.write(asText, outStream); } catch (IOException e) { warn(log, "Error copying text to output stream: " + e); throw e; } } catch (IOException e) { warn(log, "Error opening output stream to text file: " + e); throw e; } return outFile; } catch (IOException e) { warn(log, "Error reading PDF from stream: " + e); throw e; } } catch (IOException e) { warn(log, "Error opening input stream: " + e); throw e; } catch (Exception e) { warn(log, "Error converting PDF to text: " + e); throw e; } finally { if (null != outStream) outStream.close(); if (null != inStream) inStream.close(); if (null != pdfIn) pdfIn.close(); } } /** * Extract the text of a PDF and remove control sequences and line breaks. * * @param pdfIn {@link PDDocument} to extract text from * @return text of the PDF * @throws IOException */ private String extractText(PDDocument pdfIn, int startPage) throws IOException { String asText; stripper.setStartPage(startPage); asText = stripper.getText(pdfIn); if (null == asText) { throw new IOException("PdfStripper returned null!"); } asText = TextCleaningUtils.removeControlSequences(asText); asText = TextCleaningUtils.removeLineBreaks(asText); return asText; } @Override public void execute() { Execution tagExec = getExecution().createSubExecution(TagSearcher.class); tagExec.getInfolisFileTags().addAll(getExecution().getInfolisFileTags()); tagExec.getInfolisPatternTags().addAll(getExecution().getInfolisPatternTags()); tagExec.instantiateAlgorithm(this).run(); getExecution().getPatterns().addAll(tagExec.getPatterns()); getExecution().getInputFiles().addAll(tagExec.getInputFiles()); int counter =0; for (String inputFileURI : getExecution().getInputFiles()) { counter++; log.debug(inputFileURI); InfolisFile inputFile; try { inputFile = getInputDataStoreClient().get(InfolisFile.class, inputFileURI); } catch (BadRequestException | ProcessingException e) { error(log, "Could not retrieve file " + inputFileURI + ": " + e.getMessage()); getExecution().setStatus(ExecutionStatus.FAILED); return; } if (null == inputFile) { error(log, "File was not registered with the data store: " + inputFileURI); getExecution().setStatus(ExecutionStatus.FAILED); return; } if (null == inputFile.getMediaType() || !inputFile.getMediaType().equals(MediaType.PDF.toString())) { error(log, "File is not a PDF: " + inputFileURI); getExecution().setStatus(ExecutionStatus.FAILED); return; } debug(log, "Start extracting from {}", inputFile); InfolisFile outputFile; try { outputFile = extract(inputFile, getExecution().getStartPage(), getExecution().isTokenize()); debug(log, "Converted to file {}", outputFile); } catch (IOException e) { // invalid pdf file cannot be read by pdfBox // log warning, skip file and continue with next file warn(log, "Extraction caused exception in file {} - PdfBox cannot extract from this file, is it a valid pdf file? Trace: \n{}", inputFile, ExceptionUtils.getStackTrace(e)); outputFile = null; continue; } catch (RuntimeException e) { // warn but not error: do not terminate execution but continue with next file. // RuntimeErrors caused by DataFormatExceptions in pdfBox may occur when // pdfBox cannot handle a (valid) pdf file due to its encoding warn(log, "Extraction caused exception in file {} - PdfBox cannot extract from this file due to its encoding or similar issues: \n{}", inputFile, ExceptionUtils.getStackTrace(e)); outputFile = null; continue; } updateProgress(counter, getExecution().getInputFiles().size()); if (null == outputFile) { warn(log, "Conversion failed for input file {}", inputFileURI); } else { outputFile.setManifestsEntity(inputFile.getManifestsEntity()); getOutputDataStoreClient().post(InfolisFile.class, outputFile); getExecution().getOutputFiles().add(outputFile.getUri()); } } debug(log, "No of OutputFiles of this execution: {}", getExecution().getOutputFiles().size()); getExecution().setStatus(ExecutionStatus.FINISHED); } @Override public void validate() throws IllegalAlgorithmArgumentException { Execution exec = this.getExecution(); if ((null == exec.getInputFiles() || exec.getInputFiles().isEmpty()) && (null == exec.getInfolisFileTags() || exec.getInfolisFileTags().isEmpty())) { throw new IllegalArgumentException("Must set at least one inputFile!"); } if (null == exec.isTokenize()) { warn(log, "\"tokenize\" field unspecified. Defaulting to \"false\"."); this.getExecution().setTokenize(false); } } /** * Class for processing command line options using args4j. * * @author kata * @author kba */ static class OptionHandler { @Option(name = "-i", usage = "path to read PDF documents from", metaVar = "INPUT_PATH") private String inputPathOption = System.getProperty("user.dir"); @Option(name = "-o", usage = "directory to save converted documents to", metaVar = "OUTPUT_PATH") private String outputPathOption = System.getProperty("user.dir"); @Option(name = "-b", usage = "remove bibliographies", metaVar = "REMOVE_BIBLIOGRAPHIES") private boolean removeBib = false; @Option(name = "-t", usage = "tokenize", metaVar = "TOKENIZE") private boolean tokenize = false; @Option(name = "-w", usage = "overwrite existing text files", metaVar = "OVERWRITE") private boolean overwriteTextfiles = true; public void parse(String[] args) { CmdLineParser parser = new CmdLineParser(this); try { parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); parser.printSingleLineUsage(System.err); parser.printUsage(System.err); System.exit(1); } Execution execution = new Execution(); execution.setAlgorithm(TextExtractor.class); FileResolver ifr = FileResolverFactory.create(DataStoreStrategy.LOCAL); DataStoreClient idsc = DataStoreClientFactory.create(DataStoreStrategy.LOCAL); Algorithm algo = execution.instantiateAlgorithm(idsc, idsc, ifr, ifr); Path inputPath = Paths.get(inputPathOption); if (Files.isDirectory(inputPath)) { try { Iterator<Path> directoryStream = Files.newDirectoryStream(inputPath, "*.pdf").iterator(); while (directoryStream.hasNext()) { InfolisFile fileToPost = new InfolisFile(); fileToPost.setFileName(directoryStream.next().toString()); fileToPost.setMediaType("application/pdf"); algo.getInputDataStoreClient().post(InfolisFile.class, fileToPost); execution.getInputFiles().add(fileToPost.getUri()); } } catch (IOException e) { log.error("Could not read '*.pdf' in directory {}.", inputPath); System.exit(1); } } else { execution.getInputFiles().add(inputPathOption.toString()); } Path outputPath = Paths.get(outputPathOption); if (!Files.exists(outputPath)) { try { Files.createDirectories(outputPath); } catch (IOException e) { log.error("Output directory {} doesn't exist and can't be created.", outputPath); System.exit(1); } } else if (!Files.isDirectory(outputPath)) { log.error("Output directory {} is no directory.", outputPath); System.exit(1); } execution.setOutputDirectory(outputPath.toString()); execution.setRemoveBib(removeBib); execution.setTokenize(tokenize); execution.setOverwriteTextfiles(overwriteTextfiles); algo.run(); } } public static void main(String[] args) { new OptionHandler().parse(args); } }