package io.github.infolis.commandLine; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.Reader; import java.nio.charset.Charset; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import javax.json.Json; import javax.json.JsonArray; import javax.json.JsonObject; import javax.json.JsonString; import javax.json.JsonValue; import javax.ws.rs.BadRequestException; import org.apache.commons.io.FileUtils; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.kohsuke.args4j.CmdLineParser; import org.kohsuke.args4j.Option; import org.kohsuke.args4j.ParserProperties; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import io.github.infolis.algorithm.Algorithm; import io.github.infolis.algorithm.ComplexAlgorithm; import io.github.infolis.algorithm.Indexer; import io.github.infolis.algorithm.SearchResultLinker; import io.github.infolis.algorithm.LuceneSearcher; import io.github.infolis.algorithm.TextAndMetaDataExtractor; import io.github.infolis.algorithm.TextExtractor; import io.github.infolis.datastore.DataStoreClient; import io.github.infolis.datastore.DataStoreClientFactory; import io.github.infolis.datastore.DataStoreStrategy; import io.github.infolis.datastore.FileResolver; import io.github.infolis.datastore.FileResolverFactory; import io.github.infolis.model.BootstrapStrategy; import io.github.infolis.model.EntityType; import io.github.infolis.model.Execution; import io.github.infolis.model.entity.Entity; import io.github.infolis.model.entity.InfolisFile; import io.github.infolis.model.entity.InfolisPattern; import io.github.infolis.infolink.querying.QueryService; import io.github.infolis.util.RegexUtils; import io.github.infolis.util.SerializationUtils; import org.kohsuke.args4j.CmdLineException; /** * CLI to Infolis to make it easy to run an execution and store its results in a * JSON file. * */ public class CommandLineExecuter { private static final Logger log = LoggerFactory.getLogger(CommandLineExecuter.class); private DataStoreClient dataStoreClient = DataStoreClientFactory.create(DataStoreStrategy.TEMPORARY); private FileResolver fileResolver = FileResolverFactory.create(DataStoreStrategy.LOCAL); @Option(name = "--text-dir", usage = "Directory containing *.txt", metaVar = "TEXTDIR") private Path textDir; @Option(name = "--pdf-dir", usage = "Directory containing *.pdf", metaVar = "PDFDIR") private Path pdfDir; @Option(name = "--db-dir", usage = "Directory to hold JSON database dump", metaVar = "OUTPUTDIR", required = true) private Path dbDir; @Option(name = "--index-dir", usage = "Directory to contain the Lucene index (no index unless specified)", metaVar = "INDEXDIR", depends = {"--tag"}) private Path indexDir; @Option(name = "--meta-dir", usage = "Directory to contain the metadata files", metaVar = "METADIR", depends = {"--pdf-dir"}) private Path metaDir; @Option(name = "--json", usage = "Execution as JSON", metaVar = "JSON") private Path json; //TODO: support multiple tags (e.g. domain, journal, language) @Option(name = "--tag", usage = "tag, also JSON dump basename", metaVar = "TAG", required = true) private String tag; @Option(name = "--log-level", usage = "minimum log level") private String logLevel = "DEBUG"; @Option(name = "--convert-to-text", usage = "whether to convert to text before execution", depends = {"--pdf-dir"}) private boolean shouldConvertToText = false; @Option(name = "--search-candidates", usage = "look for files that match a set of queries", depends = {"--queries-file", "--tag"}) private boolean searchCandidatesMode = false; @Option(name = "--queries-file", usage = "csv-file containing one query term per line", metaVar = "QUERIESFILE", depends = {"--search-candidates"}) private String queriesFile; // This is set so we can accept --convert-to-text without JSON and not try to execute anything private boolean convertToTextMode = false; @SuppressWarnings("unchecked") private void setExecutionFromJSON(JsonObject jsonObject, Execution exec) { try { // iterate through the entries in the JSON file for (Entry<String, JsonValue> values : jsonObject.entrySet()) { switch (values.getValue().getValueType()) { case STRING: case NUMBER: case TRUE: case FALSE: // algorithm has to be handled as a special case since we // need // to find the class if (values.getKey().equals("algorithm")) { String algorithmName = values.getValue().toString(); algorithmName = algorithmName.replace("\"", ""); if (!algorithmName.startsWith("io.github.infolis.algorithm")) { algorithmName = "io.github.infolis.algorithm." + algorithmName; } try { Class<? extends Algorithm> algoClass; algoClass = (Class<? extends Algorithm>) Class.forName(algorithmName); exec.setAlgorithm(algoClass); } catch (ClassNotFoundException | ClassCastException e1) { throwCLI("No such algorithm: " + algorithmName); } break; } // TODO generic solution for enums? if (values.getKey().equals("bootstrapStrategy")) { BootstrapStrategy b = BootstrapStrategy.valueOf(values.getValue().toString().replace("\"", "")); exec.setBootstrapStrategy(b); break; } if (values.getKey().equals("tokenize")) { exec.setTokenize(Boolean.parseBoolean(values.getValue().toString())); break; } if (values.getKey().equals("searchResultLinkerClass")) { String searchResultLinkerClassName = values.getValue().toString().replace("\"", ""); String prefix = "io.github.infolis.algorithm."; if (!searchResultLinkerClassName.startsWith(prefix)) { searchResultLinkerClassName = prefix + searchResultLinkerClassName; } try { Class<? extends SearchResultLinker> searchResultLinkerClass = (Class<? extends SearchResultLinker>) Class.forName(searchResultLinkerClassName); exec.setSearchResultLinkerClass(searchResultLinkerClass); } catch (ClassNotFoundException | ClassCastException e1) { throwCLI("No such SearchResultLinker class: " + searchResultLinkerClassName); } break; } // all other fields are just set exec.setProperty(values.getKey(), values.getValue().toString().replace("\"", "")); break; case ARRAY: if (values.getKey().equals("linkedEntities")) { List<String> entityURIs = new ArrayList<>(); JsonArray array = (JsonArray) values.getValue(); for (int i = 0; i < array.size(); i++) { JsonObject entityObject = (JsonObject) array.getJsonObject(i); String name = ""; String number = ""; String identifier = ""; Set<String> tags = new HashSet<>(); if (entityObject.containsKey("name")) { name = String.valueOf(entityObject.get("name")); name = name.substring(1, name.length()-1).trim(); } if (entityObject.containsKey("number")) { number = String.valueOf(entityObject.get("number")); } if (entityObject.containsKey("identifier")) { identifier = String.valueOf(entityObject.get("identifier")); } // TODO set tags // TODO set entityType Entity entity = new Entity(); entity.setTags(tags); entity.setName(name); List<String> numInfo = new ArrayList<>(); numInfo.add(number); entity.setNumericInfo(numInfo); entity.addIdentifier(identifier); dataStoreClient.post(Entity.class, entity); entityURIs.add(entity.getUri()); } exec.setLinkedEntities(entityURIs); break; } if (values.getKey().equals("queryServiceClasses")) { JsonArray array = (JsonArray) values.getValue(); for (int i = 0; i < array.size(); i++) { JsonString stringEntry = array.getJsonString(i); String queryServiceName = stringEntry.getString(); queryServiceName = queryServiceName.replace("\"", ""); // XXX why? if (!queryServiceName.startsWith("io.github.infolis.infolink.querying.")) { queryServiceName = "io.github.infolis.infolink.querying." + queryServiceName; } log.debug("queryServiceClass item: " + queryServiceName); try { Class<? extends QueryService> queryServiceClass; queryServiceClass = (Class<? extends QueryService>) Class.forName(queryServiceName); exec.addQueryServiceClasses(queryServiceClass); } catch (ClassNotFoundException | ClassCastException e1) { throwCLI("No such queryService: " + queryServiceName); } } break; } JsonArray array = (JsonArray) values.getValue(); List<String> listEntries = new ArrayList<>(); for (int i = 0; i < array.size(); i++) { JsonString stringEntry = array.getJsonString(i); listEntries.add(stringEntry.getString()); } if (values.getKey().equals("infolisFileTags") || values.getKey().equals("infolisPatternTags")) { exec.setProperty(values.getKey(), new HashSet<>(listEntries)); } else if (values.getKey().equals("tags")) { exec.setTags(new HashSet<>(listEntries)); } else exec.setProperty(values.getKey(), listEntries); break; case OBJECT: //$FALL-THROUGH$ default: throwCLI("Unhandled value type " + values.getValue().getValueType() + " for JSON key " + values.getKey()); break; } } } catch (NoSuchFieldException | IllegalAccessException e) { throwCLI("No such field", e); } } List<String> getQueryTermsFromFile(String filename) throws IOException { return FileUtils.readLines(new File(filename), "UTF-8"); } /** * Executes the 'candidate search' mode which fires a LuceneSearcher * execution for every search term provided./ * * @param exec2 * * @param exec * @throws BadRequestException * @throws IOException */ private void executeCandidateSearch(Execution parentExec) throws BadRequestException, IOException { if (null == queriesFile) { throwCLI("Inconsistency: --search-candidates but queries>File is null."); } if (null == indexDir) { throwCLI("Inconsistency: --search-candidates but no --index-dir given."); } Set<String> allMatchingFiles = new HashSet<>(); Map<String, List<String>> matchingFilesByQuery = new HashMap<>(); for (String query : getQueryTermsFromFile(queriesFile)) { String normalizedQuery = RegexUtils.normalizeQuery(query.trim(), true); matchingFilesByQuery.put(normalizedQuery, new ArrayList<String>()); Execution exec = new Execution(); exec.setAlgorithm(LuceneSearcher.class); exec.setTags(new HashSet<>(Arrays.asList(tag))); exec.setPhraseSlop(0); exec.setIndexDirectory(parentExec.getIndexDirectory()); // normalize to treat phrase query properly InfolisPattern p = new InfolisPattern(normalizedQuery); dataStoreClient.post(InfolisPattern.class, p); exec.setPatterns(Arrays.asList(p.getUri())); dataStoreClient.post(Execution.class, exec); exec.instantiateAlgorithm(dataStoreClient, fileResolver).run(); for (InfolisFile f : dataStoreClient.get(InfolisFile.class, exec.getOutputFiles())) { String filename = FilenameUtils.getBaseName(f.getFileName()); allMatchingFiles.add(filename); matchingFilesByQuery.get(normalizedQuery).add(filename); } } Path outFile = dbDir.resolve(FilenameUtils.getBaseName(queriesFile)); try (OutputStream os = Files.newOutputStream(outFile)) { IOUtils.write(StringUtils.join(allMatchingFiles, "\n"), os); }; log.warn("ALL MATCHES: {}", allMatchingFiles); log.warn("MATCHES BY QUERY: {}", matchingFilesByQuery); } private void extractMetaData(Execution parentExec) { Execution extractExec = new Execution(); extractExec.setAlgorithm(TextAndMetaDataExtractor.class); extractExec.setMetaDataFiles(parentExec.getMetaDataFiles()); extractExec.setInputFiles(parentExec.getInputFiles()); dataStoreClient.post(Execution.class, extractExec); Algorithm algo = extractExec.instantiateAlgorithm(dataStoreClient, fileResolver); algo.run(); log.debug("Extracted metadata for " +parentExec.getMetaDataFiles().size()+" files"); } /** * Run the execution * * @param exec * @throws BadRequestException * @throws IOException */ private void doExecute(Execution exec) throws BadRequestException, IOException { // Set the input files, convert if necessary try { setExecutionInputFiles(exec); } catch (IOException e) { throwCLI("Problem setting input files", e); } // Create index if necessary if (indexDir != null) { Files.createDirectories(indexDir); setExecutionIndexDir(exec); } try { dataStoreClient.post(Execution.class, exec); if (convertToTextMode) { log.debug("Yay nothing to do. woop dee doo."); } else { if(metaDir != null) { extractMetaData(exec); } if (searchCandidatesMode) { executeCandidateSearch(exec); } else { exec.instantiateAlgorithm(dataStoreClient, fileResolver).run(); } } } catch (BadRequestException e) { throwCLI("Execution threw an excepion", e); } dataStoreClient.dump(dbDir, tag); } /** * Indexes the input files in Lucene and sets the indexDirectory of the * execution. * * @param exec */ private void setExecutionIndexDir(Execution exec) { Execution indexerExecution = new Execution(); indexerExecution.setAlgorithm(Indexer.class); indexerExecution.setInputFiles(exec.getInputFiles()); indexerExecution.setPhraseSlop(0); indexerExecution.setOutputDirectory(indexDir.toString()); dataStoreClient.post(Execution.class, indexerExecution); indexerExecution.instantiateAlgorithm(dataStoreClient, fileResolver).run(); exec.setIndexDirectory(indexerExecution.getOutputDirectory()); } /** * Sets the input files for an execution and converts depending on command * line arguments. * * @param exec * @throws IOException */ private void setExecutionInputFiles(Execution exec) throws IOException { if (null == pdfDir || !Files.exists(pdfDir)) { if (shouldConvertToText) { throwCLI("Cannot convert to text: Empty/non-existing PDF directory" + pdfDir); } if (null == textDir || !Files.exists(textDir)) { throwCLI("Neither PDFDIR nor TEXTDIR exist"); } else { try (DirectoryStream<Path> dirStream = Files.newDirectoryStream(textDir)) { if (!dirStream.iterator().hasNext()) { dirStream.close(); throwCLI("No PDFDIR specified, TEXTDIR exists, but empty."); } } exec.setInputFiles(postFiles(textDir, "text/plain")); } } else { if (null == textDir || !Files.exists(textDir)) { if (shouldConvertToText) { Files.createDirectories(textDir); if (null == exec.isTokenize()) { log.warn("Warning: tokenize parameter not set. Defaulting to true for text extraction and all algorithms to be applied on extracted texts"); exec.setTokenize(true); } exec.setInputFiles(convertPDF(postFiles(pdfDir, "application/pdf"), exec.getStartPage(), exec.isRemoveBib(), exec.getOverwriteTextfiles(), exec.isTokenize(), exec.getTokenizeNLs(), exec.getPtb3Escaping())); } else { // complex algorithm can convert pdfs on demand, others can't and need --convert-to-text option if (ComplexAlgorithm.class.isAssignableFrom(exec.getAlgorithm())) exec.setInputFiles(postFiles(pdfDir, "application/pdf")); else if (TextExtractor.class.equals(exec.getAlgorithm())) exec.setInputFiles(postFiles(pdfDir, "application/pdf")); else throwCLI("PDFDIR specified, TEXTDIR unspecified/empty, but --convert-to-text not set"); } } else { if (shouldConvertToText) { //System.err.println("WARNING: Both --text-dir '" + textDir + "' and --pdf-dir '" + pdfDir // + "' were specified. Will possibly clobber text files in conversion!"); System.err.println("WARNING: Both --text-dir '" + textDir + "' and --pdf-dir '" + pdfDir + "' were specified. Overwriting text files: " + exec.getOverwriteTextfiles()); System.err.println("<Ctrl-C> to stop, <Enter> to continue"); System.in.read(); if (null == exec.isTokenize()) { log.warn("Warning: tokenize parameter not set. Defaulting to true for text extraction and all algorithms to be applied on extracted texts"); exec.setTokenize(true); } exec.setInputFiles(convertPDF(postFiles(pdfDir, "application/pdf"), exec.getStartPage(), exec.isRemoveBib(), exec.getOverwriteTextfiles(), exec.isTokenize(), exec.getTokenizeNLs(), exec.getPtb3Escaping())); } else { exec.setInputFiles(postFiles(textDir, "text/plain")); } } //extract the metadata and create the according entities if (metaDir != null) { List<String> files = new ArrayList<>(); for (File f : metaDir.toFile().listFiles()) { files.add(f.getAbsolutePath()); } exec.setMetaDataFiles(files); } } } /** * checks whether all the variables in the Json file are indeed variables * which can be set in the execution. * * @param o * @return */ private void checkJsonFile(JsonObject o) { List<String> badFields = new ArrayList<>(); Execution testExecution = new Execution(); for (Entry<String, JsonValue> values : o.entrySet()) { if (values.getKey().equals("inputFiles")) { throwCLI("Do not specify inputFiles in JSON, it will be overridden [in " + json + "]"); } try { testExecution.getClass().getDeclaredField(values.getKey()); } catch (NoSuchFieldException ex) { try { testExecution.getClass().getSuperclass().getDeclaredField(values.getKey()); } catch (NoSuchFieldException ex2) { badFields.add(values.getKey()); } } } if (!badFields.isEmpty()) { throwCLI("Unknown fields: " + badFields); } } /** * Converts a list of InfolisFile to text * * @param uris URIs of the InfolisFiles * @return URIs of the InfolisFiles of the text versions */ private List<String> convertPDF(List<String> uris, int startPage, boolean removeBib, boolean overwriteTextfiles, boolean tokenize, boolean tokenizeNLs, boolean ptb3Escaping) { Execution convertExec = new Execution(); convertExec.setAlgorithm(TextExtractor.class); convertExec.setOutputDirectory(textDir.toString()); convertExec.setInputFiles(uris); convertExec.setStartPage(startPage); convertExec.setRemoveBib(removeBib); convertExec.setOverwriteTextfiles(overwriteTextfiles); convertExec.setTokenize(tokenize); convertExec.setTokenizeNLs(tokenizeNLs); convertExec.setPtb3Escaping(ptb3Escaping); convertExec.setTags(new HashSet<>(Arrays.asList(tag))); Algorithm algo = convertExec.instantiateAlgorithm(dataStoreClient, fileResolver); algo.run(); return convertExec.getOutputFiles(); } public List<String> postFiles(Path dir, String mimetype) { List<InfolisFile> infolisFiles = new ArrayList<>(); try (DirectoryStream<Path> dirStream = Files.newDirectoryStream(dir)) { for (Path file : dirStream) { InfolisFile infolisFile = new InfolisFile(); try (InputStream inputStream = Files.newInputStream(file)) { byte[] bytes = IOUtils.toByteArray(inputStream); infolisFile.setMd5(SerializationUtils.getHexMd5(bytes)); } catch (IOException e) { throwCLI("Could not read file " + file, e); } infolisFile.setFileName(file.toString()); infolisFile.setMediaType(mimetype); infolisFile.setTags(new HashSet<>(Arrays.asList(tag))); infolisFile.setFileStatus("AVAILABLE"); Entity entity = new Entity(); entity.setEntityType(EntityType.publication); dataStoreClient.post(Entity.class, entity); infolisFile.setManifestsEntity(entity.getUri()); infolisFiles.add(infolisFile); } } catch (IOException e) { throwCLI("Couldn't list directory contents of " + dir, e); } return dataStoreClient.post(InfolisFile.class, infolisFiles); } private static void throwCLI(String msg) { throwCLI(msg, null); } private static void throwCLI(String msg, Exception e) { if (null != msg) { System.err.println("**ERROR** " + msg); } if (null != e) { System.err.println(e.getMessage()); e.printStackTrace(System.err); } if (System.getProperty("testing") == null) { System.exit(1); } else { log.error("ERROR: {}", e); } throw new RuntimeException(e); } public void doMain(String args[]) throws FileNotFoundException, ClassNotFoundException, NoSuchFieldException, IllegalAccessException, IOException, CmdLineException { CmdLineParser parser = new CmdLineParser(this, ParserProperties.defaults().withUsageWidth(120)); try { parser.parseArgument(args); if (null == json && !(shouldConvertToText || searchCandidatesMode)) { throwCLI("Must specify JSON if not --convert-to-text|--search-candidates"); } if (null == json && shouldConvertToText && !searchCandidatesMode) { convertToTextMode = true; } // ch.qos.logback.classic.Logger root = (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(Logger.ROOT_LOGGER_NAME); // root.setLevel(Level.toLevel(logLevel)); } catch (CmdLineException e) { System.err.println("java " + getClass().getSimpleName() + " [options...]"); parser.printUsage(System.err); throwCLI("", e); return; } Files.createDirectories(dbDir); Execution exec = new Execution(); exec.setTags(new HashSet<String>(Arrays.asList(tag))); // if no JSON was provided, only convert files and exit if (null != json) { try (Reader reader = Files.newBufferedReader(json, Charset.forName("UTF-8"))) { JsonObject jsonObject = Json.createReader(reader).readObject(); // Check the JSON checkJsonFile(jsonObject); // Set the other options from JSON setExecutionFromJSON(jsonObject, exec); } catch (IOException e) { throwCLI("Problem reading JSON " + json, e); } } doExecute(exec); } public static void main(String args[]) throws Exception { new CommandLineExecuter().doMain(args); } }