CommandLineExecuter.java example

Explorer

infoLink-master
- src
  - main
    - java
      - io
        github
        infolis
        InfolisConfig.java
        algorithm
        Algorithm.java
        BaseAlgorithm.java
        BestMatchLinker.java
        BibliographyExtractor.java
        Bootstrapping.java
        ComplexAlgorithm.java
        DoiExtractor.java
        DoiLinker.java
        FederatedSearcher.java
        FrequencyBasedBootstrapping.java
        GoldLinker.java
        IllegalAlgorithmArgumentException.java
        Indexer.java
        InfolisPatternSearcher.java
        KeywordTagger.java
        LearnPatternsAndCreateLinks.java
        LinkImporter.java
        LocalSearcher.java
        LuceneSearcher.java
        MetaDataExtractor.java
        MultiMatchesLinker.java
        OntologyLinker.java
        ReferenceLinker.java
        RegexSearcher.java
        ReliabilityBasedBootstrapping.java
        SearchDoisAndCreateLinks.java
        SearchPatternsAndCreateLinks.java
        SearchResultLinker.java
        SpringerImporter.java
        TagSearcher.java
        TextAndMetaDataExtractor.java
        TextExtractor.java
        Tokenizer.java
        TokenizerOpenNLP.java
        TokenizerStanford.java
        commandLine
        CommandLineExecuter.java
        datastore
        AbstractClient.java
        CentralClient.java
        CentralFileResolver.java
        CentralFileResolverTest.java
        DataStoreClient.java
        DataStoreClientFactory.java
        DataStoreStrategy.java
        FileResolver.java
        FileResolverFactory.java
        LocalClient.java
        LocalFileResolver.java
        OutputFileResolver.java
        TempFileResolver.java
        infolink
        annotations
        Annotation.java
        AnnotationHandler.java
        WebAnnoTsvHandler.java
        lucene
        ContextHighlighter.java
        patternLearner
        BootstrapLearner.java
        Reliability.java
        StandardPatternInducer.java
        querying
        DaraHTMLQueryService.java
        DaraSolrMatcher.java
        DaraSolrQueryService.java
        DaraWebMatcher.java
        DataciteQueryService.java
        QueryService.java
        SearchResultScorer.java
        tagger
        Tagger.java
        model
        BaseModel.java
        BootstrapStrategy.java
        Chunk.java
        EntityType.java
        ErrorResponse.java
        Execution.java
        ExecutionStatus.java
        ExtractionMethod.java
        MetaDataExtractingStrategy.java
        SearchQuery.java
        StudyType.java
        TaggedWord.java
        TextualReference.java
        entity
        Entity.java
        EntityLink.java
        InfolisFile.java
        InfolisPattern.java
        Keyword.java
        SearchResult.java
        scheduler
        ExecutionScheduler.java
        util
        EvaluationUtils.java
        InfolisFileUtils.java
        InformationExtractor.java
        LimitedTimeMatcher.java
        MathUtils.java
        RegexUtils.java
        SerializationUtils.java
        TextCleaningUtils.java
        URLParamEncoder.java
        ws
        server
        ExecutorWebservice.java
        InfolisApplication.java
        UploadWebservice.java
  - test
    - java
      - io
        github
        infolis
        InfolisBaseTest.java
        InfolisConfigTest.java
        algorithm
        BibliographyExtractorTest.java
        BootstrappingTest.java
        CitationMinerTest.java
        DoiExtractorTest.java
        DumpAlgo.java
        ExampleChecker.java
        FederatedSearcherTest.java
        FrequencyBasedBootstrappingTest.java
        GoldLinkerTest.java
        IndexerTest.java
        InfolisPatternSearcherTest.java
        KeywordTaggerTest.java
        LearnPatternsAndCreateLinksTest.java
        LinkImporterTest.java
        LuceneSearcherTest.java
        MetaDataExtractorTest.java
        OntologyLinkerTest.java
        ProgressUpdates.java
        ReferenceLinkerTest.java
        RegexSearcherTest.java
        ReliabilityBasedBootstrappingTest.java
        SearchDoisAndCreateLinksTest.java
        SearchPatternsAndCreateLinksTest.java
        SpringerImporterTest.java
        TagSearcherTest.java
        TextAndMetaDataExtractorTest.java
        TextExtractorTest.java
        TokenizerTest.java
        commandLine
        CommandLineExecuterTest.java
        datastore
        CentralClientTest.java
        LocalClientTest.java
        infolink
        annotations
        WebAnnoTsvHandlerTest.java
        patternLearner
        ReliabilityTest.java
        StandardPatternInducerTest.java
        querying
        DaraHTMLQueryServiceTest.java
        DaraSolrMatcherTest.java
        DaraSolrQueryServiceTest.java
        DataciteQueryServiceTest.java
        QueryServiceTest.java
        SearchResultScorerTest.java
        model
        ExecutionTest.java
        InfolisPatternTest.java
        TextualReferenceTest.java
        entity
        EntityLinkTest.java
        scheduler
        ExecutionSchedulerTest.java
        util
        InformationExtractorTest.java
        LimitedTimeMatcherTest.java
        MathUtilsTest.java
        RegexUtilsTest.java
        ws
        server
        ExecutorWebserviceTest.java

package io.github.infolis.commandLine;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import javax.json.Json;
import javax.json.JsonArray;
import javax.json.JsonObject;
import javax.json.JsonString;
import javax.json.JsonValue;
import javax.ws.rs.BadRequestException;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import org.kohsuke.args4j.ParserProperties;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import io.github.infolis.algorithm.Algorithm;
import io.github.infolis.algorithm.ComplexAlgorithm;
import io.github.infolis.algorithm.Indexer;
import io.github.infolis.algorithm.SearchResultLinker;
import io.github.infolis.algorithm.LuceneSearcher;
import io.github.infolis.algorithm.TextAndMetaDataExtractor;
import io.github.infolis.algorithm.TextExtractor;
import io.github.infolis.datastore.DataStoreClient;
import io.github.infolis.datastore.DataStoreClientFactory;
import io.github.infolis.datastore.DataStoreStrategy;
import io.github.infolis.datastore.FileResolver;
import io.github.infolis.datastore.FileResolverFactory;
import io.github.infolis.model.BootstrapStrategy;
import io.github.infolis.model.EntityType;
import io.github.infolis.model.Execution;
import io.github.infolis.model.entity.Entity;
import io.github.infolis.model.entity.InfolisFile;
import io.github.infolis.model.entity.InfolisPattern;
import io.github.infolis.infolink.querying.QueryService;
import io.github.infolis.util.RegexUtils;
import io.github.infolis.util.SerializationUtils;
import org.kohsuke.args4j.CmdLineException;

/**
 * CLI to Infolis to make it easy to run an execution and store its results in a
 * JSON file.
 *
 */
public class CommandLineExecuter {

    private static final Logger log = LoggerFactory.getLogger(CommandLineExecuter.class);

    private DataStoreClient dataStoreClient = DataStoreClientFactory.create(DataStoreStrategy.TEMPORARY);
    private FileResolver fileResolver = FileResolverFactory.create(DataStoreStrategy.LOCAL);

    @Option(name = "--text-dir", usage = "Directory containing *.txt", metaVar = "TEXTDIR")
    private Path textDir;

    @Option(name = "--pdf-dir", usage = "Directory containing *.pdf", metaVar = "PDFDIR")
    private Path pdfDir;

    @Option(name = "--db-dir", usage = "Directory to hold JSON database dump", metaVar = "OUTPUTDIR", required = true)
    private Path dbDir;

    @Option(name = "--index-dir", usage = "Directory to contain the Lucene index (no index unless specified)", metaVar = "INDEXDIR", depends = {"--tag"})
    private Path indexDir;

    @Option(name = "--meta-dir", usage = "Directory to contain the metadata files", metaVar = "METADIR", depends = {"--pdf-dir"})
    private Path metaDir;

    @Option(name = "--json", usage = "Execution as JSON", metaVar = "JSON")
    private Path json;

    //TODO: support multiple tags (e.g. domain, journal, language)
    @Option(name = "--tag", usage = "tag, also JSON dump basename", metaVar = "TAG", required = true)
    private String tag;

    @Option(name = "--log-level", usage = "minimum log level")
    private String logLevel = "DEBUG";

    @Option(name = "--convert-to-text", usage = "whether to convert to text before execution", depends = {"--pdf-dir"})
    private boolean shouldConvertToText = false;

    @Option(name = "--search-candidates", usage = "look for files that match a set of queries", depends = {"--queries-file", "--tag"})
    private boolean searchCandidatesMode = false;

    @Option(name = "--queries-file", usage = "csv-file containing one query term per line", metaVar = "QUERIESFILE", depends = {"--search-candidates"})
    private String queriesFile;

    // This is set so we can accept --convert-to-text without JSON and not try to execute anything
    private boolean convertToTextMode = false;

    @SuppressWarnings("unchecked")
    private void setExecutionFromJSON(JsonObject jsonObject, Execution exec) {
        try {
            // iterate through the entries in the JSON file
            for (Entry<String, JsonValue> values : jsonObject.entrySet()) {
                switch (values.getValue().getValueType()) {
                    case STRING:
                    case NUMBER:
                    case TRUE:
                    case FALSE:
                    // algorithm has to be handled as a special case since we
                        // need
                        // to find the class
                        if (values.getKey().equals("algorithm")) {
                            String algorithmName = values.getValue().toString();
                            algorithmName = algorithmName.replace("\"", "");
                            if (!algorithmName.startsWith("io.github.infolis.algorithm")) {
                                algorithmName = "io.github.infolis.algorithm." + algorithmName;
                            }
                            try {
                                Class<? extends Algorithm> algoClass;
                                algoClass = (Class<? extends Algorithm>) Class.forName(algorithmName);
                                exec.setAlgorithm(algoClass);
                            } catch (ClassNotFoundException | ClassCastException e1) {
                                throwCLI("No such algorithm: " + algorithmName);
                            }
                            break;
                        }
                        // TODO generic solution for enums?
                        if (values.getKey().equals("bootstrapStrategy")) {
                            BootstrapStrategy b = BootstrapStrategy.valueOf(values.getValue().toString().replace("\"", ""));
                            exec.setBootstrapStrategy(b);
                            break;
                        }

                        if (values.getKey().equals("tokenize")) {
                            exec.setTokenize(Boolean.parseBoolean(values.getValue().toString()));
                            break;
                        }

                        if (values.getKey().equals("searchResultLinkerClass")) {
                            String searchResultLinkerClassName = values.getValue().toString().replace("\"", "");
                            String prefix = "io.github.infolis.algorithm.";
                            if (!searchResultLinkerClassName.startsWith(prefix)) {
                                searchResultLinkerClassName = prefix + searchResultLinkerClassName;
                            }
                            try {
                                Class<? extends SearchResultLinker> searchResultLinkerClass = (Class<? extends SearchResultLinker>) Class.forName(searchResultLinkerClassName);
                                exec.setSearchResultLinkerClass(searchResultLinkerClass);
                            } catch (ClassNotFoundException | ClassCastException e1) {
                                throwCLI("No such SearchResultLinker class: " + searchResultLinkerClassName);
                            }
                            break;
                        }

                        // all other fields are just set
                        exec.setProperty(values.getKey(), values.getValue().toString().replace("\"", ""));
                        break;

                    case ARRAY:
                        if (values.getKey().equals("linkedEntities")) {
                            List<String> entityURIs = new ArrayList<>();
                            JsonArray array = (JsonArray) values.getValue();
                            for (int i = 0; i < array.size(); i++) {

                                JsonObject entityObject = (JsonObject) array.getJsonObject(i);
                                String name = "";
                                String number = "";
                                String identifier = "";
                                Set<String> tags = new HashSet<>();
                                if (entityObject.containsKey("name")) {
                                    name = String.valueOf(entityObject.get("name"));
                                    name = name.substring(1, name.length()-1).trim();
                                }
                                if (entityObject.containsKey("number")) {
                                    number = String.valueOf(entityObject.get("number"));
                                }
                                if (entityObject.containsKey("identifier")) {
                                    identifier = String.valueOf(entityObject.get("identifier"));
                                }
                                // TODO set tags
                                // TODO set entityType
                                Entity entity = new Entity();
                                entity.setTags(tags);
                                entity.setName(name);
                                List<String> numInfo = new ArrayList<>();
                                numInfo.add(number);
                                entity.setNumericInfo(numInfo);
                                entity.addIdentifier(identifier);
                                dataStoreClient.post(Entity.class, entity);
                                entityURIs.add(entity.getUri());
                            }
                            exec.setLinkedEntities(entityURIs);
                            break;
                        }

                        if (values.getKey().equals("queryServiceClasses")) {
                            JsonArray array = (JsonArray) values.getValue();
                            for (int i = 0; i < array.size(); i++) {
                                JsonString stringEntry = array.getJsonString(i);

                                String queryServiceName = stringEntry.getString();
                                queryServiceName = queryServiceName.replace("\"", ""); // XXX why?
                                if (!queryServiceName.startsWith("io.github.infolis.infolink.querying.")) {
                                    queryServiceName = "io.github.infolis.infolink.querying." + queryServiceName;
                                }
                                log.debug("queryServiceClass item: " + queryServiceName);
                                try {
                                    Class<? extends QueryService> queryServiceClass;
                                    queryServiceClass = (Class<? extends QueryService>) Class.forName(queryServiceName);
                                    exec.addQueryServiceClasses(queryServiceClass);
                                } catch (ClassNotFoundException | ClassCastException e1) {
                                    throwCLI("No such queryService: " + queryServiceName);
                                }

                            }
                            break;
                        }
                        JsonArray array = (JsonArray) values.getValue();
                        List<String> listEntries = new ArrayList<>();
                        for (int i = 0; i < array.size(); i++) {
                            JsonString stringEntry = array.getJsonString(i);
                            listEntries.add(stringEntry.getString());
                        }
                        
                        if (values.getKey().equals("infolisFileTags") || values.getKey().equals("infolisPatternTags")) {
                        	exec.setProperty(values.getKey(), new HashSet<>(listEntries));
                        } else if (values.getKey().equals("tags")) {
                        	exec.setTags(new HashSet<>(listEntries));
                        } else exec.setProperty(values.getKey(), listEntries);
                       
                        break;
                    case OBJECT:
                    //$FALL-THROUGH$
                    default:
                        throwCLI("Unhandled value type " + values.getValue().getValueType() + " for JSON key "
                                + values.getKey());
                        break;
                }
            }
        } catch (NoSuchFieldException | IllegalAccessException e) {
            throwCLI("No such field", e);
        }
    }

    List<String> getQueryTermsFromFile(String filename) throws IOException {
        return FileUtils.readLines(new File(filename), "UTF-8");
    }

    /**
     * Executes the 'candidate search' mode which fires a LuceneSearcher
     * execution for every search term provided./
     *
     * @param exec2
     *
     * @param exec
     * @throws BadRequestException
     * @throws IOException
     */
    private void executeCandidateSearch(Execution parentExec) throws BadRequestException, IOException {
        if (null == queriesFile) {
            throwCLI("Inconsistency: --search-candidates but queries>File is null.");
        }
        if (null == indexDir) {
            throwCLI("Inconsistency: --search-candidates but no --index-dir given.");
        }
        Set<String> allMatchingFiles = new HashSet<>();
        Map<String, List<String>> matchingFilesByQuery = new HashMap<>();
        for (String query : getQueryTermsFromFile(queriesFile)) {
            String normalizedQuery = RegexUtils.normalizeQuery(query.trim(), true);
            matchingFilesByQuery.put(normalizedQuery, new ArrayList<String>());

            Execution exec = new Execution();
            exec.setAlgorithm(LuceneSearcher.class);
            exec.setTags(new HashSet<>(Arrays.asList(tag)));
            exec.setPhraseSlop(0);
            exec.setIndexDirectory(parentExec.getIndexDirectory());
            // normalize to treat phrase query properly
            InfolisPattern p = new InfolisPattern(normalizedQuery);
            dataStoreClient.post(InfolisPattern.class, p);
            exec.setPatterns(Arrays.asList(p.getUri()));
            dataStoreClient.post(Execution.class, exec);
            exec.instantiateAlgorithm(dataStoreClient, fileResolver).run();
            for (InfolisFile f : dataStoreClient.get(InfolisFile.class, exec.getOutputFiles())) {
                String filename = FilenameUtils.getBaseName(f.getFileName());
                allMatchingFiles.add(filename);
                matchingFilesByQuery.get(normalizedQuery).add(filename);
            }
        }
        Path outFile = dbDir.resolve(FilenameUtils.getBaseName(queriesFile));
        try (OutputStream os = Files.newOutputStream(outFile)) {
            IOUtils.write(StringUtils.join(allMatchingFiles, "\n"), os);
        };
        log.warn("ALL MATCHES: {}", allMatchingFiles);
        log.warn("MATCHES BY QUERY: {}", matchingFilesByQuery);
    }
    
    private void extractMetaData(Execution parentExec) {
        Execution extractExec = new Execution();
        extractExec.setAlgorithm(TextAndMetaDataExtractor.class);
        extractExec.setMetaDataFiles(parentExec.getMetaDataFiles());
        extractExec.setInputFiles(parentExec.getInputFiles());
        dataStoreClient.post(Execution.class, extractExec);
        Algorithm algo = extractExec.instantiateAlgorithm(dataStoreClient, fileResolver);
        algo.run();
        log.debug("Extracted metadata for " +parentExec.getMetaDataFiles().size()+" files");
    }

    /**
     * Run the execution
     *
     * @param exec
     * @throws BadRequestException
     * @throws IOException
     */
    private void doExecute(Execution exec) throws BadRequestException, IOException {
        // Set the input files, convert if necessary
        try {
            setExecutionInputFiles(exec);
        } catch (IOException e) {
            throwCLI("Problem setting input files", e);
        }
        // Create index if necessary
        if (indexDir != null) {
            Files.createDirectories(indexDir);
            setExecutionIndexDir(exec);
        }

      try {
            dataStoreClient.post(Execution.class, exec);
            if (convertToTextMode) {
                log.debug("Yay nothing to do. woop dee doo.");
            } else {
                if(metaDir != null) {
                    extractMetaData(exec);
                }
                if (searchCandidatesMode) {
                    executeCandidateSearch(exec);
                } else {
                    exec.instantiateAlgorithm(dataStoreClient, fileResolver).run();
                }
            }
        } catch (BadRequestException e) {
            throwCLI("Execution threw an excepion", e);
        }
        dataStoreClient.dump(dbDir, tag);
    }

    /**
     * Indexes the input files in Lucene and sets the indexDirectory of the
     * execution.
     *
     * @param exec
     */
    private void setExecutionIndexDir(Execution exec) {
        Execution indexerExecution = new Execution();
        indexerExecution.setAlgorithm(Indexer.class);
        indexerExecution.setInputFiles(exec.getInputFiles());
        indexerExecution.setPhraseSlop(0);
        indexerExecution.setOutputDirectory(indexDir.toString());
        dataStoreClient.post(Execution.class, indexerExecution);
        indexerExecution.instantiateAlgorithm(dataStoreClient, fileResolver).run();
        exec.setIndexDirectory(indexerExecution.getOutputDirectory());
    }

    /**
     * Sets the input files for an execution and converts depending on command
     * line arguments.
     *
     * @param exec
     * @throws IOException
     */
    private void setExecutionInputFiles(Execution exec) throws IOException {
        if (null == pdfDir || !Files.exists(pdfDir)) {
            if (shouldConvertToText) {
                throwCLI("Cannot convert to text: Empty/non-existing PDF directory" + pdfDir);
            }
            if (null == textDir || !Files.exists(textDir)) {
                throwCLI("Neither PDFDIR nor TEXTDIR exist");
            } else {
                try (DirectoryStream<Path> dirStream = Files.newDirectoryStream(textDir)) {
                    if (!dirStream.iterator().hasNext()) {
                        dirStream.close();
                        throwCLI("No PDFDIR specified, TEXTDIR exists, but empty.");
                    }
                }
                exec.setInputFiles(postFiles(textDir, "text/plain"));
            }
        } else {            
            if (null == textDir || !Files.exists(textDir)) {
                if (shouldConvertToText) {
                    Files.createDirectories(textDir);
                    if (null == exec.isTokenize()) {
                        log.warn("Warning: tokenize parameter not set. Defaulting to true for text extraction and all algorithms to be applied on extracted texts");
                        exec.setTokenize(true);
                    } 
                    exec.setInputFiles(convertPDF(postFiles(pdfDir, "application/pdf"), exec.getStartPage(), exec.isRemoveBib(), exec.getOverwriteTextfiles(), exec.isTokenize(), exec.getTokenizeNLs(), exec.getPtb3Escaping()));

                } else {
                	// complex algorithm can convert pdfs on demand, others can't and need --convert-to-text option
                	if (ComplexAlgorithm.class.isAssignableFrom(exec.getAlgorithm())) exec.setInputFiles(postFiles(pdfDir, "application/pdf"));
                	else if (TextExtractor.class.equals(exec.getAlgorithm())) exec.setInputFiles(postFiles(pdfDir, "application/pdf"));
                	else throwCLI("PDFDIR specified, TEXTDIR unspecified/empty, but --convert-to-text not set");
                }
            } else {
                if (shouldConvertToText) {
                    //System.err.println("WARNING: Both --text-dir '" + textDir + "' and --pdf-dir '" + pdfDir
                    //        + "' were specified. Will possibly clobber text files in conversion!");
                    System.err.println("WARNING: Both --text-dir '" + textDir + "' and --pdf-dir '" + pdfDir
                            + "' were specified. Overwriting text files: " + exec.getOverwriteTextfiles());
                    System.err.println("<Ctrl-C> to stop, <Enter> to continue");
                    System.in.read();
                    if (null == exec.isTokenize()) {
                        log.warn("Warning: tokenize parameter not set. Defaulting to true for text extraction and all algorithms to be applied on extracted texts");
                        exec.setTokenize(true);    
                    }
                    exec.setInputFiles(convertPDF(postFiles(pdfDir, "application/pdf"), exec.getStartPage(), exec.isRemoveBib(), exec.getOverwriteTextfiles(), exec.isTokenize(), exec.getTokenizeNLs(), exec.getPtb3Escaping()));
                } else {
                    exec.setInputFiles(postFiles(textDir, "text/plain"));
                }
            }
            //extract the metadata and create the according entities
            if (metaDir != null) {
                List<String> files = new ArrayList<>();
                for (File f : metaDir.toFile().listFiles()) {
                    files.add(f.getAbsolutePath());
                }
                exec.setMetaDataFiles(files);                
            }
        }
    }

    /**
     * checks whether all the variables in the Json file are indeed variables
     * which can be set in the execution.
     *
     * @param o
     * @return
     */
    private void checkJsonFile(JsonObject o) {
        List<String> badFields = new ArrayList<>();
        Execution testExecution = new Execution();
        for (Entry<String, JsonValue> values : o.entrySet()) {
            if (values.getKey().equals("inputFiles")) {
                throwCLI("Do not specify inputFiles in JSON, it will be overridden [in " + json + "]");
            }

            try {
                testExecution.getClass().getDeclaredField(values.getKey());
            } catch (NoSuchFieldException ex) {
            	try {
            		testExecution.getClass().getSuperclass().getDeclaredField(values.getKey());
            	} catch (NoSuchFieldException ex2) {
            		badFields.add(values.getKey());
            	}
            }
        }
        if (!badFields.isEmpty()) {
            throwCLI("Unknown fields: " + badFields);
        }
    }
    
    /**
     * Converts a list of InfolisFile to text
     *
     * @param uris URIs of the InfolisFiles
     * @return URIs of the InfolisFiles of the text versions
     */
    private List<String> convertPDF(List<String> uris, int startPage, boolean removeBib, boolean overwriteTextfiles, boolean tokenize, boolean tokenizeNLs, boolean ptb3Escaping) {
        Execution convertExec = new Execution();
        convertExec.setAlgorithm(TextExtractor.class);
        convertExec.setOutputDirectory(textDir.toString());
        convertExec.setInputFiles(uris);
        convertExec.setStartPage(startPage);
        convertExec.setRemoveBib(removeBib);
        convertExec.setOverwriteTextfiles(overwriteTextfiles);
        convertExec.setTokenize(tokenize);
        convertExec.setTokenizeNLs(tokenizeNLs);
        convertExec.setPtb3Escaping(ptb3Escaping);
        convertExec.setTags(new HashSet<>(Arrays.asList(tag)));
        Algorithm algo = convertExec.instantiateAlgorithm(dataStoreClient, fileResolver);
        algo.run();
        return convertExec.getOutputFiles();
    }

    public List<String> postFiles(Path dir, String mimetype) {
        List<InfolisFile> infolisFiles = new ArrayList<>();
        try (DirectoryStream<Path> dirStream = Files.newDirectoryStream(dir)) {
            for (Path file : dirStream) {
                InfolisFile infolisFile = new InfolisFile();

                try (InputStream inputStream = Files.newInputStream(file)) {
                    byte[] bytes = IOUtils.toByteArray(inputStream);
                    infolisFile.setMd5(SerializationUtils.getHexMd5(bytes));
                } catch (IOException e) {
                    throwCLI("Could not read file " + file, e);
                }

                infolisFile.setFileName(file.toString());
                infolisFile.setMediaType(mimetype);
                infolisFile.setTags(new HashSet<>(Arrays.asList(tag)));
                infolisFile.setFileStatus("AVAILABLE");
                
                Entity entity = new Entity();
                entity.setEntityType(EntityType.publication);
                dataStoreClient.post(Entity.class, entity);
                infolisFile.setManifestsEntity(entity.getUri());
                
                infolisFiles.add(infolisFile);
            }
        } catch (IOException e) {
            throwCLI("Couldn't list directory contents of " + dir, e);
        }

        return dataStoreClient.post(InfolisFile.class, infolisFiles);
    }

    private static void throwCLI(String msg) {
        throwCLI(msg, null);
    }

    private static void throwCLI(String msg, Exception e) {
        if (null != msg) {
            System.err.println("**ERROR** " + msg);
        }
        if (null != e) {
            System.err.println(e.getMessage());
            e.printStackTrace(System.err);
        }
        if (System.getProperty("testing") == null) {
            System.exit(1);
        } else {
            log.error("ERROR: {}", e);
        }
        throw new RuntimeException(e);
    }

    public void doMain(String args[]) throws FileNotFoundException, ClassNotFoundException, NoSuchFieldException,
            IllegalAccessException, IOException, CmdLineException {
        CmdLineParser parser = new CmdLineParser(this, ParserProperties.defaults().withUsageWidth(120));
        try {
            parser.parseArgument(args);
            if (null == json && !(shouldConvertToText || searchCandidatesMode)) {
                throwCLI("Must specify JSON if not --convert-to-text|--search-candidates");
            }
            if (null == json && shouldConvertToText && !searchCandidatesMode) {
                convertToTextMode = true;
            }
            // ch.qos.logback.classic.Logger root = (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(Logger.ROOT_LOGGER_NAME);
            // root.setLevel(Level.toLevel(logLevel));
        } catch (CmdLineException e) {
            System.err.println("java " + getClass().getSimpleName() + " [options...]");
            parser.printUsage(System.err);
            throwCLI("", e);
            return;
        }

        Files.createDirectories(dbDir);

        Execution exec = new Execution();
        exec.setTags(new HashSet<String>(Arrays.asList(tag)));

        // if no JSON was provided, only convert files and exit
        if (null != json) {
            try (Reader reader = Files.newBufferedReader(json, Charset.forName("UTF-8"))) {
                JsonObject jsonObject = Json.createReader(reader).readObject();

                // Check the JSON
                checkJsonFile(jsonObject);

                // Set the other options from JSON
                setExecutionFromJSON(jsonObject, exec);
            } catch (IOException e) {
                throwCLI("Problem reading JSON " + json, e);
            }
        }
        doExecute(exec);
    }

    public static void main(String args[]) throws Exception {
        new CommandLineExecuter().doMain(args);
    }
}