package technology.tabula;
import java.awt.geom.Point2D;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FilenameFilter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.DefaultParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import technology.tabula.detectors.DetectionAlgorithm;
import technology.tabula.detectors.NurminenDetectionAlgorithm;
import technology.tabula.detectors.SpreadsheetDetectionAlgorithm;
import technology.tabula.extractors.BasicExtractionAlgorithm;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
import technology.tabula.writers.CSVWriter;
import technology.tabula.writers.JSONWriter;
import technology.tabula.writers.TSVWriter;
import technology.tabula.writers.Writer;
public class CommandLineApp {
private static String VERSION = "1.0.0";
private static String VERSION_STRING = String.format("tabula %s (c) 2012-2017 Manuel AristarĂ¡n", VERSION);
private static String BANNER = "\nTabula helps you extract tables from PDFs\n\n";
private Appendable defaultOutput;
private Rectangle pageArea;
private List<Integer> pages;
private OutputFormat outputFormat;
private String password;
private TableExtractor tableExtractor;
public CommandLineApp(Appendable defaultOutput, CommandLine line) throws ParseException {
this.defaultOutput = defaultOutput;
this.pageArea = CommandLineApp.whichArea(line);
this.pages = CommandLineApp.whichPages(line);
this.outputFormat = CommandLineApp.whichOutputFormat(line);
this.tableExtractor = CommandLineApp.createExtractor(line);
if (line.hasOption('s')) {
this.password = line.getOptionValue('s');
}
}
public static void main(String[] args) {
CommandLineParser parser = new DefaultParser();
try {
// parse the command line arguments
CommandLine line = parser.parse(buildOptions(), args);
if (line.hasOption('h')) {
printHelp();
System.exit(0);
}
if (line.hasOption('v')) {
System.out.println(VERSION_STRING);
System.exit(0);
}
new CommandLineApp(System.out, line).extractTables(line);
} catch (ParseException exp) {
System.err.println("Error: " + exp.getMessage());
System.exit(1);
}
System.exit(0);
}
public void extractTables(CommandLine line) throws ParseException {
if (line.hasOption('b')) {
if (line.getArgs().length != 0) {
throw new ParseException("Filename specified with batch\nTry --help for help");
}
File pdfDirectory = new File(line.getOptionValue('b'));
if (!pdfDirectory.isDirectory()) {
throw new ParseException("Directory does not exist or is not a directory");
}
extractDirectoryTables(line, pdfDirectory);
return;
}
if (line.getArgs().length != 1) {
throw new ParseException("Need exactly one filename\nTry --help for help");
}
File pdfFile = new File(line.getArgs()[0]);
if (!pdfFile.exists()) {
throw new ParseException("File does not exist");
}
extractFileTables(line, pdfFile);
}
public void extractDirectoryTables(CommandLine line, File pdfDirectory) throws ParseException {
File[] pdfs = pdfDirectory.listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith(".pdf");
}
});
for (File pdfFile : pdfs) {
File outputFile = new File(getOutputFilename(pdfFile));
extractFileInto(pdfFile, outputFile);
}
}
public void extractFileTables(CommandLine line, File pdfFile) throws ParseException {
Appendable outFile = this.defaultOutput;
if (!line.hasOption('o')) {
extractFile(pdfFile, this.defaultOutput);
return;
}
File outputFile = new File(line.getOptionValue('o'));
extractFileInto(pdfFile, outputFile);
}
public void extractFileInto(File pdfFile, File outputFile) throws ParseException {
BufferedWriter bufferedWriter = null;
try {
FileWriter fileWriter = new FileWriter(outputFile.getAbsoluteFile());
bufferedWriter = new BufferedWriter(fileWriter);
outputFile.createNewFile();
extractFile(pdfFile, bufferedWriter);
} catch (IOException e) {
throw new ParseException("Cannot create file " + outputFile);
} finally {
if (bufferedWriter != null) {
try {
bufferedWriter.close();
} catch (IOException e) {
System.out.println("Error in closing the BufferedWriter" + e);
}
}
}
}
private void extractFile(File pdfFile, Appendable outFile) throws ParseException {
PDDocument pdfDocument = null;
try {
pdfDocument = PDDocument.load(pdfFile);
PageIterator pageIterator = getPageIterator(pdfDocument);
List<Table> tables = new ArrayList<Table>();
while (pageIterator.hasNext()) {
Page page = pageIterator.next();
if (pageArea != null) {
page = page.getArea(pageArea);
}
tables.addAll(tableExtractor.extractTables(page));
}
writeTables(tables, outFile);
} catch (IOException e) {
throw new ParseException(e.getMessage());
} finally {
try {
if (pdfDocument != null) {
pdfDocument.close();
}
} catch (IOException e) {
System.out.println("Error in closing pdf document" + e);
}
}
}
private PageIterator getPageIterator(PDDocument pdfDocument) throws IOException {
ObjectExtractor extractor = new ObjectExtractor(pdfDocument);
PageIterator pageIterator = (pages == null) ?
extractor.extract() :
extractor.extract(pages);
return pageIterator;
}
// CommandLine parsing methods
private static OutputFormat whichOutputFormat(CommandLine line) throws ParseException {
if (!line.hasOption('f')) {
return OutputFormat.CSV;
}
try {
return OutputFormat.valueOf(line.getOptionValue('f'));
} catch (IllegalArgumentException e) {
throw new ParseException(String.format(
"format %s is illegal. Available formats: %s",
line.getOptionValue('f'),
Utils.join(",", OutputFormat.formatNames())));
}
}
private static Rectangle whichArea(CommandLine line) throws ParseException {
if (!line.hasOption('a')) {
return null;
}
List<Float> f = parseFloatList(line.getOptionValue('a'));
if (f.size() != 4) {
throw new ParseException("area parameters must be top,left,bottom,right");
}
return new Rectangle(f.get(0), f.get(1), f.get(3) - f.get(1), f.get(2) - f.get(0));
}
private static List<Integer> whichPages(CommandLine line) throws ParseException {
String pagesOption = line.hasOption('p') ? line.getOptionValue('p') : "1";
return Utils.parsePagesOption(pagesOption);
}
private static ExtractionMethod whichExtractionMethod(CommandLine line) {
// -r/--spreadsheet [deprecated; use -l] or -l/--lattice
if (line.hasOption('r') || line.hasOption('l')) {
return ExtractionMethod.SPREADSHEET;
}
// -n/--no-spreadsheet [deprecated; use -t] or -c/--columns or -g/--guess or -t/--stream
if (line.hasOption('n') || line.hasOption('c') || line.hasOption('g') || line.hasOption('t')) {
return ExtractionMethod.BASIC;
}
return ExtractionMethod.DECIDE;
}
private static TableExtractor createExtractor(CommandLine line) throws ParseException {
TableExtractor extractor = new TableExtractor();
extractor.setGuess(line.hasOption('g'));
extractor.setMethod(CommandLineApp.whichExtractionMethod(line));
extractor.setUseLineReturns(line.hasOption('u'));
if (line.hasOption('c')) {
extractor.setVerticalRulingPositions(parseFloatList(line.getOptionValue('c')));
}
return extractor;
}
// utilities, etc.
public static List<Float> parseFloatList(String option) throws ParseException {
String[] f = option.split(",");
List<Float> rv = new ArrayList<Float>();
try {
for (int i = 0; i < f.length; i++) {
rv.add(Float.parseFloat(f[i]));
}
return rv;
} catch (NumberFormatException e) {
throw new ParseException("Wrong number syntax");
}
}
private static void printHelp() {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("tabula", BANNER, buildOptions(), "", true);
}
@SuppressWarnings("static-access")
public static Options buildOptions() {
Options o = new Options();
o.addOption("v", "version", false, "Print version and exit.");
o.addOption("h", "help", false, "Print this help text.");
o.addOption("g", "guess", false, "Guess the portion of the page to analyze per page.");
o.addOption("d", "debug", false, "Print detected table areas instead of processing");
o.addOption("r", "spreadsheet", false, "[Deprecated in favor of -l/--lattice] Force PDF to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)");
o.addOption("n", "no-spreadsheet", false, "[Deprecated in favor of -t/--stream] Force PDF not to be extracted using spreadsheet-style extraction (if there are no ruling lines separating each cell)");
o.addOption("l", "lattice", false, "Force PDF to be extracted using lattice-mode extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)");
o.addOption("t", "stream", false, "Force PDF to be extracted using stream-mode extraction (if there are no ruling lines separating each cell)");
o.addOption("i", "silent", false, "Suppress all stderr output.");
o.addOption("u", "use-line-returns", false, "Use embedded line returns in cells. (Only in spreadsheet mode.)");
o.addOption("d", "debug", false, "Print detected table areas instead of processing.");
o.addOption(Option.builder("b")
.longOpt("batch")
.desc("Convert all .pdfs in the provided directory.")
.hasArg()
.argName("DIRECTORY")
.build());
o.addOption(Option.builder("o")
.longOpt("outfile")
.desc("Write output to <file> instead of STDOUT. Default: -")
.hasArg()
.argName("OUTFILE")
.build());
o.addOption(Option.builder("f")
.longOpt("format")
.desc("Output format: (" + Utils.join(",", OutputFormat.formatNames()) + "). Default: CSV")
.hasArg()
.argName("FORMAT")
.build());
o.addOption(Option.builder("s")
.longOpt("password")
.desc("Password to decrypt document. Default is empty")
.hasArg()
.argName("PASSWORD")
.build());
o.addOption(Option.builder("c")
.longOpt("columns")
.desc("X coordinates of column boundaries. Example --columns 10.1,20.2,30.3")
.hasArg()
.argName("COLUMNS")
.build());
o.addOption(Option.builder("a")
.longOpt("area")
.desc("Portion of the page to analyze (top,left,bottom,right). Example: --area 269.875,12.75,790.5,561. Default is entire page")
.hasArg()
.argName("AREA")
.build());
o.addOption(Option.builder("p")
.longOpt("pages")
.desc("Comma separated list of ranges, or all. Examples: --pages 1-3,5-7, --pages 3 or --pages all. Default is --pages 1")
.hasArg()
.argName("PAGES")
.build());
return o;
}
private static class TableExtractor {
private boolean guess = false;
private boolean useLineReturns = false;
private BasicExtractionAlgorithm basicExtractor = new BasicExtractionAlgorithm();
private SpreadsheetExtractionAlgorithm spreadsheetExtractor = new SpreadsheetExtractionAlgorithm();
private List<Float> verticalRulingPositions = null;
private ExtractionMethod method = ExtractionMethod.BASIC;
public TableExtractor() {
}
public void setVerticalRulingPositions(List<Float> positions) {
this.verticalRulingPositions = positions;
}
public void setGuess(boolean guess) {
this.guess = guess;
}
public void setUseLineReturns(boolean useLineReturns) {
this.useLineReturns = useLineReturns;
}
public void setMethod(ExtractionMethod method) {
this.method = method;
}
public List<Table> extractTables(Page page) {
ExtractionMethod effectiveMethod = this.method;
if (effectiveMethod == ExtractionMethod.DECIDE) {
effectiveMethod = spreadsheetExtractor.isTabular(page) ?
ExtractionMethod.SPREADSHEET :
ExtractionMethod.BASIC;
}
switch (effectiveMethod) {
case BASIC:
return extractTablesBasic(page);
case SPREADSHEET:
return extractTablesSpreadsheet(page);
default:
return new ArrayList<Table>();
}
}
public List<Table> extractTablesBasic(Page page) {
if (guess) {
// guess the page areas to extract using a detection algorithm
// currently we only have a detector that uses spreadsheets to find table areas
DetectionAlgorithm detector = new NurminenDetectionAlgorithm();
List<Rectangle> guesses = detector.detect(page);
List<Table> tables = new ArrayList<Table>();
for (Rectangle guessRect : guesses) {
Page guess = page.getArea(guessRect);
tables.addAll(basicExtractor.extract(guess));
}
return tables;
}
if (verticalRulingPositions != null) {
return basicExtractor.extract(page, verticalRulingPositions);
}
return basicExtractor.extract(page);
}
public List<Table> extractTablesSpreadsheet(Page page) {
// TODO add useLineReturns
return (List<Table>) spreadsheetExtractor.extract(page);
}
}
private void writeTables(List<Table> tables, Appendable out) throws IOException {
Writer writer = null;
switch (outputFormat) {
case CSV:
writer = new CSVWriter();
break;
case JSON:
writer = new JSONWriter();
break;
case TSV:
writer = new TSVWriter();
break;
}
writer.write(out, tables);
}
private String getOutputFilename(File pdfFile) {
String extension = ".csv";
switch (outputFormat) {
case CSV:
extension = ".csv";
break;
case JSON:
extension = ".json";
break;
case TSV:
extension = ".tsv";
break;
}
return pdfFile.getPath().replaceFirst("(\\.pdf|)$", extension);
}
private enum OutputFormat {
CSV,
TSV,
JSON;
static String[] formatNames() {
OutputFormat[] values = OutputFormat.values();
String[] rv = new String[values.length];
for (int i = 0; i < values.length; i++) {
rv[i] = values[i].name();
}
return rv;
}
}
private enum ExtractionMethod {
BASIC,
SPREADSHEET,
DECIDE
}
private class DebugOutput {
private boolean debugEnabled;
public DebugOutput(boolean debug) {
this.debugEnabled = debug;
}
public void debug(String msg) {
if (this.debugEnabled) {
System.err.println(msg);
}
}
}
}