package technology.tabula; import java.awt.Shape; import java.awt.geom.Line2D; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; import java.awt.image.BufferedImage; import java.io.IOException; import java.math.BigDecimal; import java.util.*; import org.apache.commons.cli.ParseException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.rendering.ImageType; import org.apache.pdfbox.rendering.PDFRenderer; /** * @author manuel */ public class Utils { public static boolean within(double first, double second, double variance) { return second < first + variance && second > first - variance; } public static boolean overlap(double y1, double height1, double y2, double height2, double variance) { return within(y1, y2, variance) || (y2 <= y1 && y2 >= y1 - height1) || (y1 <= y2 && y1 >= y2 - height2); } public static boolean overlap(double y1, double height1, double y2, double height2) { return overlap(y1, height1, y2, height2, 0.1f); } private final static float EPSILON = 0.01f; protected static boolean useQuickSort = useCustomQuickSort(); public static boolean feq(double f1, double f2) { return (Math.abs(f1 - f2) < EPSILON); } public static float round(double d, int decimalPlace) { BigDecimal bd = new BigDecimal(Double.toString(d)); bd = bd.setScale(decimalPlace, BigDecimal.ROUND_HALF_UP); return bd.floatValue(); } public static Rectangle bounds(Collection<? extends Shape> shapes) { if (shapes.isEmpty()) { throw new IllegalArgumentException("shapes can't be empty"); } Iterator<? extends Shape> iter = shapes.iterator(); Rectangle rv = new Rectangle(); rv.setRect(iter.next().getBounds2D()); while (iter.hasNext()) { Rectangle2D.union(iter.next().getBounds2D(), rv, rv); } return rv; } // range iterator public static List<Integer> range(final int begin, final int end) { return new AbstractList<Integer>() { @Override public Integer get(int index) { return begin + index; } @Override public int size() { return end - begin; } }; } /* from apache.commons-lang */ public static boolean isNumeric(final CharSequence cs) { if (cs == null || cs.length() == 0) { return false; } final int sz = cs.length(); for (int i = 0; i < sz; i++) { if (!Character.isDigit(cs.charAt(i))) { return false; } } return true; } public static String join(String glue, String... s) { int k = s.length; if (k == 0) { return null; } StringBuilder out = new StringBuilder(); out.append(s[0]); for (int x = 1; x < k; ++x) { out.append(glue).append(s[x]); } return out.toString(); } public static <T> List<List<T>> transpose(List<List<T>> table) { List<List<T>> ret = new ArrayList<List<T>>(); final int N = table.get(0).size(); for (int i = 0; i < N; i++) { List<T> col = new ArrayList<T>(); for (List<T> row : table) { col.add(row.get(i)); } ret.add(col); } return ret; } /** * Wrap Collections.sort so we can fallback to a non-stable quicksort * if we're running on JDK7+ */ public static <T extends Comparable<? super T>> void sort(List<T> list) { if (useQuickSort) { QuickSort.sort(list); } else { Collections.sort(list); } } private static boolean useCustomQuickSort() { // taken from PDFBOX: // check if we need to use the custom quicksort algorithm as a // workaround to the transitivity issue of TextPositionComparator: // https://issues.apache.org/jira/browse/PDFBOX-1512 String numberybits = System.getProperty("java.version").split( "-")[0]; // some Java version strings are 9-internal, which is dumb. String[] versionComponents = numberybits.split( "\\."); int javaMajorVersion; int javaMinorVersion; if (versionComponents.length >= 2) { javaMajorVersion = Integer.parseInt(versionComponents[0]); javaMinorVersion = Integer.parseInt(versionComponents[1]); } else { javaMajorVersion = 1; javaMinorVersion = Integer.parseInt(versionComponents[0]); } boolean is16orLess = javaMajorVersion == 1 && javaMinorVersion <= 6; String useLegacySort = System.getProperty("java.util.Arrays.useLegacyMergeSort"); return !is16orLess || (useLegacySort != null && useLegacySort.equals("true")); } public static List<Integer> parsePagesOption(String pagesSpec) throws ParseException { if (pagesSpec.equals("all")) { return null; } List<Integer> rv = new ArrayList<Integer>(); String[] ranges = pagesSpec.split(","); for (int i = 0; i < ranges.length; i++) { String[] r = ranges[i].split("-"); if (r.length == 0 || !Utils.isNumeric(r[0]) || r.length > 1 && !Utils.isNumeric(r[1])) { throw new ParseException("Syntax error in page range specification"); } if (r.length < 2) { rv.add(Integer.parseInt(r[0])); } else { int t = Integer.parseInt(r[0]); int f = Integer.parseInt(r[1]); if (t > f) { throw new ParseException("Syntax error in page range specification"); } rv.addAll(Utils.range(t, f + 1)); } } Collections.sort(rv); return rv; } public static void snapPoints(List<? extends Line2D.Float> rulings, float xThreshold, float yThreshold) { // collect points and keep a Line -> p1,p2 map Map<Line2D.Float, Point2D[]> linesToPoints = new HashMap<Line2D.Float, Point2D[]>(); List<Point2D> points = new ArrayList<Point2D>(); for (Line2D.Float r : rulings) { Point2D p1 = r.getP1(); Point2D p2 = r.getP2(); linesToPoints.put(r, new Point2D[]{p1, p2}); points.add(p1); points.add(p2); } // snap by X Collections.sort(points, new Comparator<Point2D>() { @Override public int compare(Point2D arg0, Point2D arg1) { return java.lang.Double.compare(arg0.getX(), arg1.getX()); } }); List<List<Point2D>> groupedPoints = new ArrayList<List<Point2D>>(); groupedPoints.add(new ArrayList<Point2D>(Arrays.asList(new Point2D[]{points.get(0)}))); for (Point2D p : points.subList(1, points.size() - 1)) { List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1); if (Math.abs(p.getX() - last.get(0).getX()) < xThreshold) { groupedPoints.get(groupedPoints.size() - 1).add(p); } else { groupedPoints.add(new ArrayList<Point2D>(Arrays.asList(new Point2D[]{p}))); } } for (List<Point2D> group : groupedPoints) { float avgLoc = 0; for (Point2D p : group) { avgLoc += p.getX(); } avgLoc /= group.size(); for (Point2D p : group) { p.setLocation(avgLoc, p.getY()); } } // --- // snap by Y Collections.sort(points, new Comparator<Point2D>() { @Override public int compare(Point2D arg0, Point2D arg1) { return java.lang.Double.compare(arg0.getY(), arg1.getY()); } }); groupedPoints = new ArrayList<List<Point2D>>(); groupedPoints.add(new ArrayList<Point2D>(Arrays.asList(new Point2D[]{points.get(0)}))); for (Point2D p : points.subList(1, points.size() - 1)) { List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1); if (Math.abs(p.getY() - last.get(0).getY()) < yThreshold) { groupedPoints.get(groupedPoints.size() - 1).add(p); } else { groupedPoints.add(new ArrayList<Point2D>(Arrays.asList(new Point2D[]{p}))); } } for (List<Point2D> group : groupedPoints) { float avgLoc = 0; for (Point2D p : group) { avgLoc += p.getY(); } avgLoc /= group.size(); for (Point2D p : group) { p.setLocation(p.getX(), avgLoc); } } // --- // finally, modify lines for (Map.Entry<Line2D.Float, Point2D[]> ltp : linesToPoints.entrySet()) { Point2D[] p = ltp.getValue(); ltp.getKey().setLine(p[0], p[1]); } } public static BufferedImage pageConvertToImage(PDPage page, int dpi, ImageType imageType) throws IOException { // Yeah, this sucks. But PDFBox 2 wants PDFRenderers to have // a reference to a PDDocument (unnecessarily, IMHO) PDDocument document = new PDDocument(); document.addPage(page); PDFRenderer renderer = new PDFRenderer(document); document.close(); return renderer.renderImageWithDPI(0, dpi, imageType); } }