/**
* pdfXtk - PDF Extraction Toolkit
* Copyright (c) by the authors/contributors. All rights reserved.
* This project includes code from PDFBox and TouchGraph.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* 3. Neither the names pdfXtk or PDF Extraction Toolkit; nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* http://pdfxtk.sourceforge.net
*
*/
package at.ac.tuwien.dbai.pdfwrap.utils;
import at.ac.tuwien.dbai.pdfwrap.model.document.TextSegment;
import at.ac.tuwien.dbai.pdfwrap.model.graph.AdjacencyEdge;
import javax.media.jai.Histogram;
import javax.media.jai.JAI;
import javax.media.jai.PlanarImage;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.awt.image.renderable.ParameterBlock;
import java.io.*;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
/**
* static utility methods used throughout the project
*
* @author Tamir Hassan, pdfanalyser@tamirhassan.com
* @version PDF Analyser 0.9
*/
public class Utils
{
public static boolean IS_OCR = false;
public final static String EMPTY_STRING = "";
// in GUI now
// public final static boolean DISPLAY_NAG_SCREEN = false;
// public final static boolean DISPLAY_INSTRUCTIONS = false;
public final static boolean DISPLAY_TIMINGS = true;
public final static float PDF_POINT_RESOLUTION = 72;
public final static float XML_RESOLUTION = 300;
// todo: replace with logical neighbourhood!
public final static float neighbourLOSMin = 0.4f;
public final static float neighbourOverlapTolerance = 0.2f;
// float dividerRatio = 0.93f;
// in GUI now
// public final static float dividerRatio = 0.9f;
// public final static boolean horizToolbar = false;
// public final static boolean showWrapperPanel = false;
// public final static boolean standardLookAndFeel = false;
public final static float sameLineTolerance = 0.3f;
// from DocGBPanel.java
public static RenderingHints hints;
static {
hints = new RenderingHints(null);
// fractional metrics distorts text spacing; not for this application!
hints.put(RenderingHints.KEY_FRACTIONALMETRICS, RenderingHints.VALUE_FRACTIONALMETRICS_OFF);
hints.put(RenderingHints.KEY_TEXT_ANTIALIASING, RenderingHints.VALUE_TEXT_ANTIALIAS_LCD_HRGB);
hints.put(RenderingHints.KEY_TEXT_ANTIALIASING, RenderingHints.VALUE_TEXT_ANTIALIAS_ON);
//hints.put(RenderingHints.KEY_TEXT_ANTIALIASING, RenderingHints.VALUE_TEXT_ANTIALIAS_OFF);
hints.put(RenderingHints.KEY_RENDERING, RenderingHints.VALUE_RENDER_SPEED);
hints.put(RenderingHints.KEY_ANTIALIASING , RenderingHints.VALUE_ANTIALIAS_ON);
//hints.put(RenderingHints.KEY_ANTIALIASING , RenderingHints.VALUE_ANTIALIAS_OFF);
hints.put(RenderingHints.KEY_ALPHA_INTERPOLATION, RenderingHints.VALUE_ALPHA_INTERPOLATION_SPEED);
hints.put(RenderingHints.KEY_COLOR_RENDERING, RenderingHints.VALUE_COLOR_RENDER_SPEED);
hints.put(RenderingHints.KEY_DITHERING, RenderingHints.VALUE_DITHER_DISABLE);
}
// for robert demo
//public final static boolean standardLookAndFeel = true;
//public final static boolean horizToolbar = true;
//public final static boolean showWrapperPanel = true;
public static String getRootDir() throws IOException
{
// We want to return the root directory of the program!
// return "/home/tam/eclipse-workspace-str/PDFAnalyser-1.1-os";
File currentDir = new File(".");
// System.out.println(currentDir.getCanonicalPath());
return currentDir.getCanonicalPath();
}
public static float gaussian(float x, float mean, float sd)
{
float mu = mean;
float sigma = sd;
float k1 = (float) ((float) 1 / (sigma * (Math.sqrt(2 * Math.PI))));
float k2 = -1 / (2 * (sigma * sigma));
return (float) (k1 * Math.exp((x - mu) * (x - mu) * k2));
}
public static float normgaussian(float x, float mean, float sd)
{
return gaussian(x, mean, sd) / gaussian(mean, mean, sd);
}
// finds the midpoint of the two given numbers
public static float avg(float num1, float num2)
{
return (num1 + num2) / 2.0f;
}
// 23.08.10: The following methods from http://snippets.dzone.com/posts/show/2936
public static BufferedImage rotate90CW(BufferedImage bi) // -90deg
{
int width = bi.getWidth();
int height = bi.getHeight();
// changed 14.12.10
BufferedImage biFlip = new BufferedImage(height, width, BufferedImage.TYPE_INT_RGB);//bi.getType());
for(int i=0; i<width; i++)
for(int j=0; j<height; j++)
biFlip.setRGB(j, width-1-i, bi.getRGB(i, j));
// biFlip.setRGB(height-1-j, width-1-i, bi.getRGB(i, j)); -- this is a flip!
return biFlip;
}
public static BufferedImage rotate90ACW(BufferedImage bi) // 90deg
{
int width = bi.getWidth();
int height = bi.getHeight();
BufferedImage biFlip = new BufferedImage(height, width, BufferedImage.TYPE_INT_RGB);//bi.getType());
for(int i=0; i<width; i++)
for(int j=0; j<height; j++)
biFlip.setRGB(height-1-j, i, bi.getRGB(i, j));
// biFlip.setRGB(j, i, bi.getRGB(i, j));
return biFlip;
}
// finds the number with the highest frequency (occurrence)
// pre: only FLOATS in the list! (i.e. Float objects)
public static float findMode(List floats, float tolerance)
{
Iterator numIter = floats.iterator();
ArrayList values = new ArrayList();
ArrayList frequencies = new ArrayList();
while (numIter.hasNext())
{
Float thisObj = (Float) numIter.next();
float thisVal = thisObj.floatValue();
// now, find which index it has in the existing array
int arrayIndex = -1;
for (int n = 0; n < values.size(); n++)
{
// get value at n
float valAtN = ((Float) values.get(n)).floatValue();
if (within(thisVal, valAtN, tolerance))
{
// break out of loop
arrayIndex = n;
n = values.size();
}
}
// if already in array, increment
if (arrayIndex != -1)
{
int frequency = ((Integer) (frequencies.get(arrayIndex)))
.intValue();
frequency++;
frequencies.set(arrayIndex, new Integer(frequency));
} else
// add to both lists
{
values.add(thisObj);
frequencies.add(new Integer(1));
}
}
// now we have a list of (approx) values and frequencies,
// we just need to return the most 'popular' one
int highestIndex = 0;
int highestValue = -1;
for (int n = 0; n < frequencies.size(); n++)
{
if (((Integer) frequencies.get(n)).intValue() > highestValue)
{
highestValue = ((Integer) frequencies.get(n)).intValue();
highestIndex = n;
}
}
return ((Float) values.get(highestIndex)).floatValue();
}
public static int findModalFreq(List floats, float tolerance)
{
Iterator numIter = floats.iterator();
ArrayList values = new ArrayList();
ArrayList frequencies = new ArrayList();
while (numIter.hasNext())
{
Float thisObj = (Float) numIter.next();
float thisVal = thisObj.floatValue();
// now, find which index it has in the existing array
int arrayIndex = -1;
for (int n = 0; n < values.size(); n++)
{
// get value at n
float valAtN = ((Float) values.get(n)).floatValue();
if (within(thisVal, valAtN, tolerance))
{
// break out of loop
arrayIndex = n;
n = values.size();
}
}
// if already in array, increment
if (arrayIndex != -1)
{
int frequency = ((Integer) (frequencies.get(arrayIndex)))
.intValue();
frequency++;
frequencies.set(arrayIndex, new Integer(frequency));
} else
// add to both lists
{
values.add(thisObj);
frequencies.add(new Integer(1));
}
}
// now we have a list of (approx) values and frequencies,
// we just need to return the most 'popular' one
int highestIndex = 0;
int highestValue = -1;
for (int n = 0; n < frequencies.size(); n++)
{
if (((Integer) frequencies.get(n)).intValue() > highestValue)
{
highestValue = ((Integer) frequencies.get(n)).intValue();
highestIndex = n;
}
}
return highestValue;
}
// is used at all?
private int findModalFontSize(Collection textBlocks) throws Exception
{
// pre: all items in textBlocks must be TextPosition objects
// TODO: create a specific exception here
// will count font sizes 0..96pt inclusive, rounding down to nearest integer
int[] count = new int[96];
Iterator textIter = textBlocks.iterator();
while (textIter.hasNext())
{
TextSegment thisBlock = null;
try
{
// if empty text block, try again :)
// (required so that empty text blocks do not interfere with processing)
while (textIter.hasNext() && (thisBlock == null || thisBlock.isEmpty()))
{
thisBlock = (TextSegment)textIter.next();
}
}
catch (java.lang.ClassCastException e)
{
throw new Exception("Objects in the collection must be of type TextSegment.");
}
if (thisBlock != null && thisBlock.getFontSize() > 0
&& thisBlock.getFontSize() <= 96)
count[new Double(thisBlock.getFontSize()).intValue()] ++;
}
// loop through and find the highest
// if more than one mode, return the lowest
int highest = 0;
for (int n = 0; n < count.length; n ++)
{
if (count[n] > count[highest])
highest = n;
}
return highest;
}
public static int findIndexOfHighestValuedObject(List l)
{
int retVal = -1;
Object highestObject = null;
for (int n = 0; n < l.size(); n ++)
{
if (highestObject == null)
{
highestObject = l.get(n);
retVal = 0;
}
else
{
if (((Comparable)l.get(n)).compareTo(highestObject) > 0)
{
highestObject = l.get(n);
retVal = n;
}
}
}
if (retVal == -1) System.err.println("returning -1 with " + l.size() + " items");
return retVal;
}
public static boolean sameFontSize(TextSegment seg1, TextSegment seg2)
{
// TODO: maybe make it 10% of the smallest of the two?
//return within(seg1.getFontSize(), seg2.getFontSize(), seg1.getFontSize() * 0.25f);
float afs = (seg1.getFontSize() + seg2.getFontSize()) / 2.0f;
if (!IS_OCR)
return within(seg1.getFontSize(), seg2.getFontSize(), afs * 0.1f);
else
return within(seg1.getFontSize(), seg2.getFontSize(), afs * 0.50f);
}
// TODO: MOVE to another (utility) method!
public static boolean within(float first, float second, float variance)
{
return second > first - variance && second < first + variance;
}
public static boolean between(float number, float boundary1, float boundary2)
{
return ((number >= boundary1 && number <= boundary2) ||
(number <= boundary1 && number >= boundary2));
}
public static float minimum(float first, float second)
{
if (first < second)
return first;
else
return second;
}
public static float maximum(float first, float second)
{
if (first < second)
return second;
else
return first;
}
public static float calculateThreshold(TextSegment seg1, TextSegment seg2, float multiple)
{
return(minimum(seg1.getFontSize(), seg2.getFontSize()) * multiple);
}
public static String stripClassName(String fullName)
{
String retVal = new String();
for (int n = fullName.length() - 1; n >= 0; n --)
{
String thisChar = fullName.substring(n, n + 1);
if (!thisChar.equals("."))
{
retVal = thisChar.concat(retVal);
}
else
{
n = -1;
}
}
return retVal;
}
public static String replaceBackslashes(String inputString)
{
String retVal = new String();
for (int n = 0; n < inputString.length(); n ++)
{
String thisChar = inputString.substring(n, n + 1);
if (thisChar.equals("\\"))
{
retVal = retVal.concat("/");
}
else
{
retVal = retVal.concat(thisChar);
}
}
return retVal;
}
// adapted from http://www.javadb.com/check-if-string-contains-another-string
public static boolean containsSubstring
(String test, String substring)
{
int index1 = test.indexOf(substring);
return (index1 != -1);
}
// the following taken from: http://cse-mjmcl.cse.bris.ac.uk/blog/2007/02/14/1171465494443.html
/**
* This method ensures that the output String has only valid XML unicode characters as specified by the
* XML 1.0 standard. For reference, please see the
* standard. This method will return an empty String if the input is null or empty.
*
* @author Donoiu Cristian, GPL
* @param The String whose non-valid characters we want to remove.
* @return The in String, stripped of non-valid characters.
*/
public static String removeInvalidXMLCharacters(String s) {
StringBuilder out = new StringBuilder(); // Used to hold the output.
int codePoint; // Used to reference the current character.
//String ss = "\ud801\udc00"; // This is actualy one unicode character, represented by two code units!!!.
//System.out.println(ss.codePointCount(0, ss.length()));// See: 1
int i=0;
while(i<s.length()) {
//System.out.println("i=" + i);
codePoint = s.codePointAt(i); // This is the unicode code of the character.
if ((codePoint == 0x9) || // Consider testing larger ranges first to improve speed.
(codePoint == 0xA) ||
(codePoint == 0xD) ||
((codePoint >= 0x20) && (codePoint <= 0xD7FF)) ||
((codePoint >= 0xE000) && (codePoint <= 0xFFFD)) ||
((codePoint >= 0x10000) && (codePoint <= 0x10FFFF))) {
out.append(Character.toChars(codePoint));
}
i+= Character.charCount(codePoint); // Increment with the number of code units(java chars) needed to represent a Unicode char.
}
return out.toString();
}
// http://java.itags.org/java-tech/36144/
// 12.11.10 DOES NOT WORK!
public static BufferedImage convertToBinary(BufferedImage sourceImg){
double[][] matrix = {{ 0.3D, 0.59D, 0.11D, 0D }};
ParameterBlock pb = new ParameterBlock();
pb.addSource(sourceImg);
pb.add(matrix);
PlanarImage src = JAI.create("BandCombine", pb, null);
// Generate a histogram.
Histogram histogram = (Histogram)JAI.create("histogram", src).getProperty("histogram");
// Get a threshold equal to the median.
double[] threshold = histogram.getPTileThreshold(0.5);
// Binarize the image.
PlanarImage dst = JAI.create("binarize", src, new Double(threshold[0]));
return dst.getAsBufferedImage();
}
public static int oppositeDirection(int direction)
{
if (direction == AdjacencyEdge.REL_ABOVE)
return AdjacencyEdge.REL_BELOW;
else if (direction == AdjacencyEdge.REL_BELOW)
return AdjacencyEdge.REL_ABOVE;
else if (direction == AdjacencyEdge.REL_LEFT)
return AdjacencyEdge.REL_RIGHT;
else if (direction == AdjacencyEdge.REL_RIGHT)
return AdjacencyEdge.REL_LEFT;
else return -1;
}
public static void executeCommand(String s, String stdout, String stderr) throws IOException
{
// System.out.println("Executing: " + s);
Runtime rt = Runtime.getRuntime();
Process p = rt.exec(s);
// System.out.println("one...");
// this code from diffpdf...
// any error message?
StreamGobbler errorGobbler = new StreamGobbler(p
.getErrorStream());
// any output?
StreamGobbler outputGobbler = new StreamGobbler(p
.getInputStream());
// kick them off
errorGobbler.start();
outputGobbler.start();
// System.out.println("two...");
// any error???
try {
int exitVal = p.waitFor();
// System.out.println("fooo...");
// int exitVal2 = proc.exitValue();
} catch (InterruptedException ex) {
// TODO Auto-generated catch block
ex.printStackTrace();
}
// System.out.println("three...");
// 15.03.07 these two lines uncommented
// as there was a process still running!
// processor utilisation in the GUI is now back down to
// normal
stdout = errorGobbler.getData();
stderr = outputGobbler.getData();
// System.out.println("stdOut: " + stdout);
// System.err.println("stdErr: " + stderr);
}
// 22.10.10 -- copied from above method, only with sa argument...
public static void executeCommand(String[] sa, String stdout, String stderr) throws IOException
{
// System.out.println("Executing: " + sa);
Runtime rt = Runtime.getRuntime();
Process p = rt.exec(sa);
// System.out.println("one...");
// this code from diffpdf...
// any error message?
StreamGobbler errorGobbler = new StreamGobbler(p
.getErrorStream());
// any output?
StreamGobbler outputGobbler = new StreamGobbler(p
.getInputStream());
// kick them off
errorGobbler.start();
outputGobbler.start();
// System.out.println("two...");
// any error???
try {
int exitVal = p.waitFor();
// System.out.println("fooo...");
// int exitVal2 = proc.exitValue();
} catch (InterruptedException ex) {
// TODO Auto-generated catch block
ex.printStackTrace();
}
// System.out.println("three...");
// 15.03.07 these two lines uncommented
// as there was a process still running!
// processor utilisation in the GUI is now back down to
// normal
stdout = errorGobbler.getData();
stderr = outputGobbler.getData();
System.out.println(stdout);
System.err.println(stderr);
}
}
class StreamGobbler extends Thread
//credit to http://www.javaworld.com/javaworld/jw-12-2000/jw-1229-traps.html
//for this code
{
InputStream is;
String type;
String data;
boolean noMoreData = false;
boolean finished = false;
StreamGobbler(InputStream is) {
this.is = is;
// this.type = type;
this.data = "";
}
public void run() {
try {
InputStreamReader isr = new InputStreamReader(is);
BufferedReader br = new BufferedReader(isr);
String line = null;
// while (true)
while (!finished) {
while ((line = br.readLine()) != null) {
noMoreData = false;
data = data + "\n" + line;
}
noMoreData = true;
}
is.close();
} catch (IOException ioe) {
ioe.printStackTrace();
}
// System.err.println("reached end!");
// return;
}
public String getData() throws IOException {
// pre: process that we are gobbling has now finished
while (noMoreData == false) {
System.out.print("");
// System.out.println("still data to be gobbled! " + this);
// wait for gobbler to finish gobbling anything that
// might be left in the buffer
}
finished = true;
return data;
}
}