/** * pdfXtk - PDF Extraction Toolkit * Copyright (c) by the authors/contributors. All rights reserved. * This project includes code from PDFBox and TouchGraph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * 3. Neither the names pdfXtk or PDF Extraction Toolkit; nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * http://pdfxtk.sourceforge.net * */ package at.ac.tuwien.dbai.pdfwrap; import at.ac.tuwien.dbai.pdfwrap.analysis.PageProcessor; import at.ac.tuwien.dbai.pdfwrap.comparators.XComparator; import at.ac.tuwien.dbai.pdfwrap.model.document.GenericSegment; import at.ac.tuwien.dbai.pdfwrap.model.document.Page; import at.ac.tuwien.dbai.pdfwrap.model.graph.*; import at.ac.tuwien.dbai.pdfwrap.pdfread.PDFObjectExtractor; import at.ac.tuwien.dbai.pdfwrap.utils.ListUtils; import at.ac.tuwien.dbai.pdfwrap.utils.SegmentUtils; import org.w3c.dom.*; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import java.io.File; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.*; /** * performs the graph matching to obtain wrapping instances * @author Tamir Hassan, pdfanalyser@tamirhassan.com * @version PDF Analyser 0.9 */ public class GraphMatcher { // pre: edgesToReturn is a blank EdgeList protected static List<DocNode> getNeighboursFrom(DocNode node, List<DocEdge> docEdges, List<DocEdge> edgesToReturn) { List<DocNode> retVal = new ArrayList<DocNode>(); for (DocEdge e : docEdges) { if (e.getFrom() == node) { retVal.add(e.getTo()); edgesToReturn.add(e); } } return retVal; } // pre: edgesToReturn is a blank EdgeList protected static List<DocNode> getNeighboursTo(DocNode node, List<DocEdge> docEdges, List<DocEdge> edgesToReturn) { List<DocNode> retVal = new ArrayList<DocNode>(); for (DocEdge e : docEdges) { if (e.getTo() == node) { retVal.add(e.getFrom()); edgesToReturn.add(e); } } return retVal; } protected static boolean corresponds(DocNode insN, DocNode docN, boolean[][] M, List<DocNode> instanceNodes, List<DocNode> documentNodes) { // find index of insN int insNIndex = -1; int currIndex = -1; Iterator insNodesIter = instanceNodes.iterator(); //faster than for loop while(insNodesIter.hasNext() && insNIndex == -1) { currIndex ++; Object nextObj = insNodesIter.next(); if (nextObj == insN) insNIndex = currIndex; } // find index of docN int docNIndex = -1; currIndex = -1; Iterator docNodesIter = documentNodes.iterator(); //faster than for loop while(docNodesIter.hasNext() && docNIndex == -1) { currIndex ++; Object nextObj = docNodesIter.next(); if (nextObj == docN) docNIndex = currIndex; } return M[insNIndex][docNIndex]; } protected static DocNode getCorrespondingNode(DocNode insN, boolean[][] M, List<DocNode> instanceNodes, List<DocNode> documentNodes) { // find index of insN int insNIndex = -1; int currIndex = -1; Iterator insNodesIter = instanceNodes.iterator(); //faster than for loop while(insNodesIter.hasNext() && insNIndex == -1) { currIndex ++; Object nextObj = insNodesIter.next(); if (nextObj == insN) insNIndex = currIndex; } // find x where M[insNIndex][x] == 1 int x = -1; for (int n = 0; n < M[insNIndex].length; n ++) { if (M[insNIndex][n]) { x = n; n = M[insNIndex].length; // break out of loop } } return documentNodes.get(x); } protected static List<DocNode> getCorrespondingNodes(DocNode insN, boolean[][] M, List<DocNode> instanceNodes, List<DocNode> documentNodes) { List<DocNode> retVal = new ArrayList<DocNode>(); // find index of insN int insNIndex = -1; int currIndex = -1; Iterator insNodesIter = instanceNodes.iterator(); //faster than for loop while(insNodesIter.hasNext() && insNIndex == -1) { currIndex ++; Object nextObj = insNodesIter.next(); if (nextObj == insN) insNIndex = currIndex; } // find x where M[insNIndex][x] == 1 //int x = -1; for (int n = 0; n < M[insNIndex].length; n ++) { if (M[insNIndex][n]) { retVal.add(documentNodes.get(n)); //x = n; //n = M[insNIndex].length; // break out of loop } } //return (GenericSegment)documentNodes.get(x); return retVal; } // we need the original nodeFrom and nodeTo, as they contain the matching details protected static boolean existsMatchNPath(DocNode insNodeFrom, DocNode insNodeTo, DocNode docNodeFrom, DocNode docNodeTo, DocEdge matchNEdge, List<DocEdge> documentEdges) { DocNode currentNode = docNodeFrom; boolean switched = false; boolean loop = true; while(loop) { List<DocEdge> edgeList = new ArrayList<DocEdge>(); // 28.02.09 SegmentList neighbours = getNeighboursFrom(currentNode, documentEdges, edgeList); // List<DocNode> neighbours = getNeighboursFromHash(currentNode, null, edgeList); List<DocNode> neighbours = getNeighboursFrom(currentNode, documentEdges, edgeList); boolean foundNeighbour = false; for (int n = 0; n < edgeList.size(); n ++) { DocEdge edge = edgeList.get(n); if (compareEdges(matchNEdge, edge)) { DocNode neighbour = neighbours.get(n); // this line added on the night before cebit 2.03.09 // THIS IS IMPORTANT: it means that we have to match either node... // this refers to the if statement (previously if(true)) boolean nodeOK = true; if (matchNEdge.getMultipleMatch() == DocEdge.MATCH_N_TIL_LAST) //nodeOK = compareNodes(insNodeFrom, neighbour) || //compareNodes(insNodeTo, neighbour); // 24.08.09 above 2 lines (nodeOK...) commented out and replaced by this if statement { if (!compareNodes(insNodeFrom, neighbour)) switched = true; if (switched & !compareNodes(insNodeTo, neighbour)) nodeOK = false; } if (nodeOK) //if (compareNodes(insNodeFrom, neighbour) || // compareNodes(insNodeTo, neighbour)) { if (matchNEdge.getMultipleMatch() == DocEdge.MATCH_N_TIL_FIRST) { if (currentNode != docNodeFrom && compareNodes(insNodeFrom, currentNode)) return false; if (neighbour != docNodeTo && compareNodes(insNodeTo, currentNode)) return false; } if (neighbour == docNodeTo) { // TODO: decide whether we want to implement matchNFirst and matchNLast here... // instead of just "return true"... // 21.02.09 /* System.out.println("Found a path..."); System.out.println("nodeFrom: " + docNodeFrom); System.out.println("nodeTo: " + docNodeTo); */ // a path exists from nodeFrom to nodeTo if (matchNEdge.getMultipleMatch() == DocEdge.MATCH_N_ANY) { return true; } else if (matchNEdge.getMultipleMatch() == DocEdge.MATCH_N_TIL_FIRST) { return true; // check, if an intermediate node, that it doesn't match // insNodeFrom or insNodeTo .. but we do that above, don't we? } else // if (matchNEdge.getMultipleMatch() == Edge.MATCH_N_TIL_LAST) or MATCH_N_TIL_FIRST { /* System.out.println("checking with: dnodeFrom: " + docNodeFrom); System.out.println("checking with: dnodeTo: " + docNodeTo); System.out.println("matchNEdge: " + matchNEdge); */ // start at nodeTo and look forwards DocNode currentNode2 = docNodeTo; boolean loop2 = true; while(loop2) { // System.out.println("currentNode2: " + currentNode2); List<DocEdge> edgeList2 = new ArrayList<DocEdge>(); // 28.02.09 SegmentList neighbours2 = getNeighboursFrom(currentNode2, documentEdges, edgeList2); // List<DocNode> neighbours2 = getNeighboursFromHash(currentNode2, null, edgeList2); List<DocNode> neighbours2 = getNeighboursFrom(currentNode2, documentEdges, edgeList2); boolean foundNeighbour2 = false; for (int n2 = 0; n2 < edgeList2.size(); n2 ++) { DocEdge edge2 = edgeList2.get(n2); // System.out.println("edge2: " + edge2); // 3.02.09 if (compareEdges(matchNEdge, edge2)) if (compareEdges(matchNEdge, edge2) && compareNodes(insNodeTo, neighbours2.get(n2))) { DocNode neighbour2 = neighbours2.get(n2); if (compareNodes(insNodeTo, neighbour2)) return false; // 3.02.09 the following lines will never be run // break out of loop (we only take the first valid edge atm) n2 = edgeList2.size(); currentNode2 = neighbour2; foundNeighbour2 = true; } } if (!foundNeighbour2) { // System.out.println("no more neighbours setting loop2 to false"); } if (!foundNeighbour2) loop2 = false; } // until no more (matching) edges // start at nodeFrom and look backwards currentNode2 = docNodeFrom; loop2 = true; while(loop2) { List<DocEdge> edgeList2 = new ArrayList<DocEdge>(); // 28.02.09 SegmentList neighbours2 = getNeighboursTo(currentNode2, documentEdges, edgeList2); // List<DocNode> neighbours2 = getNeighboursToHash(currentNode2, null, edgeList2); List<DocNode> neighbours2 = getNeighboursTo(currentNode2, documentEdges, edgeList2); boolean foundNeighbour2 = false; for (int n2 = 0; n2 < edgeList2.size(); n2 ++) { DocEdge edge2 = edgeList2.get(n2); // 3.02.09 if (compareEdges(matchNEdge, edge2)) if (compareEdges(matchNEdge, edge2) && compareNodes(insNodeFrom, neighbours2.get(n2))) { DocNode neighbour2 = neighbours2.get(n2); if (compareNodes(insNodeFrom, neighbour2)) return false; // break out of loop (we only take the first valid edge atm) n2 = edgeList2.size(); currentNode2 = neighbour2; foundNeighbour2 = true; } } if (!foundNeighbour2) loop2 = false; } // until no more (matching) edges return true; // if nothing found which breaks the conditions } } // else look further // break out of loop (we only take the first valid edge atm) n = edgeList.size(); currentNode = neighbour; foundNeighbour = true; } } } if (!foundNeighbour) loop = false; } return false; } protected static boolean refineM(boolean[][] M, List<DocNode> instanceNodes, List<DocEdge> instanceEdges, List<DocNode> documentNodes, List<DocEdge> documentEdges) { //System.out.println("in refineM"); boolean loop = true; while(loop) { boolean changeMade = false; for (int a = 0; a < M.length; a ++) // i { boolean noOne = true; for (int b = 0; b < M[a].length; b ++) // j { if (M[a][b]) // if vai corresponds to vbj in any iso. under M { noOne = false; boolean forAllX = true; DocNode insNode = instanceNodes.get(a); DocNode docNode = documentNodes.get(b); //System.out.println("insNode: " + insNode); //System.out.println("docNode: " + docNode); List<DocEdge> insNEdges = new ArrayList<DocEdge>(); List<DocEdge> docNEdges = new ArrayList<DocEdge>(); List<DocNode> insNeighbours = getNeighboursFrom (insNode, instanceEdges, insNEdges); //System.out.println("insNeighbours from " + insNode); //System.out.println(insNeighbours); // 28.02.09 SegmentList docNeighbours = getNeighboursFrom // (docNode, documentEdges, docNEdges); // List<DocNode> docNeighbours = getNeighboursFromHash // (docNode, null, docNEdges); List<DocNode> docNeighbours = getNeighboursFrom (docNode, documentEdges, docNEdges); //System.out.println("docNeighbours from " + docNode); //System.out.println(docNeighbours); // check all insNeighbours for (int n = 0; n < insNeighbours.size(); n ++) { DocNode insN = insNeighbours.get(n); DocEdge insE = insNEdges.get(n); //System.out.println("checking with insNeighbour : " + insN); boolean existsY = false; if (insE.getMultipleMatch() == DocEdge.MATCH_ONE) { // check that there is a resp. node as per Ullmann // and that the edge matches // go through all docNEdges // if edge matches thisE // AND if node corresponds to thisN //then set existsY to true for (int p = 0; p < docNeighbours.size(); p ++) { DocNode docN = docNeighbours.get(p); DocEdge docE = docNEdges.get(p); if (compareEdges(insE, docE) && corresponds(insN, docN, M, instanceNodes, documentNodes)) existsY = true; } } else // multiple match { // matchN, 0plus, etc. // TBD // here we // docN and insN mean docNeighbour and insNeighbour... // we need to start at the corresponding node of insN List<DocNode> correspondingNodes = getCorrespondingNodes(insN, M, instanceNodes, documentNodes); for (DocNode docN : correspondingNodes) { //System.out.println("corresponding node: " + docN); // now we look for a path from docNode to docN // in the document using only edges which "compare" // to insE // to be done here!! if (existsMatchNPath(insNode, insN, docNode, docN, insE, documentEdges)) { //System.out.println("exists matchN path"); existsY = true; } // ZEROPLUS: this necessitates a major change // in the Ullmann enumeration algorithm to allow // finished isomorphisms to not include all matched // source nodes... // if not zeroplus // check if // if zeroplus, allow the node to match to itself... } } if (!existsY) { //System.out.println("implication broken"); // implication broken; forAllX = false; } } if (!forAllX) { M[a][b] = false; //System.out.println("matrix changed to:"); //printBinaryMatrix(M); changeMade = true; } } } //if (noOne) System.out.println("noOne ... returning false"); if (noOne) return false; } if (!changeMade) loop = false; } //System.out.println("**********************************"); return true; } protected static void printBinaryMatrix(boolean[][] M) { for (int a = 0; a < M.length; a ++) { for (int b = 0; b < M[a].length; b ++) { if (M[a][b]) System.out.print(" 1"); else System.out.print(" 0"); //System.out.print(M[a][b]); } System.out.println(); } } protected static void mirrorBinaryMatrix(boolean[][] M) { //System.out.println("in mirror with width: " + M.length); //System.out.println("in mirror with height: " + M[0].length); for (int a = 0; a < M.length; a ++) { for (int b = 0; b < M[a].length; b ++) { if (M[a][b]) M[b][a] = true; } } } public static boolean[][] copyBinaryMatrix(boolean[][] M) { boolean[][] R = new boolean[M.length][M[0].length]; for (int a = 0; a < M.length; a ++) { for (int b = 0; b < M[a].length; b ++) { R[a][b] = M[a][b]; } } return R; } protected static boolean compareNodes(DocNode insNode, DocNode docNode) { if (!insNode.isTextSegment()) return false; // TextSegment ts = (TextSegment)insNode; if (!docNode.isTextSegment()) return false; // TextSegment docTs = (TextSegment)docNode; boolean typographyMatch = true; boolean contentMatch = true; boolean minLengthMatch = true; boolean maxLengthMatch = true; if (insNode.isMatchFont()) if (!insNode.getSegFontName().equals(docNode.getSegFontName())) typographyMatch = false; if (insNode.isMatchFontSize()) if (insNode.getSegFontSize() != docNode.getSegFontSize()) typographyMatch = false; if (insNode.isMatchBold()) if (insNode.isBold() != docNode.isBold()) typographyMatch = false; if (insNode.isMatchItalic()) if (insNode.isItalic() != docNode.isItalic()) typographyMatch = false; if (insNode.getMatchContent() == DocNode.MATCH_CONTENT_STRING) if (!docNode.getSegText().trim().equals (insNode.getMatchContentString().trim())) contentMatch = false; if (insNode.getMatchContent() == DocNode.MATCH_CONTENT_SUBSTRING) if (!docNode.getSegText().contains (insNode.getMatchContentString())) contentMatch = false; if (insNode.getMatchContent() == DocNode.MATCH_CONTENT_REGEXP) if (!docNode.getSegText().trim().matches (insNode.getMatchContentString())) contentMatch = false; if (insNode.getMatchMinLength() >= 0) if (docNode.getSegText().length() < insNode.getMatchMinLength()) minLengthMatch = false; if (insNode.getMatchMaxLength() >= 0) if (docNode.getSegText().length() > insNode.getMatchMaxLength()) maxLengthMatch = false; /* System.out.println(ts.getSegText() + " compare " + docTs.getSegText() + " nodeCompare returning " + (typographyMatch && contentMatch && minLengthMatch && maxLengthMatch)); System.out.println("ts.font " + ts.getFontName() + " dts.font " + docTs.getFontName() + " ts.fontsize " + ts.getSegFontSize() + " dts.fontsize " + docTs.getSegFontSize()); System.out.println("typ: " + typographyMatch + " con: " + contentMatch + " minL: " + minLengthMatch + " maxL: " + maxLengthMatch); */ return (typographyMatch && contentMatch && minLengthMatch && maxLengthMatch); } protected static boolean compareEdgesAndNodes(DocEdge insEdge, DocEdge docEdge) { if (compareEdges(insEdge, docEdge)) { return (compareNodes(insEdge.getFrom(), docEdge.getFrom()) && compareNodes(insEdge.getTo(), docEdge.getTo())); } else return false; } protected static boolean compareEdges(DocEdge insEdge, DocEdge docEdge) { /* System.out.println("in compareEdges with: " + insEdge + " and: " + docEdge); System.out.println("insEdge.getWeight: " + insEdge.getWeight()); System.out.println("docEdge.getWeight: " + docEdge.getWeight()); System.out.println("insEdge.getMatchMaxLength: " + insEdge.getMatchMaxLength()); */ //System.out.println("insEdge class: " + insEdge.getNodeFrom().getClass()); //System.out.println("docEdge class: " + docEdge.getNodeFrom().getClass()); boolean objects = true; //if (!insEdge.getNodeFrom().getClass().equals(docEdge.getNodeFrom().getClass())) if (insEdge.getFrom().isTextSegment() != docEdge.getFrom().isTextSegment()) objects = false; //if (!insEdge.getNodeTo().getClass().equals(docEdge.getNodeTo().getClass())) if (insEdge.getTo().isTextSegment() != docEdge.getTo().isTextSegment()) objects = false; boolean relation = false; if (insEdge.getRelation().equals( docEdge.getRelation())) relation = true; // TODO: for reverse relations (not yet implemented/required) boolean length = true; if (insEdge.getMatchLength() == DocEdge.LENGTH_BLOCK) if (docEdge.getLogicalLength() != DocEdge.LENGTH_BLOCK) length = false; if (insEdge.getMatchLength() == DocEdge.LENGTH_COLUMN) if (docEdge.getLogicalLength() != DocEdge.LENGTH_COLUMN) length = false; if (insEdge.getMatchLength() == DocEdge.LENGTH_GREATER) if (docEdge.getLogicalLength() != DocEdge.LENGTH_GREATER) length = false; if (insEdge.getMatchMinLength() != 0.0f && docEdge.getWeight() < insEdge.getMatchMinLength()) length = false; if (insEdge.getMatchMaxLength() != 0.0f && docEdge.getWeight() > insEdge.getMatchMaxLength()) length = false; /* System.out.println("insEdge.getMatchMaxLength() " + insEdge.getMatchMaxLength()); System.out.println("insEdge.getMatchMinLength() " + insEdge.getMatchMinLength()); System.out.println("insEdge.getLength() " + insEdge.getLength()); */ // 19.01.09 'match alignment' changed to 'require alignment' //boolean alignTopLeft = true, alignCentre = true, // alignBottomRight = true; boolean alignment = true; //if (insEdge.isMAlignTopLeft() && !docEdge.isAlignTopLeft()) if (insEdge.isMAlignTopLeft() && (docEdge.isAlignTopLeft() != insEdge.isAlignTopLeft())) alignment = false; //if (insEdge.isMAlignCentre() && !docEdge.isAlignCentre()) if (insEdge.isMAlignCentre() && (docEdge.isAlignCentre() != insEdge.isAlignCentre())) alignment = false; if (insEdge.isMAlignBottomRight() && (docEdge.isAlignBottomRight() != insEdge.isAlignBottomRight())) //if (insEdge.isMAlignBottomRight() && !docEdge.isAlignBottomRight()) alignment = false; boolean crossesRulingLine = true; if (insEdge.isMatchCrossesRulingLine() && (insEdge.isCrossesRulingLine() != docEdge.isCrossesRulingLine())) crossesRulingLine = false; boolean readingOrder = true; if (insEdge.isMatchReadingOrder() && (insEdge.getReadingOrder() != docEdge.getReadingOrder())) readingOrder = false; boolean superiorInferior = true; if (insEdge.isMatchSuperiorInferior() && (insEdge.getSuperiorInferior() != docEdge.getSuperiorInferior())) superiorInferior = false; /* System.out.println("rel " + relation + " len " + length + " align " + alignment + " cRL " + crossesRulingLine + " rO " + readingOrder + " sI " + superiorInferior + "obj " + objects); */ /* System.out.println("edgeCompare returning " + (relation && length && alignment && crossesRulingLine && readingOrder && superiorInferior && objects)); */ return (relation && length && alignment && crossesRulingLine && readingOrder && superiorInferior && objects); } protected static boolean[][] generateStartMatrix (List<DocNode> instanceNodes, List<DocNode> documentNodes)//, EdgeList instanceEdges, //EdgeList documentEdges)//, SegmentList matchNNodes) { boolean[][] startMatrix = new boolean[instanceNodes.size()][documentNodes.size()]; int i1count = -1; for (DocNode insN : instanceNodes) { i1count ++; // quicker to use iterators than for-next loops // Object insObj = i1.next(); // ith point // we are counting edgesFrom and edgesTo as part of the 'degree' // calculation as the opposite relation conveys a meaning // EdgeList insEdges = dg.getEdgesFrom(insSeg); // insEdges.addAll(dg.getEdgesTo(insSeg)); int i2count = -1; for (DocNode docN : documentNodes) { i2count ++; // Object docObj = i2.next(); // jth point // if edge was here; we are only working with nodes though if (true) { // if not an edge, then must be some kind of segment //if (insObj.getClass() == docObj.getClass()) // 29.12.08 changed (don't know if for good) // due to CTSs etc. being saved as TextSegments only if (insN.isTextSegment() && docN.isTextSegment()) { // added 14.11.08 (!matchNNodes.contains(insObj) && condition) //if (!matchNNodes.contains(insObj) && if (insN.isTextSegment()) { // we can assume docTs is also a TS // as it must be the same class as insObj... startMatrix[i1count][i2count] = compareNodes(insN, docN); } else { startMatrix[i1count][i2count] = true; } } else startMatrix[i1count][i2count] = false; } } } return startMatrix; } // think not! // pre: DG hashmap already run (dg.indexEdges) public static boolean checkForConnectedness(DocumentGraph dg) { // check if there are any 'lone nodes' // (except if that's the only node in the graph' int enabledNodes = 0; for (DocNode n : dg.getNodes()) if (!n.isRemoveFromInstance()) enabledNodes ++; //System.out.println("enabledNodes size: " + enabledNodes); if (enabledNodes > 1) //if (dg.getVertList().size() > 1) { for (DocNode n : dg.getNodes()) { if (!n.isRemoveFromInstance()) { //System.out.println("checking for loneness: " + gs); //TODO: change to hash map lookup! //EdgeList edges = dg.getEdges(gs); boolean loneNode = true; for (DocEdge ae : dg.edgesFromTo(n)) { if (!ae.isRemoveFromInstance() && (ae.getFrom() == n || ae.getTo() == n)) { //System.out.println("londNode false with " + ae); loneNode = false; } } if (loneNode) { //System.out.println("found lone node: " + gs); //System.out.println("with edges: " + edges); return false; } } } } HashMap<DocNode, List<DocNode>> groupHash = new HashMap<DocNode, List<DocNode>>(); List<List<DocNode>> groups = new ArrayList<List<DocNode>>(); for (DocEdge ae : dg.getEdges()) { if (!ae.isRemoveFromInstance()) { DocNode nodeFrom = ae.getFrom(); DocNode nodeTo = ae.getTo(); if (groupHash.containsKey(nodeFrom) && groupHash.containsKey(nodeTo)) { List<DocNode> nodeFromGroup = groupHash.get(nodeFrom); List<DocNode> nodeToGroup = groupHash.get(nodeTo); if (nodeFromGroup == nodeToGroup) { // groups ok; do nothing } else // merge the group; put all nodeToGroup's nodes // into nodeFromGroup { for (DocNode n : nodeToGroup) { // should overwrite existing hash table entry! groupHash.put(n, nodeFromGroup); nodeFromGroup.add(n); } //TEST //if (nodeToGroup.size() > 0) // System.err.println("checkForConnectedness: group not empty!"); groups.remove(nodeToGroup); } } else if (groupHash.containsKey(nodeFrom)) { // put nodeTo in nodeFrom's group List<DocNode> nodeFromGroup = groupHash.get(nodeFrom); groupHash.put(nodeTo, nodeFromGroup); nodeFromGroup.add(nodeTo); } else if (groupHash.containsKey(nodeTo)) { // put nodeFrom in nodeTo's group List<DocNode> nodeToGroup = groupHash.get(nodeTo); groupHash.put(nodeFrom, nodeToGroup); nodeToGroup.add(nodeFrom); } else // neither node has a group { List<DocNode> newGroup = new ArrayList<DocNode>(); newGroup.add(nodeFrom); newGroup.add(nodeTo); groupHash.put(nodeFrom, newGroup); groupHash.put(nodeTo, newGroup); groups.add(newGroup); } } } /* for each edge check if nodeFrom has a group check if nodeTo has a group if neither has a group create new group and put both nodes in it if both have the same group put both nodes in it if both have different groups merge the groups update hashes add to the merged group if one has a group put the other into that group */ //System.out.println("Groups.size: " + groups.size()); // if we have one group return true else false // actually, if we have no groups, technically we also // have a connected graph? (i.e. lone node) return (groups.size() <= 1); } public static List<WrappingInstance> findInstances(DocumentGraph dg, DocumentGraph wrapperGraph, Document resultDocument, List<List<String>> returnFieldNames, List<List<String>> returnExtractedData) { List<List<DocNode>> result = performExtraction (dg, wrapperGraph, resultDocument, returnFieldNames, returnExtractedData); return toWrappingInstances(result); } public static List<WrappingInstance> toWrappingInstances (List<List<DocNode>> result) { List<WrappingInstance> retVal = new ArrayList<WrappingInstance>(); for (List<DocNode> match : result) retVal.add(new WrappingInstance(match)); // finds bounding box too return retVal; } /* 28.12.08 * clones and prepares (i.e. joins matchN edges) wrapper graph * as document graph isn't altered, no need to clone it */ public static List<List<DocNode>> performExtraction(DocumentGraph dg, DocumentGraph wrapperGraph, Document resultDocument, List<List<String>> returnFieldNames, List<List<String>> returnExtractedData)//Element parentElement) { //Element resultElement = null; //if (parentElement != null && resultDocument != null) //{ // resultElement = resultDocument.createElement("wrapper-result"); // parentElement.appendChild(resultElement); //} //// Element resultElement = parentElement; //System.out.println("wrapperGraph.getEdges: " + wrapperGraph.getEdges()); //System.out.println("wrapperGraph.getNodes: " + wrapperGraph.getVertList()); // List<WrappingInstance> retVal = new ArrayList<WrappingInstance>(); List<List<DocNode>> retVal = new ArrayList<List<DocNode>>(); // List<DocEdge> instanceEdgesTemp = wrapperGraph.getEdges(); List<DocEdge> instanceEdgesTemp = new ArrayList<DocEdge>(); for (DocEdge e : wrapperGraph.getEdges()) instanceEdgesTemp.add(e); // HashMap from dgEdges to newly cloned instanceEdges HashMap<DocEdge, DocEdge> hm = new HashMap<DocEdge, DocEdge>(); List<DocEdge> instanceEdges = new ArrayList<DocEdge>(); for (DocEdge ae : instanceEdgesTemp) { DocEdge cae = (DocEdge)ae.clone(); instanceEdges.add(cae); hm.put(ae, cae); } // 20.11.08 end of addition // ListUtils instanceNodes = (ListUtils)wrapperGraph.getVertList().clone(); List<DocNode> instanceNodes = new ArrayList<DocNode>(); for (DocNode n : wrapperGraph.getNodes()) instanceNodes.add(n); //TODO!!! /* // use hashing to speed up... DocumentGraph instanceGraph = new DocumentGraph(); instanceGraph. */ List<DocNode> documentNodes = dg.getNodes(); List<DocEdge> documentEdges = dg.getEdges(); // remove from the list any which are removed from the match List<DocEdge> edgesToRemove = new ArrayList<DocEdge>(); for (DocEdge ae : instanceEdges) if (ae.isRemoveFromInstance()) edgesToRemove.add(ae); instanceEdges.removeAll(edgesToRemove); List<DocNode> nodesToRemove = new ArrayList<DocNode>(); for (DocNode n : instanceNodes) if (n.isRemoveFromInstance()) nodesToRemove.add(n); instanceNodes.removeAll(nodesToRemove); //System.out.println("after removing disabled nodes:"); //System.out.println("wrapperGraph.getEdges: " + instanceEdges); //System.out.println("wrapperGraph.getNodes: " + instanceNodes); // TODO: refactor and move to separate method for readability // now join any neighbouring matchN edges... // List<DocNode> visitedNodes = new ArrayList<DocNode>(); nodesToRemove = new ArrayList<DocNode>(); //EdgeList visitedEdges = new EdgeList(); edgesToRemove = new ArrayList<DocEdge>(); //ieIter = instanceEdges.iterator(); // for (int n = 0; n < instanceEdges.size(); n ++) for (DocEdge ae : instanceEdges) { // DocEdge ae = (DocEdge)instanceEdges.get(n); //if (ae.isMatchN() && !edgesToRemove.contains(ae)) if (ae.getMultipleMatch() != DocEdge.MATCH_ONE && !edgesToRemove.contains(ae)) // either the edge has been visited in order or // the edge has been marked for removal //!visitedEdges.contains(ae)) could be used for speedup { // first, look in direction of edge DocNode currentNode = ae.getTo(); DocEdge currentEdge = ae; //// System.out.println("currentEdge: " + ae); boolean loop = true; while(loop) { //System.out.println("loopa with currentNode: " + currentNode); //visitedEdges.add(currentEdge); // will be set false if too many edges/sidewards edges encountered // if true, look further (if nextEdge also found) boolean expand = true; // check whether currentNode has sidewards edges and find next edge // to expand to... boolean sidewardsEdges = false; DocEdge nextEdge = null; for (DocEdge aeCheck : instanceEdges) { if (aeCheck != currentEdge && aeCheck != ae) { if (aeCheck.getFrom() == currentNode) { if //(aeCheck != currentEdge && // currentEdge != sidewards edge! (aeCheck.getRelation().equals(currentEdge.getRelation())) { if (nextEdge == null) { nextEdge = aeCheck; } else { // nextEdge already assigned; too many edges; don't expand expand = false; } } else { // sidewards edge expand = false; } } else if (aeCheck.getTo() == currentNode) { if //(aeCheck != currentEdge && (DocEdge.isInverse(aeCheck, currentEdge)) { // such an edge will probably never exist in the graph if (nextEdge == null) { nextEdge = aeCheck; } else { // nextEdge already assigned; too many edges; don't expand expand = false; } } else { // sidewards edge expand = false; } } } } // check whether nextEdge is match N! if (expand && nextEdge != null && nextEdge.getMultipleMatch() != DocEdge.MATCH_ONE) //nextEdge.isMatchN()) { //System.out.println("in Expand with nextEdge: " + nextEdge); // remove currentNode from graph match nodesToRemove.add(currentNode); // remove currentEdge from graph match edgesToRemove.add(nextEdge); //System.out.println("for edge: " + ae); //System.out.println("setting nodeTo to: " + nextEdge.getNodeTo()); ae.setTo(nextEdge.getTo()); // set new edge and node as current currentEdge = nextEdge; currentNode = nextEdge.getTo(); // look further in this direction loop = true; } else { loop = false; } } // and now, look in the opposite direction currentNode = ae.getFrom(); currentEdge = ae; // should still be the case //System.out.println("currentEdgeb: " + ae); loop = true; while(loop) { //System.out.println("loopb with currentNode: " + currentNode); boolean expand = true; // check whether currentNode has sidewards edges and find next edge // to expand to... // boolean sidewardsEdges = false; DocEdge nextEdge = null; for (DocEdge aeCheck : instanceEdges) { if (aeCheck != currentEdge && aeCheck != ae) { if (aeCheck.getTo() == currentNode) { if //(aeCheck != currentEdge && //currentEdge != sidewards edge! (aeCheck.getRelation().equals(currentEdge.getRelation())) { if (nextEdge == null) { nextEdge = aeCheck; } else { // nextEdge already assigned; too many edges; don't expand expand = false; } } else { // sidewards edge expand = false; } } else if (aeCheck.getFrom() == currentNode) { if //(aeCheck != currentEdge && // (aeCheck.getRelation().equals // (currentEdge.getRelation().getInverse())) (DocEdge.isInverse(aeCheck, currentEdge)) { // such an edge will probably never exist in the graph if (nextEdge == null) { nextEdge = aeCheck; } else { // nextEdge already assigned; too many edges; don't expand expand = false; } } else { // sidewards edge expand = false; } } } } // check whether nextEdge is match N! if (expand && nextEdge != null && nextEdge.getMultipleMatch() != DocEdge.MATCH_ONE)//nextEdge.isMatchN()) { //System.out.println("in Expand with nextEdge: " + nextEdge); // remove currentNode from graph match nodesToRemove.add(currentNode); // remove currentEdge from graph match edgesToRemove.add(nextEdge); //System.out.println("for edge: " + ae); //System.out.println("setting nodeFrom to: " + nextEdge.getNodeFrom()); ae.setFrom(nextEdge.getFrom()); // set new edge and node as current currentEdge = nextEdge; currentNode = nextEdge.getFrom(); // look further in this direction loop = true; } else { loop = false; } } } } //System.out.println("find instances one"); //System.out.println("removing instanceEdges: " + edgesToRemove); instanceEdges.removeAll(edgesToRemove); //System.out.println("removing instanceNodes: " + nodesToRemove); instanceNodes.removeAll(nodesToRemove); // ** FOLLOWING COPIED FROM FINDINSTANCESWITHOUTMATCHN // int noInsNodes = instanceNodes.size(); // p sub alpha // int noDocNodes = documentNodes.size(); // p sub beta // this SegmentList is prob. to remain blank 16.02.09 List<DocNode> matchNNodes = new ArrayList<DocNode>(); //Matrix M = new Matrix(); // note: Matrix class in PDFBox supports only 3 x 3 matrices // set up the start matrix M sub 0 // boolean[][] startMatrix = new boolean[noInsNodes][noDocNodes]; boolean[][] startMatrix = generateStartMatrix(instanceNodes, documentNodes); //, instanceEdges, documentEdges, matchNNodes); // Ullmann List<boolean[][]> graphMatchResult = ullmannAlgorithm//(A, B, startMatrix); (instanceNodes, instanceEdges, documentNodes, documentEdges, startMatrix); for(boolean[][] M : graphMatchResult) { //printBinaryMatrix(M); //isomorphisms ++; // WrappingInstance match = new WrappingInstance(); // changed 2011-02-18 List<DocNode> match = new ArrayList<DocNode>(); // match.setClassification(DocNode.C_WRAPPING_INSTANCE); List<String> fieldNames = new ArrayList<String>(); List<String> extractedData = new ArrayList<String>(); /// EdgeList matchNEdges = new EdgeList(); // System.out.println("isomorphism found!"); for (int a = 0; a < M.length; a ++) { for (int b = 0; b < M[a].length; b ++) { if (M[a][b])// && isIsomorphism(M, A, B)) { // if (documentNodes.get(b) instanceof GenericSegment) // { match.add(documentNodes.get(b)); // changed 2011-02-18 // match.getItems().add(documentNodes.get(b).toGenericSegment()); // if (documentNodes.get(b) instanceof TextSegment) if (documentNodes.get(b).isTextSegment()) { DocNode insTs = instanceNodes.get(a); DocNode docTs = documentNodes.get(b); if (insTs.isExtractContent() && //wrapperGraph.getVertList().contains(insTs)) // matchNNode seems to get added to wrapperGraph somewhere !matchNNodes.contains(insTs)) { fieldNames.add(insTs.getSegType()); extractedData.add(docTs.getSegText()); } } // } } } } // List<DocEdge> foundMatchNEdges = new ArrayList<DocEdge>(); // List<DocNode> foundMatchNNodes = new ArrayList<DocNode>(); // match.findBoundingBox(); retVal.add(match); // now null can also be passed 23.02.09 if (returnFieldNames != null) returnFieldNames.add(fieldNames); if (returnExtractedData != null) returnExtractedData.add(extractedData); // System.out.println("in perform Extraction with match: "); // ListUtils.printList(match); } // TODO: find bounding boxes! return retVal; } public static List<boolean[][]> ullmannAlgorithm (List<DocNode> instanceNodes, List<DocEdge> instanceEdges, List<DocNode> documentNodes, List<DocEdge> documentEdges, boolean[][] M) { // System.out.println("in Ullmann"); //System.out.println("M:"); //printBinaryMatrix(M); int noInsNodes = instanceNodes.size(); int noDocNodes = documentNodes.size(); ArrayList<boolean[][]> retVal = new ArrayList<boolean[][]>(); boolean[][][]Md = new boolean[noDocNodes][noInsNodes][noDocNodes]; for (int a = 0; a < noDocNodes; a ++) for (int b = 0; b < noInsNodes; b ++) for (int c = 0; c < noDocNodes; c ++) Md[a][b][c] = false; // Md.add(startMatrix); // M = (boolean[][]) Md.get(0); int d = 0; //int d = 1; int k = -1; // dummy value is necessary to allow compilation boolean[] F = new boolean[noDocNodes]; int[] H = new int[noInsNodes]; H[0] = -1; //H[1] = 0; boolean valueFound; int possibleIsomorphisms = 0; int isomorphisms = 0; for (int i = 0; i < F.length; i ++) { F[i] = false; } boolean loop = true; int nextStep = 2; int iteration = -1; if (!refineM(M, instanceNodes, instanceEdges, documentNodes, documentEdges)) { System.out.println("terminating algorithm at first step!"); loop = false; } while(loop) { iteration ++; if (iteration % 10000 == 0) { System.out.println("Iteration: " + iteration); } //System.out.println("Iteration: " + iteration + " d: " + d + " k: " + k); //printBinaryMatrix(M); switch(nextStep) { case 2: //System.out.println("step 2"); // check whether there is a value of j such that M[d][j]==1 and F[j]==0 valueFound = false; for (int j = 0; j < noDocNodes; j ++) { //System.out.println("noInsNodes: " + noInsNodes + " noDocNodes: " + noDocNodes); //System.out.println("d: " + d + " j: " + j); if (M[d][j] == true && F[j] == false) valueFound = true; } if (valueFound) { //// Md.setElementAt(M, d); for (int a = 0; a < noInsNodes; a ++) for (int b = 0; b < noDocNodes; b ++) Md[d][a][b] = M[a][b]; //Md.set(d, M); //Md = M; if (d == 0) //if (d == 1) k = H[0]; //k = H[1]; else k = -1; //k = 0; nextStep = 3; } else { nextStep = 7; } break; case 3: //System.out.println("step 3"); k ++; if (M[d][k] == false || F[k] == true) { nextStep = 3; } else { for (int j = 0; j < noDocNodes; j ++) { if (j != k) M[d][j] = false; } nextStep = 4; if (!refineM(M, instanceNodes, instanceEdges, documentNodes, documentEdges)) nextStep = 5; } break; case 4: //System.out.println("step 4"); if (d < (noInsNodes - 1)) //if (d < noInsNodes) { nextStep = 6; } else { //System.out.println("possible isomorphism"); // print out which node is linked to which?? possibleIsomorphisms ++; if (true) //if (isIsomorphism(M, A, B)) { isomorphisms++; retVal.add(copyBinaryMatrix(M)); } nextStep = 5; } break; case 5: //System.out.println("step 5"); for (int a = 0; a < noInsNodes; a ++) for (int b = 0; b < noDocNodes; b ++) M[a][b] = Md[d][a][b]; valueFound = false; for (int j = k + 1; j < noDocNodes; j ++) { if (j > k) // should always be true { //System.out.println("j: " + j + " mdj: " + M[d][j] + " fj: " + F[j]); if (M[d][j] == true && F[j] == false) { valueFound = true; } } } if (!valueFound) { nextStep = 7; } else { nextStep = 3; } break; case 6: //System.out.println("step 6"); H[d] = k; F[k] = true; d ++; nextStep = 2; break; case 7: //System.out.println("step 7"); if (d == 0)//(d == 1) { loop = false; } else { //F[k] = false; d --; for (int a = 0; a < noInsNodes; a ++) for (int b = 0; b < noDocNodes; b ++) M[a][b] = Md[d][a][b]; k = H[d]; F[k] = false; nextStep = 5; } } } System.out.println("Number of iterations: " + iteration); // System.out.println("possible isomorphisms: " + possibleIsomorphisms); // System.out.println("verified isomorphisms: " + isomorphisms); return retVal; } public static List<WrappingInstance> wrap(Document resultDocument, Element resultElement, DocumentGraph pageDg, Element wrapperElement) { boolean output = true; if (wrapperElement.getAttributes().getNamedItem("output") != null) output = Boolean.parseBoolean(wrapperElement.getAttributes(). getNamedItem("output").getNodeValue()); boolean areaBased = true; if (wrapperElement.getAttributes().getNamedItem("area-based") != null) areaBased = Boolean.parseBoolean(wrapperElement.getAttributes(). getNamedItem("area-based").getNodeValue()); //swallow? boolean wholePage = false; if (wrapperElement.getAttributes().getNamedItem("whole-page") != null) wholePage = Boolean.parseBoolean(wrapperElement.getAttributes(). getNamedItem("area-based").getNodeValue()); // 5.04.09 not here... // boolean rulingLines = Boolean.parseBoolean(wrapperElement.getAttributes(). // getNamedItem("process-ruling-lines").getNodeValue()); //System.out.println("wrap one"); //System.out.println("output: " + output); //System.out.println("area-base: " + areaBased); NodeList listOfItems = wrapperElement.getChildNodes(); //System.out.println("listOfItems.size: " + listOfItems); // this ignores any non-node or edge elements DocumentGraph wrapperDg = new DocumentGraph(listOfItems); // created page items, necessary for swallowing later List<GenericSegment> pageItems = new ArrayList<GenericSegment>(); for (DocNode dn : wrapperDg.getNodes()) pageItems.add(dn.toGenericSegment()); //System.out.println("wrapperDG: " + wrapperDg); // wrapperDg.indexEdges(); // required for hashMap // pageDg.indexEdges(); //System.out.println("wrap two"); // before calling wrap, determine whether result is to be saved and which method of moving between // levels //SegmentList retVal = wrap(resultDocument, resultElement, // pageDg, wrapperDg, output); List<List<String>> returnFieldNames = new ArrayList<List<String>>(); List<List<String>> returnExtractedData = new ArrayList<List<String>>(); // System.out.println("wrap three"); List<List<DocNode>> matchList = performExtraction(pageDg, wrapperDg, resultDocument, //resultElement); returnFieldNames, returnExtractedData); List<WrappingInstance> result = toWrappingInstances(matchList); // System.out.println("wrap four"); //Element subResultElement = resultElement; if (wholePage) { List<List<WrappingInstance>> subResults = new ArrayList<List<WrappingInstance>>(); // first, process subwrappers on whole page for (int s = 0; s < listOfItems.getLength(); s ++) { Node itemNode = listOfItems.item(s); if(itemNode.getNodeType() == Node.ELEMENT_NODE) { if(itemNode.getNodeName().equals("pdf-wrapper")) { List<List<String>> dummyRFN = new ArrayList<List<String>>(); List<List<String>> dummyRED = new ArrayList<List<String>>(); NodeList listOfSubItems = itemNode.getChildNodes(); // this ignores any non-node or edge elements DocumentGraph subWrapperDg = new DocumentGraph(listOfSubItems); //System.out.println("subWrapperDG: " + subWrapperDg); // subWrapperDg.indexEdges(); List<WrappingInstance> subResult = findInstances(pageDg, subWrapperDg, null, dummyRFN, dummyRED); for (int i = 0; i < subResult.size(); i ++) { WrappingInstance subInstance = subResult.get(i); //System.out.println("tt subInstance: " + subInstance.toExtendedString()); if (areaBased) { List<GenericSegment> items = ListUtils.findElementsIntersectingBBox(pageItems, subInstance); // System.out.println("subinstance items"); // ListUtils.printList(items); WrappingInstance wi = new WrappingInstance(); wi.setItems(items); wi.findBoundingBox(); subResult.set(i, wi); } } //System.out.println("tt subResult:" + subResult); subResults.add(subResult); } } } int cellCount = 0; // for each found instance for (int n = 0; n < result.size(); n ++) { // if output==true, add to result WrappingInstance thisResult = result.get(n); if (areaBased) { List<GenericSegment> items = ListUtils.findElementsIntersectingBBox(pageItems, thisResult); thisResult = new WrappingInstance(); thisResult.setItems(items); thisResult.findBoundingBox(); } System.out.println("******************"); System.out.println("Top-level result: ");// + thisResult.toExtendedString()); Collections.sort(thisResult.getItems(), new XComparator()); Element subResultElement = resultElement; if (output) { // go through each subresult and check if matches at least one // if so, then output // and output intersections as subresults boolean intersects = false; // deprecated for (int p = 0; p < subResults.size(); p ++) // these are the subwrappers { List<GenericSegment> intersections = new ArrayList<GenericSegment>(); List<WrappingInstance> subResult = subResults.get(p); for (int r = 0; r < subResult.size(); r ++) // these are the instances { WrappingInstance subInstance = subResult.get(r); // thisResult is an instance? List<GenericSegment> intersection = ListUtils.intersection(thisResult.getItems(), subInstance.getItems()); intersections.addAll(intersection); if (intersection.size() > 0) intersects = true; /* if (intersects) { System.out.println("Subwrapper: " + p + " intersecting segments:"); Iterator isIter = intersection.iterator(); while(isIter.hasNext()) { TextSegment gs = (TextSegment)isIter.next(); System.out.println(gs.getSegText()); } System.out.println("======================"); } */ } /* if (intersections.size() > 0) { System.out.println("Subwrapper: " + p + " intersecting segments:"); Iterator isIter = intersections.iterator(); while(isIter.hasNext()) { TextSegment gs = (TextSegment)isIter.next(); System.out.println(gs.getSegText()); cellCount ++; } System.out.println("======================"); } */ } } } System.out.println("cells on page: " + cellCount); } else { // for each found instance for (int n = 0; n < matchList.size(); n ++) { // if output==true, add to result Element subResultElement = resultElement; if (output) { subResultElement = resultDocument.createElement("wrapper-result"); resultElement.appendChild(subResultElement); List<String> resultFieldNames = returnFieldNames.get(n); List<String> resultExtractedData = returnExtractedData.get(n); for (int p = 0; p < resultExtractedData.size(); p ++) { Element newFieldElement = resultDocument. createElement(resultFieldNames.get(p)); subResultElement.appendChild(newFieldElement); newFieldElement.appendChild(resultDocument. createTextNode(resultExtractedData.get(p))); } } // run for sub-graph WrappingInstance instance = result.get(n); List<DocNode> match = matchList.get(n); if (areaBased) { /* List<GenericSegment> items = ListUtils.getElementsIntersectingBBox(pageItems, instance); instance = new WrappingInstance(); instance.setItems(items); instance.findBoundingBox(); */ // swallow on node level match.clear(); for (DocNode dn : pageDg.getNodes()) { GenericSegment testSeg = dn.toGenericSegment(); if (SegmentUtils.intersects(testSeg, instance)) match.add(dn); } } //System.out.println("wrap five"); DocumentGraph subPageDg = pageDg.subGraph(match); for (int s = 0; s < listOfItems.getLength(); s ++) { Node itemNode = listOfItems.item(s); if(itemNode.getNodeType() == Node.ELEMENT_NODE) { if(itemNode.getNodeName().equals("pdf-wrapper")) { // if wholepage... TODO (don't call wrap directly like that) // don't recurse here... // call next level & compare with all results if (!wholePage) wrap(resultDocument, subResultElement, subPageDg, (Element)itemNode); } } } } } // System.out.println("wrap six"); return result; } /** * Infamous main method. * * @param args Command line arguments, should be one and a reference to a file. * * @throws Exception If there is an error parsing the document. */ public static void main(String[] args) throws Exception { boolean toConsole = false; int processType = -1; // selected according to input wrapper int processSpacesCommandLine = 0; int rulingLinesCommandLine = 0; int currentArgumentIndex = 0; String password = ""; String encoding = ProcessFile.DEFAULT_ENCODING; PDFObjectExtractor extractor = new PDFObjectExtractor(); String inDocFile = null; String inWrapperFile = null; String outFile = null; boolean processSpaces = false; boolean rulingLines = true; int startPage = 1; int endPage = Integer.MAX_VALUE; for( int i=0; i<args.length; i++ ) { if( args[i].equals( "-test")) { i++; System.err.println("This function is not available in this version."); //System.err.println("method graphMatchTest removed; see GraphMatcher3.java"); //graphMatchTest(); System.exit(0); } if( args[i].equals( ProcessFile.PASSWORD ) ) { i++; if( i >= args.length ) { usage(); } password = args[i]; } else if( args[i].equals( ProcessFile.ENCODING ) ) { i++; if( i >= args.length ) { usage(); } encoding = args[i]; } else if( args[i].equals( ProcessFile.START_PAGE ) ) { i++; if( i >= args.length ) { usage(); } startPage = Integer.parseInt( args[i] ); } else if( args[i].equals( ProcessFile.END_PAGE ) ) { i++; if( i >= args.length ) { usage(); } endPage = Integer.parseInt( args[i] ); } else if( args[i].equals( ProcessFile.CONSOLE ) ) { toConsole = true; } else if( args[i].equals( "-rulinglines" )) { rulingLinesCommandLine = 1; } else if( args[i].equals( "-norulinglines" )) { rulingLinesCommandLine = -1; } else if( args[i].equals( "-blocks" )) { processType = PageProcessor.PP_BLOCK; } else if( args[i].equals( "-mergedlines" )) { processType = PageProcessor.PP_MERGED_LINES; } else if( args[i].equals( "-lines" )) { processType = PageProcessor.PP_LINE; } else if( args[i].equals( "-spaces" )) { processSpacesCommandLine = 1; } else if( args[i].equals( "-nospaces" )) { processSpacesCommandLine = -1; } else { if( inDocFile == null ) { inDocFile = args[i]; } else if( inWrapperFile == null ) { inWrapperFile = args[i]; } else { outFile = args[i]; } } } if( inDocFile == null && inWrapperFile == null) { usage(); } if( outFile == null && inDocFile.length() >4 ) { outFile = inDocFile.substring( 0, inDocFile.length() -4 ) + ".txt"; } long docStart = System.currentTimeMillis(); // load the input files File inputDocFile = new File(inDocFile); byte[] inputDoc = ProcessFile.getBytesFromFile(inputDocFile); File inputWrapperFile = new File(inWrapperFile); //byte[] inputWrapper = ProcessFile.getBytesFromFile(inputWrapperFile); DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder(); Document wrapperDocument = docBuilder.parse(inputWrapperFile); Document resultDocument = null; //System.out.println("main one"); // load the wrapper // normalize text representation wrapperDocument.getDocumentElement().normalize(); //NodeList listOfWrappers = wrapperDocument.getElementsByTagName("pdf-wrapper"); Element rootWrapper = (Element)wrapperDocument.getElementsByTagName("pdf-wrapper").item(0); // TODO: crashes if not present String granularity = rootWrapper.getAttributes().getNamedItem("granularity").getNodeValue(); if (rootWrapper.getAttributes().getNamedItem("process-spaces") != null) processSpaces = Boolean.parseBoolean(rootWrapper.getAttributes(). getNamedItem("process-spaces").getNodeValue()); if (rootWrapper.getAttributes().getNamedItem("process-ruling-lines") != null) rulingLines = Boolean.parseBoolean(rootWrapper.getAttributes(). //default false getNamedItem("process-ruling-lines").getNodeValue()); if (processType == -1) { if (granularity.equals("raw-line")) processType = PageProcessor.PP_LINE; else if (granularity.equals("line")) processType = PageProcessor.PP_MERGED_LINES; else if (granularity.equals("block")) processType = PageProcessor.PP_BLOCK; } // eise if overridden in commandline don't alter value here if (processSpacesCommandLine == 1) processSpaces = true; else if (processSpacesCommandLine == -1) processSpaces = false; // else if iSCL == 0 (no command line override) as document if (rulingLinesCommandLine == 1) rulingLines = true; else if (rulingLinesCommandLine == -1) rulingLines = false; // else if rLCL == 0 (no command line override) as document // wrapping... //NodeList listOfItems = listOfWrappers.item(0).getChildNodes(); // NodeList listOfItems = rootWrapper.getChildNodes(); // DocumentGraph wrapperDg = new DocumentGraph(listOfItems, model); // load the document // do the processing //endPage = startPage; // FOR NOW only process one page at a time List<AdjacencyGraph<GenericSegment>> theAdjGraphs = new ArrayList<AdjacencyGraph<GenericSegment>>(); // set up page processor object PageProcessor pp = new PageProcessor(); pp.setProcessType(processType); pp.setRulingLines(rulingLines); pp.setProcessSpaces(processSpaces); // no iterations should be automatically set to -1 List<Page> theResult = ProcessFile.processPDF(inputDoc, pp, startPage, endPage, encoding, password, theAdjGraphs, false); // copied from ProcessFile.setUpXML try { DocumentBuilderFactory myFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder myDocBuilder = myFactory.newDocumentBuilder(); DOMImplementation myDOMImpl = myDocBuilder.getDOMImplementation(); //org.w3c.dom.Document resultDocument = myDOMImpl.createDocument("at.ac.tuwien.dbai.pdfwrap", "pdf-result", null); } catch (ParserConfigurationException e) { e.printStackTrace(); // TODO: System.exit System.out.println("error"); return; } Element resultElement = resultDocument.getDocumentElement(); //System.out.println("main three"); // GraphMatcher gm = new GraphMatcher(pageDg); // gm.setDocument(pageDg); // SegmentList result = gm.findInstances(wrapperDg, resultDocument, docElement); for (int p = 0; p < theResult.size(); p ++) { System.out.println("Page: " + (p + 1)); long pageStart = System.currentTimeMillis(); Page resultPage = theResult.get(p); //System.out.println("resultPage: " + resultPage.getItems()); // System.out.println("3.1"); DocumentGraph pageDg = new DocumentGraph(theAdjGraphs.get(p)); //System.out.println("3.2"); //System.out.println("pageDG: " + pageDg); Element pageResultElement = resultDocument.createElement("page"); pageResultElement.setAttribute("page-number", Integer.toString(p + 1)); //Integer.toString(resultPage.getPageNo())); resultElement.appendChild(pageResultElement); //System.out.println("3.3"); List<WrappingInstance> result = wrap(resultDocument, pageResultElement, pageDg, wrapperDocument.getDocumentElement()); //System.out.println("3.4"); //System.out.println("result.size: " + result.size()); //System.out.println(result); System.out.println("processing time for page: " + (System.currentTimeMillis() - pageStart)); /* for (WrappingInstance thisResult : result) { System.out.println(); System.out.println("New result:"); // List<GenericSegment> theItems = thisResult.getItems(); // Collections.sort(theItems, Collections.reverseOrder(new YComparator())); Collections.sort(thisResult.getItems(), Collections.reverseOrder(new YComparator())); // TEST CODE for (Object o : thisResult.getItems()) { if (o instanceof TextSegment) System.out.println(((TextSegment)o).getText()); } // } */ //System.out.println("result: " + result.toExtendedString()); } System.out.println("processing time for document: " + (System.currentTimeMillis() - docStart)); //System.out.println("main four"); // now output the XML Document by serializing it to output Writer output = null; if( toConsole ) { output = new OutputStreamWriter( System.out ); } else { if( encoding != null ) { output = new OutputStreamWriter( new FileOutputStream( outFile ), encoding ); } else { //use default encoding output = new OutputStreamWriter( new FileOutputStream( outFile ) ); } //System.out.println("using output file: " + outFile); } //System.out.println("resultDocument: " + resultDocument); ProcessFile.serializeXML(resultDocument, output); if( output != null ) { output.close(); } } /** * This will print the usage requirements and exit. */ private static void usage() { // System.err.println( "Usage: java at.ac.tuwien.dbai.pdfwrap.GraphMatcher [OPTIONS] <PDF file> [Text File]\n" + System.err.println( "Usage: graphwrap [OPTIONS] <PDF file> <Wrapper file> [Output file]\n" + " -password <password> Password to decrypt document\n" + " -encoding <output encoding> (ISO-8859-1,UTF-16BE,UTF-16LE,...)\n" + // " -xhtml output XHTML for wrapping (instead of XMillum-XML)\n" + // " -table assume that whole page contains tabular data\n" + // " -autotable attempt to detect location of tables on page\n" + // " -bmw Processing for BMW reports (use with -xhtml)\n" + // " -encoding <output encoding> (ISO-8859-1,UTF-16BE,UTF-16LE,...)\n" + " -spaces | -spaces Override processing settings in wrapper\n" + " -blocks | -lines | mergedlines\n" + " -rulinglines | -norulinglines\n" + " -console Send text to console instead of file\n" + " -startPage <number> The first page to start extraction (1 based)\n" + " -endPage <number> The last page to extract (inclusive)\n" + " <PDF file> The PDF document to use\n" + " <Wrapper file> The XML wrapper file to use\n" + " [Output File] The output XML file name\n" ); System.exit( 1 ); }// add noborders to this printout }