PageProcessor.java example

Explorer

pdfxtk-master
- graphwrap
  - src
    - main
      - java
        at
        ac
        tuwien
        dbai
        pdfwrap
        GraphMatcher.java
        gui
        GUI.java
        displayable
        Block2.java
        Image2.java
        TextArea.java
        TextArea2.java
        graphbrowser
        DocGBPanel.java
        DocNavigateUI.java
        DocWrapperUI.java
        model
        graph
        DocEdge.java
        DocGraphEltSet.java
        DocNode.java
        DocumentGraph.java
        WrappingInstance.java
        package-info.java
        com
        touchgraph
        graphlayout
        Edge.java
        Node.java
        TGPanel.java
        interaction
        DragAddUI.java
        DragMultiselectUI.java
        DragNodeUI.java
        GLEditUI.java
        GLNavigateUI.java
        HVRotateDragUI.java
        HVScroll.java
        HyperScroll.java
        LocalityScroll.java
        RotateScroll.java
        TGAbstractClickUI.java
        TGAbstractDragUI.java
        TGAbstractMouseMotionUI.java
        TGAbstractMousePausedUI.java
        TGSelfDeactivatingUI.java
        TGUIManager.java
        TGUserInterface.java
        ZoomScroll.java
        iiuf
        awt
        Applet.java
        AppletFrame.java
        Awt.java
        BorderLayout.java
        DOM.java
        DateChooser.java
        Dialog.java
        Find.java
        FindListener.java
        Frame.java
        HLine.java
        HNav.java
        InfoList.java
        LoginRequester.java
        MultiLineLabel.java
        ProgressBar.java
        Requester.java
        SplashScreen.java
        StringRequester.java
        TableView.java
        VNav.java
        WaitBar.java
        dom
        Comparison.java
        DOM.java
        DOMContext.java
        DOMElementFactory.java
        DOMHandler.java
        DOMManager.java
        DOMUtils.java
        DOMUtilsNS.java
        DOMable.java
        DefaultElement.java
        ElementList.java
        ProjectX.java
        Xerces.java
        test
        DOM.java
        jai
        BinarizeDescriptor.java
        BinarizeOpImage.java
        BlackOrDescriptor.java
        BlackOrOpImage.java
        CCDescriptor.java
        CCOpImage.java
        DirectRasterAccessor.java
        DisplayImage.java
        DisplayImageLayer.java
        DisplayImagePanel.java
        DisplayRect.java
        FolderImageStorage.java
        HilightImagePanel.java
        HistogramPanel.java
        ImageStorage.java
        ImageStorageException.java
        ImageViewer.java
        JAITest.java
        LUV_ColorSpace.java
        PowerDescriptor.java
        PowerOpImage.java
        ProjectionProfile.java
        ProjectionProfileDescriptor.java
        ProjectionProfileOpImage.java
        RLSADescriptor.java
        RLSAOpImage.java
        RandomizeDescriptor.java
        RandomizeOpImage.java
        RectDisplayImageLayer.java
        SkeletonDescriptor.java
        SkeletonOpImage.java
        Util.java
        test
        ConnectedComponents.java
        ImageStorageTest.java
        ProjectionProfiles.java
        Scale.java
        log
        Client.java
        Const.java
        List.java
        Log.java
        LogListener.java
        LogMessage.java
        LogMessageListener.java
        LogTableModel.java
        Overview.java
        Server.java
        Stdout.java
        swing
        AbstractPreview.java
        AudioPreview.java
        AutoExpandingJTree.java
        ButtonTreePathView.java
        ChartPanel.java
        CheckBoxList.java
        ChooserTreeView.java
        ContextMenu.java
        ContextMenuEnabled.java
        ContextMenuManager.java
        DOM.java
        HexagonalBorder.java
        ImagePreview.java
        JInternalFrame.java
        JNumberField.java
        JTreeView.java
        JWindowToolBarUI.java
        ListTreeModel.java
        LocatedIcon.java
        MemoryMonitor.java
        MultiColumnListUI.java
        MultiLineToolTip.java
        Preferences.java
        PreviewFileChooser.java
        ProgressMonitor.java
        Resource.java
        SetSelectionModel.java
        SimpleTreeModel.java
        SplitPaneTreeView.java
        SubTreeModel.java
        Swing.java
        TableMap.java
        TableSorter.java
        TreeView.java
        TreeViewTest.java
        VerticalLabelUI.java
        graph
        AbstractGraphEdgeFactory.java
        AbstractNodeComponentFactory.java
        AbstractPortFactory.java
        AbstractPropertiesFactory.java
        ConnectingNode.java
        DOM.java
        DefaultNL.java
        DistanceOrderedNL.java
        EdgeEditor.java
        EdgeMarker.java
        EquilateralTriangleMarker.java
        ForceDirectedNL.java
        GeometryEditor.java
        GraphEdge.java
        GraphEdgeUtils.java
        GraphNodeComponent.java
        GraphNodePort.java
        GraphPanel.java
        GraphPanelEditor.java
        GraphRouter.java
        LabelMarker.java
        NodeLayouter.java
        OrthogonalRouter.java
        StraightLineRouter.java
        StyleEditor.java
        TreeNL.java
        propertiespanel
        ComboBox.java
        Group.java
        List.java
        ListAccess.java
        NumberField.java
        PropertiesPanel.java
        Property.java
        StringCheckbox.java
        TextField.java
        util
        AppletPreferences.java
        AsyncAccelerator.java
        AsyncInvocation.java
        Attributable.java
        AttributeFactory.java
        Base64Decoder.java
        Base64Encoder.java
        Base64FormatException.java
        BinaryTree.java
        BinaryTreeNode.java
        CacheArray.java
        CacheArrayBackEnd.java
        ClusteringQueue.java
        CopyThread.java
        Crypt.java
        DBPreferences.java
        DOM.java
        DefaultAttributable.java
        DirTree.java
        EventListenerList.java
        FilePreferences.java
        ListParser.java
        NestedException.java
        NotImplementedException.java
        PrefNamer.java
        PrefPropertyWatcher.java
        PrefReanimator.java
        PrefWatcher.java
        Preferences.java
        PreferencesHandler.java
        PreferencesStore.java
        ProgressListener.java
        ProgressWatcher.java
        Queue.java
        RectMap.java
        RectMapFilter.java
        RedBlackNode.java
        RedBlackTree.java
        StopWatch.java
        Strings.java
        SysPreferences.java
        Timer.java
        Trans.java
        Tree.java
        TreeNode.java
        TreeWalk.java
        UTHTML.java
        UTMacOS.java
        UTTeX.java
        Unicode.java
        UnicodeTrans.java
        UnicodeTranslator.java
        Util.java
        graph
        AbstractGraphModel.java
        DOM.java
        DefaultGraphEdge.java
        DefaultGraphModel.java
        DefaultGraphNode.java
        DefaultGraphPort.java
        GraphEdge.java
        GraphException.java
        GraphModel.java
        GraphModelListener.java
        GraphNode.java
        GraphPort.java
        GraphPortListener.java
        GraphWalk.java
        Path.java
        Utilities.java
        xmillum
        ActionHandler.java
        ActionHandlerFactory.java
        ActionHandlerParam.java
        AllFlagListener.java
        BrowserContext.java
        BrowserPanel.java
        Displayable.java
        DisplayableAppearance.java
        DisplayableClass.java
        DisplayableFactory.java
        DocumentChangeEvent.java
        DocumentChangeListener.java
        ElementTagger.java
        FlagAccess.java
        FlagListener.java
        FlagManager.java
        IllumDocument.java
        IllumException.java
        IllumSource.java
        Image.java
        ImageFactory.java
        ImageListener.java
        JAIImageFactory.java
        JavaImageFactory.java
        Parameter.java
        ParameterException.java
        StatusListener.java
        Style.java
        StyleRegistry.java
        Tool.java
        ToolFactory.java
        Window.java
        WindowCreator.java
        XMIllumDesktop.java
        XMIllumFrame.java
        displayable
        Block.java
        Image.java
        Polygon.java
        Root.java
        TextArea.java
        handlers
        Info.java
        Invalidate.java
        LabelAttribute.java
        Output.java
        PopupFlagger.java
        Select.java
        Split.java
        TextUpdate.java
        tool
        FlagOutput.java
        Hottable.java
        InfoWindow.java
        LabelWizard.java
        TextUpdate.java
        XMLTree.java
- pdfXtk
  - src
    - main
      - java
        at
        ac
        tuwien
        dbai
        pdfwrap
        ProcessFile.java
        analysis
        AbstractPageSegmenter.java
        CandidateCluster.java
        LineProcessor.java
        PageProcessor.java
        RulingObjectProcessor.java
        SegmentationResult.java
        TextBlockPageSegmenter.java
        comparators
        EdgeAttributeComparator.java
        EdgeLengthComparator.java
        XComparator.java
        XYTextComparator.java
        YComparator.java
        exceptions
        DocumentProcessingException.java
        gui
        EdgeSegment.java
        elements
        CheckBoxList.java
        MainFrame.java
        PDFPanel.java
        PageSpinner.java
        SelectionPanel.java
        exceptions
        UnknownShapeException.java
        layer
        Shapes.java
        Style.java
        StyledSegment.java
        tools
        MultiLineTooltip.java
        OpenDocFileFilter.java
        PDF_XMLSerializer.java
        XMLLayerLoader.java
        model
        document
        AttributeTuple.java
        CharSegment.java
        CompositeSegment.java
        GenericSegment.java
        IBlankSegment.java
        IXHTMLSegment.java
        IXmillumSegment.java
        ImageSegment.java
        LineFragment.java
        LineSegment.java
        OpTuple.java
        Page.java
        RectSegment.java
        TextBlock.java
        TextFragment.java
        TextLine.java
        TextSegment.java
        graph
        AdjacencyEdge.java
        AdjacencyGraph.java
        operator
        AppendRectangleToPath.java
        BeginInlineImage.java
        ClosePath.java
        CurveTo.java
        CurveToReplicateFinalPoint.java
        CurveToReplicateInitialPoint.java
        EndPath.java
        FillEvenOddRule.java
        FillNonZeroAndStrokePath.java
        FillNonZeroRule.java
        GRestore.java
        GSave.java
        Invoke.java
        LineTo.java
        ModifyClippingPath.java
        MoveTo.java
        SetLineWidth.java
        SetNonStrokingCMYKColor.java
        SetNonStrokingColorSpace.java
        SetNonStrokingRGBColor.java
        SetStrokingCMYKColor.java
        SetStrokingColorSpace.java
        SetStrokingRGBColor.java
        ShowText.java
        ShowTextGlyph.java
        StrokePath.java
        pdfread
        PDFObjectExtractor.java
        PDFPage.java
        utils
        ListUtils.java
        SegmentUtils.java
        SerializationUtil.java
        Utils.java
        com
        javazoid
        functions
        FileFunctions.java
    - test
      - java
        at
        ac
        tuwien
        dbai
        pdfwrap
        comparators
        EdgeAttributeComparatorTest.java
        utils
        SegmentUtilsTest.java

/**
 * pdfXtk - PDF Extraction Toolkit
 * Copyright (c) by the authors/contributors.  All rights reserved.
 * This project includes code from PDFBox and TouchGraph.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 3. Neither the names pdfXtk or PDF Extraction Toolkit; nor the names of its
 *    contributors may be used to endorse or promote products derived from this
 *    software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * http://pdfxtk.sourceforge.net
 *
 */
package at.ac.tuwien.dbai.pdfwrap.analysis;

import at.ac.tuwien.dbai.pdfwrap.comparators.YComparator;
import at.ac.tuwien.dbai.pdfwrap.gui.EdgeSegment;
import at.ac.tuwien.dbai.pdfwrap.model.document.*;
import at.ac.tuwien.dbai.pdfwrap.model.graph.AdjacencyEdge;
import at.ac.tuwien.dbai.pdfwrap.model.graph.AdjacencyGraph;
import at.ac.tuwien.dbai.pdfwrap.pdfread.PDFObjectExtractor;
import at.ac.tuwien.dbai.pdfwrap.pdfread.PDFPage;
import at.ac.tuwien.dbai.pdfwrap.utils.ListUtils;
import at.ac.tuwien.dbai.pdfwrap.utils.Utils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.awt.image.BufferedImage;
import java.io.IOException;
import java.util.*;

/**
 * General class to take a PDFPage and return a processed Page object,
 * according to the given processType
 *
 * @author Tamir Hassan, pdfanalyser@tamirhassan.com
 * @author @author Ben Litchfield (ben@csh.rit.edu)
 * @version PDF Analyser 0.9
 */
public class PageProcessor {
    private static final Log log = LogFactory.getLog( PageProcessor.class );

	// types of clustering -- while clustering is still being perfected...
//	public final static int PP_MONOSPACE = 0;	//seems to be catered for by PP_BMW
	public final static int PP_INSTRUCTION = 1;
	public final static int PP_FRAGMENT = 2;
	public final static int PP_CHAR = 3;
	public final static int PP_LINE = 4;
	public final static int PP_BLOCK = 5;

	public final static int PP_MERGED_LINES = 16;
	
//	public final static int PP_DEFAULT = 100;
	
//	public static Page STR_CURR_PAGE = new Page();
	
    // added so that the table understander could be called via a separate method
	protected Page retVal;
	protected List<CharSegment> charList; // was: charList
	protected List<TextFragment> fragList;
	protected List<ImageSegment> imageList;
	protected List<LineSegment> lineList;
	protected List<RectSegment> rectList;
    // the chosen granularity for graph matching
	protected List<GenericSegment> processingResult;
    
	protected List<EdgeSegment> edgeSegmentList; // the edges that will be finally displayed
	protected List<TextLine> textLines;
	protected List<TextBlock> mergedLines;
	protected List<TextBlock> textBlocks;
	protected RulingObjectProcessor rop;
    
	protected AdjacencyGraph<GenericSegment> adjGraph;
	
    float currentX = 0.0f;
    float currentY = 0.0f;
	// end of addition
	
    protected int processType = PP_BLOCK;
    protected boolean rulingLines = true;
    protected boolean processSpaces = false;
    protected int noIterations = -1;
    
	// added by TH
//	private Document resultDocument;
	// end of addition

//    private static Logger log = Logger.getLogger(PDFTextStripper.class);

	/*
    private int currentPageNo = 0;
    private int startPage = 1;
    private int endPage = Integer.MAX_VALUE;
    private PDOutlineItem startBookmark = null;
    private int startBookmarkPageNumber = -1;
    private PDOutlineItem endBookmark = null;
    private int endBookmarkPageNumber = -1;
    private boolean suppressDuplicateOverlappingText = true;
    private PDDocument document;
    private boolean shouldSeparateByBeads = true;
    
    private List pageArticles = null;
    */
    /**
     * The charactersByArticle is used to extract text by article divisions.  For example
     * a PDF that has two columns like a newspaper, we want to extract the first column and
     * then the second column.  In this example the PDF would have 2 beads(or articles), one for
     * each column.  The size of the charactersByArticle would be 5, because not all text on the 
     * screen will fall into one of the articles.  The five divisions are shown below
     * 
     * Text before first article
     * first article text
     * text between first article and second article
     * second article text
     * text after second article
     * 
     * Most PDFs won't have any beads, so charactersByArticle will contain a single entry.
     */
    
    // currently not used!
    private Vector charactersByArticle = new Vector();
    
    private Map characterListMapping = new HashMap();
    
    private String lineSeparator = System.getProperty("line.separator");
    private String pageSeparator = System.getProperty("line.separator");
    private String wordSeparator = " ";
    
//    private DocumentGraph documentGraph;
    
    /**
     * Instantiate a new PageProcessor object.
     */
    public PageProcessor() // throws IOException -- I don't think there's any need for it now
    {
        // super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties" ) );
    }
    
    public PageProcessor(int processType) // throws IOException -- I don't think there's any need for it now
    {
        // super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties" ) );
    	this.processType = processType;
    }
    
    public static List<Page> processDocPages(List<Page> thePages, BufferedImage pageImage)
    {
    	// methods to run AFTER all pages have been understood
    	return thePages;
    }
    
    /**
     * This will process the contents of a page.
     * modified by TH
     *
     * @param page The page to process.
     * @param content The contents of the page.
     *
     * @throws IOException If there is an error processing the page.
     */
    public Page processPage(PDFPage thisPage) // throws IOException
    {
    	Page retVal = doProcessPage(thisPage);
    	// custom processing goes here
    	postProcessing(processType, retVal);
    	retVal.setLastOpIndex(thisPage.getLastOpIndex());
    	return retVal;
    }
    
    protected Page doProcessPage(PDFPage thisPage)
    {
    	long startProcess = System.currentTimeMillis();
    	
    	retVal = new Page();
        retVal.setBoundingBox(thisPage.getBoundingBox());
        retVal.setRotation(thisPage.getRotation());
        
        charList = ListUtils.selectCharacters(thisPage.getItems()); // was: charList
        fragList = ListUtils.selectTextFragments(thisPage.getItems());
        imageList = ListUtils.selectImageSegments(thisPage.getItems());
        lineList = ListUtils.selectLineSegments(thisPage.getItems());
        rectList = ListUtils.selectRectSegments(thisPage.getItems());
        
        // the chosen granularity for graph matching
        processingResult = new ArrayList<GenericSegment>();
        
        edgeSegmentList = new ArrayList<EdgeSegment>(); // the edges that will be finally displayed
        textLines = new ArrayList<TextLine>();

        mergedLines = new ArrayList<TextBlock>();
        textBlocks = new ArrayList<TextBlock>();
       
        rop = new RulingObjectProcessor();
        

        if (Utils.DISPLAY_TIMINGS) {
            if( log.isDebugEnabled() ) {
                log.debug( ("time A: " + ( System.currentTimeMillis() - startProcess ) ));
            }
        }

        PDFObjectExtractor.removeLeadingTrailingSpaces(fragList);
        
//        if (processType == PP_STRUCT)
//        	PageSegmenter.LINE_SPACING_TOLERANCE = 0.20f;
        
//        else // lines or coarser granular levels
        if ( processType != PP_CHAR && processType != PP_FRAGMENT ) {

        	if (Utils.DISPLAY_TIMINGS ) {
                if( log.isDebugEnabled() ) {
                    log.debug("time E: " + (System.currentTimeMillis() - startProcess));
                }
            }

        	
        	// added 2011-11-04 to REMOVE space characters (test)
        	if (processSpaces)
        	{
        		List<CharSegment> charsToRemove = new ArrayList<CharSegment>();
        		for (CharSegment cs : charList)
        			if (cs.getText().equals(" "))
        				charsToRemove.add(cs);
        		charList.removeAll(charsToRemove);
        	}
        	
	        if (processSpaces)
	        	textLines = LineProcessor.findLinesFromCharacters(
	        		charList, 0.3f, false, false); //pageImage != null); // that is charList in the calling method
	        // good - 0.3 or 0.4
	        else
//	        	textLines = lxLineFinder.findLines(fragmentList, 0.20f, false);
//	        	19.10.10
//	        	changed after PDF-TREX comparison
	        	textLines = LineProcessor.findLinesFromTextFragments(
	        		fragList, 0.80f, false, false); //pageImage != null);

	        AdjacencyGraph<TextLine> lineAG = new AdjacencyGraph<TextLine>();
	        lineAG.addList(textLines);

            if( log.isDebugEnabled() ) {
                if( log.isDebugEnabled() ) {
                    log.debug( "number of items pageFromLines: " + textLines.size() );
                    log.debug("Time for preprocessing: " + (System.currentTimeMillis() - startProcess));
                }
            }


	        // Generate NG
	        long before = System.currentTimeMillis();
	        lineAG.generateEdgesSingle();

            if( Utils.DISPLAY_TIMINGS ) {
                if( log.isDebugEnabled() ) {
                    log.debug( "Time for AG generation: " + (System.currentTimeMillis() - before));
                }
            }

	        before = System.currentTimeMillis();
	        // RULING OBJECT PROCESSING
	        if(rulingLines)
	        {
		        rop.addRulingObjects(lineList);

		        rop.addRulingObjects(rectList); // here they will be automatically processed
		        								// into their constituent lines
		        rop.removeDuplicateLines();		// this removes duplicate lines and 'joins'
		        								// touching lines
		        lineList = rop.getRulingLines();
		        
		        rectList = new ArrayList<RectSegment>(); // empty rectList
		        
		        // added 6.01.11 for str conversions
//		        commented out 2011-01-26 for execution
//		        RulingObjectProcessor.strDetectUnderlinedText(textLines, lineList);
	        }
		    
	        
		    // BEST FIRST CLUSTERING (BLOCK FINDING)
	        // return only the blocks -- second level clustering takes place separately
	        
        	// noIterations affects block-finding unless processType == PP_COLUMN
	        int blockIterations = noIterations;
	        if (processType != PP_BLOCK) blockIterations = 0;

	         
//        	textBlocks = PageSegmenter.orderedEdgeCluster
	        TextBlockPageSegmenter tbps = new TextBlockPageSegmenter();
	        tbps.setMaxIterations(blockIterations);
	        
	        textBlocks = tbps.clusterLinesIntoTextBlocks(lineAG);

//	        not used in this version of pdfxtk
//	        HashMap<GenericSegment, CandidateCluster> clustHash =
//	        	tbps.getClustHash();
	        
        	if (Utils.DISPLAY_TIMINGS){
                if( log.isDebugEnabled() ) {
                    log.debug("Time for ordered edge cluster: " + (System.currentTimeMillis() - before));
                }
            }

//        	ListUtils.printListWithSubItems(textBlocks);
        	
        	before = System.currentTimeMillis();
        	// FIND ATOMIC LINES
        	for (TextBlock c : textBlocks)
        	{
        		{
	        		CandidateCluster cc = new CandidateCluster();
	        		// generic problems -- cannot simply add list
	        		for (TextSegment t : c.getItems())
	        			cc.getItems().add(t);
	        		cc.findLinesWidth();
	        		for (CompositeSegment<? extends TextSegment> l : cc.getFoundLines())
	        		{
	        			TextBlock lineBlock = new TextBlock();
	        		    lineBlock.setCalculatedFields(l);
	        			
	        			for (TextSegment i : l.getItems())
	        			{
	        				if (i.getClass() == TextLine.class)
	        				{
	        					lineBlock.getItems().add((TextLine)i);
	        				}
	        				if (i.getClass() == LineFragment.class)
	        				{
	        					TextLine tl = new TextLine();
	        					tl.getItems().add((LineFragment)i);
	        					tl.setCalculatedFields(i);
	        					
	        					lineBlock.getItems().add(tl);
	        				}
	        				else if (i.getClass() == TextFragment.class)
	        				{
	        					LineFragment lf = new LineFragment();
	        					lf.getItems().add((TextFragment)i);
	        					lf.setCalculatedFields(i);
	        					
	        					TextLine tl = new TextLine();
	        					tl.getItems().add(lf);
	        					tl.setCalculatedFields(lf);
	        					
	        					lineBlock.getItems().add(tl);
	        				}
	        				else if (i.getClass() == CharSegment.class)
	        				{
	        					TextFragment tf = new TextFragment();
	        					tf.getItems().add((CharSegment)i);
	        					tf.setCalculatedFields(i);
	        					
	        					LineFragment lf = new LineFragment();
	        					lf.getItems().add((TextFragment)tf);
	        					lf.setCalculatedFields(tf);
	        					
	        					TextLine tl = new TextLine();
	        					tl.getItems().add(lf);
	        					tl.setCalculatedFields(lf);
	        					
	        					lineBlock.getItems().add(tl);
	        				}
	        				else
	        				{
	        					//TODO:
	        					// TextBlock: add its items, not the block itself?
	        					
	        					//???
//	        					throw new DocumentProcessingException
//	        						("Invalid objects found in line");
	        				}
	        			}
	        			mergedLines.add(lineBlock);
	        		}
        		}
        	}
        	
        	if (Utils.DISPLAY_TIMINGS) {
                if( log.isDebugEnabled() ) {
                    log.debug( "total pp time: " + (System.currentTimeMillis() - startProcess));
                }
            }

        	// custom processing goes here
//        	postProcessing(processType, retVal);
        	
        }
        return retVal;
    }    

	public void postProcessing(int processType, Page retVal)
    {    
        if (processType == PP_CHAR)
        {
//        	This processing mode just for debugging view ...
        	
        	for (CharSegment c : charList)
        	{
        		TextFragment tf = new TextFragment();
        		tf.getItems().add(c);
    			tf.setCalculatedFields(c);
        		
    			LineFragment lf = new LineFragment();
    			lf.getItems().add(tf);
    			lf.setCalculatedFields(tf);
    			
    			TextLine tl = new TextLine();
    			tl.getItems().add(lf);
    			tl.setCalculatedFields(lf);
    			
    			TextBlock tb = new TextBlock();
    			tb.getItems().add(tl);
    			tb.setCalculatedFields(tl);
    			
    			processingResult.add(tb);
        	}
        }
        else if (processType == PP_FRAGMENT) // characters
        {
//        	This processing mode just for debugging view ...
        	
        	for (TextFragment tf : fragList)
        	{
    			LineFragment lf = new LineFragment();
    			lf.getItems().add(tf);
    			lf.setCalculatedFields(tf);
    			
    			TextLine tl = new TextLine();
    			tl.getItems().add(lf);
    			tl.setCalculatedFields(lf);
    			
    			TextBlock tb = new TextBlock();
    			tb.getItems().add(tl);
    			tb.setCalculatedFields(tl);
    			
    			processingResult.add(tb);
        	}
        }
        else if (processType == PP_LINE)
        {
//        	This processing mode also for debugging view; 
//        	for wrapping generally PP_MERGED_LINES should be used ...
        	for (TextLine tl : textLines)
        	{
    			TextBlock tb = new TextBlock();
    			tb.getItems().add(tl);
    			tb.setCalculatedFields(tl);
    			
    			processingResult.add(tb);
        	}
        }
        else if (processType == PP_MERGED_LINES)
        	processingResult.addAll(mergedLines);
        else if (processType == PP_BLOCK)
        	processingResult.addAll(textBlocks);
        
//      if (processType == PP_LINE || processType == PP_MERGED_LINES ||
//        	processType == PP_BLOCK)
        if (processType != PP_INSTRUCTION && processType != PP_FRAGMENT &&
        	processType != PP_CHAR)
        {
        	adjGraph = new AdjacencyGraph<GenericSegment>();
//        		processingResult.setFontNames();
	        adjGraph.addList(processingResult);
//    	        ListFactory.create imageList?!?
//    	        clusterNG.addList(imageList);
	        adjGraph.generateEdgesSingle();
	        //GraphMatcher.removeLongEdges(clusterNG, 25.0f);


            if( log.isDebugEnabled() ) {
                log.debug("PP.edges: " + adjGraph.getEdges().size());
            }


	        List<EdgeSegment> edgeList = new ArrayList<EdgeSegment>();
	        for (AdjacencyEdge<GenericSegment> ae : adjGraph.getEdges())
//	        	edgeList.add(new EdgeSegment(ae));
	        	edgeList.add(ae.toDisplayableSegment());
        	
//	        edgeList = clusterNG.getEdges().toSegmentList();
	        
//	        2011-01-27 TEMPORARILY COMMENTED OUT
//	        rop.labelEdges(clusterNG.getEdges(), rop.getRulingLines());
        }
        
        /*
        System.out.println("processingResult:");
        ListUtils.printList(processingResult);
        */
        
        // add the text clusters (segments) to the page object
        
        retVal.getItems().addAll(processingResult);
        retVal.getItems().addAll(textLines);
        retVal.getItems().addAll(fragList);
        retVal.getItems().addAll(charList);
        
        retVal.getItems().addAll(imageList);
        retVal.getItems().addAll(lineList);
        retVal.getItems().addAll(rectList);

        retVal.getItems().addAll(edgeSegmentList);
        
        Collections.sort(retVal.getItems(), new YComparator());//.reverseOrder(new YComparator()));
    }
    
    public void customProcessing(int processType)
    {
    	// for custom processing, override this method!
    }
    
    public AdjacencyGraph<GenericSegment> getAdjGraph() {
		return adjGraph;
	}

	public void setAdjGraph(AdjacencyGraph<GenericSegment> adjGraph) {
		this.adjGraph = adjGraph;
	}

	public int getProcessType() {
		return processType;
	}

	public void setProcessType(int processType) {
		this.processType = processType;
	}

	public boolean isRulingLines() {
		return rulingLines;
	}

	public void setRulingLines(boolean rulingLines) {
		this.rulingLines = rulingLines;
	}

	public boolean isProcessSpaces() {
		return processSpaces;
	}

	public void setProcessSpaces(boolean processSpaces) {
		this.processSpaces = processSpaces;
	}

	public int getNoIterations() {
		return noIterations;
	}

	public void setNoIterations(int noIterations) {
		this.noIterations = noIterations;
	}
    
}