package technology.tabula; import static org.junit.Assert.*; import java.io.File; import java.io.IOException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import junit.framework.Assert; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVRecord; import org.junit.Test; import technology.tabula.Page; import technology.tabula.Ruling; import technology.tabula.Table; import technology.tabula.extractors.BasicExtractionAlgorithm; import technology.tabula.writers.CSVWriter; import technology.tabula.UtilsForTesting; public class TestBasicExtractor { private static final String[][] EXPECTED_CORRECT_COLUMNS = { {"", "", "Involvement of pupils in", ""}, {"", "Preperation and", "Production of", "Presentation an"}, {"", "planing", "materials", "evaluation"}, {"Knowledge and awareness of different cultures", "0,2885", "0,3974", "0,3904"}, {"Foreign language competence", "0,3057", "0,4184", "0,3899"}, {"Social skills and abilities", "0,3416", "0,3369", "0,4303"}, {"Acquaintance of special knowledge", "0,2569", "0,2909", "0,3557"}, {"Self competence", "0,3791", "0,3320", "0,4617"}}; private static final String[][] EXPECTED_COLUMN_RECOGNITION = { {"ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"}, {"ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"}, {"ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"}, {"ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"}, {"ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"}, {"AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"}, {"BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"}, {"BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"}, {"BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"}, {"BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"}, {"BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"}, {"BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"}, {"BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"}, {"BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"}, {"BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"}, {"BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"}, {"BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"}, {"BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"}, {"CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"}, {"CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"}, {"CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"}, {"CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"}, {"CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"}, {"CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"}, {"CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"}, {"CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"}, {"CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"}, {"COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"}, {"CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"}, {"CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"}, {"CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"}}; private static final String[][] EXPECTED_COLUMN_EXTRACTION2 = { {"", "Austria", "77", "1", "78"}, {"", "Belgium", "159", "2", "161"}, {"", "Bulgaria", "52", "0", "52"}, {"", "Croatia", "144", "0", "144"}, {"", "Cyprus", "43", "2", "45"}, {"", "Czech Republic", "78", "0", "78"}, {"", "Denmark", "151", "2", "153"}, {"", "Estonia", "46", "0", "46"}, {"", "Finland", "201", "1", "202"}, {"", "France", "428", "7", "435"}, {"", "Germany", "646", "21", "667"}, {"", "Greece", "113", "2", "115"}, {"", "Hungary", "187", "0", "187"}, {"", "Iceland", "18", "0", "18"}, {"", "Ireland", "213", "4", "217"}, {"", "Israel", "25", "0", "25"}, {"", "Italy", "627", "12", "639"}, {"", "Latvia", "7", "0", "7"}, {"", "Lithuania", "94", "1", "95"}, {"", "Luxembourg", "22", "0", "22"}, {"", "Malta", "18", "0", "18"}, {"", "Netherlands", "104", "1", "105"}, {"", "Norway", "195", "0", "195"}, {"", "Poland", "120", "1", "121"}, {"", "Portugal", "532", "3", "535"}, {"", "Romania", "110", "0", "110"}, {"", "Slovakia", "176", "0", "176"}, {"", "Slovenia", "56", "0", "56"}, {"", "Spain", "614", "3", "617"}, {"", "Sweden", "122", "3", "125"}, {"", "Switzerland", "64", "0", "64"}, {"", "Turkey", "96", "0", "96"}, {"", "United Kingdom", "572", "14", "586"} }; private static final String[][] EXPECTED_TABLE_EXTRACTION = { {"AANONSEN, DEBORAH, A", "", "STATEN ISLAND, NY", "MEALS", "$85.00"}, {"TOTAL", "", "", "", "$85.00"}, {"AARON, CAREN, T", "", "RICHMOND, VA", "EDUCATIONAL ITEMS", "$78.80"}, {"AARON, CAREN, T", "", "RICHMOND, VA", "MEALS", "$392.45"}, {"TOTAL", "", "", "", "$471.25"}, {"AARON, JOHN", "", "CLARKSVILLE, TN", "MEALS", "$20.39"}, {"TOTAL", "", "", "", "$20.39"}, {"AARON, JOSHUA, N", "", "WEST GROVE, PA", "MEALS", "$310.33"}, {"", "REGIONAL PULMONARY & SLEEP", "", "", ""}, {"AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"}, {"", "MEDICINE", "", "", ""}, {"TOTAL", "", "", "", "$5,010.33"}, {"AARON, MAUREEN, M", "", "MARTINSVILLE, VA", "MEALS", "$193.67"}, {"TOTAL", "", "", "", "$193.67"}, {"AARON, MICHAEL, L", "", "WEST ISLIP, NY", "MEALS", "$19.50"}, {"TOTAL", "", "", "", "$19.50"}, {"AARON, MICHAEL, R", "", "BROOKLYN, NY", "MEALS", "$65.92"} }; private static final String[][] EXPECTED_EMPTY_TABLE = { {""} }; @Test public void testRemoveSequentialSpaces() throws IOException { Page page = UtilsForTesting.getAreaFromFirstPage( "src/test/resources/technology/tabula/m27.pdf", 79.2f, 28.28f, 103.04f, 732.6f); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.extract(page).get(0); List<RectangularTextContainer> firstRow = table.getRows().get(0); assertTrue(firstRow.get(1).getText().equals("ALLEGIANT AIR")); assertTrue(firstRow.get(2).getText().equals("ALLEGIANT AIR LLC")); } @Test public void testColumnRecognition() throws IOException { Page page = UtilsForTesting .getAreaFromFirstPage( "src/test/resources/technology/tabula/argentina_diputados_voting_record.pdf", 269.875f, 12.75f, 790.5f, 561f); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.extract(page).get(0); assertArrayEquals(EXPECTED_COLUMN_RECOGNITION, UtilsForTesting.tableToArrayOfRows(table)); } @Test public void testVerticalRulingsPreventMergingOfColumns() throws IOException { List<Ruling> rulings = new ArrayList<Ruling>(); Float[] rulingsVerticalPositions = {147f, 256f, 310f, 375f, 431f, 504f}; for (int i = 0; i < 6; i++) { rulings.add(new Ruling(255.57f, rulingsVerticalPositions[i], 0, 398.76f - 255.57f)); } Page page = UtilsForTesting.getAreaFromFirstPage( "src/test/resources/technology/tabula/campaign_donors.pdf", 255.57f, 40.43f, 398.76f, 557.35f); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(rulings); Table table = bea.extract(page).get(0); List<RectangularTextContainer> sixthRow = table.getRows().get(5); assertTrue(sixthRow.get(0).getText().equals("VALSANGIACOMO BLANC")); assertTrue(sixthRow.get(1).getText().equals("OFERNANDO JORGE")); } @Test public void testExtractColumnsCorrectly() throws IOException { Page page = UtilsForTesting.getAreaFromPage( "src/test/resources/technology/tabula/eu-002.pdf", 1, 115.0f, 70.0f, 233.0f, 510.0f); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.extract(page).get(0); assertArrayEquals(EXPECTED_CORRECT_COLUMNS, UtilsForTesting.tableToArrayOfRows(table)); } @Test public void testExtractColumnsCorrectly2() throws IOException { Page page = UtilsForTesting.getPage("src/test/resources/technology/tabula/eu-017.pdf", 3); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(page.getVerticalRulings()); Table table = bea.extract(page.getArea(299.625f, 148.44f, 711.875f, 452.32f)).get(0); assertArrayEquals(EXPECTED_COLUMN_EXTRACTION2, UtilsForTesting.tableToArrayOfRows(table)); } @Test public void testExtractColumnsCorrectly3() throws IOException { Page page = UtilsForTesting.getAreaFromFirstPage("src/test/resources/technology/tabula/frx_2012_disclosure.pdf", 106.01f, 48.09f, 227.31f, 551.89f); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.extract(page).get(0); assertArrayEquals(EXPECTED_TABLE_EXTRACTION, UtilsForTesting.tableToArrayOfRows(table)); } @Test public void testCheckSqueezeDoesntBreak() throws IOException { Page page = UtilsForTesting.getAreaFromFirstPage("src/test/resources/technology/tabula/12s0324.pdf", 99.0f, 17.25f, 316.5f, 410.25f); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.extract(page).get(0); List<List<RectangularTextContainer>> rows = table.getRows(); List<RectangularTextContainer> firstRow = rows.get(0); List<RectangularTextContainer> lastRow = rows.get(rows.size() - 1); assertTrue(firstRow.get(0).getText().equals("Violent crime . . . . . . . . . . . . . . . . . .")); assertTrue(lastRow.get(lastRow.size() - 1).getText().equals("(X)")); } @Test public void testNaturalOrderOfRectangles() throws IOException { Page page = UtilsForTesting.getPage( "src/test/resources/technology/tabula/us-017.pdf", 2) .getArea(446.0f, 97.0f, 685.0f, 520.0f); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm( page.getVerticalRulings()); Table table = bea.extract(page).get(0); List<RectangularTextContainer> cells = table.getCells(); for (RectangularTextContainer rectangularTextContainer : cells) { System.out.println(rectangularTextContainer.getText()); } //Column headers assertEquals("Project", cells.get(0).getText()); assertEquals("Agency", cells.get(1).getText()); assertEquals("Institution", cells.get(2).getText()); //First row assertEquals("Nanotechnology and its publics", cells.get(3).getText()); assertEquals("NSF", cells.get(4).getText()); assertEquals("Pennsylvania State Universit", cells.get(5).getText()); //Second row assertEquals("Public information and deliberation in nanoscience and", cells.get(6).getText()); assertEquals("North Carolina State", cells.get(7).getText()); assertEquals("Interagency", cells.get(8).getText()); assertEquals("nanotechnology policy (SGER)", cells.get(9).getText()); assertEquals("University", cells.get(10).getText()); //Third row assertEquals("Social and ethical research and education in agrifood", cells.get(11).getText()); assertEquals("NSF", cells.get(12).getText()); assertEquals("Michigan State University", cells.get(13).getText()); assertEquals("nanotechnology (NIRT)", cells.get(14).getText()); //Fourth row assertEquals("From laboratory to society: developing an informed", cells.get(15).getText()); assertEquals("NSF", cells.get(16).getText()); assertEquals("University of South Carolina", cells.get(17).getText()); assertEquals("approach to nanoscale science and engineering (NIRT)", cells.get(18).getText()); //Fifth row assertEquals("Database and innovation timeline for nanotechnology", cells.get(19).getText()); assertEquals("NSF", cells.get(20).getText()); assertEquals("UCLA", cells.get(21).getText()); //Sixth row assertEquals("Social and ethical dimensions of nanotechnology", cells.get(22).getText()); assertEquals("NSF", cells.get(23).getText()); assertEquals("University of Virginia", cells.get(24).getText()); //Seventh row assertEquals("Undergraduate exploration of nanoscience,", cells.get(25).getText()); assertEquals("Michigan Technological", cells.get(26).getText()); assertEquals("NSF", cells.get(27).getText()); assertEquals("applications and societal implications (NUE)", cells.get(28).getText()); assertEquals("University", cells.get(29).getText()); //Eighth row assertEquals("Ethics and belief inside the development of", cells.get(30).getText()); assertEquals("NSF", cells.get(31).getText()); assertEquals("University of Virginia", cells.get(32).getText()); assertEquals("nanotechnology (CAREER)", cells.get(33).getText()); //Ninth row assertEquals("All centers, NNIN and NCN have a societal", cells.get(34).getText()); assertEquals("NSF, DOE,", cells.get(35).getText()); assertEquals("All nanotechnology centers", cells.get(36).getText()); assertEquals("implications components", cells.get(37).getText()); assertEquals("DOD, and NIH", cells.get(38).getText()); assertEquals("and networks", cells.get(39).getText()); } @Test public void testNaturalOrderOfRectanglesOneMoreTime() throws IOException { CSVParser parse = org.apache.commons.csv.CSVParser.parse(new File("src/test/resources/technology/tabula/csv/TestBasicExtractor-RECTANGLE_TEST_NATURAL_ORDER.csv"), Charset.forName("utf-8"), CSVFormat.DEFAULT); List<Rectangle> rectangles = new ArrayList<Rectangle>(); for (CSVRecord record : parse) { rectangles.add(new Rectangle(Float.parseFloat(record.get(0)), Float.parseFloat(record.get(1)), Float.parseFloat(record.get(2)), Float.parseFloat(record.get(3)))); } //List<Rectangle> rectangles = Arrays.asList(RECTANGLES_TEST_NATURAL_ORDER); Utils.sort(rectangles); for (int i = 0; i < (rectangles.size() - 1); i++) { Rectangle rectangle = rectangles.get(i); Rectangle nextRectangle = rectangles.get(i + 1); assertTrue(rectangle.compareTo(nextRectangle) < 0); } } @Test public void testRealLifeRTL2() throws IOException { String expectedCsv = UtilsForTesting.loadCsv("src/test/resources/technology/tabula/csv/indictb1h_14.csv"); Page page = UtilsForTesting.getAreaFromPage("src/test/resources/technology/tabula/indictb1h_14.pdf", 1, 205.0f, 120.0f, 622.82f, 459.9f); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.extract(page).get(0); StringBuilder sb = new StringBuilder(); (new CSVWriter()).write(sb, table); assertEquals(expectedCsv, sb.toString()); } @Test public void testEmptyRegion() throws IOException { Page page = UtilsForTesting.getAreaFromPage("src/test/resources/technology/tabula/indictb1h_14.pdf", 1, 0.0f, 0.0f, 80.82f, 100.9f); // an empty area BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.extract(page).get(0); assertArrayEquals(EXPECTED_EMPTY_TABLE, UtilsForTesting.tableToArrayOfRows(table)); } @Test public void testTableWithMultilineHeader() throws IOException { String expectedCsv = UtilsForTesting.loadCsv("src/test/resources/technology/tabula/csv/us-020.csv"); Page page = UtilsForTesting.getAreaFromPage("src/test/resources/technology/tabula/us-020.pdf", 2, 103.0f, 35.0f, 641.0f, 560.0f); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.extract(page).get(0); StringBuilder sb = new StringBuilder(); (new CSVWriter()).write(sb, table); assertEquals(expectedCsv, sb.toString()); } }