/** * */ package org.voyanttools.trombone.input.expand; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import org.apache.commons.codec.digest.DigestUtils; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.usermodel.WorkbookFactory; import org.voyanttools.trombone.input.source.InputSource; import org.voyanttools.trombone.input.source.Source; import org.voyanttools.trombone.input.source.StringInputSource; import org.voyanttools.trombone.model.DocumentFormat; import org.voyanttools.trombone.model.DocumentMetadata; import org.voyanttools.trombone.model.StoredDocumentSource; import org.voyanttools.trombone.storage.StoredDocumentSourceStorage; import org.voyanttools.trombone.util.FlexibleParameters; import edu.stanford.nlp.util.StringUtils; /** * @author sgs * */ public class XlsExpander implements Expander { /** * all parameters sent, only some of which may be relevant to some expanders */ private FlexibleParameters parameters; /** * the stored document storage strategy */ private StoredDocumentSourceStorage storedDocumentSourceStorage; /** * @param storedDocumentSourceExpander * @param storedDocumentSourceStorage * */ public XlsExpander(StoredDocumentSourceStorage storedDocumentSourceStorage, FlexibleParameters parameters) { this.storedDocumentSourceStorage = storedDocumentSourceStorage; this.parameters = parameters; } /* (non-Javadoc) * @see org.voyanttools.trombone.input.expand.Expander#getExpandedStoredDocumentSources(org.voyanttools.trombone.model.StoredDocumentSource) */ @Override public List<StoredDocumentSource> getExpandedStoredDocumentSources(StoredDocumentSource storedDocumentSource) throws IOException { // first try to see if we've been here already String id = storedDocumentSource.getId(); List<StoredDocumentSource> xlsStoredDocumentSources = storedDocumentSourceStorage.getMultipleExpandedStoredDocumentSources(id); if (xlsStoredDocumentSources!=null && xlsStoredDocumentSources.isEmpty()==false) { return xlsStoredDocumentSources; } xlsStoredDocumentSources = new ArrayList<StoredDocumentSource>(); // check to see if anything needs to be expanded String tableDocuments = parameters.getParameterValue("tableDocuments", "").toLowerCase(); if (tableDocuments.isEmpty()==false) { if (tableDocuments.equals("rows")) { return getDocumentsRowCells(storedDocumentSource); } else if (tableDocuments.equals("columns")) { return getDocumentsColumns(storedDocumentSource); } } // otherwise, use the entire table xlsStoredDocumentSources = new ArrayList<StoredDocumentSource>(); xlsStoredDocumentSources.add(storedDocumentSource); return xlsStoredDocumentSources; } private Workbook getWorkBook(StoredDocumentSource storedDocumentSource) throws IOException { InputStream inputStream = null; Workbook wb; try { inputStream = storedDocumentSourceStorage.getStoredDocumentSourceInputStream(storedDocumentSource.getId()); wb = WorkbookFactory.create(inputStream); } catch (InvalidFormatException e) { throw new IOException(e); } finally { inputStream.close(); } return wb; } private List<StoredDocumentSource> getDocumentsColumns(StoredDocumentSource storedDocumentSource) throws IOException { DocumentMetadata metadata = storedDocumentSource.getMetadata(); String id = storedDocumentSource.getId(); Workbook wb = getWorkBook(storedDocumentSource); List<StoredDocumentSource> xlsStoredDocumentSources = new ArrayList<StoredDocumentSource>(); List<List<Integer>> columns = getInts("tableContent", true); StringBuffer docBuffer = new StringBuffer(); int firstRow = parameters.getParameterBooleanValue("tableNoHeadersRow") ? 0 : 1; String title; for (int k = 0; k < wb.getNumberOfSheets(); k++) { Sheet sheet = wb.getSheetAt(k); int rows = sheet.getLastRowNum(); // no columns defined, so take all, as defined by first row if (columns.isEmpty()) { short len = sheet.getRow(0).getLastCellNum(); if (len>0) { for (int i=0; i<len; i++) { List<Integer> cols = new ArrayList<Integer>(); cols.add(i); columns.add(cols); } } } for (List<Integer> set : columns) { for (int c : set) { for (int r = firstRow; r < rows+1; r++) { String value = getValue(sheet, r, c); if (value.isEmpty()==false) { if (docBuffer.length()>0) docBuffer.append("\n\n"); docBuffer.append(value); } } } if (docBuffer.length()>0) { String location = (k+1)+"."+StringUtils.join(set, "+")+"."+(firstRow+1); title = firstRow == 0 ? location : getValue(sheet.getRow(0), set, " "); xlsStoredDocumentSources.add(getChild(metadata, id, docBuffer.toString(), location, title, null)); docBuffer.setLength(0); // reset buffer } } } wb.close(); return xlsStoredDocumentSources; } private List<StoredDocumentSource> getDocumentsRowCells(StoredDocumentSource storedDocumentSource) throws IOException { DocumentMetadata metadata = storedDocumentSource.getMetadata(); String id = storedDocumentSource.getId(); Workbook wb = getWorkBook(storedDocumentSource); List<StoredDocumentSource> xlsStoredDocumentSources = new ArrayList<StoredDocumentSource>(); List<List<Integer>> columns = getInts("tableContent", true); List<List<Integer>> titles = getInts("tableTitle", true); List<List<Integer>> authors = getInts("tableAuthor", true); int firstRow = parameters.getParameterBooleanValue("tableNoHeadersRow") ? 0 : 1; Row row; String contents; String location; for (int k = 0; k < wb.getNumberOfSheets(); k++) { Sheet sheet = wb.getSheetAt(k); int rows = sheet.getLastRowNum(); for (int r = firstRow; r < rows+1; r++) { row = sheet.getRow(r); if (row==null) {continue;} if (columns.isEmpty()) { short len = row.getLastCellNum(); if (len>0) { List<Integer> cols = new ArrayList<Integer>(); for (int i=0; i<len; i++) { cols.add(i); } columns.add(cols); } } for (List<Integer> columnsSet : columns) { contents = columnsSet.isEmpty() ? getValue(row, "\t") : getValue(row, columnsSet, "\t"); if (contents.isEmpty()==false) { location = (k+1)+"."+StringUtils.join(columnsSet, "+")+"."+(r+1); String title = location; if (titles.isEmpty()==false && columns.size()==1) { List<String> currentTitles = new ArrayList<String>(); for (List<Integer> titleSet : titles) { String t = getValue(row, titleSet, " "); if (t.isEmpty()==false) { currentTitles.add(t); } } if (currentTitles.isEmpty()==false) { title = StringUtils.join(currentTitles, " "); } } List<String> currentAuthors = new ArrayList<String>(); if (authors.isEmpty()==false && columns.size()==1) { for (List<Integer> set : authors) { String author = getValue(row, set, " ").trim(); if (author.isEmpty()==false) { currentAuthors.add(author); } } } xlsStoredDocumentSources.add(getChild(metadata, id, contents, location, title, currentAuthors)); } } } } wb.close(); return xlsStoredDocumentSources; } private String getValue(Row row, String separator) { short len = row.getLastCellNum(); if (len>0) { List<Integer> cells = new ArrayList<Integer>(); for (int i=0; i<len; i++) { cells.add(i); } return getValue(row, cells, separator); } else { return ""; } } private List<String> getValues(Row row, List<Integer> cells) { List<String> strings = new ArrayList<String>(); for (int i : cells) { Cell cell = row.getCell(i); if (cell!=null) { String s = getValue(cell); if (s!=null && s.isEmpty()==false) { strings.add(s); } } } return strings; } private String getValue(Row row, List<Integer> cells, String separator) { return StringUtils.join(getValues(row, cells), separator); } private String getValue(Sheet sheet, int rowIndex, int cellIndex) { if (rowIndex < 0 || cellIndex < 0) return ""; Row row = sheet.getRow(rowIndex); if (row==null) return ""; Cell cell = row.getCell(cellIndex); if (cell==null) return ""; if (cell.getCellType()==Cell.CELL_TYPE_STRING) { String value = getValue(cell); return value == null ? "" : value.trim(); } else return ""; } private String getValue(Cell cell) { if (cell!=null) { switch (cell.getCellType()) { case Cell.CELL_TYPE_STRING: return cell.getStringCellValue().trim(); case Cell.CELL_TYPE_FORMULA: return cell.getCellFormula().trim(); case Cell.CELL_TYPE_NUMERIC: return String.valueOf(cell.getNumericCellValue()).trim(); } } return null; } private List<List<Integer>> getInts(String key, boolean decrement) { List<List<Integer>> outerList = new ArrayList<List<Integer>>(); for (String string : parameters.getParameterValues(key)) { for (String set : string.trim().split(",")) { List<Integer> innerList = new ArrayList<Integer>(); for (String s : set.trim().split("\\+")) { try { innerList.add(Integer.valueOf(s.trim()) + (decrement ? -1 : 0)); } catch (NumberFormatException e) { throw new IllegalArgumentException(key+" parameter should only contain numbers: "+string, e); } } if (innerList.isEmpty()==false) { outerList.add(innerList); } } } return outerList; } private StoredDocumentSource getChild(DocumentMetadata parentMetadata, String parentId, String string, String location, String title, List<String> authors) throws IOException { DocumentMetadata metadata = parentMetadata.asParent(parentId, DocumentMetadata.ParentType.EXPANSION); metadata.setModified(parentMetadata.getModified()); metadata.setSource(Source.STRING); metadata.setLocation(location); metadata.setTitle(title); if (authors!=null && authors.isEmpty()==false) { metadata.setAuthors(authors.toArray(new String[0])); } metadata.setDocumentFormat(DocumentFormat.TEXT); String id = DigestUtils.md5Hex(parentId + location); InputSource inputSource = new StringInputSource(id, metadata, string); return storedDocumentSourceStorage.getStoredDocumentSource(inputSource); } }