/*
* RapidMiner
*
* Copyright (C) 2001-2014 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.nio.model;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.TimeZone;
import java.util.logging.Level;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.tools.I18N;
import com.rapidminer.tools.LogService;
import com.rapidminer.tools.ProgressListener;
import com.rapidminer.tools.Tools;
/**
* A DataResultSet for an Excel 2007 File.
*
* @author Marco Boeck
*
*/
public class Excel2007ResultSet implements DataResultSet {
private Sheet sheet = null;
private boolean[] emptyRows;
private boolean[] emptyColumns;
private int rowOffset = 0;
private int columnOffset = 0;
private int totalNumberOfRows = 0;
private int totalNumberOfColumns = 0;
private int currentRow;
private Cell[] currentRowCells;
private Workbook workbook;
private InputStream workbookInputStream;
//private ExcelResultSetConfiguration configuration;
private String[] attributeNames;
private String timeZone;
private String dateFormat;
/**
* The constructor to build an ExcelResultSet from the given configuration. The calling operator might be null. It
* is only needed for error handling.
*/
public Excel2007ResultSet(Operator callingOperator, ExcelResultSetConfiguration configuration) throws OperatorException {
// reading configuration
columnOffset = configuration.getColumnOffset();
rowOffset = configuration.getRowOffset();
currentRow = configuration.getRowOffset() - 1;
timeZone = configuration.getTimezone();
dateFormat = configuration.getDatePattern();
try {
workbookInputStream = new FileInputStream(configuration.getFile());
workbook = WorkbookFactory.create(workbookInputStream);
} catch (Exception e1) {
throw new UserError(callingOperator, "file_consumer.error_loading_file");
}
// check range
if (columnOffset > configuration.getColumnLast() || rowOffset > configuration.getRowLast() || columnOffset < 0 || rowOffset < 0)
throw new UserError(callingOperator, 223, Tools.getExcelColumnName(columnOffset) + rowOffset + ":" + Tools.getExcelColumnName(configuration.getColumnLast()) + configuration.getRowLast());
// check file presence
if (configuration.getFile() == null) {
throw new UserError(callingOperator, "file_consumer.no_file_defined");
}
try {
sheet = workbook.getSheetAt(configuration.getSheet());
} catch (IndexOutOfBoundsException e) {
throw new UserError(callingOperator, 953, configuration.getSheet());
}
Row row = sheet.getRow(sheet.getFirstRowNum());
if (row == null) {
totalNumberOfColumns = 0;
totalNumberOfRows = 0;
} else {
totalNumberOfColumns = Math.min(configuration.getColumnLast(), sheet.getRow(sheet.getFirstRowNum()).getLastCellNum() - 1) - columnOffset + 1;
totalNumberOfRows = Math.min(configuration.getRowLast(), sheet.getLastRowNum()) - rowOffset + 1;
}
if (totalNumberOfColumns < 0 || totalNumberOfRows < 0)
throw new UserError(callingOperator, 404);
emptyColumns = new boolean[totalNumberOfColumns];
emptyRows = new boolean[totalNumberOfRows];
// filling offsets
Arrays.fill(emptyColumns, true);
Arrays.fill(emptyRows, true);
// determine offsets and emptiness
boolean foundAny = false;
for (int r = 0; r < totalNumberOfRows; r++) {
for (int c = 0; c < totalNumberOfColumns; c++) {
if (emptyRows[r] || emptyColumns[c]) {
final Row rowItem = sheet.getRow(r + rowOffset);
if (rowItem == null) {
continue;
}
final Cell cell = rowItem.getCell(c + columnOffset);
if (cell == null) {
continue;
}
boolean empty;
try {
empty = "".equals(cell.getStringCellValue().trim());
} catch (IllegalStateException e) {
empty = false;
}
if (!empty) {
foundAny = true;
emptyRows[r] = false;
emptyColumns[c] = false;
}
}
}
}
if (!foundAny) {
throw new UserError(callingOperator, 302, configuration.getFile().getPath(), "spreadsheet seems to be empty");
}
// retrieve attribute names: first count columns
int numberOfAttributes = 0;
List<Integer> nonEmptyColumnsList = new LinkedList<Integer>();
for (int i = 0; i < totalNumberOfColumns; i++) {
if (!emptyColumns[i]) {
numberOfAttributes++;
nonEmptyColumnsList.add(i);
}
}
// retrieve or generate attribute names
attributeNames = new String[nonEmptyColumnsList.size()];
if (!configuration.isEmulatingOldNames()) {
for (int i = 0; i < numberOfAttributes; i++) {
attributeNames[i] = Tools.getExcelColumnName(nonEmptyColumnsList.get(i));
}
} else {
// emulate old 5.0.x style
for (int i = 0; i < numberOfAttributes; i++) {
if (!emptyColumns[i]) {
attributeNames[i] = "attribute_" + i;
}
}
}
}
@Override
public void reset(ProgressListener listener) {
currentRow = rowOffset - 1;
if (listener != null) {
listener.setTotal(totalNumberOfRows);
listener.setCompleted(0);
}
}
@Override
public boolean hasNext() {
int nextRow = currentRow + 1;
while (nextRow < totalNumberOfRows + rowOffset && emptyRows[nextRow - rowOffset])
nextRow++;
return nextRow < totalNumberOfRows + rowOffset;
}
@Override
public void next(ProgressListener listener) {
currentRow++;
while (currentRow < totalNumberOfRows + rowOffset && emptyRows[currentRow - rowOffset]) {
currentRow++;
}
if (currentRow >= totalNumberOfRows + rowOffset) {
throw new NoSuchElementException("No further row in excel sheet.");
}
currentRowCells = new Cell[attributeNames.length];
int columnCounter = 0;
for (int c = 0; c < totalNumberOfColumns; c++) {
if (!emptyColumns[c]) {
currentRowCells[columnCounter] = sheet.getRow(currentRow).getCell(c + columnOffset);
columnCounter++;
}
}
// notifying progress listener
if (listener != null) {
listener.setCompleted(currentRow);
}
}
@Override
public void close() throws OperatorException {
try {
if (workbookInputStream != null) {
workbookInputStream.close();
workbookInputStream = null;
}
} catch (IOException e) {
LogService.getRoot().log(Level.WARNING, I18N.getMessage(LogService.getRoot().getResourceBundle(), "com.rapidminer.operator.nio.model.ExcelResultSetConfiguration.close_workbook_error", e.getMessage()), e);
}
}
@Override
public int getNumberOfColumns() {
return attributeNames.length;
}
@Override
public String[] getColumnNames() {
return attributeNames;
}
@Override
public boolean isMissing(int columnIndex) {
Cell cell = getCurrentCell(columnIndex);
try {
boolean missing = cell == null || cell.getCellType() == Cell.CELL_TYPE_BLANK ||
cell.getCellType() == Cell.CELL_TYPE_ERROR ||
"".equals(cell.getStringCellValue().trim());
return missing;
} catch (IllegalStateException e) {
return false;
}
}
/**
* Gets the cell with the given index in the current row.
* @param index
* @return
*/
private Cell getCurrentCell(int index) {
// return currentRowCells[index + columnOffset];
return currentRowCells[index];
}
@Override
public Number getNumber(int columnIndex) throws ParseException {
final Cell cell = getCurrentCell(columnIndex);
if (cell == null) {
return Double.NaN;
}
if (cell.getCellType() == Cell.CELL_TYPE_NUMERIC || cell.getCellType() == Cell.CELL_TYPE_FORMULA) {
final double value = cell.getNumericCellValue();
return Double.valueOf(value);
} else {
String valueString = "";
try {
valueString = cell.getStringCellValue();
return Double.valueOf(valueString);
} catch (NumberFormatException e) {
throw new ParseException(new ParsingError(currentRow, columnIndex, ParsingError.ErrorCode.UNPARSEABLE_REAL, valueString));
} catch (IllegalStateException e) {
throw new ParseException(new ParsingError(currentRow, columnIndex, ParsingError.ErrorCode.UNPARSEABLE_REAL, "CELL_NOT_NUMERIC"));
}
}
}
@Override
public Date getDate(int columnIndex) throws ParseException {
final Cell cell = getCurrentCell(columnIndex);
if (cell == null) {
return null;
}
if (cell.getCellType() == Cell.CELL_TYPE_NUMERIC) {
Date dateCellValue = cell.getDateCellValue();
return dateCellValue;
} else {
try {
String valueString = cell.getStringCellValue();
try {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat(dateFormat);
simpleDateFormat.setTimeZone(TimeZone.getTimeZone(this.timeZone));
return simpleDateFormat.parse(valueString);
} catch (java.text.ParseException e) {
throw new ParseException(new ParsingError(currentRow, columnIndex, ParsingError.ErrorCode.UNPARSEABLE_DATE, valueString));
}
} catch (IllegalStateException e) {
throw new ParseException(new ParsingError(currentRow, columnIndex, ParsingError.ErrorCode.UNPARSEABLE_DATE, "CELL_NEITHER_NUMERIC_NOR_NOMINAL"));
}
}
}
@Override
public String getString(int columnIndex) {
final Cell cell = getCurrentCell(columnIndex);
if (cell == null) {
return "";
}
if (cell.getCellType() == Cell.CELL_TYPE_NUMERIC) {
String value = String.valueOf(cell.getNumericCellValue());
return value;
} else if (cell.getCellType() == Cell.CELL_TYPE_FORMULA) {
String value = String.valueOf(cell.getNumericCellValue());
return value;
} else {
try {
String value = cell.getStringCellValue();
return value;
} catch (IllegalStateException e) {
return "";
}
}
}
@Override
public int[] getValueTypes() {
return new int[this.attributeNames.length];
}
@Override
public ValueType getNativeValueType(int columnIndex) throws ParseException {
Cell cell = getCurrentCell(columnIndex);
final int type = cell.getCellType();
if (type == Cell.CELL_TYPE_BLANK) {
return ValueType.EMPTY;
} else if (type == Cell.CELL_TYPE_STRING) {
return ValueType.STRING;
} else if (type == Cell.CELL_TYPE_NUMERIC) {
if (HSSFDateUtil.isCellDateFormatted(cell)) {
return ValueType.DATE;
} else {
return ValueType.NUMBER;
}
} else if (type == Cell.CELL_TYPE_FORMULA) {
return ValueType.NUMBER;
} else {
return ValueType.STRING;
}
}
@Override
public int getCurrentRow() {
return currentRow;
}
}