ExcelExampleSource.java example

Explorer
rapidminer-5-master
/*
 *  RapidMiner
 *
 *  Copyright (C) 2001-2014 by RapidMiner and the contributors
 *
 *  Complete list of developers available at our web site:
 *
 *       http://rapidminer.com
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Affero General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Affero General Public License for more details.
 *
 *  You should have received a copy of the GNU Affero General Public License
 *  along with this program.  If not, see http://www.gnu.org/licenses/.
 */
package com.rapidminer.operator.io;

import java.io.IOException;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TimeZone;
import java.util.TreeSet;

import jxl.Cell;
import jxl.CellType;
import jxl.DateCell;
import jxl.NumberCell;
import jxl.Sheet;
import jxl.Workbook;
import jxl.read.biff.BiffException;

import com.rapidminer.gui.tools.dialogs.wizards.dataimport.excel.ExcelImportWizard;
import com.rapidminer.operator.Annotations;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeConfiguration;
import com.rapidminer.parameter.ParameterTypeFile;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.ParameterTypeList;
import com.rapidminer.parameter.UndefinedParameterError;

/**
 * 
 * <p>
 * This operator can be used to load data from Microsoft Excel spreadsheets.
 * This operator is able to reads data from Excel 95, 97, 2000, XP, and 2003.
 * The user has to define which of the spreadsheets in the workbook should be
 * used as data table. The table must have a format so that each line is an
 * example and each column represents an attribute. Please note that the first
 * line might be used for attribute names which can be indicated by a parameter.
 * </p>
 * 
 * <p>
 * The data table can be placed anywhere on the sheet and is allowed to contain
 * arbitrary formatting instructions, empty rows, and empty columns. Missing
 * data values are indicated by empty cells or by cells containing only
 * "?".
 * </p>
 * 
 * @author Ingo Mierswa, Tobias Malbrecht, Sebastian Loh
 */
public class ExcelExampleSource extends AbstractDataReader {

	/** Pseudo-annotation to be used for attribute names. */
	public static final String ANNOTATION_NAME = Annotations.ANNOTATION_NAME;

	/**
	 * Pseudo-annotation to be used for original attribute names which are read
	 * from the ANNOTATION_NAME line and later were changed by the user
	 */
	public static final String ANNOTATION_NAMES_FROM_DOCUMENT = "Name from document";

	/**
	 * The parameter name for "The Excel spreadsheet file which should be
	 * loaded."
	 */
	public static final String PARAMETER_EXCEL_FILE = "excel_file";

//	static {
//		AbstractReader.registerReaderDescription(new ReaderDescription("xls", ExcelExampleSource.class, PARAMETER_EXCEL_FILE));
//	}

	/**
	 * The parameter name for "The number of the sheet which should be
	 * imported."
	 */
	public static final String PARAMETER_SHEET_NUMBER = "sheet_number";

	/**
	 * The parameter name for "Indicates if the first row should be used
	 * for the attribute names."
	 */
	public static final String PARAMETER_FIRST_ROW_AS_NAMES = "first_row_as_names";

	/**
	 * The parameter name for "Indicates which column should be used for
	 * the label attribute (0: no label)"
	 */
	public static final String PARAMETER_LABEL_COLUMN = "label_column";

	/**
	 * The parameter name for "Indicates which column should be used for
	 * the Id attribute (0: no id)"
	 */
	public static final String PARAMETER_ID_COLUMN = "id_column";

	/**
	 * The parameter name for "Determines, how the data is represented
	 * internally."
	 */
	public static final String PARAMETER_DATAMANAGEMENT = "datamanagement";

	public static final String PARAMETER_COLUMN_OFFSET = "column_offset";

	public static final String PARAMETER_ROW_OFFSET = "row_offset";

	public static final String PARAMETER_CREATE_LABEL = "create_label";

	public static final String PARAMETER_CREATE_ID = "create_id";

	public static final String PARAMETER_ANNOTATIONS = "annotations";

	private Workbook workbook = null;

	private boolean keepWorkbookOpen = false;

	private ExcelDataSet cacheDataSet = null;

	private boolean skipAnnotationRows = false;

	private int nameRowF = -1;

	private Map<Integer, String> annotationsMap;

	private Set<Integer> annotationRows;

	public ExcelExampleSource(OperatorDescription description) {
		super(description);
		getParameters().addObserver(new CacheResetParameterObserver(PARAMETER_EXCEL_FILE), false);
	}

//	@Override
//	public ExampleSet createExampleSet() throws OperatorException {
////		if (getParameterAsFile(PARAMETER_CACHED_EXCEL_FILE).equals(getParameterAsFile(PARAMETER_EXCEL_FILE)) && attributeNamesDefinedByUser()) {
//		if (attributeNamesDefinedByUser()){
//			loadMetaDataFromParameters();
//		}
//		return super.createExampleSet();
//	}

	@Override
	public void writeMetaDataInParameter() {
		try {
			// clean up annotations in case attribute names are defined manually
			List<String[]> annotations = getParameterList(PARAMETER_ANNOTATIONS);
			List<String[]> cleanedAnnotations = new LinkedList<String[]>();
			// annotation name needs to be changed into
			// ANNOTATION_NAMES_FROM_DOCUMENT since the reader shall
			// not overwrite the user defined attribute name by names which
			// might occur in the name row.
			for (String[] pair : annotations) {
				if (!pair[1].equals(Annotations.ANNOTATION_NAME)) {
					cleanedAnnotations.add(pair);
				} else {
					pair[1] = ExcelExampleSource.ANNOTATION_NAMES_FROM_DOCUMENT;
					cleanedAnnotations.add(pair);
				}
			}
			setParameter(PARAMETER_ANNOTATIONS, ParameterTypeList.transformList2String(cleanedAnnotations));
//			setParameter(ExcelExampleSource.PARAMETER_CACHED_EXCEL_FILE, getParameter(PARAMETER_EXCEL_FILE));
		} catch (UndefinedParameterError e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		super.writeMetaDataInParameter();
	}

	/**
	 * inform the excel reader to keep the workbook open due to performance
	 * reasons. The workbook has to be closed then manually.
	 * 
	 * @see ExcelExampleSource#closeWorkbook()
	 */
	public void keepWorkbookOpen() {
		keepWorkbookOpen = true;
	}

	/**
	 * closes the the excel workbook
	 */
	public void closeWorkbook() {
		if (workbook != null) {
			workbook.close();
		}
	}

	public void resetWorkbook() {
		closeWorkbook();
		workbook = null;
		cacheDataSet = null;
	}

	public void skipNameAnnotationRow(boolean flag) {
		skipAnnotationRows = flag;
	}

	@Override
	protected ExcelDataSet getDataSet() throws OperatorException {

		if (cacheDataSet != null) {
			cacheDataSet.setCurrentRow(0);
			return cacheDataSet;
		}

		List<String[]> allAnnotations = getParameterList(PARAMETER_ANNOTATIONS);
		annotationRows = new HashSet<Integer>();
		annotationsMap = new HashMap<Integer, String>();
		boolean nameFound = false;
		int lastAnnotatedRow = -1;
		int nameRow = -1;
		for (String[] pair : allAnnotations) {
			try {
				final int row = Integer.parseInt(pair[0]);
				if (row > lastAnnotatedRow) {
					lastAnnotatedRow = row;
				}
				annotationsMap.put(row, pair[1]);
				if (Annotations.ANNOTATION_NAME.equals(pair[1])) {
					nameFound = true;
					nameRow = row;
				}
				annotationRows.add(row);
			} catch (NumberFormatException e) {
				throw new OperatorException("row_number entries in parameter list " + PARAMETER_ANNOTATIONS + " must be integers.", e);
			}
		}
		if (nameFound && getParameterAsBoolean(PARAMETER_FIRST_ROW_AS_NAMES)) {
			throw new OperatorException("If " + PARAMETER_FIRST_ROW_AS_NAMES + " is set to true, you cannot use " + Annotations.ANNOTATION_NAME
					+ " entries in parameter list " + PARAMETER_ANNOTATIONS + ".");
		}
		if (getParameterAsBoolean(PARAMETER_FIRST_ROW_AS_NAMES)) {
			annotationsMap.put(0, Annotations.ANNOTATION_NAME);
			annotationRows.add(0);
			nameRow = 0;
		}
		// final int lastAnnotatedRowF = lastAnnotatedRow + 1; //+1 since the
		// last annotated row itself must be ignored
		nameRowF = nameRow;

		return new ExcelDataSet();
	}

	@Override
	public List<ParameterType> getParameterTypes() {
		List<ParameterType> types = new LinkedList<ParameterType>();
		ParameterType type = new ParameterTypeConfiguration(ExcelImportWizard.ExcelExampleSourceConfigurationWizardCreator.class, this);
		type.setExpert(false);
		types.add(type);
		types.add(new ParameterTypeFile(PARAMETER_EXCEL_FILE, "Name of the excel file to read the data from.", "xls", false));
		types.add(new ParameterTypeInt(PARAMETER_SHEET_NUMBER, "The number of the sheet which should be imported.", 1, Integer.MAX_VALUE, 1, false));
		types.add(new ParameterTypeInt(PARAMETER_ROW_OFFSET, "The number of rows to skip at top of sheet as they contain no usable data.", 0, 65535,
				0, true));
		types.add(new ParameterTypeInt(PARAMETER_COLUMN_OFFSET,
				"The number of columns to skip at left side of sheet as they contain no usable data.", 0, 255, 0, true));
		types.add(new ParameterTypeBoolean(PARAMETER_FIRST_ROW_AS_NAMES, "Indicates if the first row should be used for the attribute names.", true,
				true));
		List<String> annotations = new LinkedList<String>();
		annotations.add(Annotations.ANNOTATION_NAME);
		annotations.addAll(Arrays.asList(Annotations.ALL_KEYS_ATTRIBUTE));
		types.add(new ParameterTypeList(PARAMETER_ANNOTATIONS, "Maps row numbers to annotation names.", new ParameterTypeInt("row_number",
				"Row number which contains an annotation", 0, Integer.MAX_VALUE), new ParameterTypeCategory("annotation",
				"Name of the annotation to assign this row.", annotations.toArray(new String[annotations.size()]), 0)));
		types.addAll(super.getParameterTypes());
		return types;
	}

	/**
	 * 
	 * Dedicated DataSet class for ExcelDataSets. It is need too cache the the
	 * DataSet in the {@link ExcelExampleSource#getDataSet()} method.
	 * 
	 * @author Sebastian Loh (22.06.2010)
	 * 
	 */
	private class ExcelDataSet extends DataSet {

		private Sheet sheet = null;

		private Cell[] cells = null;

		private SortedSet<Integer> emptyRows = new TreeSet<Integer>();

		private SortedSet<Integer> emptyColumns = new TreeSet<Integer>();

		private int rowOffset = 0;
		private int columnOffset = 0;
		private int numberOfRows = 0;
		private int numberOfColumns = 0;
		private int currentRow;

		public ExcelDataSet() throws OperatorException {

			rowOffset = getParameterAsInt(PARAMETER_ROW_OFFSET);
			columnOffset = getParameterAsInt(PARAMETER_COLUMN_OFFSET);
			currentRow = rowOffset;
			
			// load the excelWorkbook if it is not set
			if (workbook == null || true) {
				try {
					workbook = Workbook.getWorkbook(getParameterAsInputStream(PARAMETER_EXCEL_FILE));
				} catch (IOException e) {
					throw new UserError(ExcelExampleSource.this, 302, getParameter(PARAMETER_EXCEL_FILE), e.getMessage());
				} catch (BiffException e) {
					throw new UserError(ExcelExampleSource.this, 302, getParameter(PARAMETER_EXCEL_FILE), e.getMessage());
				}
			}
			try{
				sheet = workbook.getSheet(getParameterAsInt(PARAMETER_SHEET_NUMBER) - 1);
			} catch (IndexOutOfBoundsException e) {
				throw new UserError(ExcelExampleSource.this, 953, getParameter(PARAMETER_SHEET_NUMBER));
			}
			numberOfColumns = sheet.getColumns();
			numberOfRows = sheet.getRows();

			// TODO unifiy offset and emptiness checks in one loop
			// determine offsets
			boolean contentFound = false;
			for (int r = rowOffset; r < numberOfRows; r++) {
				for (int c = columnOffset; c < numberOfColumns; c++) {
					if (sheet.getCell(c, r).getType() != CellType.EMPTY && !"".equals(sheet.getCell(c, r).getContents().trim())) {
						columnOffset = c;
						contentFound = true;
						break;
					}
				}
				if (contentFound) {
					rowOffset = r;
					break;
				}
			}
			if (!contentFound) {
				throw new UserError(ExcelExampleSource.this, 302, getParameter(PARAMETER_EXCEL_FILE), "spreadsheet seems to be empty");
			}

			// determine empty rows
			for (int r = rowOffset; r < numberOfRows; r++) {
				boolean rowEmpty = true;
				for (int c = columnOffset; c < numberOfColumns; c++) {
					if (sheet.getCell(c, r).getType() != CellType.EMPTY && !"".equals(sheet.getCell(c, r).getContents().trim())) {
						rowEmpty = false;
						break;
					}
				}
				if (rowEmpty) {
					emptyRows.add(r);
				}
			}

			// determine empty columns
			for (int c = columnOffset; c < numberOfColumns; c++) {
				boolean columnEmpty = true;
				for (int r = rowOffset; r < numberOfRows; r++) {
					if (sheet.getCell(c, r).getType() != CellType.EMPTY && !"".equals(sheet.getCell(c, r).getContents().trim())) {
						columnEmpty = false;
						break;
					}
				}
				if (columnEmpty) {
					emptyColumns.add(c);
				}
			}

			// set attribute names
			if (nameRowF != -1) {
				String[] attributeNames = new String[numberOfColumns - columnOffset - emptyColumns.size()];
				int columnCounter = 0;
				for (int c = columnOffset; c < numberOfColumns; c++) {
					// skip empty columns
					if (!emptyColumns.contains(c)) {
						Cell cell = sheet.getCell(c, rowOffset + nameRowF);
						attributeNames[columnCounter++] = cell.getContents();
					}
				}
				setAttributeNames(attributeNames);
				currentRow++;
			}

			// Annotations
			int columnCounter = 0;
			Annotations[] annotations = new Annotations[numberOfColumns - columnOffset - emptyColumns.size()];
			boolean foundAnnotations = false;
			for (int c = columnOffset; c < numberOfColumns; c++) {
				// skip empty columns
				if (emptyColumns.contains(c))
					continue;

				annotations[columnCounter] = new Annotations();
				for (Map.Entry<Integer, String> entry : annotationsMap.entrySet()) {
					if (Annotations.ANNOTATION_NAME.equals(entry.getValue())) {
						continue;
					} else {
						Cell cell = sheet.getCell(c, rowOffset + entry.getKey());
						annotations[columnCounter].put(entry.getValue(), cell.getContents());
						foundAnnotations = true;
					}
				}
				columnCounter++;
			}
			if (foundAnnotations)
				setAnnotations(annotations);
		}

		@Override
		public int getNumberOfColumnsInCurrentRow() {
			return numberOfColumns - columnOffset - emptyColumns.size();
		}

		@Override
		public boolean isMissing(int columnIndex) {
			return cells[columnIndex].getType() == CellType.EMPTY || cells[columnIndex].getType() == CellType.ERROR
					|| cells[columnIndex].getType() == CellType.FORMULA_ERROR || cells[columnIndex].getContents() == null
					|| "".equals(cells[columnIndex].getContents().trim());
		}

		@Override
		public Number getNumber(int columnIndex) {
			try {
				if (cells[columnIndex].getType() == CellType.NUMBER)
					return Double.valueOf(((NumberCell) cells[columnIndex]).getValue());
				else
					return Double.valueOf(cells[columnIndex].getContents());
			} catch (ClassCastException e) {
			} catch (NumberFormatException e) {}
			return null;
		}

		@Override
		public Date getDate(int columnIndex) {
			try {
				Date date = ((DateCell) cells[columnIndex]).getDate();
				if (date == null) {
					return null;
				}
				int offset = TimeZone.getDefault().getOffset(date.getTime());
				return new Date(date.getTime() - offset);
			} catch (ClassCastException e) {
			}
			return null;
		}

		@Override
		public String getString(int columnIndex) {
			return cells[columnIndex].getContents();
		}

		@Override
		public boolean next() {
			while ((emptyRows.contains(currentRow) || (skipAnnotationRows && annotationRows.contains(currentRow))) && currentRow < numberOfRows) {
				currentRow++;
			}
			if (currentRow >= numberOfRows) {
				return false;
			}
			cells = new Cell[numberOfColumns - columnOffset - emptyColumns.size()];
			int columnCounter = 0;
			for (int c = columnOffset; c < numberOfColumns; c++) {
				if (emptyColumns.contains(c)) {
					continue;
				}
				// sheet.getRow(currentRow);
				cells[columnCounter] = sheet.getCell(c, currentRow);
				columnCounter++;
			}
			currentRow++;
			return true;
		}

		@Override
		public void close() throws OperatorException {
			if (!keepWorkbookOpen) {
				workbook.close();
			}
		}

		public void setCurrentRow(int currentRow) {
			this.currentRow = currentRow;
		}
	}
}