FileDataRowReader.java example

Explorer
rapidminer-5-master
/*
 *  RapidMiner
 *
 *  Copyright (C) 2001-2014 by RapidMiner and the contributors
 *
 *  Complete list of developers available at our web site:
 *
 *       http://rapidminer.com
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Affero General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Affero General Public License for more details.
 *
 *  You should have received a copy of the GNU Affero General Public License
 *  along with this program.  If not, see http://www.gnu.org/licenses/.
 */
package com.rapidminer.example.table;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;

import com.rapidminer.example.Attribute;
import com.rapidminer.tools.LogService;
import com.rapidminer.tools.RandomGenerator;
import com.rapidminer.tools.Tools;
import com.rapidminer.tools.att.AttributeDataSource;


/**
 * <p>
 * FileDataRowReader implements a DataRowReader that reads DataRows from a file.
 * This is the main data reader for many file formats (including csv) and is
 * used by the ExampleSource operator and the attribute editor.
 * </p>
 * 
 * <p>
 * This class supports the reading of data from multiple source files. Each
 * attribute (including special attributes like labels, weights, ...) might be
 * read from another file. Please note that only the minimum number of lines of
 * all files will be read, i.e. if one of the data source files has less lines
 * than the others, only this number of data rows will be read.
 * </p>
 * 
 * <p>
 * The split points can be defined with regular expressions (please refer to the
 * Java API). Quoting is possible but not suggested since the runtime is higher.
 * The user should ensure that the split characters are not included in the data
 * columns. Please refer to {@link RapidMinerLineReader} for further information.
 * </p>
 * 
 * <p>
 * Unknown attribute values can be marked with empty strings or "?".
 * </p>
 * 
 * @author Ingo Mierswa
 *          Exp $
 */
public class FileDataRowReader extends AbstractDataRowReader {

	private static final int FILE_NR = 0;

	private static final int COLUMN_NR = 1;

	/** The file readers. */
	private BufferedReader[] fileReader;

	/** The attribute descriptions. */
	private Attribute[] attributes;

	/** Remember if an end of file has occured. */
	private boolean eof;

	/** Remember if a line has already been read. */
	private boolean lineRead;

	/** The sample ratio. */
	private double sampleRatio = 1.0d;

	/** The maximum number of examples to read (sampling). */
	private int maxNumber = -1;

	/** The number of lines read so far (i.e. the number of examples). */
	private int linesRead = 0;

	/**
	 * This array hold the current data. The first dimension is used for
	 * distinguishing different sources and the second for data read from the
	 * corresponding source.
	 */
	private String[][] currentData;

	/**
	 * This array holds the information how many columns each data source should
	 * provide. Otherwise an IOException will be thrown. This information is
	 * only used for checks and error improvement.
	 */
	private int[] expectedNumberOfColumns;

	/** This reader maps lines read from a file to RapidMiner columns. */
	private RapidMinerLineReader rapidMinerLineReader;

    /** The random generator used for sampling. */
    private RandomGenerator random;
    
	/**
	 * Array of size [number of attributes][2]. For each attribute i the value
	 * of dataSourceIndex[i][FILE_NR] is used as an index to {@link #fileReader}
	 * and the value of dataSourceIndex[i][TOKEN_NR] specifies the index of the
	 * column to use for attribute i.
	 */
	private int[][] dataSourceIndex;

	/**
	 * Constructs a new FileDataRowReader.
	 * 
	 * @param factory
	 *            Factory used to create data rows.
	 * @param attributeDataSources
	 *            List of {@link AttributeDataSource}s.
	 * @param sampleRatio
	 *            the ratio of examples which will be read. Only used if
	 *            sampleSize is -1.
	 * @param sampleSize
	 *            Limit sample to the first sampleSize lines read from files. -1
	 *            for no limit, then the sampleRatio will be used.
	 * @param separatorsRegExpr
	 *            a regular expression describing the separator characters for
	 *            the columns of each line
	 * @param commentChars
	 *            defines which characters are used to comment the rest of a
	 *            line
	 * @param useQuotes
	 *            indicates if quotes should be used and parsed. Slows down
	 *            reading and should be avoided if possible
     * @param random
     *            the random generator used for sampling
	 */
	public FileDataRowReader(DataRowFactory factory, List<AttributeDataSource> attributeDataSources, double sampleRatio, int sampleSize, String separatorsRegExpr, char[] commentChars, boolean useQuotes, char quoteChar, char escapeChar, boolean trimLines, boolean skipErrorLines, Charset encoding, RandomGenerator random) throws IOException {
		super(factory);
		this.sampleRatio = sampleRatio;
		this.maxNumber = sampleSize;
		this.attributes = new Attribute[attributeDataSources.size()];
		this.dataSourceIndex = new int[attributeDataSources.size()][2];
		this.rapidMinerLineReader = new RapidMinerLineReader(separatorsRegExpr, commentChars, useQuotes, quoteChar, escapeChar, trimLines, skipErrorLines);
		this.random = random;
		initReader(factory, attributeDataSources, sampleSize, separatorsRegExpr, useQuotes, encoding);
	}
    
	/** Read the complete data. */
	private void initReader(DataRowFactory factory, List<AttributeDataSource> attributeDataSources, int sampleSize, String separatorsRegExpr, boolean useQuotes, Charset encoding) throws IOException {
		// map all files used to indices
		List<BufferedReader> readerList = new LinkedList<BufferedReader>();
		Map<File, Integer> fileMap = new HashMap<File, Integer>();
		Iterator<AttributeDataSource> i = attributeDataSources.iterator();
		int attribute = 0;
		int greatestFileIndex = -1;
		List<AtomicInteger> columnCounters = new ArrayList<AtomicInteger>();
		while (i.hasNext()) {
			AttributeDataSource ads = i.next();
			attributes[attribute] = ads.getAttribute();
			File file = ads.getFile();
			Integer fileIndex = fileMap.get(file);
			// new file found? -> create reader and map to index number
			if (fileIndex == null) {
				fileIndex = Integer.valueOf(++greatestFileIndex);
				fileMap.put(file, fileIndex);
				readerList.add(Tools.getReader(file, encoding));
				columnCounters.add(new AtomicInteger(1));
			} else {
				AtomicInteger counter = columnCounters.get(fileIndex.intValue());
				counter.incrementAndGet();
			}
			dataSourceIndex[attribute][FILE_NR] = fileIndex.intValue();
			dataSourceIndex[attribute][COLUMN_NR] = ads.getColumn();
			attribute++;
		}

		this.fileReader = new BufferedReader[readerList.size()];
		readerList.toArray(this.fileReader);
		currentData = new String[this.fileReader.length][];

		// create counters
		expectedNumberOfColumns = new int[columnCounters.size()];
		Iterator<AtomicInteger> j = columnCounters.iterator();
		int k = 0;
		while (j.hasNext())
			expectedNumberOfColumns[k++] = j.next().intValue();
	}

    /** Skips the next line, if present. */
    public void skipLine() {
        try {
            readLine();
        } catch (Exception e) {
        	//LogService.getGlobal().log("Problem during skipping of line: " + e.getMessage(), LogService.WARNING);
        	LogService.getRoot().log(Level.WARNING, "com.rapidminer.example.table.FileDataRowReader.problem_during_skipping_of_line", e.getMessage());
        }
    }
    
	/**
	 * Reads a line of data from all file readers. Returns true if the line was
	 * readable, i.e. the end of the source files was not yet reached.
	 */
	private boolean readLine() throws IOException {
		boolean eofReached = false;
		boolean ok = false;
		while (!ok) {
			for (int i = 0; i < fileReader.length; i++) {
				currentData[i] = rapidMinerLineReader.readLine(fileReader[i], expectedNumberOfColumns[i]);
				if (currentData[i] == null) {
					eofReached = true;
					break;
				}
			}
			if ((eofReached) || (maxNumber != -1) || (sampleRatio == 1.0d) || (random.nextDouble() < sampleRatio))
				ok = true;
		}
		if (eofReached) {
			for (int i = 0; i < fileReader.length; i++) {
				fileReader[i].close();
			}
			return false;
		} else {
			return true;
		}
	}

	/**
	 * Checks if another line exists and reads. The next line is only read once
	 * even if this method is invoked more than once.
	 */
	public boolean hasNext() {
		if ((maxNumber > -1) && (linesRead >= maxNumber))
			return false;

		if (lineRead)
			return !eof;

		try {
			eof = !readLine();
		} catch (IOException e) {
			LogService.getGlobal().log(e.getMessage(), LogService.ERROR);
			return false;
		}
		lineRead = true;

		return (!eof);
	}

	/** Returns the next Example. */
	public DataRow next() {
		if (eof == true)
			return null;
		if (!lineRead)
			if (!hasNext())
				return null;

		String[] data = new String[attributes.length];
		for (int i = 0; i < attributes.length; i++) {
			if (dataSourceIndex[i][1] == -1) {
				data[i] = null;
			} else {
				data[i] = currentData[dataSourceIndex[i][0]][dataSourceIndex[i][1]];
			}
		}

		DataRow dataRow = getFactory().create(data, attributes);
		linesRead++;
		lineRead = false;
		return dataRow;
	}
}