/*
* RapidMiner
*
* Copyright (C) 2001-2014 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.example.table;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import com.rapidminer.example.Attribute;
import com.rapidminer.tools.LogService;
import com.rapidminer.tools.RandomGenerator;
import com.rapidminer.tools.Tools;
import com.rapidminer.tools.att.AttributeDataSource;
/**
* <p>
* FileDataRowReader implements a DataRowReader that reads DataRows from a file.
* This is the main data reader for many file formats (including csv) and is
* used by the ExampleSource operator and the attribute editor.
* </p>
*
* <p>
* This class supports the reading of data from multiple source files. Each
* attribute (including special attributes like labels, weights, ...) might be
* read from another file. Please note that only the minimum number of lines of
* all files will be read, i.e. if one of the data source files has less lines
* than the others, only this number of data rows will be read.
* </p>
*
* <p>
* The split points can be defined with regular expressions (please refer to the
* Java API). Quoting is possible but not suggested since the runtime is higher.
* The user should ensure that the split characters are not included in the data
* columns. Please refer to {@link RapidMinerLineReader} for further information.
* </p>
*
* <p>
* Unknown attribute values can be marked with empty strings or "?".
* </p>
*
* @author Ingo Mierswa
* Exp $
*/
public class FileDataRowReader extends AbstractDataRowReader {
private static final int FILE_NR = 0;
private static final int COLUMN_NR = 1;
/** The file readers. */
private BufferedReader[] fileReader;
/** The attribute descriptions. */
private Attribute[] attributes;
/** Remember if an end of file has occured. */
private boolean eof;
/** Remember if a line has already been read. */
private boolean lineRead;
/** The sample ratio. */
private double sampleRatio = 1.0d;
/** The maximum number of examples to read (sampling). */
private int maxNumber = -1;
/** The number of lines read so far (i.e. the number of examples). */
private int linesRead = 0;
/**
* This array hold the current data. The first dimension is used for
* distinguishing different sources and the second for data read from the
* corresponding source.
*/
private String[][] currentData;
/**
* This array holds the information how many columns each data source should
* provide. Otherwise an IOException will be thrown. This information is
* only used for checks and error improvement.
*/
private int[] expectedNumberOfColumns;
/** This reader maps lines read from a file to RapidMiner columns. */
private RapidMinerLineReader rapidMinerLineReader;
/** The random generator used for sampling. */
private RandomGenerator random;
/**
* Array of size [number of attributes][2]. For each attribute i the value
* of dataSourceIndex[i][FILE_NR] is used as an index to {@link #fileReader}
* and the value of dataSourceIndex[i][TOKEN_NR] specifies the index of the
* column to use for attribute i.
*/
private int[][] dataSourceIndex;
/**
* Constructs a new FileDataRowReader.
*
* @param factory
* Factory used to create data rows.
* @param attributeDataSources
* List of {@link AttributeDataSource}s.
* @param sampleRatio
* the ratio of examples which will be read. Only used if
* sampleSize is -1.
* @param sampleSize
* Limit sample to the first sampleSize lines read from files. -1
* for no limit, then the sampleRatio will be used.
* @param separatorsRegExpr
* a regular expression describing the separator characters for
* the columns of each line
* @param commentChars
* defines which characters are used to comment the rest of a
* line
* @param useQuotes
* indicates if quotes should be used and parsed. Slows down
* reading and should be avoided if possible
* @param random
* the random generator used for sampling
*/
public FileDataRowReader(DataRowFactory factory, List<AttributeDataSource> attributeDataSources, double sampleRatio, int sampleSize, String separatorsRegExpr, char[] commentChars, boolean useQuotes, char quoteChar, char escapeChar, boolean trimLines, boolean skipErrorLines, Charset encoding, RandomGenerator random) throws IOException {
super(factory);
this.sampleRatio = sampleRatio;
this.maxNumber = sampleSize;
this.attributes = new Attribute[attributeDataSources.size()];
this.dataSourceIndex = new int[attributeDataSources.size()][2];
this.rapidMinerLineReader = new RapidMinerLineReader(separatorsRegExpr, commentChars, useQuotes, quoteChar, escapeChar, trimLines, skipErrorLines);
this.random = random;
initReader(factory, attributeDataSources, sampleSize, separatorsRegExpr, useQuotes, encoding);
}
/** Read the complete data. */
private void initReader(DataRowFactory factory, List<AttributeDataSource> attributeDataSources, int sampleSize, String separatorsRegExpr, boolean useQuotes, Charset encoding) throws IOException {
// map all files used to indices
List<BufferedReader> readerList = new LinkedList<BufferedReader>();
Map<File, Integer> fileMap = new HashMap<File, Integer>();
Iterator<AttributeDataSource> i = attributeDataSources.iterator();
int attribute = 0;
int greatestFileIndex = -1;
List<AtomicInteger> columnCounters = new ArrayList<AtomicInteger>();
while (i.hasNext()) {
AttributeDataSource ads = i.next();
attributes[attribute] = ads.getAttribute();
File file = ads.getFile();
Integer fileIndex = fileMap.get(file);
// new file found? -> create reader and map to index number
if (fileIndex == null) {
fileIndex = Integer.valueOf(++greatestFileIndex);
fileMap.put(file, fileIndex);
readerList.add(Tools.getReader(file, encoding));
columnCounters.add(new AtomicInteger(1));
} else {
AtomicInteger counter = columnCounters.get(fileIndex.intValue());
counter.incrementAndGet();
}
dataSourceIndex[attribute][FILE_NR] = fileIndex.intValue();
dataSourceIndex[attribute][COLUMN_NR] = ads.getColumn();
attribute++;
}
this.fileReader = new BufferedReader[readerList.size()];
readerList.toArray(this.fileReader);
currentData = new String[this.fileReader.length][];
// create counters
expectedNumberOfColumns = new int[columnCounters.size()];
Iterator<AtomicInteger> j = columnCounters.iterator();
int k = 0;
while (j.hasNext())
expectedNumberOfColumns[k++] = j.next().intValue();
}
/** Skips the next line, if present. */
public void skipLine() {
try {
readLine();
} catch (Exception e) {
//LogService.getGlobal().log("Problem during skipping of line: " + e.getMessage(), LogService.WARNING);
LogService.getRoot().log(Level.WARNING, "com.rapidminer.example.table.FileDataRowReader.problem_during_skipping_of_line", e.getMessage());
}
}
/**
* Reads a line of data from all file readers. Returns true if the line was
* readable, i.e. the end of the source files was not yet reached.
*/
private boolean readLine() throws IOException {
boolean eofReached = false;
boolean ok = false;
while (!ok) {
for (int i = 0; i < fileReader.length; i++) {
currentData[i] = rapidMinerLineReader.readLine(fileReader[i], expectedNumberOfColumns[i]);
if (currentData[i] == null) {
eofReached = true;
break;
}
}
if ((eofReached) || (maxNumber != -1) || (sampleRatio == 1.0d) || (random.nextDouble() < sampleRatio))
ok = true;
}
if (eofReached) {
for (int i = 0; i < fileReader.length; i++) {
fileReader[i].close();
}
return false;
} else {
return true;
}
}
/**
* Checks if another line exists and reads. The next line is only read once
* even if this method is invoked more than once.
*/
public boolean hasNext() {
if ((maxNumber > -1) && (linesRead >= maxNumber))
return false;
if (lineRead)
return !eof;
try {
eof = !readLine();
} catch (IOException e) {
LogService.getGlobal().log(e.getMessage(), LogService.ERROR);
return false;
}
lineRead = true;
return (!eof);
}
/** Returns the next Example. */
public DataRow next() {
if (eof == true)
return null;
if (!lineRead)
if (!hasNext())
return null;
String[] data = new String[attributes.length];
for (int i = 0; i < attributes.length; i++) {
if (dataSourceIndex[i][1] == -1) {
data[i] = null;
} else {
data[i] = currentData[dataSourceIndex[i][0]][dataSourceIndex[i][1]];
}
}
DataRow dataRow = getFactory().create(data, attributes);
linesRead++;
lineRead = false;
return dataRow;
}
}