/* Copyright (C) 2010 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.classify.tui;
import java.util.ArrayList;
import java.util.logging.*;
import java.io.*;
import java.nio.charset.Charset;
import cc.mallet.pipe.*;
import cc.mallet.pipe.iterator.*;
import cc.mallet.types.*;
import cc.mallet.util.*;
/**
* Command line import tool for loading a sequence of
* instances from an SVMLight feature-value pair file, with one instance
* per line of the input file.
* <p>
*
* The expected format is
*
* target feature:value feature:value ...
*
* targets and features can be indices, as in
* SVMLight, or Strings.
*
* Note that if targets and features are indices,
* their indices in the data and target Alphabets
* may be different, though the data will be
* equivalent.
*
* Note that the input and output args can take multiple files.
*
* @author Gregory Druck
*/
public class SvmLight2Vectors {
private static Logger logger = MalletLogger.getLogger(SvmLight2Vectors.class.getName());
static CommandOption.SpacedStrings inputFiles = new CommandOption.SpacedStrings
(SvmLight2Vectors.class, "input", "FILE", true, null,
"The files containing data to be classified, one instance per line", null);
static CommandOption.SpacedStrings outputFiles = new CommandOption.SpacedStrings
(SvmLight2Vectors.class, "output", "FILE", true, null,
"Write the instance list to this file; Using - indicates stdout.", null);
static CommandOption.File usePipeFromVectorsFile = new CommandOption.File
(SvmLight2Vectors.class, "use-pipe-from", "FILE", true, new File("text.vectors"),
"Use the pipe and alphabets from a previously created vectors file.\n" +
" Allows the creation, for example, of a test set of vectors that are\n" +
" compatible with a previously created set of training vectors", null);
static CommandOption.Boolean printOutput = new CommandOption.Boolean
(SvmLight2Vectors.class, "print-output", "[TRUE|FALSE]", false, false,
"If true, print a representation of the processed data\n" +
" to standard output. This option is intended for debugging.", null);
static CommandOption.String encoding = new CommandOption.String
(SvmLight2Vectors.class, "encoding", "STRING", true, Charset.defaultCharset().displayName(),
"Character encoding for input file", null);
public static void main (String[] args) throws FileNotFoundException, IOException
{
// Process the command-line options
CommandOption.setSummary (SvmLight2Vectors.class,
"A tool for creating instance lists of feature vectors from comma-separated-values");
CommandOption.process (SvmLight2Vectors.class, args);
// Print some helpful messages for error cases
if (args.length == 0) {
CommandOption.getList(SvmLight2Vectors.class).printUsage(false);
System.exit (-1);
}
if (inputFiles == null) {
throw new IllegalArgumentException ("You must include `--input FILE FILE ...' in order to specify "+
"files containing the instances, one per line.");
}
Pipe instancePipe;
InstanceList previousInstanceList = null;
if (usePipeFromVectorsFile.wasInvoked()) {
// Ignore all options, use a previously created pipe
previousInstanceList = InstanceList.load (usePipeFromVectorsFile.value);
instancePipe = previousInstanceList.getPipe();
}
else {
// Build a new pipe
ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
pipeList.add(new SvmLight2FeatureVectorAndLabel());
if (printOutput.value) {
pipeList.add(new PrintInputAndTarget());
}
instancePipe = new SerialPipes(pipeList);
}
if (inputFiles.value.length != outputFiles.value.length) {
throw new RuntimeException("Number of input and output files must be the same.");
}
InstanceList[] instances = new InstanceList[inputFiles.value.length];
for (int fileIndex = 0; fileIndex < inputFiles.value.length; fileIndex++) {
// Create the instance list and open the input file
instances[fileIndex] = new InstanceList (instancePipe);
Reader fileReader;
if (inputFiles.value[fileIndex].equals ("-")) {
fileReader = new InputStreamReader (System.in);
}
else {
fileReader = new InputStreamReader(new FileInputStream(inputFiles.value[fileIndex]), encoding.value);
}
// Read instances from the file
instances[fileIndex].addThruPipe (new SelectiveFileLineIterator (fileReader, "^\\s*#.+"));
}
// gdruck@cs.umass.edu
// If we have multiple files, the data or target alphabet may have new
// elements added to it with each new file. If we save each InstanceList
// immediately after processing each file, then Alphabets won't be the
// same. Instead, process all files before writing the InstanceLists.
for (int fileIndex = 0; fileIndex < inputFiles.value.length; fileIndex++) {
// Save instances to output file
instances[fileIndex].save(new File(outputFiles.value[fileIndex]));
}
// If we are reusing a pipe from an instance list
// created earlier, we may have extended the label
// or feature alphabets. To maintain compatibility,
// we now save that original instance list back to disk
// with the new alphabet.
if (usePipeFromVectorsFile.wasInvoked()) {
logger.info(" Rewriting extended pipe from " + usePipeFromVectorsFile.value);
logger.info(" Instance ID = " + previousInstanceList.getPipe().getInstanceId());
ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(usePipeFromVectorsFile.value));
oos.writeObject(previousInstanceList);
oos.close();
}
}
}