/* Copyright (C) 2003 University of Pennsylvania.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
@author Aron Culotta
package cc.mallet.pipe;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import cc.mallet.extract.StringSpan;
import cc.mallet.extract.StringTokenization;
import cc.mallet.types.*;
* This extends {@link SimpleTaggerSentence2TokenSequence} to use
* {Slink StringTokenizations} for use with the extract package.
public class SimpleTaggerSentence2StringTokenization
extends SimpleTaggerSentence2TokenSequence{
* Creates a new
* <code>SimpleTaggerSentence2StringTokenization</code> instance.
* By default we include tokens as features.
public SimpleTaggerSentence2StringTokenization ()
super ();
* creates a new <code>SimpleTaggerSentence2StringTokenization</code> instance
* which includes tokens as features iff the supplied argument is true.
public SimpleTaggerSentence2StringTokenization (boolean inc)
super (inc);
* Takes an instance with data of type String or String[][] and creates
* an Instance of type StringTokenization. Each Token in the sequence is
* gets the test of the line preceding it and once feature of value 1
* for each "Feature" in the line. For example, if the String[][] is
* {{a,b},{c,d,e}} (and target processing is off) then the text would be
* "a b" for the first token and "c d e" for the second. Also, the
* features "a" and "b" would be set for the first token and "c", "d" and
* "e" for the second. The last element in the String[] for the current
* token is taken as the target (label), so in the previous example "b"
* would have been the label of the first sequence.
public Instance pipe(Instance carrier) {
Object inputData = carrier.getData();
LabelAlphabet labels;
LabelSequence target = null;
String[][] tokens;
StringBuffer source = new StringBuffer();
StringTokenization ts = new StringTokenization(source);
if (inputData instanceof String)
tokens = parseSentence((String) inputData);
else if (inputData instanceof String[][])
tokens = (String[][]) inputData;
throw new IllegalArgumentException("Not a String; got " + inputData);
if (isTargetProcessing()) {
labels = (LabelAlphabet) getTargetAlphabet();
target = new LabelSequence(labels, tokens.length);
for (int l = 0; l < tokens.length; l++) {
int nFeatures;
if (isTargetProcessing()) {
if (tokens[l].length < 1)
throw new IllegalStateException("Missing label at line "
+ l + " instance " + carrier.getName());
nFeatures = tokens[l].length - 1;
} else
nFeatures = tokens[l].length;
int start = source.length();
String word = makeText(tokens[l]);
source.append(word + " ");
Token tok = new StringSpan(source, start, source.length() - 1);
if (setTokensAsFeatures) {
for (int f = 0; f < nFeatures; f++)
tok.setFeatureValue(tokens[l][f], 1.0);
} else {
for (int f = 1; f < nFeatures; f++)
tok.setFeatureValue(tokens[l][f], 1.0);
if (isTargetProcessing())
return carrier;
// Serialization garbage
private static final long serialVersionUID = 1;
private static final int CURRENT_SERIAL_VERSION = 1;
private void writeObject (ObjectOutputStream out) throws IOException
out.defaultWriteObject ();
private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException
in.defaultReadObject ();
int version = in.readInt ();