/* Copyright (C) 2003 University of Pennsylvania. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ /** @author Aron Culotta */ package cc.mallet.pipe; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import cc.mallet.extract.StringSpan; import cc.mallet.extract.StringTokenization; import cc.mallet.types.*; /** * This extends {@link SimpleTaggerSentence2TokenSequence} to use * {Slink StringTokenizations} for use with the extract package. */ public class SimpleTaggerSentence2StringTokenization extends SimpleTaggerSentence2TokenSequence{ /** * Creates a new * <code>SimpleTaggerSentence2StringTokenization</code> instance. * By default we include tokens as features. */ public SimpleTaggerSentence2StringTokenization () { super (); } /** * creates a new <code>SimpleTaggerSentence2StringTokenization</code> instance * which includes tokens as features iff the supplied argument is true. */ public SimpleTaggerSentence2StringTokenization (boolean inc) { super (inc); } /** * Takes an instance with data of type String or String[][] and creates * an Instance of type StringTokenization. Each Token in the sequence is * gets the test of the line preceding it and once feature of value 1 * for each "Feature" in the line. For example, if the String[][] is * {{a,b},{c,d,e}} (and target processing is off) then the text would be * "a b" for the first token and "c d e" for the second. Also, the * features "a" and "b" would be set for the first token and "c", "d" and * "e" for the second. The last element in the String[] for the current * token is taken as the target (label), so in the previous example "b" * would have been the label of the first sequence. */ public Instance pipe(Instance carrier) { Object inputData = carrier.getData(); LabelAlphabet labels; LabelSequence target = null; String[][] tokens; StringBuffer source = new StringBuffer(); StringTokenization ts = new StringTokenization(source); if (inputData instanceof String) tokens = parseSentence((String) inputData); else if (inputData instanceof String[][]) tokens = (String[][]) inputData; else throw new IllegalArgumentException("Not a String; got " + inputData); if (isTargetProcessing()) { labels = (LabelAlphabet) getTargetAlphabet(); target = new LabelSequence(labels, tokens.length); } for (int l = 0; l < tokens.length; l++) { int nFeatures; if (isTargetProcessing()) { if (tokens[l].length < 1) throw new IllegalStateException("Missing label at line " + l + " instance " + carrier.getName()); nFeatures = tokens[l].length - 1; target.add(tokens[l][nFeatures]); } else nFeatures = tokens[l].length; int start = source.length(); String word = makeText(tokens[l]); source.append(word + " "); Token tok = new StringSpan(source, start, source.length() - 1); if (setTokensAsFeatures) { for (int f = 0; f < nFeatures; f++) tok.setFeatureValue(tokens[l][f], 1.0); } else { for (int f = 1; f < nFeatures; f++) tok.setFeatureValue(tokens[l][f], 1.0); } ts.add(tok); } carrier.setData(ts); if (isTargetProcessing()) carrier.setTarget(target); return carrier; } // Serialization garbage private static final long serialVersionUID = 1; private static final int CURRENT_SERIAL_VERSION = 1; private void writeObject (ObjectOutputStream out) throws IOException { out.defaultWriteObject (); out.writeInt (CURRENT_SERIAL_VERSION); } private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject (); int version = in.readInt (); } }