/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
package cc.mallet.share.mccallum.ner;
import junit.framework.*;
import java.util.Iterator;
import java.util.Random;
import java.util.regex.*;
import java.io.*;
import cc.mallet.fst.*;
import cc.mallet.optimize.*;
import cc.mallet.pipe.*;
import cc.mallet.pipe.iterator.*;
import cc.mallet.pipe.tsf.*;
import cc.mallet.types.*;
import cc.mallet.util.*;
public class TUI
static CommandOption.Double gaussianVarianceOption = new CommandOption.Double
(TUI.class, "gaussian-variance", "DECIMAL", true, 10.0,
"The gaussian prior variance used for training.", null);
static CommandOption.Double hyperbolicSlopeOption = new CommandOption.Double
(TUI.class, "hyperbolic-slope", "DECIMAL", true, 0.2,
"The hyperbolic prior slope used for training.", null);
static CommandOption.Double hyperbolicSharpnessOption = new CommandOption.Double
(TUI.class, "hyperbolic-sharpness", "DECIMAL", true, 10.0,
"The hyperbolic prior sharpness used for training.", null);
static CommandOption.File crfInputFileOption = new CommandOption.File
(TUI.class, "crf-input-file", "FILENAME", true, null,
"The name of the file to write the CRF after training.", null);
static CommandOption.Integer randomSeedOption = new CommandOption.Integer
(TUI.class, "random-seed", "INTEGER", true, 0,
"The random seed for randomly selecting a proportion of the instance list for training", null);
static CommandOption.Integer labelGramOption = new CommandOption.Integer
(TUI.class, "label-gram", "INTEGER", true, 1,
"Markov order of labels: 1, 2, 3", null);
static CommandOption.Integer wordWindowFeatureOption = new CommandOption.Integer
(TUI.class, "word-window-size", "INTEGER", true, 0,
"Size of window of words as features: 0=none, 10, 20...", null);
static CommandOption.Boolean useTestbOption = new CommandOption.Boolean
(TUI.class, "use-testb", "true|false", true, false,
"Use testb, final test set", null);
static CommandOption.Boolean useHyperbolicPriorOption = new CommandOption.Boolean
(TUI.class, "use-hyperbolic-prior", "true|false", true, false,
"Use hyperbolic prior", null);
static CommandOption.Boolean useFeatureInductionOption = new CommandOption.Boolean
(TUI.class, "use-feature-induction", "true|false", true, false,
"Not use or use feature induction", null);
static CommandOption.Boolean clusterFeatureInductionOption = new CommandOption.Boolean
(TUI.class, "cluster-feature-induction", "true|false", true, false,
"Cluster in feature induction", null);
static CommandOption.Boolean useFirstMentionFeatureOption = new CommandOption.Boolean
(TUI.class, "use-firstmention-feature", "true|false", true, false,
"Don't use first-mention feature", null);
static CommandOption.Boolean useDocHeaderFeatureOption = new CommandOption.Boolean
(TUI.class, "use-docheader-feature", "true|false", true, false,
"", null);
static CommandOption.Boolean includeConllLexiconsOption = new CommandOption.Boolean
(TUI.class, "include-conll-lexicons", "true|false", true, false,
"", null);
static CommandOption.Boolean charNGramsOption = new CommandOption.Boolean
(TUI.class, "char-ngrams", "true|false", true, false,
"", null);
static CommandOption.String offsetsOption = new CommandOption.String
(TUI.class, "offsets", "e.g. [[0,0],[1]]", true, "[[-2],[-1],[1],[2]]",
"Offset conjunctions", null);
static CommandOption.String capOffsetsOption = new CommandOption.String
(TUI.class, "cap-offsets", "e.g. [[0,0],[0,1]]", true, "",
"Offset conjunctions applied to features that are [A-Z]*", null);
static CommandOption.String viterbiFilePrefixOption = new CommandOption.String
(TUI.class, "viterbi-file", "FILE", true, "TUI",
"Filename in which to store most recent Viterbi output", null);
static final CommandOption.List commandOptions =
new CommandOption.List (
"Training, testing and running a Chinese word segmenter.",
new CommandOption[] {
int numEvaluations = 0;
static int iterationsBetweenEvals = 16;
static boolean doingFeatureInduction = true;
static boolean doingClusteredFeatureInduction = false;
private static String CAPS = "[\\p{Lu}]";
private static String LOW = "[\\p{Ll}]";
private static String CAPSNUM = "[\\p{Lu}\\p{Nd}]";
private static String ALPHA = "[\\p{Lu}\\p{Ll}]";
private static String ALPHANUM = "[\\p{Lu}\\p{Ll}\\p{Nd}]";
private static String PUNT = "[,\\.;:?!()]";
private static String QUOTE = "[\"`']";
public static void main (String[] args) throws FileNotFoundException, Exception
commandOptions.process (args);
String homedir = System.getProperty ("HOME");
String lexdir = homedir+"/research/data/resources/";
String offsetsString = offsetsOption.value.replace('[','{').replace(']','}');
int[][] offsets = (int[][]) CommandOption.getInterpreter().eval ("new int[][] "+offsetsString);
String capOffsetsString = capOffsetsOption.value.replace('[','{').replace(']','}');
int[][] capOffsets = null;
if (capOffsetsString.length() > 0)
capOffsets = (int[][]) CommandOption.getInterpreter().eval ("new int[][] "+capOffsetsString);
Pipe conllLexiconsPipe = null;
if (includeConllLexiconsOption.value)
conllLexiconsPipe = new SerialPipes (new Pipe[] {
new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOPER")),
new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOLOC")),
new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOORG")),
new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOMISC")),
Pipe p = new SerialPipes (new Pipe[] {
new ConllNer2003Sentence2TokenSequence (),
new RegexMatches ("INITCAP", Pattern.compile (CAPS+".*")),
new RegexMatches ("CAPITALIZED", Pattern.compile (CAPS+LOW+"*")),
new RegexMatches ("ALLCAPS", Pattern.compile (CAPS+"+")),
new RegexMatches ("MIXEDCAPS", Pattern.compile ("[A-Z][a-z]+[A-Z][A-Za-z]*")),
new RegexMatches ("CONTAINSDIGITS", Pattern.compile (".*[0-9].*")),
new RegexMatches ("ALLDIGITS", Pattern.compile ("[0-9]+")),
new RegexMatches ("NUMERICAL", Pattern.compile ("[-0-9]+[\\.,]+[0-9\\.,]+")),
//new RegexMatches ("ALPHNUMERIC", Pattern.compile ("[A-Za-z0-9]+")),
//new RegexMatches ("ROMAN", Pattern.compile ("[ivxdlcm]+|[IVXDLCM]+")),
new RegexMatches ("MULTIDOTS", Pattern.compile ("\\.\\.+")),
new RegexMatches ("ENDSINDOT", Pattern.compile ("[^\\.]+.*\\.")),
new RegexMatches ("CONTAINSDASH", Pattern.compile (ALPHANUM+"+-"+ALPHANUM+"*")),
new RegexMatches ("ACRO", Pattern.compile ("[A-Z][A-Z\\.]*\\.[A-Z\\.]*")),
new RegexMatches ("LONELYINITIAL", Pattern.compile (CAPS+"\\.")),
new RegexMatches ("SINGLECHAR", Pattern.compile (ALPHA)),
new RegexMatches ("CAPLETTER", Pattern.compile ("[A-Z]")),
new RegexMatches ("PUNC", Pattern.compile (PUNT)),
new RegexMatches ("QUOTE", Pattern.compile (QUOTE)),
//new RegexMatches ("LOWER", Pattern.compile (LOW+"+")),
//new RegexMatches ("MIXEDCAPS", Pattern.compile ("[A-Z]+[a-z]+[A-Z]+[a-z]*")),
(includeConllLexiconsOption.value ? conllLexiconsPipe : new Noop ()),
// Note that the word has not been lowecased! so INITCAP, etc, is redundant
//new TokenSequenceLowercase (),
new TokenText ("W="),
//new TokenSequenceFirstSentenceAllCaps (),
new OffsetConjunctions (offsets),
(capOffsets != null ? (Pipe) new OffsetConjunctions (capOffsets) : (Pipe) new Noop ()),
//// Don't lowercase the W= if you want to use this.
? (Pipe) new Noop ()
: (Pipe) new FeaturesOfFirstMention ("FIRSTMENTION=", Pattern.compile (CAPS+".*"),
// Exclude singleton W=foo features b/c redundant
Pattern.compile ("W=[^@&]+"), false)),
(!useDocHeaderFeatureOption.value ? (Pipe) new Noop () : (Pipe) new TokenSequenceDocHeader ()),
(wordWindowFeatureOption.value > 0
? (Pipe) new FeaturesInWindow ("WINDOW=", -wordWindowFeatureOption.value,
wordWindowFeatureOption.value, Pattern.compile ("WORD=.*"), true)
: (Pipe) new Noop()),
? (Pipe) new TokenTextCharNGrams ("CHARNGRAM=", new int[] {2,3,4})
: (Pipe) new Noop()),
new PrintTokenSequenceFeatures(),
new TokenSequence2FeatureVectorSequence (true, true)
// Set up training and testing data
//args = new String[] {homedir+"/research/data/ie/ner2003/eng.testa"};
if (useTestbOption.value)
args = new String[] {homedir+"/research/data/ie/ner2003/eng.train",
args = new String[] {homedir+"/research/data/ie/ner2003/eng.train",
InstanceList trainingData = new InstanceList (p);
trainingData.addThruPipe (new LineGroupIterator (new FileReader (new File (args[0])),
Pattern.compile("^.DOCSTART. .X. .X. .$"), true));
System.out.println ("Read "+trainingData.size()+" training instances");
InstanceList testingData = null;
if (args.length > 1) {
testingData = new InstanceList (p);
testingData.addThruPipe (new LineGroupIterator (new FileReader (new File (args[1])),
Pattern.compile("^.DOCSTART. .X. .X. .$"), true));
if (testingData == null) {
// For now, just train on a small fraction of the data
Random r = new Random (1);
// Proportions below is: {training, testing, ignore}
InstanceList[] trainingLists = trainingData.split (r, new double[] {.2, .1, .7});
trainingData = trainingLists[0];
// and test on just 50% of the data
if (testingData != null) {
InstanceList[] testingLists = testingData.split (r, new double[] {.5, .5});
testingData = testingLists[0];
testingLists = null;
} else {
testingData = trainingLists[1];
trainingLists = null;
assert (testingData != null);
// Print out all the target names
Alphabet targets = p.getTargetAlphabet();
System.out.print ("State labels:");
for (int i = 0; i < targets.size(); i++)
System.out.print (" " + targets.lookupObject(i));
System.out.println ("");
// Print out some feature information
System.out.println ("Number of features = "+p.getDataAlphabet().size());
CRF crf = new CRF (p, null);
if (labelGramOption.value == 1)
crf.addStatesForLabelsConnectedAsIn (trainingData);
else if (labelGramOption.value == 2)
crf.addStatesForBiLabelsConnectedAsIn (trainingData);
//else if (labelGramOption.value == 3)
//crf.addStatesForTriLabelsConnectedAsIn (trainingData);
throw new IllegalStateException ("label-gram must be 1, 2, or 3, not "+ labelGramOption.value);
CRFTrainerByLabelLikelihood crft = new CRFTrainerByLabelLikelihood (crf);
if (useHyperbolicPriorOption.value) {
crft.setUseHyperbolicPrior (true);
crft.setHyperbolicPriorSlope (hyperbolicSlopeOption.value);
crft.setHyperbolicPriorSharpness (hyperbolicSharpnessOption.value);
} else {
crft.setGaussianPriorVariance (gaussianVarianceOption.value);
for (int i = 0; i < crf.numStates(); i++) {
Transducer.State s = crf.getState (i);
if (s.getName().charAt(0) == 'I')
s.setInitialWeight (Double.POSITIVE_INFINITY);
System.out.println("Training on "+trainingData.size()+" training instances, "+
testingData.size()+" testing instances...");
MultiSegmentationEvaluator eval =
new MultiSegmentationEvaluator (new InstanceList[] {trainingData, testingData},
new String[] {"Training", "Testing"},
new String[] {"B-PER", "B-LOC", "B-ORG", "B-MISC"},
new String[] {"I-PER", "I-LOC", "I-ORG", "I-MISC"});
ViterbiWriter vw = new ViterbiWriter (viterbiFilePrefixOption.value,
new InstanceList[] {trainingData, testingData}, new String[] {"Training", "Testing"});
if (useFeatureInductionOption.value) {
if (clusterFeatureInductionOption.value)
crft.trainWithFeatureInduction (trainingData, null, testingData,
eval, 99999,
10, 99, 200, 0.5, true,
new double[] {.1, .2, .5, .7});
crft.trainWithFeatureInduction (trainingData, null, testingData,
eval, 99999,
10, 99, 1000, 0.5, false,
new double[] {.1, .2, .5, .7});
else {
double[] trainingProportions = new double[] {.1, .2, .5, .7};
for (int i = 0; i < trainingProportions.length; i++) {
crft.train(trainingData, 3, new double[] {trainingProportions[i]});
while (crft.train(trainingData, 3)) {