/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ /** @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a> */ package cc.mallet.share.mccallum.ner; import junit.framework.*; import java.util.Iterator; import java.util.Random; import java.util.regex.*; import java.io.*; import cc.mallet.fst.*; import cc.mallet.optimize.*; import cc.mallet.pipe.*; import cc.mallet.pipe.iterator.*; import cc.mallet.pipe.tsf.*; import cc.mallet.types.*; import cc.mallet.util.*; public class TUI { static CommandOption.Double gaussianVarianceOption = new CommandOption.Double (TUI.class, "gaussian-variance", "DECIMAL", true, 10.0, "The gaussian prior variance used for training.", null); static CommandOption.Double hyperbolicSlopeOption = new CommandOption.Double (TUI.class, "hyperbolic-slope", "DECIMAL", true, 0.2, "The hyperbolic prior slope used for training.", null); static CommandOption.Double hyperbolicSharpnessOption = new CommandOption.Double (TUI.class, "hyperbolic-sharpness", "DECIMAL", true, 10.0, "The hyperbolic prior sharpness used for training.", null); static CommandOption.File crfInputFileOption = new CommandOption.File (TUI.class, "crf-input-file", "FILENAME", true, null, "The name of the file to write the CRF after training.", null); static CommandOption.Integer randomSeedOption = new CommandOption.Integer (TUI.class, "random-seed", "INTEGER", true, 0, "The random seed for randomly selecting a proportion of the instance list for training", null); static CommandOption.Integer labelGramOption = new CommandOption.Integer (TUI.class, "label-gram", "INTEGER", true, 1, "Markov order of labels: 1, 2, 3", null); static CommandOption.Integer wordWindowFeatureOption = new CommandOption.Integer (TUI.class, "word-window-size", "INTEGER", true, 0, "Size of window of words as features: 0=none, 10, 20...", null); static CommandOption.Boolean useTestbOption = new CommandOption.Boolean (TUI.class, "use-testb", "true|false", true, false, "Use testb, final test set", null); static CommandOption.Boolean useHyperbolicPriorOption = new CommandOption.Boolean (TUI.class, "use-hyperbolic-prior", "true|false", true, false, "Use hyperbolic prior", null); static CommandOption.Boolean useFeatureInductionOption = new CommandOption.Boolean (TUI.class, "use-feature-induction", "true|false", true, false, "Not use or use feature induction", null); static CommandOption.Boolean clusterFeatureInductionOption = new CommandOption.Boolean (TUI.class, "cluster-feature-induction", "true|false", true, false, "Cluster in feature induction", null); static CommandOption.Boolean useFirstMentionFeatureOption = new CommandOption.Boolean (TUI.class, "use-firstmention-feature", "true|false", true, false, "Don't use first-mention feature", null); static CommandOption.Boolean useDocHeaderFeatureOption = new CommandOption.Boolean (TUI.class, "use-docheader-feature", "true|false", true, false, "", null); static CommandOption.Boolean includeConllLexiconsOption = new CommandOption.Boolean (TUI.class, "include-conll-lexicons", "true|false", true, false, "", null); static CommandOption.Boolean charNGramsOption = new CommandOption.Boolean (TUI.class, "char-ngrams", "true|false", true, false, "", null); static CommandOption.String offsetsOption = new CommandOption.String (TUI.class, "offsets", "e.g. [[0,0],[1]]", true, "[[-2],[-1],[1],[2]]", "Offset conjunctions", null); static CommandOption.String capOffsetsOption = new CommandOption.String (TUI.class, "cap-offsets", "e.g. [[0,0],[0,1]]", true, "", "Offset conjunctions applied to features that are [A-Z]*", null); static CommandOption.String viterbiFilePrefixOption = new CommandOption.String (TUI.class, "viterbi-file", "FILE", true, "TUI", "Filename in which to store most recent Viterbi output", null); static final CommandOption.List commandOptions = new CommandOption.List ( "Training, testing and running a Chinese word segmenter.", new CommandOption[] { gaussianVarianceOption, hyperbolicSlopeOption, hyperbolicSharpnessOption, randomSeedOption, labelGramOption, wordWindowFeatureOption, useHyperbolicPriorOption, useFeatureInductionOption, clusterFeatureInductionOption, useFirstMentionFeatureOption, useDocHeaderFeatureOption, includeConllLexiconsOption, offsetsOption, capOffsetsOption, viterbiFilePrefixOption, useTestbOption, }); int numEvaluations = 0; static int iterationsBetweenEvals = 16; static boolean doingFeatureInduction = true; static boolean doingClusteredFeatureInduction = false; private static String CAPS = "[\\p{Lu}]"; private static String LOW = "[\\p{Ll}]"; private static String CAPSNUM = "[\\p{Lu}\\p{Nd}]"; private static String ALPHA = "[\\p{Lu}\\p{Ll}]"; private static String ALPHANUM = "[\\p{Lu}\\p{Ll}\\p{Nd}]"; private static String PUNT = "[,\\.;:?!()]"; private static String QUOTE = "[\"`']"; public static void main (String[] args) throws FileNotFoundException, Exception { commandOptions.process (args); String homedir = System.getProperty ("HOME"); String lexdir = homedir+"/research/data/resources/"; String offsetsString = offsetsOption.value.replace('[','{').replace(']','}'); int[][] offsets = (int[][]) CommandOption.getInterpreter().eval ("new int[][] "+offsetsString); String capOffsetsString = capOffsetsOption.value.replace('[','{').replace(']','}'); int[][] capOffsets = null; if (capOffsetsString.length() > 0) capOffsets = (int[][]) CommandOption.getInterpreter().eval ("new int[][] "+capOffsetsString); Pipe conllLexiconsPipe = null; if (includeConllLexiconsOption.value) conllLexiconsPipe = new SerialPipes (new Pipe[] { new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOPER")), new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOLOC")), new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOORG")), new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOMISC")), }); Pipe p = new SerialPipes (new Pipe[] { new ConllNer2003Sentence2TokenSequence (), new RegexMatches ("INITCAP", Pattern.compile (CAPS+".*")), new RegexMatches ("CAPITALIZED", Pattern.compile (CAPS+LOW+"*")), new RegexMatches ("ALLCAPS", Pattern.compile (CAPS+"+")), new RegexMatches ("MIXEDCAPS", Pattern.compile ("[A-Z][a-z]+[A-Z][A-Za-z]*")), new RegexMatches ("CONTAINSDIGITS", Pattern.compile (".*[0-9].*")), new RegexMatches ("ALLDIGITS", Pattern.compile ("[0-9]+")), new RegexMatches ("NUMERICAL", Pattern.compile ("[-0-9]+[\\.,]+[0-9\\.,]+")), //new RegexMatches ("ALPHNUMERIC", Pattern.compile ("[A-Za-z0-9]+")), //new RegexMatches ("ROMAN", Pattern.compile ("[ivxdlcm]+|[IVXDLCM]+")), new RegexMatches ("MULTIDOTS", Pattern.compile ("\\.\\.+")), new RegexMatches ("ENDSINDOT", Pattern.compile ("[^\\.]+.*\\.")), new RegexMatches ("CONTAINSDASH", Pattern.compile (ALPHANUM+"+-"+ALPHANUM+"*")), new RegexMatches ("ACRO", Pattern.compile ("[A-Z][A-Z\\.]*\\.[A-Z\\.]*")), new RegexMatches ("LONELYINITIAL", Pattern.compile (CAPS+"\\.")), new RegexMatches ("SINGLECHAR", Pattern.compile (ALPHA)), new RegexMatches ("CAPLETTER", Pattern.compile ("[A-Z]")), new RegexMatches ("PUNC", Pattern.compile (PUNT)), new RegexMatches ("QUOTE", Pattern.compile (QUOTE)), //new RegexMatches ("LOWER", Pattern.compile (LOW+"+")), //new RegexMatches ("MIXEDCAPS", Pattern.compile ("[A-Z]+[a-z]+[A-Z]+[a-z]*")), (includeConllLexiconsOption.value ? conllLexiconsPipe : new Noop ()), // Note that the word has not been lowecased! so INITCAP, etc, is redundant //new TokenSequenceLowercase (), new TokenText ("W="), //new TokenSequenceFirstSentenceAllCaps (), new OffsetConjunctions (offsets), (capOffsets != null ? (Pipe) new OffsetConjunctions (capOffsets) : (Pipe) new Noop ()), //// Don't lowercase the W= if you want to use this. (!useFirstMentionFeatureOption.value ? (Pipe) new Noop () : (Pipe) new FeaturesOfFirstMention ("FIRSTMENTION=", Pattern.compile (CAPS+".*"), // Exclude singleton W=foo features b/c redundant Pattern.compile ("W=[^@&]+"), false)), (!useDocHeaderFeatureOption.value ? (Pipe) new Noop () : (Pipe) new TokenSequenceDocHeader ()), (wordWindowFeatureOption.value > 0 ? (Pipe) new FeaturesInWindow ("WINDOW=", -wordWindowFeatureOption.value, wordWindowFeatureOption.value, Pattern.compile ("WORD=.*"), true) : (Pipe) new Noop()), (charNGramsOption.value ? (Pipe) new TokenTextCharNGrams ("CHARNGRAM=", new int[] {2,3,4}) : (Pipe) new Noop()), new PrintTokenSequenceFeatures(), new TokenSequence2FeatureVectorSequence (true, true) }); // Set up training and testing data //args = new String[] {homedir+"/research/data/ie/ner2003/eng.testa"}; if (useTestbOption.value) args = new String[] {homedir+"/research/data/ie/ner2003/eng.train", homedir+"/research/data/ie/ner2003/eng.testb"}; else args = new String[] {homedir+"/research/data/ie/ner2003/eng.train", homedir+"/research/data/ie/ner2003/eng.testa"}; InstanceList trainingData = new InstanceList (p); trainingData.addThruPipe (new LineGroupIterator (new FileReader (new File (args[0])), Pattern.compile("^.DOCSTART. .X. .X. .$"), true)); System.out.println ("Read "+trainingData.size()+" training instances"); InstanceList testingData = null; if (args.length > 1) { testingData = new InstanceList (p); testingData.addThruPipe (new LineGroupIterator (new FileReader (new File (args[1])), Pattern.compile("^.DOCSTART. .X. .X. .$"), true)); } if (testingData == null) { // For now, just train on a small fraction of the data Random r = new Random (1); // Proportions below is: {training, testing, ignore} InstanceList[] trainingLists = trainingData.split (r, new double[] {.2, .1, .7}); trainingData = trainingLists[0]; // and test on just 50% of the data if (testingData != null) { InstanceList[] testingLists = testingData.split (r, new double[] {.5, .5}); testingData = testingLists[0]; testingLists = null; } else { testingData = trainingLists[1]; } trainingLists = null; assert (testingData != null); } // Print out all the target names Alphabet targets = p.getTargetAlphabet(); System.out.print ("State labels:"); for (int i = 0; i < targets.size(); i++) System.out.print (" " + targets.lookupObject(i)); System.out.println (""); // Print out some feature information System.out.println ("Number of features = "+p.getDataAlphabet().size()); CRF crf = new CRF (p, null); if (labelGramOption.value == 1) crf.addStatesForLabelsConnectedAsIn (trainingData); else if (labelGramOption.value == 2) crf.addStatesForBiLabelsConnectedAsIn (trainingData); //else if (labelGramOption.value == 3) //crf.addStatesForTriLabelsConnectedAsIn (trainingData); else throw new IllegalStateException ("label-gram must be 1, 2, or 3, not "+ labelGramOption.value); CRFTrainerByLabelLikelihood crft = new CRFTrainerByLabelLikelihood (crf); if (useHyperbolicPriorOption.value) { crft.setUseHyperbolicPrior (true); crft.setHyperbolicPriorSlope (hyperbolicSlopeOption.value); crft.setHyperbolicPriorSharpness (hyperbolicSharpnessOption.value); } else { crft.setGaussianPriorVariance (gaussianVarianceOption.value); } for (int i = 0; i < crf.numStates(); i++) { Transducer.State s = crf.getState (i); if (s.getName().charAt(0) == 'I') s.setInitialWeight (Double.POSITIVE_INFINITY); } System.out.println("Training on "+trainingData.size()+" training instances, "+ testingData.size()+" testing instances..."); MultiSegmentationEvaluator eval = new MultiSegmentationEvaluator (new InstanceList[] {trainingData, testingData}, new String[] {"Training", "Testing"}, new String[] {"B-PER", "B-LOC", "B-ORG", "B-MISC"}, new String[] {"I-PER", "I-LOC", "I-ORG", "I-MISC"}); ViterbiWriter vw = new ViterbiWriter (viterbiFilePrefixOption.value, new InstanceList[] {trainingData, testingData}, new String[] {"Training", "Testing"}); if (useFeatureInductionOption.value) { if (clusterFeatureInductionOption.value) crft.trainWithFeatureInduction (trainingData, null, testingData, eval, 99999, 10, 99, 200, 0.5, true, new double[] {.1, .2, .5, .7}); else crft.trainWithFeatureInduction (trainingData, null, testingData, eval, 99999, 10, 99, 1000, 0.5, false, new double[] {.1, .2, .5, .7}); } else { double[] trainingProportions = new double[] {.1, .2, .5, .7}; for (int i = 0; i < trainingProportions.length; i++) { crft.train(trainingData, 3, new double[] {trainingProportions[i]}); eval.evaluate(crft); vw.evaluate(crft); } while (crft.train(trainingData, 3)) { eval.evaluate(crft); vw.evaluate(crft); } eval.evaluate(crft); vw.evaluate(crft); } } }