/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/
/*
An error? CoNLLTrue MalletTrue MalletPred
O O O
I-MISC B-MISC B-MISC
B-MISC B-MISC I-MISC
I-MISC B-MISC I-MISC
O O O
O O O
O O O
*/
package cc.mallet.share.mccallum.ner;
import java.util.regex.*;
import cc.mallet.pipe.*;
import cc.mallet.types.*;
public class ConllNer2003Sentence2TokenSequence extends Pipe
{
static final String[] endings = new String[]
{"ing", "ed", "ogy", "s", "ly", "ion", "tion", "ity", "ies"};
static Pattern[] endingPatterns = new Pattern[endings.length];
// Indexed by {forward,backward} {0,1,2 offset} {ending char ngram index}
static final String[][][] endingNames = new String[2][3][endings.length];
{
for (int i = 0; i < endings.length; i++) {
endingPatterns[i] = Pattern.compile (".*"+endings[i]+"$");
for (int j = 0; j < 3; j++) {
for (int k = 0; k < 2; k++)
endingNames[k][j][i] = "W"+(k==1?"-":"")+j+"=<END"+endings[i]+">";
}
}
}
boolean saveSource = false;
boolean doConjunctions = false;
boolean doTags = true;
boolean doPhrases = true;
boolean doSpelling = false;
boolean doDigitCollapses = true;
boolean doDowncasing = false;
public ConllNer2003Sentence2TokenSequence ()
{
super (null, new LabelAlphabet());
}
public ConllNer2003Sentence2TokenSequence (boolean extraFeatures)
{
super (null, new LabelAlphabet());
if (!extraFeatures) {
doDigitCollapses = doConjunctions = doSpelling = doPhrases = doTags = false;
doDowncasing = true;
}
}
/* Lines look like this:
-DOCSTART- -X- -X- O
EU NNP I-NP I-ORG
rejects VBZ I-VP O
German JJ I-NP I-MISC
call NN I-NP O
to TO I-VP O
boycott VB I-VP O
British JJ I-NP I-MISC
lamb NN I-NP O
. . O O
Peter NNP I-NP I-PER
Blackburn NNP I-NP I-PER
BRUSSELS NNP I-NP I-LOC
1996-08-22 CD I-NP O
The DT I-NP O
European NNP I-NP I-ORG
Commission NNP I-NP I-ORG
said VBD I-VP O
on IN I-PP O
...
*/
public Instance pipe (Instance carrier)
{
String sentenceLines = (String) carrier.getData();
String[] tokens = sentenceLines.split ("\n");
TokenSequence data = new TokenSequence (tokens.length);
LabelSequence target = new LabelSequence ((LabelAlphabet)getTargetAlphabet(), tokens.length);
boolean [][] ending = new boolean[3][endings.length];
boolean [][] endingp1 = new boolean[3][endings.length];
boolean [][] endingp2 = new boolean[3][endings.length];
StringBuffer source = saveSource ? new StringBuffer() : null;
String prevLabel = "NOLABEL";
Pattern ipattern = Pattern.compile ("I-.*");
String word, tag, phrase, label;
for (int i = 0; i < tokens.length; i++) {
if (tokens[i].length() != 0) {
String[] features = tokens[i].split (" ");
if (features.length != 4)
throw new IllegalStateException ("Line \""+tokens[i]+"\" doesn't have four elements");
word = features[0]; // .toLowerCase();
tag = features[1];
phrase = features[2];
label = features[3];
} else {
word = "-<S>-";
tag = "-<S>-";
phrase = "-<S>-";
label = "O";
}
// Transformations
if (doDigitCollapses) {
if (word.matches ("19\\d\\d"))
word = "<YEAR>";
else if (word.matches ("19\\d\\ds"))
word = "<YEARDECADE>";
else if (word.matches ("19\\d\\d-\\d+"))
word = "<YEARSPAN>";
else if (word.matches ("\\d+\\\\/\\d"))
word = "<FRACTION>";
else if (word.matches ("\\d[\\d,\\.]*"))
word = "<DIGITS>";
else if (word.matches ("19\\d\\d-\\d\\d-\\d--d"))
word = "<DATELINEDATE>";
else if (word.matches ("19\\d\\d-\\d\\d-\\d\\d"))
word = "<DATELINEDATE>";
else if (word.matches (".*-led"))
word = "<LED>";
else if (word.matches (".*-sponsored"))
word = "<LED>";
}
if (doDowncasing)
word = word.toLowerCase();
Token token = new Token (word);
// Word and tag unigram at current time
if (doSpelling) {
for (int j = 0; j < endings.length; j++) {
ending[2][j] = ending[1][j];
ending[1][j] = ending[0][j];
ending[0][j] = endingPatterns[j].matcher(word).matches();
if (ending[0][j]) token.setFeatureValue (endingNames[0][0][j], 1);
}
}
if (doTags) {
token.setFeatureValue ("T="+tag, 1);
}
if (doPhrases) {
token.setFeatureValue ("P="+phrase, 1);
}
if (true) {
// Change so each segment always begins with a "B-",
// even if previous token did not have this label.
String oldLabel = label;
if (ipattern.matcher(label).matches ()
&& (prevLabel.length() < 3 // prevLabel is "O"
|| !prevLabel.substring(2).equals (label.substring(2)))) {
label = "B" + oldLabel.substring(1);
}
prevLabel = oldLabel;
}
// Append
data.add (token);
//target.add (bigramLabel);
target.add (label);
//System.out.print (label + ' ');
if (saveSource) {
source.append (word); source.append (" ");
//source.append (bigramLabel); source.append ("\n");
source.append (label); source.append ("\n");
}
}
//System.out.println ("");
carrier.setData(data);
carrier.setTarget(target);
if (saveSource)
carrier.setSource(source);
return carrier;
}
}