package net.sf.hfst;
import java.io.DataInputStream;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.*;
import net.sf.hfst.Transducer;
import net.sf.hfst.NoTokenizationException;
/**
* Reads the header, alphabet, index table and transition table and provides
* interfaces to them.
*/
public class WeightedTransducer extends Transducer
{
public class TransitionIndex
{
protected int inputSymbol;
protected long firstTransitionIndex;
public TransitionIndex(int input, long firstTransition)
{
inputSymbol = input;
firstTransitionIndex = firstTransition;
}
public Boolean matches(int s)
{
if (inputSymbol == HfstOptimizedLookup.NO_SYMBOL_NUMBER)
{ return false; }
if (s == HfstOptimizedLookup.NO_SYMBOL_NUMBER)
{ return true; }
return (s == inputSymbol);
}
public Boolean isFinal()
{
return (inputSymbol == HfstOptimizedLookup.NO_SYMBOL_NUMBER &&
firstTransitionIndex != HfstOptimizedLookup.NO_TABLE_INDEX);
}
public float getFinalWeight()
{ return Float.intBitsToFloat((int)firstTransitionIndex); }
public long target()
{ return firstTransitionIndex; }
public int getInput()
{ return inputSymbol; }
}
/**
* On instantiation reads the transducer's index table and provides an interface
* to it.
*/
public class IndexTable
{
private TransitionIndex[] indices;
public IndexTable(FileInputStream filestream,
Integer indicesCount) throws java.io.IOException
{
ByteArray b = new ByteArray((int) indicesCount*6);
filestream.read(b.getBytes());
// each index entry is a unsigned short followed by an unsigned int
indices = new TransitionIndex[indicesCount];
Integer i = 0;
while (i < indicesCount)
{
indices[i] = new TransitionIndex(b.getUShort(), b.getUInt());
i++;
}
}
public Boolean isFinal(Integer index)
{ return indices[index].isFinal(); }
public TransitionIndex at(Integer index)
{ return indices[index]; }
}
public class Transition
{
protected int inputSymbol;
protected int outputSymbol;
protected long targetIndex;
protected float weight;
public Transition(int input, int output, long target, float w)
{
inputSymbol = input;
outputSymbol = output;
targetIndex = target;
weight = w;
}
public Transition()
{
inputSymbol = HfstOptimizedLookup.NO_SYMBOL_NUMBER;
outputSymbol = HfstOptimizedLookup.NO_SYMBOL_NUMBER;
targetIndex = Long.MAX_VALUE;
weight = HfstOptimizedLookup.INFINITE_WEIGHT;
}
public Boolean matches(int symbol)
{
if (inputSymbol == HfstOptimizedLookup.NO_SYMBOL_NUMBER)
{ return false; }
if (symbol == HfstOptimizedLookup.NO_SYMBOL_NUMBER)
{ return true; }
return (inputSymbol == symbol);
}
public long target()
{ return targetIndex; }
public int getOutput()
{ return outputSymbol; }
public int getInput()
{ return inputSymbol; }
public Boolean isFinal()
{
return (inputSymbol == HfstOptimizedLookup.NO_SYMBOL_NUMBER &&
outputSymbol == HfstOptimizedLookup.NO_SYMBOL_NUMBER &&
targetIndex == 1);
}
public float getWeight()
{ return weight; }
}
/**
* On instantiation reads the transducer's transition table and provides an
* interface to it.
*/
public class TransitionTable
{
private Transition[] transitions;
public TransitionTable(FileInputStream filestream,
Integer transitionCount) throws java.io.IOException
{
ByteArray b = new ByteArray((int) transitionCount*12);
// 12 bytes per transition
// each transition entry is two unsigned shorts, an unsigned int and a float
filestream.read(b.getBytes());
transitions = new Transition[transitionCount];
Integer i = 0;
while (i < transitionCount)
{
transitions[i] = new Transition(b.getUShort(), b.getUShort(), b.getUInt(), b.getFloat());
i++;
}
}
public Transition at(Integer pos)
{ return transitions[pos]; }
public Integer size()
{ return transitions.length; }
}
protected TransducerHeader header;
protected TransducerAlphabet alphabet;
protected Stack<int[]> stateStack;
protected Hashtable<Integer, FlagDiacriticOperation> operations;
protected LetterTrie letterTrie;
protected IndexTable indexTable;
protected TransitionTable transitionTable;
protected Vector<String> displayVector;
protected int[] outputString;
protected Vector<Integer> inputString;
protected int outputPointer;
protected int inputPointer;
protected float current_weight;
public WeightedTransducer(FileInputStream file, TransducerHeader h, TransducerAlphabet a) throws java.io.IOException
{
header = h;
alphabet = a;
stateStack = new Stack< int[] >();
int[] neutral = new int[alphabet.features];
for (int i = 0; i < neutral.length; ++i) {
neutral[i] = 0;
}
stateStack.push(neutral);
operations = alphabet.operations;
letterTrie = new LetterTrie();
int i = 0;
while (i < header.getInputSymbolCount())
{
letterTrie.addString(alphabet.keyTable.get(i), i);
i++;
}
indexTable = new IndexTable(file, header.getIndexTableSize());
transitionTable = new TransitionTable(file, header.getTargetTableSize());
displayVector = new Vector<String>();
outputString = new int[1000];
for (i = 0; i < 1000; i++)
{ outputString[i] = HfstOptimizedLookup.NO_SYMBOL_NUMBER; }
inputString = new Vector<Integer>();
outputPointer = 0;
inputPointer = 0;
current_weight = 0.0f;
}
private int pivot(long i)
{
if (i >= HfstOptimizedLookup.TRANSITION_TARGET_TABLE_START) {
return (int) (i - HfstOptimizedLookup.TRANSITION_TARGET_TABLE_START);
}
return (int) i;
}
private void tryEpsilonIndices(int index)
{
if (indexTable.at(index).getInput() == 0)
{
tryEpsilonTransitions(pivot(indexTable.at(index).target()));
}
}
private void tryEpsilonTransitions(int index)
{
while (true)
{
// first test for flag
if (operations.containsKey(transitionTable.at(index).getInput())) {
if (!pushState(operations.get(transitionTable.at(index).getInput())))
{
++index;
continue;
} else {
outputString[outputPointer] = transitionTable.at(index).getOutput();
++outputPointer;
current_weight += transitionTable.at(index).getWeight();
getAnalyses(transitionTable.at(index).target());
current_weight -= transitionTable.at(index).getWeight();
--outputPointer;
++index;
stateStack.pop();
continue;
}
} else if (transitionTable.at(index).getInput() == 0)
{ // epsilon transitions
outputString[outputPointer] = transitionTable.at(index).getOutput();
++outputPointer;
current_weight += transitionTable.at(index).getWeight();
getAnalyses(transitionTable.at(index).target());
current_weight -= transitionTable.at(index).getWeight();
--outputPointer;
++index;
continue;
}
else
{
break;
}
}
}
private void findIndex(int index)
{
if (indexTable.at(index + (inputString.get(inputPointer - 1))).getInput() == inputString.get(inputPointer - 1))
{
findTransitions(pivot(indexTable.at(index + inputString.get(inputPointer - 1)).target()));
}
}
private void findTransitions(int index)
{
while (transitionTable.at(index).getInput() != HfstOptimizedLookup.NO_SYMBOL_NUMBER)
{
if (transitionTable.at(index).getInput() == inputString.get(inputPointer - 1))
{
outputString[outputPointer] = transitionTable.at(index).getOutput();
++outputPointer;
current_weight += transitionTable.at(index).getWeight();
getAnalyses(transitionTable.at(index).target());
current_weight -= transitionTable.at(index).getWeight();
--outputPointer;
} else
{
return;
}
++index;
}
}
private void getAnalyses(long idx)
{
if (idx >= HfstOptimizedLookup.TRANSITION_TARGET_TABLE_START)
{
int index = pivot(idx);
tryEpsilonTransitions(pivot(index) + 1);
if (inputString.get(inputPointer) == HfstOptimizedLookup.NO_SYMBOL_NUMBER)
{ // end of input string
outputString[outputPointer] = HfstOptimizedLookup.NO_SYMBOL_NUMBER;
if (transitionTable.size() <= index)
{ return; }
if (transitionTable.at(index).isFinal())
{
current_weight += transitionTable.at(index).getWeight();
noteAnalysis();
current_weight -= transitionTable.at(index).getWeight();
}
return;
}
++inputPointer;
findTransitions(index + 1);
} else
{
int index = pivot(idx);
tryEpsilonIndices(index + 1);
if (inputString.get(inputPointer) == HfstOptimizedLookup.NO_SYMBOL_NUMBER)
{ // end of input string
outputString[outputPointer] = HfstOptimizedLookup.NO_SYMBOL_NUMBER;
if (indexTable.isFinal(index))
{
current_weight += indexTable.at(index).getFinalWeight();
noteAnalysis();
current_weight -= indexTable.at(index).getFinalWeight();
}
return;
}
++inputPointer;
findIndex(index + 1);
}
--inputPointer;
outputString[outputPointer] = HfstOptimizedLookup.NO_SYMBOL_NUMBER;
}
private void noteAnalysis()
{
int i = 0;
displayVector.add("");
while (outputString[i] != HfstOptimizedLookup.NO_SYMBOL_NUMBER)
{
displayVector.set(displayVector.size() - 1, displayVector.lastElement() + alphabet.keyTable.get(outputString[i]));
++i;
}
displayVector.set(displayVector.size() - 1, displayVector.lastElement() + "\t" + current_weight);
}
public Collection<String> analyze(String input)
throws NoTokenizationException
{
inputString.clear();
displayVector.clear();
outputPointer = 0;
outputString[0] = HfstOptimizedLookup.NO_SYMBOL_NUMBER;
inputPointer = 0;
IndexString inputLine = new IndexString(input);
while (inputLine.index < input.length())
{
inputString.add(letterTrie.findKey(inputLine));
if (inputString.lastElement() == HfstOptimizedLookup.NO_SYMBOL_NUMBER) {
break;
}
}
if ( (inputString.size() == 0) || (inputString.lastElement() == HfstOptimizedLookup.NO_SYMBOL_NUMBER) )
{
throw new NoTokenizationException(input);
}
inputString.add(HfstOptimizedLookup.NO_SYMBOL_NUMBER);
getAnalyses(0);
return new ArrayList<String>(displayVector);
}
private Boolean pushState(FlagDiacriticOperation flag)
{
int[] top = new int[alphabet.features];
System.arraycopy(stateStack.peek(), 0, top, 0, alphabet.features);
if (flag.op == HfstOptimizedLookup.FlagDiacriticOperator.P) { // positive set
stateStack.push(top);
stateStack.peek()[flag.feature] = flag.value;
return true;
} else if (flag.op == HfstOptimizedLookup.FlagDiacriticOperator.N) { // negative set
stateStack.push(top);
stateStack.peek()[flag.feature] = -1*flag.value;
return true;
} else if (flag.op == HfstOptimizedLookup.FlagDiacriticOperator.R) { // require
if (flag.value == 0) // empty require
{
if (stateStack.peek()[flag.feature] == 0)
{
return false;
}
else
{
stateStack.push(top);
return true;
}
}
else {
if (stateStack.peek()[flag.feature] == flag.value) {
stateStack.push(top);
return true;
}
}
return false;
} else if (flag.op == HfstOptimizedLookup.FlagDiacriticOperator.D) { // disallow
if (flag.value == 0) // empty disallow
{
if (stateStack.peek()[flag.feature] != 0)
{
return false;
}
else
{
stateStack.push(top);
return true;
}
}
else {
if (stateStack.peek()[flag.feature] == flag.value) {
return false;
}
}
stateStack.push(top);
return true;
} else if (flag.op == HfstOptimizedLookup.FlagDiacriticOperator.C) { // clear
stateStack.push(top);
stateStack.peek()[flag.feature] = 0;
return true;
} else if (flag.op == HfstOptimizedLookup.FlagDiacriticOperator.U) { // unification
if ((stateStack.peek()[flag.feature] == 0) ||
(stateStack.peek()[flag.feature] == flag.value) ||
(stateStack.peek()[flag.feature] != flag.value &&
stateStack.peek()[flag.feature] < 0)) {
stateStack.push(top);
stateStack.peek()[flag.feature] = flag.value;
return true;
}
return false;
}
return false; // compiler sanity
}
}