package won.matcher.utils.tensor;
import won.matcher.utils.preprocessing.OpenNlpTokenExtraction;
import java.io.IOException;
import java.util.Collection;
import java.util.LinkedList;
/**
* Used for tokenization of {@link TensorEntry} objects.
*
* Created by hfriedrich on 21.04.2017.
*/
public class TensorEntryTokenizer implements TensorEntryGenerator {
private OpenNlpTokenExtraction tokenizer;
private Collection<TensorEntry> tensorEntries;
public TensorEntryTokenizer(Collection<TensorEntry> tensorEntries) throws IOException {
this.tensorEntries = tensorEntries;
tokenizer = new OpenNlpTokenExtraction();
}
@Override
public Collection<TensorEntry> generateTensorEntries() throws IOException {
Collection<TensorEntry> tokenEntries = new LinkedList<>();
for (TensorEntry entry : tensorEntries) {
String tokens[] = tokenizer.extractWordTokens(entry.getValue());
for (String token : tokens) {
TensorEntry newEntry = new TensorEntry(entry.getSliceName(), entry.getNeedUri(), token);
tokenEntries.add(newEntry);
}
}
return tokenEntries;
}
}