package org.juxtasoftware.service;
import java.io.IOException;
import java.io.Reader;
import java.util.List;
import org.json.simple.JSONObject;
import org.juxtasoftware.Constants;
import org.juxtasoftware.dao.ComparisonSetDao;
import org.juxtasoftware.dao.JuxtaAnnotationDao;
import org.juxtasoftware.model.CollatorConfig;
import org.juxtasoftware.model.CollatorConfig.HyphenationFilter;
import org.juxtasoftware.model.ComparisonSet;
import org.juxtasoftware.model.JuxtaAnnotation;
import org.juxtasoftware.model.Witness;
import org.juxtasoftware.util.BackgroundTaskSegment;
import org.juxtasoftware.util.BackgroundTaskStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.config.BeanDefinition;
import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Service;
import com.google.common.collect.Lists;
import eu.interedition.text.Name;
import eu.interedition.text.NameRepository;
import eu.interedition.text.Range;
import eu.interedition.text.Text;
import eu.interedition.text.TextConsumer;
import eu.interedition.text.TextRepository;
@Service
@Scope(BeanDefinition.SCOPE_PROTOTYPE)
public class Tokenizer {
private static final Logger LOG = LoggerFactory.getLogger(Constants.WS_LOGGER_NAME);
private enum HyphenState {NONE, FOUND_HYPHEN, LINEBREAK_HYPHEN, IN_HYPHENATED_PART};
private enum RunType {NONE, TOKEN, NON_TOKEN, WHITESPACE};
@Autowired private Integer tokenizationBatchSize;
@Autowired private JuxtaAnnotationDao annotationDao;
@Autowired private TextRepository textRepository;
@Autowired private ComparisonSetDao comparisonSetDao;
@Autowired private NameRepository qnameRepo;
private ComparisonSet set;
private Name tokenQName;
/**
* Break up the text of the all witnesses in the comparison set on whitespace boundaries.
* If the configuration specifies that punctuation also be ignored,
* tokenize on punctuation as well.
*
* @param comparisonSet
* @param config
* @param taskStatus
*
* @throws IOException
*/
public void tokenize(ComparisonSet comparisonSet, CollatorConfig config, BackgroundTaskStatus taskStatus) throws IOException {
final List<Witness> witnesses = comparisonSetDao.getWitnesses(comparisonSet);
final BackgroundTaskSegment ts = taskStatus.add(1, new BackgroundTaskSegment(witnesses.size()));
this.set = comparisonSet;
LOG.info("Token batch size: " + this.tokenizationBatchSize );
taskStatus.setNote("Clearing old tokens");
LOG.info("Cleanup collation data ");
this.comparisonSetDao.clearCollationData(comparisonSet);
this.set.setStatus(ComparisonSet.Status.TOKENIZING );
this.comparisonSetDao.update(this.set);
this.tokenQName = this.qnameRepo.get(Constants.TOKEN_NAME);
taskStatus.setNote("Tokenizing " + JSONObject.escape(comparisonSet.getName()));
for (Witness witness : witnesses) {
taskStatus.setNote("Tokenizing '" + witness.getJsonName() + "'");
LOG.info("Tokenizing " + witness.getName());
long totalTokenLen = tokenize(config, witness);
comparisonSetDao.setTokenzedLength(comparisonSet, witness, totalTokenLen);
ts.incrementValue();
}
this.set.setStatus(ComparisonSet.Status.TOKENIZED );
this.comparisonSetDao.update(this.set);
}
private long tokenize(final CollatorConfig config, final Witness witness) throws IOException {
final Text text = witness.getText();
TokenizingConsumer tc = new TokenizingConsumer(config, witness);
this.textRepository.read(text, tc);
return tc.getTokenizedLength();
}
/**
* Text consumer that splits the text stream into tokens based
* on configuration settings
*
* @author loufoster
*
*/
private class TokenizingConsumer implements TextConsumer {
private List<JuxtaAnnotation> tokens = Lists.newArrayListWithExpectedSize(tokenizationBatchSize);
private final Witness witness;
private final HyphenationFilter hyphenFilter;
private long tokenizedLength;
public TokenizingConsumer(CollatorConfig cfg, Witness w) {
this.hyphenFilter = cfg.getHyphenationFilter();
this.witness = w;
}
public long getTokenizedLength() {
return this.tokenizedLength;
}
private boolean isTokenChar(int c) {
if (Character.isLetter(c) || Character.isDigit(c) || c == '-') {
return true;
}
return false;
}
public void read(Reader tokenText, long contentLength) throws IOException {
int offset = 0;
int start = -1;
HyphenState hyphenState = HyphenState.NONE;
RunType runType = RunType.NONE;
final boolean filterLinebreak =
(this.hyphenFilter.equals(HyphenationFilter.FILTER_LINEBREAK) ||
this.hyphenFilter.equals(HyphenationFilter.FILTER_ALL) );
StringBuilder tokenTxt = new StringBuilder();
do {
final int read = tokenText.read();
if (read < 0) {
if (start != -1) {
createToken( start, offset);
tokenTxt = new StringBuilder();
}
break;
}
tokenTxt.append((char)read);
// Token char (alphanumeric or hyphen)?
if ( isTokenChar(read)) {
// create a token with prior run of non-token characters
if ( runType.equals(RunType.NON_TOKEN) ) {
createToken(start, offset);
start = -1;
tokenTxt = new StringBuilder();
}
runType = RunType.TOKEN;
if (start == -1 ) {
start = offset;
}
// Special case handling for linebreak hyphen filtering
if ( filterLinebreak ) {
// If we have found a hyphen before (and possibly identified this as a linebreak),
// the next text encountered is the continuation of the hyphenated word.
if ( hyphenState.equals(HyphenState.FOUND_HYPHEN) || hyphenState.equals(HyphenState.LINEBREAK_HYPHEN) ) {
hyphenState = HyphenState.IN_HYPHENATED_PART;
}
else if ( read == '-' ) {
if ( tokenTxt.toString().contains("substi")) {
System.err.println("fb");
}
hyphenState = HyphenState.FOUND_HYPHEN;
}
}
} else {
if ( filterLinebreak ) {
if ( hyphenState.equals( HyphenState.IN_HYPHENATED_PART) ) {
createToken( start, offset );
start = -1;
runType = RunType.NONE;
hyphenState = HyphenState.NONE;
tokenTxt = new StringBuilder();
} else if ( hyphenState.equals(HyphenState.FOUND_HYPHEN) || hyphenState.equals(HyphenState.LINEBREAK_HYPHEN)) {
// Special case for text that is a candidate for being a linebreak
// hyphenated word. We have a hyphen. Do nothing but wait if more whitespace
// is encountered. If the whitespace is a linefeed, flag
// this as a line break. In either case, do no more processing.
if ( Character.isWhitespace(read) ) {
if ( read == 13 || read == 10 ) {
hyphenState = HyphenState.LINEBREAK_HYPHEN;
}
offset++;
continue;
}
}
}
// if this non-token char breaks up a prior token
// run, create a new token with it
if ( runType.equals(RunType.TOKEN) ) {
createToken( start, offset);
start = -1;
runType = RunType.NONE;
hyphenState = HyphenState.NONE;
tokenTxt = new StringBuilder();
}
// Start or continue a run of non-token characters?
if ( Character.isWhitespace(read) == false ) {
runType = RunType.NON_TOKEN;
if (start == -1 ) {
start = offset;
}
} else {
// This is whitespace. See if we need to end a non-token run.
// other than that, do not track the whitespace
if ( runType.equals(RunType.NON_TOKEN) ) {
createToken(start, offset);
runType = RunType.NONE;
hyphenState = HyphenState.NONE;
start = -1;
tokenTxt = new StringBuilder();
}
}
}
offset++;
} while (true);
if (!this.tokens.isEmpty()) {
write();
}
}
private void createToken(int start, int end) {
this.tokenizedLength += (end - start);
this.tokens.add( new JuxtaAnnotation(set.getId(), this.witness, tokenQName, new Range(start, end)) );
if ((this.tokens.size() % tokenizationBatchSize ) == 0) {
write();
}
}
private void write() {
LOG.info("Writing "+this.tokens.size()+" token annotations");
int cnt = Tokenizer.this.annotationDao.create(this.tokens);
if ( cnt != this.tokens.size() ) {
LOG.error("Not all tokens writtens: "+cnt+" of "+this.tokens.size());
}
this.tokens.clear();
}
}
}