Tokenizer.java example

Explorer
juxta-service-master
package org.juxtasoftware.service;

import java.io.IOException;
import java.io.Reader;
import java.util.List;

import org.json.simple.JSONObject;
import org.juxtasoftware.Constants;
import org.juxtasoftware.dao.ComparisonSetDao;
import org.juxtasoftware.dao.JuxtaAnnotationDao;
import org.juxtasoftware.model.CollatorConfig;
import org.juxtasoftware.model.CollatorConfig.HyphenationFilter;
import org.juxtasoftware.model.ComparisonSet;
import org.juxtasoftware.model.JuxtaAnnotation;
import org.juxtasoftware.model.Witness;
import org.juxtasoftware.util.BackgroundTaskSegment;
import org.juxtasoftware.util.BackgroundTaskStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.config.BeanDefinition;
import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Service;

import com.google.common.collect.Lists;

import eu.interedition.text.Name;
import eu.interedition.text.NameRepository;
import eu.interedition.text.Range;
import eu.interedition.text.Text;
import eu.interedition.text.TextConsumer;
import eu.interedition.text.TextRepository;

@Service
@Scope(BeanDefinition.SCOPE_PROTOTYPE)
public class Tokenizer {
    private static final Logger LOG = LoggerFactory.getLogger(Constants.WS_LOGGER_NAME);
    private enum HyphenState {NONE, FOUND_HYPHEN, LINEBREAK_HYPHEN, IN_HYPHENATED_PART};
    private enum RunType {NONE, TOKEN, NON_TOKEN, WHITESPACE};

    @Autowired private Integer tokenizationBatchSize;
    @Autowired private JuxtaAnnotationDao annotationDao;
    @Autowired private TextRepository textRepository;
    @Autowired private ComparisonSetDao comparisonSetDao;
    @Autowired private NameRepository qnameRepo;
    private ComparisonSet set;
    private Name tokenQName;

    /**
     * Break up the text of the all witnesses in the comparison set on whitespace boundaries. 
     * If the configuration specifies that punctuation also be ignored, 
     * tokenize on punctuation as well.
     * 
     * @param comparisonSet
     * @param config
     * @param taskStatus
     * 
     * @throws IOException
     */
    public void tokenize(ComparisonSet comparisonSet, CollatorConfig config, BackgroundTaskStatus taskStatus) throws IOException {
        final List<Witness> witnesses = comparisonSetDao.getWitnesses(comparisonSet);
        final BackgroundTaskSegment ts = taskStatus.add(1, new BackgroundTaskSegment(witnesses.size()));
        
        this.set = comparisonSet;
                 
        LOG.info("Token batch size: " + this.tokenizationBatchSize );
        taskStatus.setNote("Clearing old tokens");
        LOG.info("Cleanup collation data ");
        this.comparisonSetDao.clearCollationData(comparisonSet);
        
        this.set.setStatus(ComparisonSet.Status.TOKENIZING );
        this.comparisonSetDao.update(this.set);
        
        this.tokenQName = this.qnameRepo.get(Constants.TOKEN_NAME);
        
        taskStatus.setNote("Tokenizing " + JSONObject.escape(comparisonSet.getName()));
        for (Witness witness : witnesses) {
            taskStatus.setNote("Tokenizing '" + witness.getJsonName() + "'");
            LOG.info("Tokenizing " + witness.getName());
            long totalTokenLen = tokenize(config, witness);
            comparisonSetDao.setTokenzedLength(comparisonSet, witness, totalTokenLen);
            ts.incrementValue();            
        }
        
        this.set.setStatus(ComparisonSet.Status.TOKENIZED );
        this.comparisonSetDao.update(this.set);
    }
    
    private long tokenize(final CollatorConfig config, final Witness witness) throws IOException {
        final Text text = witness.getText();
        TokenizingConsumer tc = new TokenizingConsumer(config, witness);
        this.textRepository.read(text, tc);
        return tc.getTokenizedLength();
    }
    
    /**
     * Text consumer that splits the text stream into tokens based
     * on configuration settings
     * 
     * @author loufoster
     *
     */
    private class TokenizingConsumer implements TextConsumer {
        private List<JuxtaAnnotation> tokens = Lists.newArrayListWithExpectedSize(tokenizationBatchSize);
        private final Witness witness;
        private final HyphenationFilter hyphenFilter;
        private long tokenizedLength;
        
        public TokenizingConsumer(CollatorConfig cfg, Witness w) {
            this.hyphenFilter = cfg.getHyphenationFilter();
            this.witness = w;
        }
        
        public long getTokenizedLength() {
            return this.tokenizedLength;
        }
        
        private boolean isTokenChar(int c) {
            if (Character.isLetter(c) || Character.isDigit(c) || c == '-') {
                return true;
            }
            return false;
        }
        
        public void read(Reader tokenText, long contentLength) throws IOException {
            
            int offset = 0;
            int start = -1;
            HyphenState hyphenState = HyphenState.NONE;
            RunType runType = RunType.NONE;
            final boolean filterLinebreak = 
                (this.hyphenFilter.equals(HyphenationFilter.FILTER_LINEBREAK) || 
                 this.hyphenFilter.equals(HyphenationFilter.FILTER_ALL) );
            
            StringBuilder tokenTxt = new StringBuilder();
            do {
                final int read = tokenText.read();
                if (read < 0) {
                    if (start != -1) {
                        createToken( start, offset);
                        tokenTxt = new StringBuilder();
                    }
                    break;
                }
                
                tokenTxt.append((char)read);
                    
                // Token char (alphanumeric or hyphen)?
                if ( isTokenChar(read)) {
                    // create a token with prior run of non-token characters
                    if ( runType.equals(RunType.NON_TOKEN) ) {
                        createToken(start, offset);
                        start = -1;
                        tokenTxt = new StringBuilder();
                    }
                    
                    runType = RunType.TOKEN;
                    if (start == -1 ) {
                        start = offset;
                    }
                    
                    // Special case handling for linebreak hyphen filtering
                    if ( filterLinebreak ) {
                        // If we have found a hyphen before (and possibly identified this as a linebreak),
                        // the next text encountered is the continuation of the hyphenated word.
                        if ( hyphenState.equals(HyphenState.FOUND_HYPHEN) || hyphenState.equals(HyphenState.LINEBREAK_HYPHEN) ) {
                            hyphenState = HyphenState.IN_HYPHENATED_PART;
                        }
                        else if ( read == '-' ) {
                            if ( tokenTxt.toString().contains("substi")) {
                                System.err.println("fb");
                            }
                            hyphenState = HyphenState.FOUND_HYPHEN;
                        }
                    }
                } else {
                    
                    if ( filterLinebreak ) {
                        if ( hyphenState.equals( HyphenState.IN_HYPHENATED_PART) ) {
                            createToken( start, offset );
                            start = -1;
                            runType = RunType.NONE;
                            hyphenState = HyphenState.NONE;
                            tokenTxt = new StringBuilder();
                        } else if ( hyphenState.equals(HyphenState.FOUND_HYPHEN) || hyphenState.equals(HyphenState.LINEBREAK_HYPHEN)) {
                            // Special case for text that is a candidate for being a linebreak 
                            // hyphenated word. We have a hyphen. Do nothing but wait if more whitespace
                            // is encountered. If the whitespace is a linefeed, flag
                            // this as a line break. In either case, do no more processing.
                            if ( Character.isWhitespace(read) ) {
                                if (  read == 13 || read == 10 ) {
                                    hyphenState = HyphenState.LINEBREAK_HYPHEN;
                                }
                                offset++;
                                continue;
                            }
                        }
                    }

                    // if this non-token char breaks up a prior token
                    // run, create a new token with it
                    if ( runType.equals(RunType.TOKEN) ) {
                        createToken( start, offset);
                        start = -1;
                        runType = RunType.NONE;
                        hyphenState = HyphenState.NONE;
                        tokenTxt = new StringBuilder();
                    } 
                    
                    // Start or continue a run of non-token characters?
                    if ( Character.isWhitespace(read) == false ) {
                        runType = RunType.NON_TOKEN;
                        if (start == -1 ) {
                            start = offset;
                        }
                    } else {
                        // This is whitespace. See if we need to end a non-token run.
                        // other than that, do not track the whitespace
                        if ( runType.equals(RunType.NON_TOKEN) ) {
                            createToken(start, offset);
                            runType = RunType.NONE;
                            hyphenState = HyphenState.NONE;
                            start = -1;
                            tokenTxt = new StringBuilder();
                        }
                    }
                }

                offset++;
            } while (true);

            if (!this.tokens.isEmpty()) {
                write();
            }
        }

        private void createToken(int start, int end) {
            this.tokenizedLength += (end - start);
            this.tokens.add(  new JuxtaAnnotation(set.getId(), this.witness, tokenQName, new Range(start, end)) );
            if ((this.tokens.size() % tokenizationBatchSize ) == 0) {
                write();
            }
        }

        private void write() {
            LOG.info("Writing "+this.tokens.size()+" token annotations");
            int cnt = Tokenizer.this.annotationDao.create(this.tokens);
            if ( cnt != this.tokens.size() ) {
                LOG.error("Not all tokens writtens: "+cnt+" of "+this.tokens.size());
            }
            this.tokens.clear();
        }
    }
}