RegexTokenizer.java example

Explorer
cachewolf-master
- lib
- src
  - CacheWolf
  - build
    - CabHelper.java
//
// This software is now distributed according to
// the Lesser Gnu Public License.  Please see
// http://www.gnu.org/copyleft/lesser.txt for
// the details.
//    -- Happy Computing!
//
package com.stevesoft.ewe_pat;
import ewe.util.*;
/** 
        Shareware: package pat
   <a href="copyright.html">Copyright 2001, Steven R. Brandt</a>
*/ /**
The RegexTokenizer is similar to the StringTokenizer class
provided with java, but allows one to tokenize using
regular expressions, rather than a simple list of characters.
Tokens are any strings between the supplied regular expression,
as well as any backreferences (things in parenthesis)
contained within the regular expression. */
public class RegexTokenizer implements Enumeration {
    String toParse;
    Regex r;
    int count = 0;
    Vector v = new Vector();
    Vector vi = new Vector();
    int pos=0;

    int offset = 1;
    void getMore() {
        String s = r.right();
        if(r.searchFrom(toParse,pos)) {
            v.addElement(r.left().substring(pos));
            vi.addElement(new Integer(r.matchFrom()+
                r.charsMatched()));
            for(int i=0;i<r.numSubs();i++)
                if(r.substring() != null) {
                    v.addElement(r.substring(i+offset));
                    vi.addElement(
                        new Integer(r.matchFrom(i+offset)+
                        r.charsMatched(i+offset)));
                }
            pos = r.matchFrom()+r.charsMatched();
        } else if(s != null) v.addElement(s);
    }

    /** Initialize the tokenizer with a string of text and a pattern */
    public RegexTokenizer(String txt,String ptrn) {
        toParse = txt;
        r = new Regex(ptrn);
        offset = r.BackRefOffset;
        getMore();
    }
    /** Initialize the tokenizer with a Regex object. */
    public RegexTokenizer(String txt,Regex r) {
        toParse = txt;
        this.r = r;
        offset = r.BackRefOffset;
        getMore();
    }
    /** This should always be cast to a String, as in StringTokenizer,
         and as in StringTokenizer one can do this by calling
         nextString(). */
    public Object nextElement() {
        if(count >= v.size()) getMore();
        return v.elementAt(count++);
    }
    /** This is the equivalent (String)nextElement(). */
    public String nextToken() { return (String)nextElement(); }
    /** This asks for the next token, and changes the pattern
         being used at the same time. */
    public String nextToken(String newpat) {
        try { r.compile(newpat); } catch (RegSyntax r_) {}
        return nextToken(r);
    }
    /** This asks for the next token, and changes the pattern
         being used at the same time. */
    public String nextToken(Regex nr) {
        r = nr;
        if(vi.size() > count) {
            pos = ((Integer)vi.elementAt(count)).intValue();
            v.setSize(count);
            vi.setSize(count);
        }
        getMore();
        return nextToken();
    }
    /** Tells whether there are more tokens in the pattern. */
    public boolean hasMoreElements() {
        if(count >= v.size()) getMore();
        return count < v.size();
    }
    /** Tells whether there are more tokens in the pattern, but
         in the fashion of StringTokenizer. */
    public boolean hasMoreTokens() { return hasMoreElements(); }
    /** Determines the # of remaining tokens */
    public int countTokens() {
        int old_pos=pos,_count=count;
        while(hasMoreTokens())
            nextToken();
        count=_count;
        return v.size()-count;
    }
    /** Returns all tokens in the String */
    public String[] allTokens() {
        countTokens();
        String[] ret = new String[v.size()];
        v.copyInto(ret);
        return ret;
    }
};