// // This software is now distributed according to // the Lesser Gnu Public License. Please see // http://www.gnu.org/copyleft/lesser.txt for // the details. // -- Happy Computing! // package com.stevesoft.ewe_pat; import com.stevesoft.ewe_pat.wrap.StringWrap; /** Like Skip, but implements a <a href="http://www.dcc.uchile.cl/~rbaeza/handbook/algs/7/713b.srch.p.html"> Boyer-Moore-Horspool</a> type search method that has been modified to be more like a "T-search" (see the Michael Tamm''s article in <i>C'T, magazin fuer computer und technic</i>, August 97 p 292). Yet another important source of information for me was the <a href="http://www.go2net.com/people/paulp/deep/1997/05/14/"> Deep Magic</a> article on string searching. As of this writing, I can beat String's indexOf method in many cases. @see com.stevesoft.ewe_pat.Skip @see com.stevesoft.ewe_pat.Skip2 */ public class SkipBMH extends Skip { // This number could be 256, but I think it's // big enough. Note, it must be a power of 2. final int MAX_CHAR = 64; final char[] skip = new char[MAX_CHAR]; int sm1; int jump_ahead = 0; char uc,lc,tc,x; final boolean exact(char c) { return (ign && anyc(c))||c==x; } final boolean anyc(char c) { return c==uc||c==lc||c==tc; } public SkipBMH(String pt,boolean ign) { this(pt,ign,0); } public SkipBMH(String pt) { this(pt,false,0); } public SkipBMH(String pt,boolean ign,int offset) { super(pt,ign,offset); for(int k=0;k<MAX_CHAR;k++) skip[k] = (char)src.length(); sm1 = src.length()-1; x = src.charAt(sm1); uc=CaseMgr.toUpperCase(x); lc=CaseMgr.toLowerCase(x); tc=CaseMgr.toTitleCase(x); // We don't really want 65536 long arrays in skip[], // so we mask of the higher bits. This can be combined // with ignore case, so accounting for upper // case costs us nothing extra. for(int k=0;k<src.length()-1;k++) { char x_ = src.charAt(k); if(ign) { char uc_ = CaseMgr.toUpperCase(x_); char lc_ = CaseMgr.toLowerCase(x_); char tc_ = CaseMgr.toTitleCase(x_); skip[uc_ & (MAX_CHAR-1)]=(char)(src.length()-k-1); skip[lc_ & (MAX_CHAR-1)]=(char)(src.length()-k-1); skip[tc_ & (MAX_CHAR-1)]=(char)(src.length()-k-1); } else skip[x_ & (MAX_CHAR-1)] = (char)(src.length()-k-1); } // This trick can be found in the July issue of // C-T magazine. This makes the method a type of // "T-search." jump_ahead = src.length()-1; for(int k=0;k<src.length()-1;k++) { char y=src.charAt(sm1-k-1); if(exact(y)) { jump_ahead = k; break; } } } /** Set to true if you only want to compare two of the characters in the String. */ final public int searchRegion(String s,int start,int end) { return find(s,start,end); } final public int searchFrom(String s,int start) { return find(s,start,s.length()); } final public int search(String s) { return find(s,0,s.length()); } public int find(String s,int start,int end) { start += offset+sm1; int vend = min(s.length()-1,end+sm1+offset),k; int vend1 = vend-jump_ahead; if(ign) { for(k=start; k <= vend1;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) { // table look-up is expensive, avoid it if possible if( anyc(s.charAt(k)) ) { if(CaseMgr.regionMatches(src,ign,0,s,k-sm1,sm1)) return k-sm1-offset; k += jump_ahead; } } for(; k <= vend;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) { // table look-up is expensive, avoid it if possible if( anyc(s.charAt(k)) ) { if(CaseMgr.regionMatches(src,ign,0,s,k-sm1,sm1)) return k-sm1-offset; k += jump_ahead; if(k > vend) return -1; } } } else { for(k=start; k <= vend1;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) { // table look-up is expensive, avoid it if possible if( x==s.charAt(k) ) { //if(src.regionMatches(0,s,k-sm1,sm1)) if(CaseMgr.regionMatches(src,false,0,s,k-sm1,sm1)) return k-sm1-offset; k += jump_ahead; } } for(; k <= vend;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) { // table look-up is expensive, avoid it if possible if( x==s.charAt(k) ) { //if(src.regionMatches(0,s,k-sm1,sm1)) if(CaseMgr.regionMatches(src,false,0,s,k-sm1,sm1)) return k-sm1-offset; k += jump_ahead; if(k > vend) return -1; } } } return -1; } public int find(StringLike s,int start,int end) { if(s instanceof StringWrap) return find(s.toString(),start,end); start += offset+sm1; int vend = min(s.length()-1,end+sm1+offset),k; int vend1 = vend-jump_ahead; if(ign) { for(k=start; k <= vend1;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) { // table look-up is expensive, avoid it if possible if( anyc(s.charAt(k)) ) { if(CaseMgr.regionMatches(src,ign,0,s,k-sm1,sm1)) return k-sm1-offset; k += jump_ahead; } } for(; k <= vend;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) { // table look-up is expensive, avoid it if possible if( anyc(s.charAt(k)) ) { if(CaseMgr.regionMatches(src,ign,0,s,k-sm1,sm1)) return k-sm1-offset; k += jump_ahead; if(k > vend) return -1; } } } else { for(k=start; k <= vend1;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) { // table look-up is expensive, avoid it if possible if( x==s.charAt(k) ) { //if(src.regionMatches(0,s,k-sm1,sm1)) if(CaseMgr.regionMatches(src,false,0,s,k-sm1,sm1)) return k-sm1-offset; k += jump_ahead; } } for(; k <= vend;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) { // table look-up is expensive, avoid it if possible if( x==s.charAt(k) ) { //if(src.regionMatches(0,s,k-sm1,sm1)) if(CaseMgr.regionMatches(src,false,0,s,k-sm1,sm1)) return k-sm1-offset; k += jump_ahead; if(k > vend) return -1; } } } return -1; } }