PatternMatcher.java example

Explorer
platform_frameworks_base-master
/*
 * Copyright (C) 2008 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package android.os;

import android.util.Log;

import java.util.Arrays;

/**
 * A simple pattern matcher, which is safe to use on untrusted data: it does
 * not provide full reg-exp support, only simple globbing that can not be
 * used maliciously.
 */
public class PatternMatcher implements Parcelable {
    /**
     * Pattern type: the given pattern must exactly match the string it is
     * tested against.
     */
    public static final int PATTERN_LITERAL = 0;
    
    /**
     * Pattern type: the given pattern must match the
     * beginning of the string it is tested against.
     */
    public static final int PATTERN_PREFIX = 1;
    
    /**
     * Pattern type: the given pattern is interpreted with a
     * simple glob syntax for matching against the string it is tested against.
     * In this syntax, you can use the '*' character to match against zero or
     * more occurrences of the character immediately before.  If the
     * character before it is '.' it will match any character.  The character
     * '\' can be used as an escape.  This essentially provides only the '*'
     * wildcard part of a normal regexp. 
     */
    public static final int PATTERN_SIMPLE_GLOB = 2;

    /**
     * Pattern type: the given pattern is interpreted with a regular
     * expression-like syntax for matching against the string it is tested
     * against. Supported tokens include dot ({@code .}) and sets ({@code [...]})
     * with full support for character ranges and the not ({@code ^}) modifier.
     * Supported modifiers include star ({@code *}) for zero-or-more, plus ({@code +})
     * for one-or-more and full range ({@code {...}}) support. This is a simple
     * evaulation implementation in which matching is done against the pattern in
     * realtime with no backtracking support.
     *
     * {@hide} Pending approval for public API
     */
    public static final int PATTERN_ADVANCED_GLOB = 3;

    // token types for advanced matching
    private static final int TOKEN_TYPE_LITERAL = 0;
    private static final int TOKEN_TYPE_ANY = 1;
    private static final int TOKEN_TYPE_SET = 2;
    private static final int TOKEN_TYPE_INVERSE_SET = 3;

    // Return for no match
    private static final int NO_MATCH = -1;

    private static final String TAG = "PatternMatcher";

    // Parsed placeholders for advanced patterns
    private static final int PARSED_TOKEN_CHAR_SET_START = -1;
    private static final int PARSED_TOKEN_CHAR_SET_INVERSE_START = -2;
    private static final int PARSED_TOKEN_CHAR_SET_STOP = -3;
    private static final int PARSED_TOKEN_CHAR_ANY = -4;
    private static final int PARSED_MODIFIER_RANGE_START = -5;
    private static final int PARSED_MODIFIER_RANGE_STOP = -6;
    private static final int PARSED_MODIFIER_ZERO_OR_MORE = -7;
    private static final int PARSED_MODIFIER_ONE_OR_MORE = -8;

    private final String mPattern;
    private final int mType;
    private final int[] mParsedPattern;


    private static final int MAX_PATTERN_STORAGE = 2048;
    // workspace to use for building a parsed advanced pattern;
    private static final int[] sParsedPatternScratch = new int[MAX_PATTERN_STORAGE];

    public PatternMatcher(String pattern, int type) {
        mPattern = pattern;
        mType = type;
        if (mType == PATTERN_ADVANCED_GLOB) {
            mParsedPattern = parseAndVerifyAdvancedPattern(pattern);
        } else {
            mParsedPattern = null;
        }
    }

    public final String getPath() {
        return mPattern;
    }
    
    public final int getType() {
        return mType;
    }
    
    public boolean match(String str) {
        return matchPattern(str, mPattern, mParsedPattern, mType);
    }

    public String toString() {
        String type = "? ";
        switch (mType) {
            case PATTERN_LITERAL:
                type = "LITERAL: ";
                break;
            case PATTERN_PREFIX:
                type = "PREFIX: ";
                break;
            case PATTERN_SIMPLE_GLOB:
                type = "GLOB: ";
                break;
            case PATTERN_ADVANCED_GLOB:
                type = "ADVANCED: ";
                break;
        }
        return "PatternMatcher{" + type + mPattern + "}";
    }
    
    public int describeContents() {
        return 0;
    }

    public void writeToParcel(Parcel dest, int flags) {
        dest.writeString(mPattern);
        dest.writeInt(mType);
        dest.writeIntArray(mParsedPattern);
    }
    
    public PatternMatcher(Parcel src) {
        mPattern = src.readString();
        mType = src.readInt();
        mParsedPattern = src.createIntArray();
    }
    
    public static final Parcelable.Creator<PatternMatcher> CREATOR
            = new Parcelable.Creator<PatternMatcher>() {
        public PatternMatcher createFromParcel(Parcel source) {
            return new PatternMatcher(source);
        }

        public PatternMatcher[] newArray(int size) {
            return new PatternMatcher[size];
        }
    };
    
    static boolean matchPattern(String match, String pattern, int[] parsedPattern, int type) {
        if (match == null) return false;
        if (type == PATTERN_LITERAL) {
            return pattern.equals(match);
        } if (type == PATTERN_PREFIX) {
            return match.startsWith(pattern);
        } else if (type == PATTERN_SIMPLE_GLOB) {
            return matchGlobPattern(pattern, match);
        } else if (type == PATTERN_ADVANCED_GLOB) {
            return matchAdvancedPattern(parsedPattern, match);
        }
        return false;
    }

    static boolean matchGlobPattern(String pattern, String match) {
        final int NP = pattern.length();
        if (NP <= 0) {
            return match.length() <= 0;
        }
        final int NM = match.length();
        int ip = 0, im = 0;
        char nextChar = pattern.charAt(0);
        while ((ip<NP) && (im<NM)) {
            char c = nextChar;
            ip++;
            nextChar = ip < NP ? pattern.charAt(ip) : 0;
            final boolean escaped = (c == '\\');
            if (escaped) {
                c = nextChar;
                ip++;
                nextChar = ip < NP ? pattern.charAt(ip) : 0;
            }
            if (nextChar == '*') {
                if (!escaped && c == '.') {
                    if (ip >= (NP-1)) {
                        // at the end with a pattern match, so
                        // all is good without checking!
                        return true;
                    }
                    ip++;
                    nextChar = pattern.charAt(ip);
                    // Consume everything until the next character in the
                    // pattern is found.
                    if (nextChar == '\\') {
                        ip++;
                        nextChar = ip < NP ? pattern.charAt(ip) : 0;
                    }
                    do {
                        if (match.charAt(im) == nextChar) {
                            break;
                        }
                        im++;
                    } while (im < NM);
                    if (im == NM) {
                        // Whoops, the next character in the pattern didn't
                        // exist in the match.
                        return false;
                    }
                    ip++;
                    nextChar = ip < NP ? pattern.charAt(ip) : 0;
                    im++;
                } else {
                    // Consume only characters matching the one before '*'.
                    do {
                        if (match.charAt(im) != c) {
                            break;
                        }
                        im++;
                    } while (im < NM);
                    ip++;
                    nextChar = ip < NP ? pattern.charAt(ip) : 0;
                }
            } else {
                if (c != '.' && match.charAt(im) != c) return false;
                im++;
            }
        }
        
        if (ip >= NP && im >= NM) {
            // Reached the end of both strings, all is good!
            return true;
        }
        
        // One last check: we may have finished the match string, but still
        // have a '.*' at the end of the pattern, which should still count
        // as a match.
        if (ip == NP-2 && pattern.charAt(ip) == '.'
            && pattern.charAt(ip+1) == '*') {
            return true;
        }
        
        return false;
    }

    /**
     * Parses the advanced pattern and returns an integer array representation of it. The integer
     * array treats each field as a character if positive and a unique token placeholder if
     * negative. This method will throw on any pattern structure violations.
     */
    synchronized static int[] parseAndVerifyAdvancedPattern(String pattern) {
        int ip = 0;
        final int LP = pattern.length();

        int it = 0;

        boolean inSet = false;
        boolean inRange = false;
        boolean inCharClass = false;

        boolean addToParsedPattern;

        while (ip < LP) {
            if (it > MAX_PATTERN_STORAGE - 3) {
                throw new IllegalArgumentException("Pattern is too large!");
            }

            char c = pattern.charAt(ip);
            addToParsedPattern = false;

            switch (c) {
                case '[':
                    if (inSet) {
                        addToParsedPattern = true; // treat as literal or char class in set
                    } else {
                        if (pattern.charAt(ip + 1) == '^') {
                            sParsedPatternScratch[it++] = PARSED_TOKEN_CHAR_SET_INVERSE_START;
                            ip++; // skip over the '^'
                        } else {
                            sParsedPatternScratch[it++] = PARSED_TOKEN_CHAR_SET_START;
                        }
                        ip++; // move to the next pattern char
                        inSet = true;
                        continue;
                    }
                    break;
                case ']':
                    if (!inSet) {
                        addToParsedPattern = true; // treat as literal outside of set
                    } else {
                        int parsedToken = sParsedPatternScratch[it - 1];
                        if (parsedToken == PARSED_TOKEN_CHAR_SET_START ||
                            parsedToken == PARSED_TOKEN_CHAR_SET_INVERSE_START) {
                            throw new IllegalArgumentException(
                                    "You must define characters in a set.");
                        }
                        sParsedPatternScratch[it++] = PARSED_TOKEN_CHAR_SET_STOP;
                        inSet = false;
                        inCharClass = false;
                    }
                    break;
                case '{':
                    if (!inSet) {
                        if (it == 0 || isParsedModifier(sParsedPatternScratch[it - 1])) {
                            throw new IllegalArgumentException("Modifier must follow a token.");
                        }
                        sParsedPatternScratch[it++] = PARSED_MODIFIER_RANGE_START;
                        ip++;
                        inRange = true;
                    }
                    break;
                case '}':
                    if (inRange) { // only terminate the range if we're currently in one
                        sParsedPatternScratch[it++] = PARSED_MODIFIER_RANGE_STOP;
                        inRange = false;
                    }
                    break;
                case '*':
                    if (!inSet) {
                        if (it == 0 || isParsedModifier(sParsedPatternScratch[it - 1])) {
                            throw new IllegalArgumentException("Modifier must follow a token.");
                        }
                        sParsedPatternScratch[it++] = PARSED_MODIFIER_ZERO_OR_MORE;
                    }
                    break;
                case '+':
                    if (!inSet) {
                        if (it == 0 || isParsedModifier(sParsedPatternScratch[it - 1])) {
                            throw new IllegalArgumentException("Modifier must follow a token.");
                        }
                        sParsedPatternScratch[it++] = PARSED_MODIFIER_ONE_OR_MORE;
                    }
                    break;
                case '.':
                    if (!inSet) {
                        sParsedPatternScratch[it++] = PARSED_TOKEN_CHAR_ANY;
                    }
                    break;
                case '\\': // escape
                    if (ip + 1 >= LP) {
                        throw new IllegalArgumentException("Escape found at end of pattern!");
                    }
                    c = pattern.charAt(++ip);
                    addToParsedPattern = true;
                    break;
                default:
                    addToParsedPattern = true;
                    break;
            }
            if (inSet) {
                if (inCharClass) {
                    sParsedPatternScratch[it++] = c;
                    inCharClass = false;
                } else {
                    // look forward for character class
                    if (ip + 2 < LP
                            && pattern.charAt(ip + 1) == '-'
                            && pattern.charAt(ip + 2) != ']') {
                        inCharClass = true;
                        sParsedPatternScratch[it++] = c; // set first token as lower end of range
                        ip++; // advance past dash
                    } else { // literal
                        sParsedPatternScratch[it++] = c; // set first token as literal
                        sParsedPatternScratch[it++] = c; // set second set as literal
                    }
                }
            } else if (inRange) {
                int endOfSet = pattern.indexOf('}', ip);
                if (endOfSet < 0) {
                    throw new IllegalArgumentException("Range not ended with '}'");
                }
                String rangeString = pattern.substring(ip, endOfSet);
                int commaIndex = rangeString.indexOf(',');
                try {
                    final int rangeMin;
                    final int rangeMax;
                    if (commaIndex < 0) {
                        int parsedRange = Integer.parseInt(rangeString);
                        rangeMin = rangeMax = parsedRange;
                    } else {
                        rangeMin = Integer.parseInt(rangeString.substring(0, commaIndex));
                        if (commaIndex == rangeString.length() - 1) { // e.g. {n,} (n or more)
                            rangeMax = Integer.MAX_VALUE;
                        } else {
                            rangeMax = Integer.parseInt(rangeString.substring(commaIndex + 1));
                        }
                    }
                    if (rangeMin > rangeMax) {
                        throw new IllegalArgumentException(
                            "Range quantifier minimum is greater than maximum");
                    }
                    sParsedPatternScratch[it++] = rangeMin;
                    sParsedPatternScratch[it++] = rangeMax;
                } catch (NumberFormatException e) {
                    throw new IllegalArgumentException("Range number format incorrect", e);
                }
                ip = endOfSet;
                continue; // don't increment ip
            } else if (addToParsedPattern) {
                sParsedPatternScratch[it++] = c;
            }
            ip++;
        }
        if (inSet) {
            throw new IllegalArgumentException("Set was not terminated!");
        }
        return Arrays.copyOf(sParsedPatternScratch, it);
    }

    private static boolean isParsedModifier(int parsedChar) {
        return parsedChar == PARSED_MODIFIER_ONE_OR_MORE ||
                parsedChar == PARSED_MODIFIER_ZERO_OR_MORE ||
                parsedChar == PARSED_MODIFIER_RANGE_STOP ||
                parsedChar == PARSED_MODIFIER_RANGE_START;
    }

    static boolean matchAdvancedPattern(int[] parsedPattern, String match) {

        // create indexes
        int ip = 0, im = 0;

        // one-time length check
        final int LP = parsedPattern.length, LM = match.length();

        // The current character being analyzed in the pattern
        int patternChar;

        int tokenType;

        int charSetStart = 0, charSetEnd = 0;

        while (ip < LP) { // we still have content in the pattern

            patternChar = parsedPattern[ip];
            // get the match type of the next verb

            switch (patternChar) {
                case PARSED_TOKEN_CHAR_ANY:
                    tokenType = TOKEN_TYPE_ANY;
                    ip++;
                    break;
                case PARSED_TOKEN_CHAR_SET_START:
                case PARSED_TOKEN_CHAR_SET_INVERSE_START:
                    tokenType = patternChar == PARSED_TOKEN_CHAR_SET_START
                            ? TOKEN_TYPE_SET
                            : TOKEN_TYPE_INVERSE_SET;
                    charSetStart = ip + 1; // start from the char after the set start
                    while (++ip < LP && parsedPattern[ip] != PARSED_TOKEN_CHAR_SET_STOP);
                    charSetEnd = ip - 1; // we're on the set stop, end is the previous
                    ip++; // move the pointer to the next pattern entry
                    break;
                default:
                    charSetStart = ip;
                    tokenType = TOKEN_TYPE_LITERAL;
                    ip++;
                    break;
            }

            final int minRepetition;
            final int maxRepetition;

            // look for a match length modifier
            if (ip >= LP) {
                minRepetition = maxRepetition = 1;
            } else {
                patternChar = parsedPattern[ip];
                switch (patternChar) {
                    case PARSED_MODIFIER_ZERO_OR_MORE:
                        minRepetition = 0;
                        maxRepetition = Integer.MAX_VALUE;
                        ip++;
                        break;
                    case PARSED_MODIFIER_ONE_OR_MORE:
                        minRepetition = 1;
                        maxRepetition = Integer.MAX_VALUE;
                        ip++;
                        break;
                    case PARSED_MODIFIER_RANGE_START:
                        minRepetition = parsedPattern[++ip];
                        maxRepetition = parsedPattern[++ip];
                        ip += 2; // step over PARSED_MODIFIER_RANGE_STOP and on to the next token
                        break;
                    default:
                        minRepetition = maxRepetition = 1; // implied literal
                        break;
                }
            }
            if (minRepetition > maxRepetition) {
                return false;
            }

            // attempt to match as many characters as possible
            int matched = matchChars(match, im, LM, tokenType, minRepetition, maxRepetition,
                    parsedPattern, charSetStart, charSetEnd);

            // if we found a conflict, return false immediately
            if (matched == NO_MATCH) {
                return false;
            }

            // move the match pointer the number of characters matched
            im += matched;
        }
        return ip >= LP && im >= LM; // have parsed entire string and regex
    }

    private static int matchChars(String match, int im, final int lm, int tokenType,
            int minRepetition, int maxRepetition, int[] parsedPattern,
            int tokenStart, int tokenEnd) {
        int matched = 0;

        while(matched < maxRepetition
                && matchChar(match, im + matched, lm, tokenType, parsedPattern, tokenStart,
                    tokenEnd)) {
            matched++;
        }

        return matched < minRepetition ? NO_MATCH : matched;
    }

    private static boolean matchChar(String match, int im, final int lm, int tokenType,
            int[] parsedPattern, int tokenStart, int tokenEnd) {
        if (im >= lm) { // we've overrun the string, no match
            return false;
        }
        switch (tokenType) {
            case TOKEN_TYPE_ANY:
                return true;
            case TOKEN_TYPE_SET:
                for (int i = tokenStart; i < tokenEnd; i += 2) {
                    char matchChar = match.charAt(im);
                    if (matchChar >= parsedPattern[i] && matchChar <= parsedPattern[i + 1]) {
                        return true;
                    }
                }
                return false;
            case TOKEN_TYPE_INVERSE_SET:
                for (int i = tokenStart; i < tokenEnd; i += 2) {
                    char matchChar = match.charAt(im);
                    if (matchChar >= parsedPattern[i] && matchChar <= parsedPattern[i + 1]) {
                        return false;
                    }
                }
                return true;
            case TOKEN_TYPE_LITERAL:
                return match.charAt(im) == parsedPattern[tokenStart];
            default:
                return false;
        }
    }
}