AddressParser.java example

Explorer
someluigis-peripherals-master
- slp_common
/*
 * AddressParser.java February 2001
 *
 * Copyright (C) 2001, Niall Gallagher <niallg@users.sf.net>
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
 * implied. See the License for the specific language governing 
 * permissions and limitations under the License.
 */

package org.simpleframework.http.parse;

import org.simpleframework.http.Address;
import org.simpleframework.http.Path;
import org.simpleframework.http.Query;
import org.simpleframework.util.KeyMap;
import org.simpleframework.util.parse.Parser;

/**
 * This parser is used to parse uniform resource identifiers. The uniform
 * resource identifier syntax is given in RFC 2396. This parser can parse
 * relative and absolute URI's. The uniform resource identifier syntax that this
 * parser will parse are based on the generic web based URL similar to the
 * syntax represented in RFC 2616 section 3.2.2. The syntax used to parse this
 * URI is a modified version of RFC 2396
 * 
 * <pre>
 * 
 *    URI         = (absoluteURI | relativeURI)
 *    absoluteURI = scheme ":" ("//" netpath | relativeURI)
 *    relativeURI = path ["?" querypart]
 *    netpath     = domain [":" port] relativeURI
 *    path        = *("/" segment)
 *    segment     = *pchar *( ";" param )
 * 
 * </pre>
 * 
 * This implements the <code>Address</code> interface and provides methods that
 * access the various parts of the URI. The parameters in the path segments of
 * the uniform resource identifier are stored in name value pairs. If parameter
 * names are not unique across the path segments then only the deepest parameter
 * will be stored from the path segment. For example if the URI represented was
 * <code>http://domain/path1;x=y/path2;x=z</code> the value for the parameter
 * named <code>x</code> would be <code>z</code>.
 * <p>
 * This will normalize the path part of the uniform resource identifier. A
 * normalized path is one that contains no back references like "./" and "../".
 * The normalized path will not contain the path parameters.
 * <p>
 * The <code>setPath</code> method is used to reset the path this uniform
 * resource identifier has, it also resets the parameters. The parameters are
 * extracted from the new path given.
 * 
 * @author Niall Gallagher
 */
public class AddressParser extends Parser implements Address {

    /**
     * Parameters are stored so that the can be viewed.
     */
    private ParameterMap param;

    /**
     * This is the path used to represent the address path.
     */
    private Path normal;

    /**
     * This contains the query parameters for the address.
     */
    private Query data;

    /**
     * Used to track the characters that form the path.
     */
    private Token path;

    /**
     * Used to track the characters that form the domain.
     */
    private Token domain;

    /**
     * Used to track the characters that form the query.
     */
    private Token query;

    /**
     * Used to track the name characters of a parameter.
     */
    private Token name;

    /**
     * Used to track the value characters of a parameter.
     */
    private Token value;

    /**
     * References the scheme that this URI contains.
     */
    private Token scheme;

    /**
     * Contains the port number if it was specified.
     */
    private int port;

    /**
     * Default constructor will create a <code>AddressParser</code> that
     * contains no specifics. The instance will return <code>null</code> for all
     * the get methods. The parsers get methods are populated by using the
     * <code>parse</code> method.
     */
    public AddressParser() {
        this.param = new ParameterMap();
        this.path = new Token();
        this.domain = new Token();
        this.query = new Token();
        this.scheme = new Token();
        this.name = new Token();
        this.value = new Token();
    }

    /**
     * This is primarily a convenience constructor. This will parse the
     * <code>String</code> given to extract the specifics. This could be
     * achieved by calling the default no-arg constructor and then using the
     * instance to invoke the <code>parse</code> method on that
     * <code>String</code> to extract the parts.
     * 
     * @param text
     *            a <code>String</code> containing a URI value
     */
    public AddressParser(String text) {
        this();
        this.parse(text);
    }

    /**
     * This allows the scheme of the URL given to be returned. If the URI does
     * not contain a scheme then this will return null. The scheme of the URI is
     * the part that specifies the type of protocol that the URI is used for, an
     * example <code>gopher://domain/path</code> is a URI that is intended for
     * the gopher protocol. The scheme is the string <code>gopher</code>.
     * 
     * @return this returns the scheme tag for the URI if there is one specified
     *         for it
     */
    @Override
    public String getScheme() {
        return this.scheme.toString();
    }

    /**
     * This is used to retrive the domain of this URI. The domain part in the
     * URI is an optional part, an example
     * <code>http://domain/path?querypart</code>. This will return the value of
     * the domain part. If there is no domain part then this will return null
     * otherwise the domain value found in the uniform resource identifier.
     * 
     * @return the domain part of this uniform resource identifier this
     *         represents
     */
    @Override
    public String getDomain() {
        return this.domain.toString();
    }

    /**
     * This is used to retrive the path of this URI. The path part is the most
     * fundemental part of the URI. This will return the value of the path. If
     * there is no path part then this will return <code>/</code> to indicate
     * the root.
     * <p>
     * The <code>Path</code> object returned by this will contain no path
     * parameters. The path parameters are available using the
     * <code>Address</code> methods. The reason that this does not contain any
     * of the path parameters is so that if the path is needed to be converted
     * into ab OS specific path then the path parameters will not need to be
     * separately parsed out.
     * 
     * @return the path that this URI contains, this value will not contain any
     *         back references such as "./" and "../" or any path parameters
     */
    @Override
    public Path getPath() {
        String text = this.path.toString();

        if (text == null) {
            this.normal = new PathParser("/");
        }
        if (this.normal == null) {
            this.normal = new PathParser(text);
        }
        return this.normal;
    }

    /**
     * This is used to retrieve the query of this URI. The query part in the URI
     * is an optional part. This will return the value of the query part. If
     * there is no query part then this will return an empty <code>Query</code>
     * object. The query is an optional member of a URI and comes after the path
     * part, it is preceded by a question mark, <code>?</code> character. For
     * example the following URI contains <code>query</code> for its query part,
     * <code>http://host:port/path?query</code>.
     * <p>
     * This returns a <code>org.simpleframework.http.Query</code> object that
     * can be used to interact directly with the query values. The
     * <code>Query</code> object is a read-only interface to the query
     * parameters, and so will not affect the URI.
     * 
     * @return a <code>Query</code> object for the query part
     */
    @Override
    public Query getQuery() {
        String text = this.query.toString();

        if (text == null) {
            this.data = new QueryParser();
        }
        if (this.data == null) {
            this.data = new QueryParser(text);
        }
        return this.data;
    }

    /**
     * This is used to retrive the port of the uniform resource identifier. The
     * port part in this is an optional part, an example
     * <code>http://host:port/path?querypart</code>. This will return the value
     * of the port. If there is no port then this will return <code>-1</code>
     * because this represents an impossible uniform resource identifier port.
     * The port is an optional part.
     * 
     * @return this returns the port of the uniform resource identifier
     */
    @Override
    public int getPort() {
        return this.port <= 0 ? -1 : this.port;
    }

    /**
     * This extracts the parameter values from the uniform resource identifier
     * represented by this object. The parameters that a uniform resource
     * identifier contains are embedded in the path part of the URI. If the path
     * contains no parameters then this will return an empty <code>Map</code>
     * instance.
     * <p>
     * This will produce unique name and value parameters. Thus if the URI
     * contains several path segments with similar parameter names this will
     * return the deepest parameter. For example if the URI represented was
     * <code>http://domain/path1;x=y/path2;x=z</code> the value for the
     * parameter named <code>x</code> would be <code>z</code>.
     * 
     * @return this will return the parameter names found in the URI
     */
    @Override
    public KeyMap<String> getParameters() {
        return this.param;
    }

    /**
     * This allows the scheme for the URI to be specified. If the URI does not
     * contain a scheme then this will attach the scheme and the
     * <code>://</code> identifier to ensure that the
     * <code>Address.toString</code> will produce the correct syntax.
     * <p>
     * Caution must be taken to ensure that the port and the scheme are
     * consistent. So if the original URI was <code>http://domain:80/path</code>
     * and the scheme was changed to <code>ftp</code> the port number that
     * remains is the standard HTTP port not the FTP port.
     * 
     * @param value
     *            this specifies the protocol this URI is intended for
     */
    public void setScheme(String value) {
        this.scheme.value = value;
    }

    /**
     * This will set the domain to whatever value is in the string parameter. If
     * the string is null then this URI objects <code>toString</code> method
     * will not contain the domain. The result of the <code>toString</code>
     * method will be <code>/path/path?query</code>. If the path is non-null
     * this URI will contain the path.
     * 
     * @param value
     *            this will be the new domain of this uniform resource
     *            identifier, if it is not null
     */
    public void setDomain(String value) {
        this.domain.value = value;
    }

    /**
     * This will set the port to whatever value it is given. If the value is 0
     * or less then the <code>toString</code> will will not contain the optional
     * port. If port number is above 0 then the <code>toString</code> method
     * will produce a URI like <code>http://host:123/path</code> but only if
     * there is a valid domain.
     * 
     * @param port
     *            the port value that this URI is to have
     */
    public void setPort(int port) {
        this.port = port;
    }

    /**
     * This will set the path to whatever value it is given. If the value is
     * null then this <code>Address.toString</code> method will not contain the
     * path, that is if path is null then it will be interpreted as
     * <code>/</code>.
     * <p>
     * This will reset the parameters this URI has. If the value given to this
     * method has embedded parameters these will form the parameters of this
     * URI. The value given may not be the same value that the
     * <code>getPath</code> produces. The path will have all back references and
     * parameters stripped.
     * 
     * @param text
     *            the path that this URI is to be set with
     */
    public void setPath(String text) {
        if (!text.startsWith("/")) {
            text = "/" + text;
        }
        this.domain.toString();
        this.query.toString();
        this.scheme.toString();
        this.param.clear();
        this.path.clear();
        this.parsePath(text); /* extract params */
    }

    /**
     * This will set the path to whatever value it is given. If the value is
     * null then this <code>Address.toString</code> method will not contain the
     * path, that is if path is null then it will be interpreted as
     * <code>/</code>.
     * <p>
     * This will reset the parameters this URI has. If the value given to this
     * method has embedded parameters these will form the parameters of this
     * URI. The value given may not be the same value that the
     * <code>getPath</code> produces. The path will have all back references and
     * parameters stripped.
     * 
     * @param path
     *            the path that this URI is to be set with
     */
    public void setPath(Path path) {
        if (path != null) {
            this.normal = path;
        } else {
            this.setPath("/");
        }
    }

    /**
     * This is used to parse the path given with the <code>setPath</code>
     * method. The path contains name and value pairs. These parameters are
     * embedded into the path segments using a semicolon character, ';'. Since
     * the parameters to not form part of the actual path mapping they are
     * removed from the path and stored. Each parameter can then be extracted
     * from this parser using the methods provided by the <code>Address</code>
     * interface.
     * 
     * @param path
     *            this is the path that is to be parsed and have the parameter
     *            values extracted
     */
    private void parsePath(String path) {
        this.count = path.length();
        this.ensureCapacity(this.count);
        path.getChars(0, this.count, this.buf, 0);
        this.normal = null;
        this.off = 0;
        this.path();
    }

    /**
     * This will set the query to whatever value it is given. If the value is
     * null then this <code>Address.toString</code> method will not contain the
     * query. If the query was <code>abc</code> then the <code>toString</code>
     * method would produca a string like <code>http://host:port/path?abc</code>
     * . If the query is null this URI would have no query part. The query must
     * not contain the <code>?</code> character.
     * 
     * @param value
     *            the query that this uniform resource identifier is to be set
     *            to if it is non-null
     */
    public void setQuery(String value) {
        this.query.value = value;
        this.data = null;
    }

    /**
     * This will set the query to whatever value it is given. If the value is
     * null then this <code>Address.toString</code> method will not contain the
     * query. If the <code>Query.toString</code> returns null then the query
     * will be empty. This is basically the <code>setQuery(String)</code> method
     * with the string value from the issued <code>Query.toString</code> method.
     * 
     * @param query
     *            a <code>Query</code> object that contains the name value
     *            parameters for the query
     */
    public void setQuery(Query query) {
        if (this.value != null) {
            this.data = query;
        } else {
            this.setQuery("");
        }
    }

    /**
     * This will check to see what type of URI this is if it is an
     * <code>absoluteURI</code> or a <code>relativeURI</code>. To see the
     * definition of a URI see RFC 2616 for the definition of a URL and for more
     * specifics see RFC 2396 for the expressions.
     */
    @Override
    protected void parse() {
        if (this.count > 0) {
            if (this.buf[0] == '/') {
                this.relativeURI();
            } else {
                this.absoluteURI();
            }
        }
    }

    /**
     * This will empty each tokens cache. A tokens cache is used to represent a
     * token once the token's <code>toString</code> method has been called. Thus
     * when the <code>toString</code> method is called then the token depends on
     * the value of the cache alone in further calls to <code>toString</code>.
     * However if a URI has just been parsed and that method has not been
     * invoked then the cache is created from the buf if its length is greater
     * than zero.
     */
    @Override
    protected void init() {
        this.param.clear();
        this.domain.clear();
        this.path.clear();
        this.query.clear();
        this.scheme.clear();
        this.off = this.port = 0;
        this.normal = null;
        this.data = null;
    }

    /**
     * This is a specific definition of a type of URI. An absolute URI is a URI
     * that contains a host and port. It is the most frequently used type of
     * URI. This will define the host and the optional port part. As well as the
     * relative URI part. This uses a simpler syntax than the one specified in
     * RFC 2396 <code><pre>
     * 
     *    absoluteURI = scheme ":" ("//" netpath | relativeURI)
     *    relativeURI = path ["?" querypart]
     *    netpath     = domain [":" port] relativeURI
     *    path        = *("/" segment)
     *    segment     = *pchar *( ";" param )
     * 
     * </pre></code> This syntax is sufficent to handle HTTP style URI's as well
     * as GOPHER and FTP and various other 'simple' schemes. See RFC 2396 for
     * the syntax of an <code>absoluteURI</code>.
     */
    private void absoluteURI() {
        this.scheme();
        this.netPath();
    }

    /**
     * This will check to see if there is a scheme in the URI. If there is a
     * scheme found in the URI this returns true and removes that scheme tag of
     * the form "ftp:" or "http:" or whatever the protocol scheme tag may be for
     * the URI.
     * <p>
     * The syntax for the scheme is given in RFC 2396 as follows <code><pre>
     * 
     *    scheme = alpha *( alpha | digit | "+" | "-" | "." )
     * 
     * </pre></code> This will however also skips the "://" from the tag so of
     * the URI was <code>gopher://domain/path</code> then the URI would be
     * <code>domain/path</code> afterwards.
     */
    private void scheme() {
        int mark = this.off;
        int pos = this.off;

        if (this.alpha(this.buf[this.off])) {
            while (this.off < this.count) {
                char next = this.buf[this.off++];

                if (this.schemeChar(next)) {
                    pos++;
                } else if (next == ':') {
                    if (!this.skip("//")) {
                        this.off = mark;
                        pos = mark;
                    }
                    break;
                } else {
                    this.off = mark;
                    pos = mark;
                    break;
                }
            }
            this.scheme.len = pos - mark;
            this.scheme.off = mark;
        }
    }

    /**
     * This method is used to assist the scheme method. This will check to see
     * if the type of the character is the same as those described in RFC 2396
     * for a scheme character. The scheme tag can contain an alphanumeric of the
     * following <code>"+", "-", "."</code>.
     * 
     * @param c
     *            this is the character that is being checked
     * 
     * @return this returns true if the character is a valid scheme character
     */
    private boolean schemeChar(char c) {
        switch (c) {
            case '+':
            case '-':
            case '.':
                return true;
            default:
                return this.alphanum(c);
        }
    }

    /**
     * The network path is the path that contains the network address of the
     * host that this URI is targeted at. This will parse the domain name of the
     * host and also a port number before parsing a relativeURI <code><pre>
     * 
     *    netpath     = domain [":" port] relativeURI
     * 
     * </pre></code> This syntax is modified from the URI specification on RFC
     * 2396.
     */
    private void netPath() {
        this.domain();
        if (this.skip(":")) {
            this.port();
        }
        this.relativeURI();
    }

    /**
     * This is a specific definition of a type of URI. A relative URI is a URI
     * that contains no host or port. It is basically the resource within the
     * host. This will extract the path and the optional query part of the URI.
     * Rfc2396 has the proper definition of a <code>relativeURI</code>.
     */
    private void relativeURI() {
        this.path();
        if (this.skip("?")) {
            this.query();
        }
    }

    /**
     * This is used to extract the optional port from a given URI. This will
     * read a sequence of digit characters and convert the <code>String</code>
     * of digit characters into a decimal number. The digits will be added to
     * the port variable. If there is no port number this will not update the
     * read offset.
     */
    private void port() {
        while (this.off < this.count) {
            if (!this.digit(this.buf[this.off])) {
                break;
            }
            this.port *= 10;
            this.port += this.buf[this.off];
            this.port -= '0';
            this.off++;
        }
    }

    /**
     * This is used to extract the domain from the given URI. This will firstly
     * initialize the token object that represents the domain. This allows the
     * token's <code>toString</code> method to return the extracted value of the
     * token rather than getting confused with previous values set by a previous
     * parse method.
     * <p>
     * This uses the following delimiters to determine the end of the domain
     * <code>?</code>,<code>:</code> and <code>/<code>. This
     * ensures that the read offset does not go out of bounds and
     * consequently throw an <code>IndexOutOfBoundsException</code>.
     */
    private void domain() {
        int mark = this.off;

        loop: while (this.off < this.count) {
            switch (this.buf[this.off]) {
                case '/':
                case ':':
                case '?':
                    break loop;
                default:
                    this.off++;
            }
        }
        this.domain.len = this.off - mark;
        this.domain.off = mark;
    }

    /**
     * This is used to extract the segments from the given URI. This will
     * firstly initialize the token object that represents the path. This allows
     * the token's <code>toString</code> method to return the extracted value of
     * the token rather than getting confused with previous values set by a
     * previous parse method.
     * <p>
     * This is slightly different from RFC 2396 in that it defines a pchar as
     * the RFC 2396 definition of a pchar without the escaped chars. So this
     * method has to ensure that no escaped chars go unchecked. This ensures
     * that the read offset does not go out of bounds and throw an
     * <code>IndexOutOfBoundsException</code>.
     */
    private void path() {
        int mark = this.off;
        int pos = this.off;

        while (this.skip("/")) {
            this.buf[pos++] = '/';

            while (this.off < this.count) {
                if (this.buf[this.off] == ';') {
                    while (this.skip(";")) {
                        this.param();
                        this.insert();
                    }
                    break;
                }
                if (this.buf[this.off] == '%') {
                    this.escape();
                } else if (!this.pchar(this.buf[this.off])) {
                    break;
                }
                this.buf[pos++] = this.buf[this.off++];
            }
        }
        this.path.len = pos - mark;
        this.path.off = mark;
    }

    /**
     * This is used to extract the query from the given URI. This will firstly
     * initialize the token object that represents the query. This allows the
     * token's <code>toString</code> method to return the extracted value of the
     * token rather than getting confused with previous values set by a previous
     * parse method. The calculation of the query part of a URI is basically the
     * end of the URI.
     */
    private void query() {
        this.query.len = this.count - this.off;
        this.query.off = this.off;
    }

    /**
     * This is an expression that is defined by RFC 2396 it is used in the
     * definition of a segment expression. This is basically a list of pchars.
     * <p>
     * This method has to ensure that no escaped chars go unchecked. This
     * ensures that the read offset does not goe out of bounds and consequently
     * throw an out of bounds exception.
     */
    private void param() {
        this.name();
        if (this.skip("=")) { /* in case of error */
            this.value();
        }
    }

    /**
     * This extracts the name of the parameter from the character buffer. The
     * name of a parameter is defined as a set of pchars including escape
     * sequences. This will extract the parameter name and buffer the chars. The
     * name ends when a equals character, "=", is encountered or in the case of
     * a malformed parameter when the next character is not a pchar.
     */
    private void name() {
        int mark = this.off;
        int pos = this.off;

        while (this.off < this.count) {
            if (this.buf[this.off] == '%') { /* escaped */
                this.escape();
            } else if (this.buf[this.off] == '=') {
                break;
            } else if (!this.pchar(this.buf[this.off])) {
                break;
            }
            this.buf[pos++] = this.buf[this.off++];
        }
        this.name.len = pos - mark;
        this.name.off = mark;
    }

    /**
     * This extracts a parameter value from a path segment. The parameter value
     * consists of a sequence of pchars and some escape sequences. The parameter
     * value is buffered so that the name and values can be paired. The end of
     * the value is determined as the end of the buffer or the last pchar.
     */
    private void value() {
        int mark = this.off;
        int pos = this.off;

        while (this.off < this.count) {
            if (this.buf[this.off] == '%') { /* escaped */
                this.escape();
            } else if (!this.pchar(this.buf[this.off])) {
                break;
            }
            this.buf[pos++] = this.buf[this.off++];
        }
        this.value.len = pos - mark;
        this.value.off = mark;
    }

    /**
     * This method adds the name and value to a map so that the next name and
     * value can be collected. The name and value are added to the map as string
     * objects. Once added to the map the <code>Token</code> objects are set to
     * have zero length so they can be reused to collect further values. This
     * will add the values to the map as an array of type string. This is done
     * so that if there are multiple values that they can be stored.
     */
    private void insert() {
        if (this.value.length() > 0) {
            if (this.name.length() > 0) {
                this.insert(this.name, this.value);
            }
        }
        this.name.clear();
        this.value.clear();
    }

    /**
     * This will add the given name and value to the parameters map. This will
     * only store a single value per parameter name, so only the parameter that
     * was latest encountered will be saved. The <code>getQuery</code> method
     * can be used to collect the parameter values using the parameter name.
     * 
     * @param name
     *            this is the name of the value to be inserted
     * @param value
     *            this is the value of a that is to be inserted
     */
    private void insert(Token name, Token value) {
        this.insert(name.toString(), value.toString());
    }

    /**
     * This will add the given name and value to the parameters map. This will
     * only store a single value per parameter name, so only the parameter that
     * was latest encountered will be saved. The <code>getQuery</code> method
     * can be used to collect the parameter values using the parameter name.
     * 
     * @param name
     *            this is the name of the value to be inserted
     * @param value
     *            this is the value of a that is to be inserted
     */
    private void insert(String name, String value) {
        this.param.put(name, value);
    }

    /**
     * This converts an encountered escaped sequence, that is all embedded
     * hexidecimal characters into a native UCS character value. This does not
     * take any characters from the stream it just prepares the buffer with the
     * correct byte. The escaped sequence within the URI will be interpreded as
     * UTF-8.
     * <p>
     * This will leave the next character to read from the buffer as the
     * character encoded from the URI. If there is a fully valid escaped
     * sequence, that is <code>"%" HEX HEX</code>. This decodes the escaped
     * sequence using UTF-8 encoding, all encoded sequences should be in UCS-2
     * to fit in a Java char.
     */
    private void escape() {
        int peek = this.peek(this.off);

        if (!this.unicode(peek)) {
            this.binary(peek);
        }
    }

    /**
     * This method determines, using a peek character, whether the sequence of
     * escaped characters within the URI is binary data. If the data within the
     * escaped sequence is binary then this will ensure that the next character
     * read from the URI is the binary octet. This is used strictly for backward
     * compatible parsing of URI strings, binary data should never appear.
     * 
     * @param peek
     *            this is the first escaped character from the URI
     * 
     * @return currently this implementation always returns true
     */
    private boolean binary(int peek) {
        if ((this.off + 2) < this.count) {
            this.off += 2;
            this.buf[this.off] = this.bits(peek);
        }
        return true;
    }

    /**
     * This method determines, using a peek character, whether the sequence of
     * escaped characters within the URI is in UTF-8. If a UTF-8 character can
     * be successfully decoded from the URI it will be the next character read
     * from the buffer. This can check for both UCS-2 and UCS-4 characters.
     * However, because the Java <code>char</code> can only hold UCS-2, the
     * UCS-4 characters will have only the low order octets stored.
     * <p>
     * The WWW Consortium provides a reference implementation of a UTF-8
     * decoding for Java, in this the low order octets in the UCS-4 sequence are
     * used for the character. So, in the absence of a defined behaviour, the
     * W3C behaviour is assumed.
     * 
     * @param peek
     *            this is the first escaped character from the URI
     * 
     * @return this returns true if a UTF-8 character is decoded
     */
    private boolean unicode(int peek) {
        if ((peek & 0x80) == 0x00) return this.unicode(peek, 0);
        if ((peek & 0xe0) == 0xc0) return this.unicode(peek & 0x1f, 1);
        if ((peek & 0xf0) == 0xe0) return this.unicode(peek & 0x0f, 2);
        if ((peek & 0xf8) == 0xf0) return this.unicode(peek & 0x07, 3);
        if ((peek & 0xfc) == 0xf8) return this.unicode(peek & 0x03, 4);
        if ((peek & 0xfe) == 0xfc) return this.unicode(peek & 0x01, 5);
        return false;
    }

    /**
     * This method will decode the specified amount of escaped characters from
     * the URI and convert them into a single Java UCS-2 character. If there are
     * not enough characters within the URI then this will return false and
     * leave the URI alone.
     * <p>
     * The number of characters left is determined from the first UTF-8 octet,
     * as specified in RFC 2279, and because this is a URI there must that
     * number of <code>"%" HEX HEX</code> sequences left. If successful the next
     * character read is the UTF-8 sequence decoded into a native UCS-2
     * character.
     * 
     * @param peek
     *            contains the bits read from the first UTF octet
     * @param more
     *            this specifies the number of UTF octets left
     * 
     * @return this returns true if a UTF-8 character is decoded
     */
    private boolean unicode(int peek, int more) {
        if ((this.off + (more * 3)) >= this.count) return false;
        return this.unicode(peek, more, this.off);
    }

    /**
     * This will decode the specified amount of trailing UTF-8 bits from the
     * URI. The trailing bits are those following the first UTF-8 octet, which
     * specifies the length, in octets, of the sequence. The trailing octets are
     * if the form 10xxxxxx, for each of these octets only the last six bits are
     * valid UCS bits. So a conversion is basically an accumulation of these.
     * <p>
     * If at any point during the accumulation of the UTF-8 bits there is a
     * parsing error, then parsing is aborted an false is returned, as a result
     * the URI is left unchanged.
     * 
     * @param peek
     *            bytes that have been accumulated from the URI
     * @param more
     *            this specifies the number of UTF octets left
     * @param pos
     *            this specifies the position the parsing begins
     * 
     * @return this returns true if a UTF-8 character is decoded
     */
    private boolean unicode(int peek, int more, int pos) {
        while (more-- > 0) {
            if (this.buf[pos] == '%') {
                int next = pos + 3;
                int hex = this.peek(next);

                if ((hex & 0xc0) == 0x80) {
                    peek = (peek << 6) | (hex & 0x3f);
                    pos = next;
                    continue;
                }
            }
            return false;
        }
        if ((pos + 2) < this.count) {
            this.off = pos + 2;
            this.buf[this.off] = this.bits(peek);
        }
        return true;
    }

    /**
     * Defines behaviour for UCS-2 versus UCS-4 conversion from four octets. The
     * UTF-8 encoding scheme enables UCS-4 characters to be encoded and
     * decodeded. However, Java supports the 16-bit UCS-2 character set, and so
     * the 32-bit UCS-4 character set is not compatable. This basically decides
     * what to do with UCS-4.
     * 
     * @param data
     *            up to four octets to be converted to UCS-2 format
     * 
     * @return this returns a native UCS-2 character from the int
     */
    private char bits(int data) {
        return (char) data;
    }

    /**
     * This will return the escape expression specified from the URI as an
     * integer value of the hexidecimal sequence. This does not make any changes
     * to the buffer it simply checks to see if the characters at the position
     * specified are an escaped set characters of the form
     * <code>"%" HEX HEX</code>, if so, then it will convert that hexidecimal
     * string in to an integer value, or -1 if the expression is not
     * hexidecimal.
     * 
     * @param pos
     *            this is the position the expression starts from
     * 
     * @return the integer value of the hexidecimal expression
     */
    private int peek(int pos) {
        if (this.buf[pos] == '%') {
            if (this.count <= (pos + 2)) return -1;
            char high = this.buf[pos + 1];
            char low = this.buf[pos + 2];

            return this.convert(high, low);
        }
        return -1;
    }

    /**
     * This will convert the two hexidecimal characters to a real integer value,
     * which is returned. This requires characters within the range of 'A' to
     * 'F' and 'a' to 'f', and also the digits '0' to '9'. The characters
     * encoded using the ISO-8859-1 encoding scheme, if the characters are not
     * with in the range specified then this returns -1.
     * 
     * @param high
     *            this is the high four bits within the integer
     * @param low
     *            this is the low four bits within the integer
     * 
     * @return this returns the indeger value of the conversion
     */
    private int convert(char high, char low) {
        int hex = 0x00;

        if (this.hex(high) && this.hex(low)) {
            if (('A' <= high) && (high <= 'F')) {
                high -= 'A' - 'a';
            }
            if (high >= 'a') {
                hex ^= (high - 'a') + 10;
            } else {
                hex ^= high - '0';
            }
            hex <<= 4;

            if (('A' <= low) && (low <= 'F')) {
                low -= 'A' - 'a';
            }
            if (low >= 'a') {
                hex ^= (low - 'a') + 10;
            } else {
                hex ^= low - '0';
            }
            return hex;
        }
        return -1;
    }

    /**
     * This is used to determine wheather a char is a hexidecimal
     * <code>char</code> or not. A hexidecimal character is consdered to be a
     * character within the range of <code>0 - 9</code> and between
     * <code>a - f</code> and <code>A - F</code>. This will return
     * <code>true</code> if the character is in this range.
     * 
     * @param ch
     *            this is the character which is to be determined here
     * 
     * @return true if the character given has a hexidecimal value
     */
    private boolean hex(char ch) {
        if ((ch >= '0') && (ch <= '9'))
            return true;
        else if ((ch >= 'a') && (ch <= 'f'))
            return true;
        else if ((ch >= 'A') && (ch <= 'F')) return true;
        return false;
    }

    /**
     * This is a character set defined by RFC 2396 it is used to determine the
     * valididity of cetain <code>chars</code> within a Uniform Resource
     * Identifier. RFC 2396 defines an unreserverd char as
     * <code>alphanum | mark</code>.
     * 
     * @param c
     *            the character value that is being checked
     * 
     * @return true if the character has an unreserved value
     */
    private boolean unreserved(char c) {
        return this.mark(c) || this.alphanum(c);
    }

    /**
     * This is used to determine wheather or not a given unicode character is an
     * alphabetic character or a digit character. That is withing the range
     * <code>0 - 9</code> and between <code>a - z</code> it uses
     * <code>iso-8859-1</code> to compare the character.
     * 
     * @param c
     *            the character value that is being checked
     * 
     * @return true if the character has an alphanumeric value
     */
    private boolean alphanum(char c) {
        return this.digit(c) || this.alpha(c);
    }

    /**
     * This is used to determine wheather or not a given unicode character is an
     * alphabetic character. This uses encoding <code>iso-8859-1</code> to
     * compare the characters.
     * 
     * @param c
     *            the character value that is being checked
     * 
     * @return true if the character has an alphabetic value
     */
    private boolean alpha(char c) {
        return ((c <= 'z') && ('a' <= c)) || ((c <= 'Z') && ('A' <= c));
    }

    /**
     * This is a character set defined by RFC 2396 it checks the valididity of
     * cetain chars within a uniform resource identifier. The RFC 2396 defines a
     * mark char as <code>"-",
     * "_", ".", "!", "~", "*", "'", "(", ")"</code>.
     * 
     * @param c
     *            the character value that is being checked
     * 
     * @return true if the character is a mark character
     */
    private boolean mark(char c) {
        switch (c) {
            case '-':
            case '_':
            case '.':
            case '!':
            case '~':
            case '*':
            case '\'':
            case '(':
            case ')':
                return true;
            default:
                return false;
        }
    }

    /**
     * This is a character set defined by RFC 2396 it is used to check the
     * valididity of cetain chars within a generic uniform resource identifier.
     * The RFC 2396 defines a pchar char as unreserved or escaped or one of the
     * following characters <code>":", "@", "=",
     * "&", "+", "$", ","</code> this will not check to see if the char is
     * an escaped char, that is <code>% HEX HEX</code>. Because this takes 3
     * chars.
     * 
     * @param c
     *            the character value that is being checked
     * 
     * @return true if the character is a pchar character
     */
    private boolean pchar(char c) {
        switch (c) {
            case '@':
            case '&':
            case '=':
            case '+':
            case '$':
            case ',':
            case ':':
                return true;
            default:
                return this.unreserved(c);
        }
    }

    /**
     * This is used to convert this URI object into a <code>String</code>
     * object. This will only convert the parts of the URI that exist, so the
     * URI may not contain the domain or the query part and it will not contain
     * the path parameters. If the URI contains all these parts then it will
     * return somthing like
     * 
     * <pre>
     * scheme://host:port/path/path?querypart
     * </pre>
     * <p>
     * It can return <code>/path/path?querypart</code> style relative URI's. If
     * any of the parts are set to null then that part will be missing, for
     * example if <code>setDomain</code> method is invoked with a null parameter
     * then the domain and port will be missing from the resulting URI. If the
     * path part is set to null using the <code>setPath</code> then the path
     * will be <code>/</code>. An example URI with the path part of null would
     * be
     * 
     * <pre>
     * scheme://host:port/?querypart
     * </pre>
     * 
     * @return the URI with only the path part and the non-null optional parts
     *         of the uniform resource identifier
     */
    @Override
    public String toString() {
        return (this.scheme.length() > 0 ? this.scheme + "://" : "")
                + (this.domain.length() > 0 ? this.domain
                        + (this.port > 0 ? ":" + this.port : "") : "")
                + this.getPath() + (this.param.size() > 0 ? this.param : "")
                + (this.query.length() > 0 ? "?" + this.query : "");
    }

    /**
     * The <code>ParameterMap</code> is uses to store the parameters that are to
     * be encoded in to the address. This will append all of the parameters to
     * the end of the path. These can later be extracted by parsing the address.
     * 
     * @author Niall Gallagher
     */
    private class ParameterMap extends KeyMap<String> {

        /**
         * This will return the parameters encoded in such a way that it can be
         * appended to the end of the path. These parameters can be added to the
         * address such that they do not form a query parameter. Values such as
         * session identifiers are often added as the path parameters to the
         * address.
         * 
         * @return this returns the representation of the parameters
         */
        private String encode() {
            StringBuilder text = new StringBuilder();

            for (String name : AddressParser.this.param) {
                String value = AddressParser.this.param.get(name);

                text.append(";");
                text.append(name);

                if (value != null) {
                    text.append("=");
                    text.append(value);
                    ;
                }
            }
            return text.toString();
        }

        /**
         * This will return the parameters encoded in such a way that it can be
         * appended to the end of the path. These parameters can be added to the
         * address such that they do not form a query parameter. Values such as
         * session identifiers are often added as the path parameters to the
         * address.
         * 
         * @return this returns the representation of the parameters
         */
        @Override
        public String toString() {
            return this.encode();
        }
    }

    /**
     * This is used as an alternative to the <code>ParseBuffer</code> for
     * extracting tokens from the URI without allocating memory. This will
     * basically mark out regions within the buffer which are used to represent
     * the token. When the token value is required the region is used to create
     * a <code>String</code> object.
     */
    private class Token {

        /**
         * This can be used to override the value for this token.
         */
        public String value;

        /**
         * This represents the start offset within the buffer.
         */
        public int off;

        /**
         * This represents the number of charters in the token.
         */
        public int len;

        /**
         * If the <code>Token</code> is to be reused this will clear all
         * previous data. Clearing the buffer allows it to be reused if there is
         * a new URI to be parsed. This ensures that a null is returned if the
         * token length is zero.
         */
        public void clear() {
            this.value = null;
            this.len = 0;
        }

        /**
         * This is used to determine the number of characters this token
         * contains. This is used rather than accessing the length directly so
         * that the value the token represents can be overridden easily without
         * upsetting the token.
         * 
         * @return this returns the number of characters this uses
         */
        public int length() {
            if (this.value == null) return this.len;
            return this.value.length();
        }

        /**
         * This method will convert the <code>Token</code> into it's
         * <code>String</code> equivelant. This will firstly check to see if
         * there is a value, for the string representation, if there is the
         * value is returned, otherwise the region is converted into a
         * <code>String</code> and returned.
         * 
         * @return this returns a value representing the token
         */
        @Override
        public String toString() {
            if (this.value != null) return this.value;
            if (this.len > 0) {
                this.value = new String(AddressParser.this.buf, this.off,
                        this.len);
            }
            return this.value;
        }
    }
}