/* ************************************************************************ # # DivConq # # http://divconq.com/ # # Copyright: # Copyright 2014 eTimeline, LLC. All rights reserved. # # License: # See the license.txt file in the project's top-level directory for details. # # Authors: # * Andy White # ************************************************************************ */ /* * AddressParser.java February 2001 * * Copyright (C) 2001, Niall Gallagher <niallg@users.sf.net> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package divconq.www.http.parse; import divconq.www.http.Address; import divconq.www.http.Path; import divconq.www.http.Query; import divconq.www.util.KeyMap; import divconq.www.util.parse.Parser; /** * This parser is used to parse uniform resource identifiers. * The uniform resource identifier syntax is given in RFC 2396. * This parser can parse relative and absolute URI's. The * uniform resource identifier syntax that this parser will * parse are based on the generic web based URL similar to * the syntax represented in RFC 2616 section 3.2.2. The syntax * used to parse this URI is a modified version of RFC 2396 * <pre> * * URI = (absoluteURI | relativeURI) * absoluteURI = scheme ":" ("//" netpath | relativeURI) * relativeURI = path ["?" querypart] * netpath = domain [":" port] relativeURI * path = *("/" segment) * segment = *pchar *( ";" param ) * * </pre> * This implements the <code>Address</code> interface and provides * methods that access the various parts of the URI. The parameters * in the path segments of the uniform resource identifier are * stored in name value pairs. If parameter names are not unique * across the path segments then only the deepest parameter will be * stored from the path segment. For example if the URI represented * was <code>http://domain/path1;x=y/path2;x=z</code> the value for * the parameter named <code>x</code> would be <code>z</code>. * <p> * This will normalize the path part of the uniform resource * identifier. A normalized path is one that contains no back * references like "./" and "../". The normalized path will not * contain the path parameters. * <p> * The <code>setPath</code> method is used to reset the path this * uniform resource identifier has, it also resets the parameters. * The parameters are extracted from the new path given. * * @author Niall Gallagher */ public class AddressParser extends Parser implements Address { /** * Parameters are stored so that the can be viewed. */ private ParameterMap param; /** * This is the path used to represent the address path. */ private Path normal; /** * This contains the query parameters for the address. */ private Query data; /** * Used to track the characters that form the path. */ private Token path; /** * Used to track the characters that form the domain. */ private Token domain; /** * Used to track the characters that form the query. */ private Token query; /** * Used to track the name characters of a parameter. */ private Token name; /** * Used to track the value characters of a parameter. */ private Token value; /** * References the scheme that this URI contains. */ private Token scheme; /** * Contains the port number if it was specified. */ private int port; /** * Default constructor will create a <code>AddressParser</code> * that contains no specifics. The instance will return * <code>null</code> for all the get methods. The parsers * get methods are populated by using the <code>parse</code> * method. */ public AddressParser(){ this.param = new ParameterMap(); this.path = new Token(); this.domain = new Token(); this.query = new Token(); this.scheme = new Token(); this.name = new Token(); this.value = new Token(); } /** * This is primarily a convenience constructor. This will parse * the <code>String</code> given to extract the specifics. This * could be achieved by calling the default no-arg constructor * and then using the instance to invoke the <code>parse</code> * method on that <code>String</code> to extract the parts. * * @param text a <code>String</code> containing a URI value */ public AddressParser(String text){ this(); parse(text); } /** * This allows the scheme of the URL given to be returned. * If the URI does not contain a scheme then this will * return null. The scheme of the URI is the part that * specifies the type of protocol that the URI is used * for, an example <code>gopher://domain/path</code> is * a URI that is intended for the gopher protocol. The * scheme is the string <code>gopher</code>. * * @return this returns the scheme tag for the URI if * there is one specified for it */ public String getScheme(){ return scheme.toString(); } /** * This is used to retrieve the domain of this URI. The * domain part in the URI is an optional part, an example * <code>http://domain/path?querypart</code>. This will * return the value of the domain part. If there is no * domain part then this will return null otherwise the * domain value found in the uniform resource identifier. * * @return the domain part of this uniform resource * identifier this represents */ public String getDomain(){ return domain.toString(); } /** * This is used to retrieve the path of this URI. The path part * is the most fundamental part of the URI. This will return * the value of the path. If there is no path part then this * will return <code>/</code> to indicate the root. * <p> * The <code>Path</code> object returned by this will contain * no path parameters. The path parameters are available using * the <code>Address</code> methods. The reason that this does not * contain any of the path parameters is so that if the path is * needed to be converted into an OS specific path then the path * parameters will not need to be separately parsed out. * * @return the path that this URI contains, this value will not * contain any back references such as "./" and "../" or any * path parameters */ public Path getPath(){ if(normal == null) { String text = path.toString(); if(text == null) { normal = new PathParser("/"); } if(normal == null){ normal = new PathParser(text); } } return normal; } /** * This is used to retrieve the query of this URI. The query part * in the URI is an optional part. This will return the value * of the query part. If there is no query part then this will * return an empty <code>Query</code> object. The query is * an optional member of a URI and comes after the path part, it * is preceded by a question mark, <code>?</code> character. * For example the following URI contains <code>query</code> for * its query part, <code>http://host:port/path?query</code>. * <p> * This returns a <code>org.simpleframework.http.Query</code> * object that can be used to interact directly with the query * values. The <code>Query</code> object is a read-only interface * to the query parameters, and so will not affect the URI. * * @return a <code>Query</code> object for the query part */ public Query getQuery(){ if(data == null) { String text = query.toString(); if(text == null) { data = new QueryParser(); } if(data == null){ data = new QueryParser(text); } } return data; } /** * This is used to retrieve the port of the uniform resource * identifier. The port part in this is an optional part, an * example <code>http://host:port/path?querypart</code>. This * will return the value of the port. If there is no port then * this will return <code>-1</code> because this represents * an impossible uniform resource identifier port. The port * is an optional part. * * @return this returns the port of the uniform resource * identifier */ public int getPort(){ return port <= 0? -1 : port; } /** * This extracts the parameter values from the uniform resource * identifier represented by this object. The parameters that a * uniform resource identifier contains are embedded in the path * part of the URI. If the path contains no parameters then this * will return an empty <code>Map</code> instance. * <p> * This will produce unique name and value parameters. Thus if the * URI contains several path segments with similar parameter names * this will return the deepest parameter. For example if the URI * represented was <code>http://domain/path1;x=y/path2;x=z</code> * the value for the parameter named <code>x</code> would be * <code>z</code>. * * @return this will return the parameter names found in the URI */ public KeyMap<String> getParameters(){ return param; } /** * This allows the scheme for the URI to be specified. * If the URI does not contain a scheme then this will * attach the scheme and the <code>://</code> identifier * to ensure that the <code>Address.toString</code> will * produce the correct syntax. * <p> * Caution must be taken to ensure that the port and * the scheme are consistent. So if the original URI * was <code>http://domain:80/path</code> and the scheme * was changed to <code>ftp</code> the port number that * remains is the standard HTTP port not the FTP port. * * @param value this specifies the protocol this URI * is intended for */ public void setScheme(String value){ scheme.value = value; } /** * This will set the domain to whatever value is in the * string parameter. If the string is null then this URI * objects <code>toString</code> method will not contain * the domain. The result of the <code>toString</code> * method will be <code>/path/path?query</code>. If the * path is non-null this URI will contain the path. * * @param value this will be the new domain of this * uniform resource identifier, if it is not null */ public void setDomain(String value){ domain.value = value; } /** * This will set the port to whatever value it is given. If * the value is 0 or less then the <code>toString</code> will * will not contain the optional port. If port number is above * 0 then the <code>toString</code> method will produce a URI * like <code>http://host:123/path</code> but only if there is * a valid domain. * * @param port the port value that this URI is to have */ public void setPort(int port) { this.port = port; } /** * This will set the path to whatever value it is given. If the * value is null then this <code>Address.toString</code> method will * not contain the path, that is if path is null then it will be * interpreted as <code>/</code>. * <p> * This will reset the parameters this URI has. If the value * given to this method has embedded parameters these will form * the parameters of this URI. The value given may not be the * same value that the <code>getPath</code> produces. The path * will have all back references and parameters stripped. * * @param text the path that this URI is to be set with */ public void setPath(String text) { if(!text.startsWith("/")){ text = "/" + text; } domain.toString(); query.toString(); scheme.toString(); param.clear(); path.clear(); parsePath(text); /*extract params*/ } /** * This will set the path to whatever value it is given. If the * value is null then this <code>Address.toString</code> method * will not contain the path, that is if path is null then it will * be interpreted as <code>/</code>. * <p> * This will reset the parameters this URI has. If the value * given to this method has embedded parameters these will form * the parameters of this URI. The value given may not be the * same value that the <code>getPath</code> produces. The path * will have all back references and parameters stripped. * * @param path the path that this URI is to be set with */ public void setPath(Path path) { if(path != null){ normal = path; }else { setPath("/"); } } /** * This is used to parse the path given with the <code>setPath</code> * method. The path contains name and value pairs. These parameters * are embedded into the path segments using a semicolon character, * ';'. Since the parameters to not form part of the actual path * mapping they are removed from the path and stored. Each parameter * can then be extracted from this parser using the methods provided * by the <code>Address</code> interface. * * @param path this is the path that is to be parsed and have the * parameter values extracted */ private void parsePath(String path){ count = path.length(); ensureCapacity(count); path.getChars(0, count, buf, 0); normal = null; off = 0; path(); } /** * This will set the query to whatever value it is given. If the * value is null then this <code>Address.toString</code> method * will not contain the query. If the query was <code>abc</code> * then the <code>toString</code> method would produce a string * like <code>http://host:port/path?abc</code>. If the query is * null this URI would have no query part. The query must not * contain the <code>?</code> character. * * @param value the query that this uniform resource identifier * is to be set to if it is non-null */ public void setQuery(String value) { query.value = value; data = null; } /** * This will set the query to whatever value it is given. If the * value is null then this <code>Address.toString</code> method * will not contain the query. If the <code>Query.toString</code> * returns null then the query will be empty. This is basically * the <code>setQuery(String)</code> method with the string value * from the issued <code>Query.toString</code> method. * * @param query a <code>Query</code> object that contains * the name value parameters for the query */ public void setQuery(Query query) { if(value != null) { data = query; }else { setQuery(""); } } /** * This will check to see what type of URI this is if it is an * <code>absoluteURI</code> or a <code>relativeURI</code>. To * see the definition of a URI see RFC 2616 for the definition * of a URL and for more specifics see RFC 2396 for the * expressions. */ protected void parse(){ if(count > 0){ if(buf[0] == '/'){ relativeURI(); }else{ absoluteURI(); } } } /** * This will empty each tokens cache. A tokens cache is used * to represent a token once the token's <code>toString</code> * method has been called. Thus when the <code>toString</code> * method is called then the token depends on the value of the * cache alone in further calls to <code>toString</code>. * However if a URI has just been parsed and that method has * not been invoked then the cache is created from the buf if * its length is greater than zero. */ protected void init(){ param.clear(); domain.clear(); path.clear(); query.clear(); scheme.clear(); off =port = 0; normal = null; data = null; } /** * This is a specific definition of a type of URI. An absolute * URI is a URI that contains a host and port. It is the most * frequently used type of URI. This will define the host and * the optional port part. As well as the relative URI part. * This uses a simpler syntax than the one specified in RFC 2396 * <code><pre> * * absoluteURI = scheme ":" ("//" netpath | relativeURI) * relativeURI = path ["?" querypart] * netpath = domain [":" port] relativeURI * path = *("/" segment) * segment = *pchar *( ";" param ) * * </pre></code> * This syntax is sufficient to handle HTTP style URI's as well * as GOPHER and FTP and various other 'simple' schemes. See * RFC 2396 for the syntax of an <code>absoluteURI</code>. */ private void absoluteURI(){ scheme(); netPath(); } /** * This will check to see if there is a scheme in the URI. If * there is a scheme found in the URI this returns true and * removes that scheme tag of the form "ftp:" or "http:" * or whatever the protocol scheme tag may be for the URI. * <p> * The syntax for the scheme is given in RFC 2396 as follows * <code><pre> * * scheme = alpha *( alpha | digit | "+" | "-" | "." ) * * </pre></code> * This will however also skips the "://" from the tag * so of the URI was <code>gopher://domain/path</code> then * the URI would be <code>domain/path</code> afterwards. */ private void scheme(){ int mark = off; int pos = off; if(alpha(buf[off])){ while(off < count){ char next = buf[off++]; if(schemeChar(next)){ pos++; }else if(next == ':'){ if(!skip("//")) { off = mark; pos = mark; } break; }else{ off = mark; pos = mark; break; } } scheme.len = pos - mark; scheme.off = mark; } } /** * This method is used to assist the scheme method. This will * check to see if the type of the character is the same as * those described in RFC 2396 for a scheme character. The * scheme tag can contain an alphanumeric of the following * <code>"+", "-", "."</code>. * * @param c this is the character that is being checked * * @return this returns true if the character is a valid * scheme character */ private boolean schemeChar(char c){ switch(c){ case '+': case '-': case '.': return true; default: return alphanum(c); } } /** * The network path is the path that contains the network * address of the host that this URI is targeted at. This * will parse the domain name of the host and also a port * number before parsing a relativeURI * <code><pre> * * netpath = domain [":" port] relativeURI * * </pre></code> * This syntax is modified from the URI specification on * RFC 2396. */ private void netPath(){ domain(); if(skip(":")){ port(); } relativeURI(); } /** * This is a specific definition of a type of URI. A relative * URI is a URI that contains no host or port. It is basically * the resource within the host. This will extract the path and * the optional query part of the URI. Rfc2396 has the proper * definition of a <code>relativeURI</code>. */ private void relativeURI(){ path(); if(skip("?")){ query(); } } /** * This is used to extract the optional port from a given URI. * This will read a sequence of digit characters and convert * the <code>String</code> of digit characters into a decimal * number. The digits will be added to the port variable. If * there is no port number this will not update the read offset. */ private void port() { while(off < count){ if(!digit(buf[off])){ break; } port *= 10; port += buf[off]; port -= '0'; off++; } } /** * This is used to extract the domain from the given URI. This * will firstly initialize the token object that represents the * domain. This allows the token's <code>toString</code> method to * return the extracted value of the token rather than getting * confused with previous values set by a previous parse method. * <p> * This uses the following delimiters to determine the end of the * domain <code>?</code>,<code>:</code> and <code>/<code>. This * ensures that the read offset does not go out of bounds and * consequently throw an <code>IndexOutOfBoundsException</code>. */ private void domain(){ int mark = off; loop: while(off < count){ switch(buf[off]){ case '/': case ':': case '?': break loop; default: off++; } } domain.len = off - mark; domain.off = mark; } /** * This is used to extract the segments from the given URI. This * will firstly initialize the token object that represents the * path. This allows the token's <code>toString</code> method to * return the extracted value of the token rather than getting * confused with previous values set by a previous parse method. * <p> * This is slightly different from RFC 2396 in that it defines a * pchar as the RFC 2396 definition of a pchar without the escaped * chars. So this method has to ensure that no escaped chars go * unchecked. This ensures that the read offset does not go out * of bounds and throw an <code>IndexOutOfBoundsException</code>. */ private void path(){ int mark = off; int pos = off; while(skip("/")) { buf[pos++] = '/'; while(off < count){ if(buf[off]==';'){ while(skip(";")){ param(); insert(); } break; } if(buf[off]=='%'){ escape(); }else if(!pchar(buf[off])){ break; } buf[pos++]=buf[off++]; } } path.len = pos -mark; path.off = mark; } /** * This is used to extract the query from the given URI. This * will firstly initialize the token object that represents the * query. This allows the token's <code>toString</code> method * to return the extracted value of the token rather than getting * confused with previous values set by a previous parse method. * The calculation of the query part of a URI is basically the * end of the URI. */ private void query() { query.len = count - off; query.off = off; } /** * This is an expression that is defined by RFC 2396 it is used * in the definition of a segment expression. This is basically * a list of pchars. * <p> * This method has to ensure that no escaped chars go unchecked. * This ensures that the read offset does not goe out of bounds * and consequently throw an out of bounds exception. */ private void param() { name(); if(skip("=")){ /* in case of error*/ value(); } } /** * This extracts the name of the parameter from the character * buffer. The name of a parameter is defined as a set of * pchars including escape sequences. This will extract the * parameter name and buffer the chars. The name ends when a * equals character, "=", is encountered or in the case of a * malformed parameter when the next character is not a pchar. */ private void name(){ int mark = off; int pos = off; while(off < count){ if(buf[off]=='%'){ /* escaped */ escape(); }else if(buf[off]=='=') { break; }else if(!pchar(buf[off])){ break; } buf[pos++] = buf[off++]; } name.len = pos - mark; name.off = mark; } /** * This extracts a parameter value from a path segment. The * parameter value consists of a sequence of pchars and some * escape sequences. The parameter value is buffered so that * the name and values can be paired. The end of the value * is determined as the end of the buffer or the last pchar. */ private void value(){ int mark = off; int pos = off; while(off < count){ if(buf[off]=='%'){ /* escaped */ escape(); }else if(!pchar(buf[off])) { break; } buf[pos++] = buf[off++]; } value.len = pos - mark; value.off = mark; } /** * This method adds the name and value to a map so that the next * name and value can be collected. The name and value are added * to the map as string objects. Once added to the map the * <code>Token</code> objects are set to have zero length so they * can be reused to collect further values. This will add the * values to the map as an array of type string. This is done so * that if there are multiple values that they can be stored. */ private void insert(){ if(value.length() > 0){ if(name.length() > 0) insert(name,value); } name.clear(); value.clear(); } /** * This will add the given name and value to the parameters map. * This will only store a single value per parameter name, so * only the parameter that was latest encountered will be saved. * The <code>getQuery</code> method can be used to collect * the parameter values using the parameter name. * * @param name this is the name of the value to be inserted * @param value this is the value of a that is to be inserted */ private void insert(Token name, Token value){ insert(name.toString(), value.toString()); } /** * This will add the given name and value to the parameters map. * This will only store a single value per parameter name, so * only the parameter that was latest encountered will be saved. * The <code>getQuery</code> method can be used to collect * the parameter values using the parameter name. * * @param name this is the name of the value to be inserted * @param value this is the value of a that is to be inserted */ private void insert(String name, String value) { param.put(name, value); } /** * This converts an encountered escaped sequence, that is all * embedded hexidecimal characters into a native UCS character * value. This does not take any characters from the stream it * just prepares the buffer with the correct byte. The escaped * sequence within the URI will be interpreded as UTF-8. * <p> * This will leave the next character to read from the buffer * as the character encoded from the URI. If there is a fully * valid escaped sequence, that is <code>"%" HEX HEX</code>. * This decodes the escaped sequence using UTF-8 encoding, all * encoded sequences should be in UCS-2 to fit in a Java char. */ private void escape() { int peek = peek(off); if(!unicode(peek)) { binary(peek); } } /** * This method determines, using a peek character, whether the * sequence of escaped characters within the URI is binary data. * If the data within the escaped sequence is binary then this * will ensure that the next character read from the URI is the * binary octet. This is used strictly for backward compatible * parsing of URI strings, binary data should never appear. * * @param peek this is the first escaped character from the URI * * @return currently this implementation always returns true */ private boolean binary(int peek) { if(off + 2 < count) { off += 2; buf[off]= bits(peek); } return true; } /** * This method determines, using a peek character, whether the * sequence of escaped characters within the URI is in UTF-8. If * a UTF-8 character can be successfully decoded from the URI it * will be the next character read from the buffer. This can * check for both UCS-2 and UCS-4 characters. However, because * the Java <code>char</code> can only hold UCS-2, the UCS-4 * characters will have only the low order octets stored. * <p> * The WWW Consortium provides a reference implementation of a * UTF-8 decoding for Java, in this the low order octets in the * UCS-4 sequence are used for the character. So, in the * absence of a defined behaviour, the W3C behaviour is assumed. * * @param peek this is the first escaped character from the URI * * @return this returns true if a UTF-8 character is decoded */ private boolean unicode(int peek) { if((peek & 0x80) == 0x00){ return unicode(peek, 0); } if((peek & 0xe0) == 0xc0){ return unicode(peek & 0x1f, 1); } if((peek & 0xf0) == 0xe0){ return unicode(peek & 0x0f, 2); } if((peek & 0xf8) == 0xf0){ return unicode(peek & 0x07, 3); } if((peek & 0xfc) == 0xf8){ return unicode(peek & 0x03, 4); } if((peek & 0xfe) == 0xfc){ return unicode(peek & 0x01, 5); } return false; } /** * This method will decode the specified amount of escaped * characters from the URI and convert them into a single Java * UCS-2 character. If there are not enough characters within * the URI then this will return false and leave the URI alone. * <p> * The number of characters left is determined from the first * UTF-8 octet, as specified in RFC 2279, and because this is * a URI there must that number of <code>"%" HEX HEX</code> * sequences left. If successful the next character read is * the UTF-8 sequence decoded into a native UCS-2 character. * * @param peek contains the bits read from the first UTF octet * @param more this specifies the number of UTF octets left * * @return this returns true if a UTF-8 character is decoded */ private boolean unicode(int peek, int more) { if(off + more * 3 >= count) { return false; } return unicode(peek,more,off); } /** * This will decode the specified amount of trailing UTF-8 bits * from the URI. The trailing bits are those following the first * UTF-8 octet, which specifies the length, in octets, of the * sequence. The trailing octets are if the form 10xxxxxx, for * each of these octets only the last six bits are valid UCS * bits. So a conversion is basically an accumulation of these. * <p> * If at any point during the accumulation of the UTF-8 bits * there is a parsing error, then parsing is aborted an false * is returned, as a result the URI is left unchanged. * * @param peek bytes that have been accumulated from the URI * @param more this specifies the number of UTF octets left * @param pos this specifies the position the parsing begins * * @return this returns true if a UTF-8 character is decoded */ private boolean unicode(int peek, int more, int pos) { while(more-- > 0) { if(buf[pos] == '%'){ int next = pos + 3; int hex = peek(next); if((hex & 0xc0) == 0x80){ peek = (peek<<6)|(hex&0x3f); pos = next; continue; } } return false; } if(pos + 2 < count) { off = pos + 2; buf[off]= bits(peek); } return true; } /** * Defines behaviour for UCS-2 versus UCS-4 conversion from four * octets. The UTF-8 encoding scheme enables UCS-4 characters to * be encoded and decodeded. However, Java supports the 16-bit * UCS-2 character set, and so the 32-bit UCS-4 character set is * not compatable. This basically decides what to do with UCS-4. * * @param data up to four octets to be converted to UCS-2 format * * @return this returns a native UCS-2 character from the int */ private char bits(int data) { return (char)data; } /** * This will return the escape expression specified from the URI * as an integer value of the hexidecimal sequence. This does * not make any changes to the buffer it simply checks to see if * the characters at the position specified are an escaped set * characters of the form <code>"%" HEX HEX</code>, if so, then * it will convert that hexidecimal string in to an integer * value, or -1 if the expression is not hexidecimal. * * @param pos this is the position the expression starts from * * @return the integer value of the hexidecimal expression */ private int peek(int pos) { if(buf[pos] == '%'){ if(count <= pos + 2) { return -1; } char high = buf[pos + 1]; char low = buf[pos + 2]; return convert(high, low); } return -1; } /** * This will convert the two hexidecimal characters to a real * integer value, which is returned. This requires characters * within the range of 'A' to 'F' and 'a' to 'f', and also * the digits '0' to '9'. The characters encoded using the * ISO-8859-1 encoding scheme, if the characters are not with * in the range specified then this returns -1. * * @param high this is the high four bits within the integer * @param low this is the low four bits within the integer * * @return this returns the indeger value of the conversion */ private int convert(char high, char low) { int hex = 0x00; if(hex(high) && hex(low)){ if('A' <= high && high <= 'F'){ high -= 'A' - 'a'; } if(high >= 'a') { hex ^= (high-'a')+10; } else { hex ^= high -'0'; } hex <<= 4; if('A' <= low && low <= 'F') { low -= 'A' - 'a'; } if(low >= 'a') { hex ^= (low-'a')+10; } else { hex ^= low-'0'; } return hex; } return -1; } /** * This is used to determine wheather a char is a hexidecimal * <code>char</code> or not. A hexidecimal character is consdered * to be a character within the range of <code>0 - 9</code> and * between <code>a - f</code> and <code>A - F</code>. This will * return <code>true</code> if the character is in this range. * * @param ch this is the character which is to be determined here * * @return true if the character given has a hexidecimal value */ private boolean hex(char ch) { if(ch >= '0' && ch <= '9') { return true; } else if(ch >='a' && ch <= 'f') { return true; } else if(ch >= 'A' && ch <= 'F') { return true; } return false; } /** * This is a character set defined by RFC 2396 it is used to * determine the valididity of certain <code>chars</code> * within a Uniform Resource Identifier. RFC 2396 defines * an unreserved char as <code>alphanum | mark</code>. * * @param c the character value that is being checked * * @return true if the character has an unreserved value */ private boolean unreserved(char c){ return mark(c) || alphanum(c); } /** * This is used to determine wheather or not a given unicode * character is an alphabetic character or a digit character. * That is withing the range <code>0 - 9</code> and between * <code>a - z</code> it uses <code>iso-8859-1</code> to * compare the character. * * @param c the character value that is being checked * * @return true if the character has an alphanumeric value */ private boolean alphanum(char c){ return digit(c) || alpha(c); } /** * This is used to determine wheather or not a given unicode * character is an alphabetic character. This uses encoding * <code>iso-8859-1</code> to compare the characters. * * @param c the character value that is being checked * * @return true if the character has an alphabetic value */ private boolean alpha(char c){ return (c <= 'z' && 'a' <= c) || (c <= 'Z' && 'A' <= c); } /** * This is a character set defined by RFC 2396 it checks * the valididity of cetain chars within a uniform resource * identifier. The RFC 2396 defines a mark char as <code>"-", * "_", ".", "!", "~", "*", "'", "(", ")"</code>. * * @param c the character value that is being checked * * @return true if the character is a mark character */ private boolean mark(char c){ switch(c){ case '-': case '_': case '.': case '!': case '~': case '*': case '\'': case '(': case ')': return true; default: return false; } } /** * This is a character set defined by RFC 2396 it is used to check * the valididity of cetain chars within a generic uniform resource * identifier. The RFC 2396 defines a pchar char as unreserved or * escaped or one of the following characters <code>":", "@", "=", * "&", "+", "$", ","</code> this will not check to see if the * char is an escaped char, that is <code>% HEX HEX</code>. Because * this takes 3 chars. * * @param c the character value that is being checked * * @return true if the character is a pchar character */ private boolean pchar(char c){ switch(c){ case '@': case '&': case '=': case '+': case '$': case ',': case ':': return true; default: return unreserved(c); } } /** * This is a character set defined by RFC 2396, it checks the * valididity of certain chars in a uniform resource identifier. * The RFC 2396 defines a reserved char as <code>";", "/", "?", * ":", "@", "&", "=", "+", "$", ","</code>. * * @param c the character value that is being checked * * @return true if the character is a reserved character */ @SuppressWarnings("unused") private boolean reserved(char c){ switch(c){ case ';': case '/': case '?': case '@': case '&': case ':': case '=': case '+': case '$': case ',': return true; default: return false; } } /** * This is used to convert this URI object into a <code>String</code> * object. This will only convert the parts of the URI that exist, so * the URI may not contain the domain or the query part and it will * not contain the path parameters. If the URI contains all these * parts then it will return somthing like * <pre> * scheme://host:port/path/path?querypart * </pre> * <p> * It can return <code>/path/path?querypart</code> style relative * URI's. If any of the parts are set to null then that part will be * missing, for example if <code>setDomain</code> method is invoked * with a null parameter then the domain and port will be missing * from the resulting URI. If the path part is set to null using the * <code>setPath</code> then the path will be <code>/</code>. An * example URI with the path part of null would be * <pre> * scheme://host:port/?querypart * </pre> * * @return the URI with only the path part and the non-null optional * parts of the uniform resource identifier */ public String toString() { return (scheme.length() > 0 ? scheme +"://": "") + (domain.length() > 0 ? domain + (port > 0 ? ":"+port : "") : "")+ getPath() + (param.size() > 0 ? param : "")+ (query.length()>0?"?"+query :""); } /** * The <code>ParameterMap</code> is uses to store the parameters * that are to be encoded in to the address. This will append all * of the parameters to the end of the path. These can later be * extracted by parsing the address. * * @author Niall Gallagher */ private class ParameterMap extends KeyMap<String> { /** * */ private static final long serialVersionUID = -7391825649971667162L; /** * This will return the parameters encoded in such a way that * it can be appended to the end of the path. These parameters * can be added to the address such that they do not form a * query parameter. Values such as session identifiers are * often added as the path parameters to the address. * * @return this returns the representation of the parameters */ private String encode() { StringBuilder text = new StringBuilder(); for(String name : param) { String value = param.get(name); text.append(";"); text.append(name); if(value != null) { text.append("="); text.append(value);; } } return text.toString(); } /** * This will return the parameters encoded in such a way that * it can be appended to the end of the path. These parameters * can be added to the address such that they do not form a * query parameter. Values such as session identifiers are * often added as the path parameters to the address. * * @return this returns the representation of the parameters */ public String toString() { return encode(); } } /** * This is used as an alternative to the <code>ParseBuffer</code> * for extracting tokens from the URI without allocating memory. * This will basically mark out regions within the buffer which are * used to represent the token. When the token value is required * the region is used to create a <code>String</code> object. */ private class Token { /** * This can be used to override the value for this token. */ public String value; /** * This represents the start offset within the buffer. */ public int off; /** * This represents the number of charters in the token. */ public int len; /** * If the <code>Token</code> is to be reused this will clear * all previous data. Clearing the buffer allows it to be * reused if there is a new URI to be parsed. This ensures * that a null is returned if the token length is zero. */ public void clear() { value = null; len = 0; } /** * This is used to determine the number of characters this * token contains. This is used rather than accessing the * length directly so that the value the token represents * can be overridden easily without upsetting the token. * * @return this returns the number of characters this uses */ public int length() { if(value == null){ return len; } return value.length(); } /** * This method will convert the <code>Token</code> into it's * <code>String</code> equivelant. This will firstly check * to see if there is a value, for the string representation, * if there is the value is returned, otherwise the region * is converted into a <code>String</code> and returned. * * @return this returns a value representing the token */ public String toString() { if(value != null) { return value; } if(len > 0) { value = new String(buf,off,len); } return value; } } }