/* ************************************************************************
#
# DivConq
#
# http://divconq.com/
#
# Copyright:
# Copyright 2014 eTimeline, LLC. All rights reserved.
#
# License:
# See the license.txt file in the project's top-level directory for details.
#
# Authors:
# * Andy White
#
************************************************************************ */
/*
* AddressParser.java February 2001
*
* Copyright (C) 2001, Niall Gallagher <niallg@users.sf.net>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package divconq.www.http.parse;
import divconq.www.http.Address;
import divconq.www.http.Path;
import divconq.www.http.Query;
import divconq.www.util.KeyMap;
import divconq.www.util.parse.Parser;
/**
* This parser is used to parse uniform resource identifiers.
* The uniform resource identifier syntax is given in RFC 2396.
* This parser can parse relative and absolute URI's. The
* uniform resource identifier syntax that this parser will
* parse are based on the generic web based URL similar to
* the syntax represented in RFC 2616 section 3.2.2. The syntax
* used to parse this URI is a modified version of RFC 2396
* <pre>
*
* URI = (absoluteURI | relativeURI)
* absoluteURI = scheme ":" ("//" netpath | relativeURI)
* relativeURI = path ["?" querypart]
* netpath = domain [":" port] relativeURI
* path = *("/" segment)
* segment = *pchar *( ";" param )
*
* </pre>
* This implements the <code>Address</code> interface and provides
* methods that access the various parts of the URI. The parameters
* in the path segments of the uniform resource identifier are
* stored in name value pairs. If parameter names are not unique
* across the path segments then only the deepest parameter will be
* stored from the path segment. For example if the URI represented
* was <code>http://domain/path1;x=y/path2;x=z</code> the value for
* the parameter named <code>x</code> would be <code>z</code>.
* <p>
* This will normalize the path part of the uniform resource
* identifier. A normalized path is one that contains no back
* references like "./" and "../". The normalized path will not
* contain the path parameters.
* <p>
* The <code>setPath</code> method is used to reset the path this
* uniform resource identifier has, it also resets the parameters.
* The parameters are extracted from the new path given.
*
* @author Niall Gallagher
*/
public class AddressParser extends Parser implements Address {
/**
* Parameters are stored so that the can be viewed.
*/
private ParameterMap param;
/**
* This is the path used to represent the address path.
*/
private Path normal;
/**
* This contains the query parameters for the address.
*/
private Query data;
/**
* Used to track the characters that form the path.
*/
private Token path;
/**
* Used to track the characters that form the domain.
*/
private Token domain;
/**
* Used to track the characters that form the query.
*/
private Token query;
/**
* Used to track the name characters of a parameter.
*/
private Token name;
/**
* Used to track the value characters of a parameter.
*/
private Token value;
/**
* References the scheme that this URI contains.
*/
private Token scheme;
/**
* Contains the port number if it was specified.
*/
private int port;
/**
* Default constructor will create a <code>AddressParser</code>
* that contains no specifics. The instance will return
* <code>null</code> for all the get methods. The parsers
* get methods are populated by using the <code>parse</code>
* method.
*/
public AddressParser(){
this.param = new ParameterMap();
this.path = new Token();
this.domain = new Token();
this.query = new Token();
this.scheme = new Token();
this.name = new Token();
this.value = new Token();
}
/**
* This is primarily a convenience constructor. This will parse
* the <code>String</code> given to extract the specifics. This
* could be achieved by calling the default no-arg constructor
* and then using the instance to invoke the <code>parse</code>
* method on that <code>String</code> to extract the parts.
*
* @param text a <code>String</code> containing a URI value
*/
public AddressParser(String text){
this();
parse(text);
}
/**
* This allows the scheme of the URL given to be returned.
* If the URI does not contain a scheme then this will
* return null. The scheme of the URI is the part that
* specifies the type of protocol that the URI is used
* for, an example <code>gopher://domain/path</code> is
* a URI that is intended for the gopher protocol. The
* scheme is the string <code>gopher</code>.
*
* @return this returns the scheme tag for the URI if
* there is one specified for it
*/
public String getScheme(){
return scheme.toString();
}
/**
* This is used to retrieve the domain of this URI. The
* domain part in the URI is an optional part, an example
* <code>http://domain/path?querypart</code>. This will
* return the value of the domain part. If there is no
* domain part then this will return null otherwise the
* domain value found in the uniform resource identifier.
*
* @return the domain part of this uniform resource
* identifier this represents
*/
public String getDomain(){
return domain.toString();
}
/**
* This is used to retrieve the path of this URI. The path part
* is the most fundamental part of the URI. This will return
* the value of the path. If there is no path part then this
* will return <code>/</code> to indicate the root.
* <p>
* The <code>Path</code> object returned by this will contain
* no path parameters. The path parameters are available using
* the <code>Address</code> methods. The reason that this does not
* contain any of the path parameters is so that if the path is
* needed to be converted into an OS specific path then the path
* parameters will not need to be separately parsed out.
*
* @return the path that this URI contains, this value will not
* contain any back references such as "./" and "../" or any
* path parameters
*/
public Path getPath(){
if(normal == null) {
String text = path.toString();
if(text == null) {
normal = new PathParser("/");
}
if(normal == null){
normal = new PathParser(text);
}
}
return normal;
}
/**
* This is used to retrieve the query of this URI. The query part
* in the URI is an optional part. This will return the value
* of the query part. If there is no query part then this will
* return an empty <code>Query</code> object. The query is
* an optional member of a URI and comes after the path part, it
* is preceded by a question mark, <code>?</code> character.
* For example the following URI contains <code>query</code> for
* its query part, <code>http://host:port/path?query</code>.
* <p>
* This returns a <code>org.simpleframework.http.Query</code>
* object that can be used to interact directly with the query
* values. The <code>Query</code> object is a read-only interface
* to the query parameters, and so will not affect the URI.
*
* @return a <code>Query</code> object for the query part
*/
public Query getQuery(){
if(data == null) {
String text = query.toString();
if(text == null) {
data = new QueryParser();
}
if(data == null){
data = new QueryParser(text);
}
}
return data;
}
/**
* This is used to retrieve the port of the uniform resource
* identifier. The port part in this is an optional part, an
* example <code>http://host:port/path?querypart</code>. This
* will return the value of the port. If there is no port then
* this will return <code>-1</code> because this represents
* an impossible uniform resource identifier port. The port
* is an optional part.
*
* @return this returns the port of the uniform resource
* identifier
*/
public int getPort(){
return port <= 0? -1 : port;
}
/**
* This extracts the parameter values from the uniform resource
* identifier represented by this object. The parameters that a
* uniform resource identifier contains are embedded in the path
* part of the URI. If the path contains no parameters then this
* will return an empty <code>Map</code> instance.
* <p>
* This will produce unique name and value parameters. Thus if the
* URI contains several path segments with similar parameter names
* this will return the deepest parameter. For example if the URI
* represented was <code>http://domain/path1;x=y/path2;x=z</code>
* the value for the parameter named <code>x</code> would be
* <code>z</code>.
*
* @return this will return the parameter names found in the URI
*/
public KeyMap<String> getParameters(){
return param;
}
/**
* This allows the scheme for the URI to be specified.
* If the URI does not contain a scheme then this will
* attach the scheme and the <code>://</code> identifier
* to ensure that the <code>Address.toString</code> will
* produce the correct syntax.
* <p>
* Caution must be taken to ensure that the port and
* the scheme are consistent. So if the original URI
* was <code>http://domain:80/path</code> and the scheme
* was changed to <code>ftp</code> the port number that
* remains is the standard HTTP port not the FTP port.
*
* @param value this specifies the protocol this URI
* is intended for
*/
public void setScheme(String value){
scheme.value = value;
}
/**
* This will set the domain to whatever value is in the
* string parameter. If the string is null then this URI
* objects <code>toString</code> method will not contain
* the domain. The result of the <code>toString</code>
* method will be <code>/path/path?query</code>. If the
* path is non-null this URI will contain the path.
*
* @param value this will be the new domain of this
* uniform resource identifier, if it is not null
*/
public void setDomain(String value){
domain.value = value;
}
/**
* This will set the port to whatever value it is given. If
* the value is 0 or less then the <code>toString</code> will
* will not contain the optional port. If port number is above
* 0 then the <code>toString</code> method will produce a URI
* like <code>http://host:123/path</code> but only if there is
* a valid domain.
*
* @param port the port value that this URI is to have
*/
public void setPort(int port) {
this.port = port;
}
/**
* This will set the path to whatever value it is given. If the
* value is null then this <code>Address.toString</code> method will
* not contain the path, that is if path is null then it will be
* interpreted as <code>/</code>.
* <p>
* This will reset the parameters this URI has. If the value
* given to this method has embedded parameters these will form
* the parameters of this URI. The value given may not be the
* same value that the <code>getPath</code> produces. The path
* will have all back references and parameters stripped.
*
* @param text the path that this URI is to be set with
*/
public void setPath(String text) {
if(!text.startsWith("/")){
text = "/" + text;
}
domain.toString();
query.toString();
scheme.toString();
param.clear();
path.clear();
parsePath(text); /*extract params*/
}
/**
* This will set the path to whatever value it is given. If the
* value is null then this <code>Address.toString</code> method
* will not contain the path, that is if path is null then it will
* be interpreted as <code>/</code>.
* <p>
* This will reset the parameters this URI has. If the value
* given to this method has embedded parameters these will form
* the parameters of this URI. The value given may not be the
* same value that the <code>getPath</code> produces. The path
* will have all back references and parameters stripped.
*
* @param path the path that this URI is to be set with
*/
public void setPath(Path path) {
if(path != null){
normal = path;
}else {
setPath("/");
}
}
/**
* This is used to parse the path given with the <code>setPath</code>
* method. The path contains name and value pairs. These parameters
* are embedded into the path segments using a semicolon character,
* ';'. Since the parameters to not form part of the actual path
* mapping they are removed from the path and stored. Each parameter
* can then be extracted from this parser using the methods provided
* by the <code>Address</code> interface.
*
* @param path this is the path that is to be parsed and have the
* parameter values extracted
*/
private void parsePath(String path){
count = path.length();
ensureCapacity(count);
path.getChars(0, count, buf, 0);
normal = null;
off = 0;
path();
}
/**
* This will set the query to whatever value it is given. If the
* value is null then this <code>Address.toString</code> method
* will not contain the query. If the query was <code>abc</code>
* then the <code>toString</code> method would produce a string
* like <code>http://host:port/path?abc</code>. If the query is
* null this URI would have no query part. The query must not
* contain the <code>?</code> character.
*
* @param value the query that this uniform resource identifier
* is to be set to if it is non-null
*/
public void setQuery(String value) {
query.value = value;
data = null;
}
/**
* This will set the query to whatever value it is given. If the
* value is null then this <code>Address.toString</code> method
* will not contain the query. If the <code>Query.toString</code>
* returns null then the query will be empty. This is basically
* the <code>setQuery(String)</code> method with the string value
* from the issued <code>Query.toString</code> method.
*
* @param query a <code>Query</code> object that contains
* the name value parameters for the query
*/
public void setQuery(Query query) {
if(value != null) {
data = query;
}else {
setQuery("");
}
}
/**
* This will check to see what type of URI this is if it is an
* <code>absoluteURI</code> or a <code>relativeURI</code>. To
* see the definition of a URI see RFC 2616 for the definition
* of a URL and for more specifics see RFC 2396 for the
* expressions.
*/
protected void parse(){
if(count > 0){
if(buf[0] == '/'){
relativeURI();
}else{
absoluteURI();
}
}
}
/**
* This will empty each tokens cache. A tokens cache is used
* to represent a token once the token's <code>toString</code>
* method has been called. Thus when the <code>toString</code>
* method is called then the token depends on the value of the
* cache alone in further calls to <code>toString</code>.
* However if a URI has just been parsed and that method has
* not been invoked then the cache is created from the buf if
* its length is greater than zero.
*/
protected void init(){
param.clear();
domain.clear();
path.clear();
query.clear();
scheme.clear();
off =port = 0;
normal = null;
data = null;
}
/**
* This is a specific definition of a type of URI. An absolute
* URI is a URI that contains a host and port. It is the most
* frequently used type of URI. This will define the host and
* the optional port part. As well as the relative URI part.
* This uses a simpler syntax than the one specified in RFC 2396
* <code><pre>
*
* absoluteURI = scheme ":" ("//" netpath | relativeURI)
* relativeURI = path ["?" querypart]
* netpath = domain [":" port] relativeURI
* path = *("/" segment)
* segment = *pchar *( ";" param )
*
* </pre></code>
* This syntax is sufficient to handle HTTP style URI's as well
* as GOPHER and FTP and various other 'simple' schemes. See
* RFC 2396 for the syntax of an <code>absoluteURI</code>.
*/
private void absoluteURI(){
scheme();
netPath();
}
/**
* This will check to see if there is a scheme in the URI. If
* there is a scheme found in the URI this returns true and
* removes that scheme tag of the form "ftp:" or "http:"
* or whatever the protocol scheme tag may be for the URI.
* <p>
* The syntax for the scheme is given in RFC 2396 as follows
* <code><pre>
*
* scheme = alpha *( alpha | digit | "+" | "-" | "." )
*
* </pre></code>
* This will however also skips the "://" from the tag
* so of the URI was <code>gopher://domain/path</code> then
* the URI would be <code>domain/path</code> afterwards.
*/
private void scheme(){
int mark = off;
int pos = off;
if(alpha(buf[off])){
while(off < count){
char next = buf[off++];
if(schemeChar(next)){
pos++;
}else if(next == ':'){
if(!skip("//")) {
off = mark;
pos = mark;
}
break;
}else{
off = mark;
pos = mark;
break;
}
}
scheme.len = pos - mark;
scheme.off = mark;
}
}
/**
* This method is used to assist the scheme method. This will
* check to see if the type of the character is the same as
* those described in RFC 2396 for a scheme character. The
* scheme tag can contain an alphanumeric of the following
* <code>"+", "-", "."</code>.
*
* @param c this is the character that is being checked
*
* @return this returns true if the character is a valid
* scheme character
*/
private boolean schemeChar(char c){
switch(c){
case '+': case '-':
case '.':
return true;
default:
return alphanum(c);
}
}
/**
* The network path is the path that contains the network
* address of the host that this URI is targeted at. This
* will parse the domain name of the host and also a port
* number before parsing a relativeURI
* <code><pre>
*
* netpath = domain [":" port] relativeURI
*
* </pre></code>
* This syntax is modified from the URI specification on
* RFC 2396.
*/
private void netPath(){
domain();
if(skip(":")){
port();
}
relativeURI();
}
/**
* This is a specific definition of a type of URI. A relative
* URI is a URI that contains no host or port. It is basically
* the resource within the host. This will extract the path and
* the optional query part of the URI. Rfc2396 has the proper
* definition of a <code>relativeURI</code>.
*/
private void relativeURI(){
path();
if(skip("?")){
query();
}
}
/**
* This is used to extract the optional port from a given URI.
* This will read a sequence of digit characters and convert
* the <code>String</code> of digit characters into a decimal
* number. The digits will be added to the port variable. If
* there is no port number this will not update the read offset.
*/
private void port() {
while(off < count){
if(!digit(buf[off])){
break;
}
port *= 10;
port += buf[off];
port -= '0';
off++;
}
}
/**
* This is used to extract the domain from the given URI. This
* will firstly initialize the token object that represents the
* domain. This allows the token's <code>toString</code> method to
* return the extracted value of the token rather than getting
* confused with previous values set by a previous parse method.
* <p>
* This uses the following delimiters to determine the end of the
* domain <code>?</code>,<code>:</code> and <code>/<code>. This
* ensures that the read offset does not go out of bounds and
* consequently throw an <code>IndexOutOfBoundsException</code>.
*/
private void domain(){
int mark = off;
loop: while(off < count){
switch(buf[off]){
case '/': case ':':
case '?':
break loop;
default:
off++;
}
}
domain.len = off - mark;
domain.off = mark;
}
/**
* This is used to extract the segments from the given URI. This
* will firstly initialize the token object that represents the
* path. This allows the token's <code>toString</code> method to
* return the extracted value of the token rather than getting
* confused with previous values set by a previous parse method.
* <p>
* This is slightly different from RFC 2396 in that it defines a
* pchar as the RFC 2396 definition of a pchar without the escaped
* chars. So this method has to ensure that no escaped chars go
* unchecked. This ensures that the read offset does not go out
* of bounds and throw an <code>IndexOutOfBoundsException</code>.
*/
private void path(){
int mark = off;
int pos = off;
while(skip("/")) {
buf[pos++] = '/';
while(off < count){
if(buf[off]==';'){
while(skip(";")){
param();
insert();
}
break;
}
if(buf[off]=='%'){
escape();
}else if(!pchar(buf[off])){
break;
}
buf[pos++]=buf[off++];
}
}
path.len = pos -mark;
path.off = mark;
}
/**
* This is used to extract the query from the given URI. This
* will firstly initialize the token object that represents the
* query. This allows the token's <code>toString</code> method
* to return the extracted value of the token rather than getting
* confused with previous values set by a previous parse method.
* The calculation of the query part of a URI is basically the
* end of the URI.
*/
private void query() {
query.len = count - off;
query.off = off;
}
/**
* This is an expression that is defined by RFC 2396 it is used
* in the definition of a segment expression. This is basically
* a list of pchars.
* <p>
* This method has to ensure that no escaped chars go unchecked.
* This ensures that the read offset does not goe out of bounds
* and consequently throw an out of bounds exception.
*/
private void param() {
name();
if(skip("=")){ /* in case of error*/
value();
}
}
/**
* This extracts the name of the parameter from the character
* buffer. The name of a parameter is defined as a set of
* pchars including escape sequences. This will extract the
* parameter name and buffer the chars. The name ends when a
* equals character, "=", is encountered or in the case of a
* malformed parameter when the next character is not a pchar.
*/
private void name(){
int mark = off;
int pos = off;
while(off < count){
if(buf[off]=='%'){ /* escaped */
escape();
}else if(buf[off]=='=') {
break;
}else if(!pchar(buf[off])){
break;
}
buf[pos++] = buf[off++];
}
name.len = pos - mark;
name.off = mark;
}
/**
* This extracts a parameter value from a path segment. The
* parameter value consists of a sequence of pchars and some
* escape sequences. The parameter value is buffered so that
* the name and values can be paired. The end of the value
* is determined as the end of the buffer or the last pchar.
*/
private void value(){
int mark = off;
int pos = off;
while(off < count){
if(buf[off]=='%'){ /* escaped */
escape();
}else if(!pchar(buf[off])) {
break;
}
buf[pos++] = buf[off++];
}
value.len = pos - mark;
value.off = mark;
}
/**
* This method adds the name and value to a map so that the next
* name and value can be collected. The name and value are added
* to the map as string objects. Once added to the map the
* <code>Token</code> objects are set to have zero length so they
* can be reused to collect further values. This will add the
* values to the map as an array of type string. This is done so
* that if there are multiple values that they can be stored.
*/
private void insert(){
if(value.length() > 0){
if(name.length() > 0)
insert(name,value);
}
name.clear();
value.clear();
}
/**
* This will add the given name and value to the parameters map.
* This will only store a single value per parameter name, so
* only the parameter that was latest encountered will be saved.
* The <code>getQuery</code> method can be used to collect
* the parameter values using the parameter name.
*
* @param name this is the name of the value to be inserted
* @param value this is the value of a that is to be inserted
*/
private void insert(Token name, Token value){
insert(name.toString(), value.toString());
}
/**
* This will add the given name and value to the parameters map.
* This will only store a single value per parameter name, so
* only the parameter that was latest encountered will be saved.
* The <code>getQuery</code> method can be used to collect
* the parameter values using the parameter name.
*
* @param name this is the name of the value to be inserted
* @param value this is the value of a that is to be inserted
*/
private void insert(String name, String value) {
param.put(name, value);
}
/**
* This converts an encountered escaped sequence, that is all
* embedded hexidecimal characters into a native UCS character
* value. This does not take any characters from the stream it
* just prepares the buffer with the correct byte. The escaped
* sequence within the URI will be interpreded as UTF-8.
* <p>
* This will leave the next character to read from the buffer
* as the character encoded from the URI. If there is a fully
* valid escaped sequence, that is <code>"%" HEX HEX</code>.
* This decodes the escaped sequence using UTF-8 encoding, all
* encoded sequences should be in UCS-2 to fit in a Java char.
*/
private void escape() {
int peek = peek(off);
if(!unicode(peek)) {
binary(peek);
}
}
/**
* This method determines, using a peek character, whether the
* sequence of escaped characters within the URI is binary data.
* If the data within the escaped sequence is binary then this
* will ensure that the next character read from the URI is the
* binary octet. This is used strictly for backward compatible
* parsing of URI strings, binary data should never appear.
*
* @param peek this is the first escaped character from the URI
*
* @return currently this implementation always returns true
*/
private boolean binary(int peek) {
if(off + 2 < count) {
off += 2;
buf[off]= bits(peek);
}
return true;
}
/**
* This method determines, using a peek character, whether the
* sequence of escaped characters within the URI is in UTF-8. If
* a UTF-8 character can be successfully decoded from the URI it
* will be the next character read from the buffer. This can
* check for both UCS-2 and UCS-4 characters. However, because
* the Java <code>char</code> can only hold UCS-2, the UCS-4
* characters will have only the low order octets stored.
* <p>
* The WWW Consortium provides a reference implementation of a
* UTF-8 decoding for Java, in this the low order octets in the
* UCS-4 sequence are used for the character. So, in the
* absence of a defined behaviour, the W3C behaviour is assumed.
*
* @param peek this is the first escaped character from the URI
*
* @return this returns true if a UTF-8 character is decoded
*/
private boolean unicode(int peek) {
if((peek & 0x80) == 0x00){
return unicode(peek, 0);
}
if((peek & 0xe0) == 0xc0){
return unicode(peek & 0x1f, 1);
}
if((peek & 0xf0) == 0xe0){
return unicode(peek & 0x0f, 2);
}
if((peek & 0xf8) == 0xf0){
return unicode(peek & 0x07, 3);
}
if((peek & 0xfc) == 0xf8){
return unicode(peek & 0x03, 4);
}
if((peek & 0xfe) == 0xfc){
return unicode(peek & 0x01, 5);
}
return false;
}
/**
* This method will decode the specified amount of escaped
* characters from the URI and convert them into a single Java
* UCS-2 character. If there are not enough characters within
* the URI then this will return false and leave the URI alone.
* <p>
* The number of characters left is determined from the first
* UTF-8 octet, as specified in RFC 2279, and because this is
* a URI there must that number of <code>"%" HEX HEX</code>
* sequences left. If successful the next character read is
* the UTF-8 sequence decoded into a native UCS-2 character.
*
* @param peek contains the bits read from the first UTF octet
* @param more this specifies the number of UTF octets left
*
* @return this returns true if a UTF-8 character is decoded
*/
private boolean unicode(int peek, int more) {
if(off + more * 3 >= count) {
return false;
}
return unicode(peek,more,off);
}
/**
* This will decode the specified amount of trailing UTF-8 bits
* from the URI. The trailing bits are those following the first
* UTF-8 octet, which specifies the length, in octets, of the
* sequence. The trailing octets are if the form 10xxxxxx, for
* each of these octets only the last six bits are valid UCS
* bits. So a conversion is basically an accumulation of these.
* <p>
* If at any point during the accumulation of the UTF-8 bits
* there is a parsing error, then parsing is aborted an false
* is returned, as a result the URI is left unchanged.
*
* @param peek bytes that have been accumulated from the URI
* @param more this specifies the number of UTF octets left
* @param pos this specifies the position the parsing begins
*
* @return this returns true if a UTF-8 character is decoded
*/
private boolean unicode(int peek, int more, int pos) {
while(more-- > 0) {
if(buf[pos] == '%'){
int next = pos + 3;
int hex = peek(next);
if((hex & 0xc0) == 0x80){
peek = (peek<<6)|(hex&0x3f);
pos = next;
continue;
}
}
return false;
}
if(pos + 2 < count) {
off = pos + 2;
buf[off]= bits(peek);
}
return true;
}
/**
* Defines behaviour for UCS-2 versus UCS-4 conversion from four
* octets. The UTF-8 encoding scheme enables UCS-4 characters to
* be encoded and decodeded. However, Java supports the 16-bit
* UCS-2 character set, and so the 32-bit UCS-4 character set is
* not compatable. This basically decides what to do with UCS-4.
*
* @param data up to four octets to be converted to UCS-2 format
*
* @return this returns a native UCS-2 character from the int
*/
private char bits(int data) {
return (char)data;
}
/**
* This will return the escape expression specified from the URI
* as an integer value of the hexidecimal sequence. This does
* not make any changes to the buffer it simply checks to see if
* the characters at the position specified are an escaped set
* characters of the form <code>"%" HEX HEX</code>, if so, then
* it will convert that hexidecimal string in to an integer
* value, or -1 if the expression is not hexidecimal.
*
* @param pos this is the position the expression starts from
*
* @return the integer value of the hexidecimal expression
*/
private int peek(int pos) {
if(buf[pos] == '%'){
if(count <= pos + 2) {
return -1;
}
char high = buf[pos + 1];
char low = buf[pos + 2];
return convert(high, low);
}
return -1;
}
/**
* This will convert the two hexidecimal characters to a real
* integer value, which is returned. This requires characters
* within the range of 'A' to 'F' and 'a' to 'f', and also
* the digits '0' to '9'. The characters encoded using the
* ISO-8859-1 encoding scheme, if the characters are not with
* in the range specified then this returns -1.
*
* @param high this is the high four bits within the integer
* @param low this is the low four bits within the integer
*
* @return this returns the indeger value of the conversion
*/
private int convert(char high, char low) {
int hex = 0x00;
if(hex(high) && hex(low)){
if('A' <= high && high <= 'F'){
high -= 'A' - 'a';
}
if(high >= 'a') {
hex ^= (high-'a')+10;
} else {
hex ^= high -'0';
}
hex <<= 4;
if('A' <= low && low <= 'F') {
low -= 'A' - 'a';
}
if(low >= 'a') {
hex ^= (low-'a')+10;
} else {
hex ^= low-'0';
}
return hex;
}
return -1;
}
/**
* This is used to determine wheather a char is a hexidecimal
* <code>char</code> or not. A hexidecimal character is consdered
* to be a character within the range of <code>0 - 9</code> and
* between <code>a - f</code> and <code>A - F</code>. This will
* return <code>true</code> if the character is in this range.
*
* @param ch this is the character which is to be determined here
*
* @return true if the character given has a hexidecimal value
*/
private boolean hex(char ch) {
if(ch >= '0' && ch <= '9') {
return true;
} else if(ch >='a' && ch <= 'f') {
return true;
} else if(ch >= 'A' && ch <= 'F') {
return true;
}
return false;
}
/**
* This is a character set defined by RFC 2396 it is used to
* determine the valididity of certain <code>chars</code>
* within a Uniform Resource Identifier. RFC 2396 defines
* an unreserved char as <code>alphanum | mark</code>.
*
* @param c the character value that is being checked
*
* @return true if the character has an unreserved value
*/
private boolean unreserved(char c){
return mark(c) || alphanum(c);
}
/**
* This is used to determine wheather or not a given unicode
* character is an alphabetic character or a digit character.
* That is withing the range <code>0 - 9</code> and between
* <code>a - z</code> it uses <code>iso-8859-1</code> to
* compare the character.
*
* @param c the character value that is being checked
*
* @return true if the character has an alphanumeric value
*/
private boolean alphanum(char c){
return digit(c) || alpha(c);
}
/**
* This is used to determine wheather or not a given unicode
* character is an alphabetic character. This uses encoding
* <code>iso-8859-1</code> to compare the characters.
*
* @param c the character value that is being checked
*
* @return true if the character has an alphabetic value
*/
private boolean alpha(char c){
return (c <= 'z' && 'a' <= c) ||
(c <= 'Z' && 'A' <= c);
}
/**
* This is a character set defined by RFC 2396 it checks
* the valididity of cetain chars within a uniform resource
* identifier. The RFC 2396 defines a mark char as <code>"-",
* "_", ".", "!", "~", "*", "'", "(", ")"</code>.
*
* @param c the character value that is being checked
*
* @return true if the character is a mark character
*/
private boolean mark(char c){
switch(c){
case '-': case '_': case '.':
case '!': case '~': case '*':
case '\'': case '(': case ')':
return true;
default:
return false;
}
}
/**
* This is a character set defined by RFC 2396 it is used to check
* the valididity of cetain chars within a generic uniform resource
* identifier. The RFC 2396 defines a pchar char as unreserved or
* escaped or one of the following characters <code>":", "@", "=",
* "&", "+", "$", ","</code> this will not check to see if the
* char is an escaped char, that is <code>% HEX HEX</code>. Because
* this takes 3 chars.
*
* @param c the character value that is being checked
*
* @return true if the character is a pchar character
*/
private boolean pchar(char c){
switch(c){
case '@': case '&': case '=':
case '+': case '$': case ',':
case ':':
return true;
default:
return unreserved(c);
}
}
/**
* This is a character set defined by RFC 2396, it checks the
* valididity of certain chars in a uniform resource identifier.
* The RFC 2396 defines a reserved char as <code>";", "/", "?",
* ":", "@", "&", "=", "+", "$", ","</code>.
*
* @param c the character value that is being checked
*
* @return true if the character is a reserved character
*/
@SuppressWarnings("unused")
private boolean reserved(char c){
switch(c){
case ';': case '/': case '?':
case '@': case '&': case ':':
case '=': case '+': case '$':
case ',':
return true;
default:
return false;
}
}
/**
* This is used to convert this URI object into a <code>String</code>
* object. This will only convert the parts of the URI that exist, so
* the URI may not contain the domain or the query part and it will
* not contain the path parameters. If the URI contains all these
* parts then it will return somthing like
* <pre>
* scheme://host:port/path/path?querypart
* </pre>
* <p>
* It can return <code>/path/path?querypart</code> style relative
* URI's. If any of the parts are set to null then that part will be
* missing, for example if <code>setDomain</code> method is invoked
* with a null parameter then the domain and port will be missing
* from the resulting URI. If the path part is set to null using the
* <code>setPath</code> then the path will be <code>/</code>. An
* example URI with the path part of null would be
* <pre>
* scheme://host:port/?querypart
* </pre>
*
* @return the URI with only the path part and the non-null optional
* parts of the uniform resource identifier
*/
public String toString() {
return (scheme.length() > 0 ? scheme +"://": "") +
(domain.length() > 0 ? domain +
(port > 0 ? ":"+port : "") : "")+ getPath() +
(param.size() > 0 ? param : "")+
(query.length()>0?"?"+query :"");
}
/**
* The <code>ParameterMap</code> is uses to store the parameters
* that are to be encoded in to the address. This will append all
* of the parameters to the end of the path. These can later be
* extracted by parsing the address.
*
* @author Niall Gallagher
*/
private class ParameterMap extends KeyMap<String> {
/**
*
*/
private static final long serialVersionUID = -7391825649971667162L;
/**
* This will return the parameters encoded in such a way that
* it can be appended to the end of the path. These parameters
* can be added to the address such that they do not form a
* query parameter. Values such as session identifiers are
* often added as the path parameters to the address.
*
* @return this returns the representation of the parameters
*/
private String encode() {
StringBuilder text = new StringBuilder();
for(String name : param) {
String value = param.get(name);
text.append(";");
text.append(name);
if(value != null) {
text.append("=");
text.append(value);;
}
}
return text.toString();
}
/**
* This will return the parameters encoded in such a way that
* it can be appended to the end of the path. These parameters
* can be added to the address such that they do not form a
* query parameter. Values such as session identifiers are
* often added as the path parameters to the address.
*
* @return this returns the representation of the parameters
*/
public String toString() {
return encode();
}
}
/**
* This is used as an alternative to the <code>ParseBuffer</code>
* for extracting tokens from the URI without allocating memory.
* This will basically mark out regions within the buffer which are
* used to represent the token. When the token value is required
* the region is used to create a <code>String</code> object.
*/
private class Token {
/**
* This can be used to override the value for this token.
*/
public String value;
/**
* This represents the start offset within the buffer.
*/
public int off;
/**
* This represents the number of charters in the token.
*/
public int len;
/**
* If the <code>Token</code> is to be reused this will clear
* all previous data. Clearing the buffer allows it to be
* reused if there is a new URI to be parsed. This ensures
* that a null is returned if the token length is zero.
*/
public void clear() {
value = null;
len = 0;
}
/**
* This is used to determine the number of characters this
* token contains. This is used rather than accessing the
* length directly so that the value the token represents
* can be overridden easily without upsetting the token.
*
* @return this returns the number of characters this uses
*/
public int length() {
if(value == null){
return len;
}
return value.length();
}
/**
* This method will convert the <code>Token</code> into it's
* <code>String</code> equivelant. This will firstly check
* to see if there is a value, for the string representation,
* if there is the value is returned, otherwise the region
* is converted into a <code>String</code> and returned.
*
* @return this returns a value representing the token
*/
public String toString() {
if(value != null) {
return value;
}
if(len > 0) {
value = new String(buf,off,len);
}
return value;
}
}
}