/*
* PathParser.java February 2001
*
* Copyright (C) 2001, Niall Gallagher <niallg@users.sf.net>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.simpleframework.http.parse;
import org.simpleframework.http.Path;
import org.simpleframework.util.parse.Parser;
/**
* This is used to parse a path given as part of a URI. This will read the path,
* normalize it, and break it up into its components. The normalization of the
* path is the conversion of the path given into it's actual path by removing
* the references to the parent directories and to the current dir.
* <p>
* If the path that this represents is <code>/usr/bin/../etc/./README</code>
* then the actual path, normalized, is <code>/usr/etc/README</code>. Once the
* path has been normalized it is possible to acquire the segments as an array
* of strings, which allows simple manipulation of the path.
* <p>
* Although RFC 2396 defines the path within a URI to have parameters this does
* not extract those parameters this will simply normalize the path and include
* the path parameters in the path. If the path is to be converted into a OS
* specific file system path that has the parameters extracted then the
* <code>AddressParser</code> should be used.
*
* @author Niall Gallagher
*/
public class PathParser extends Parser implements Path {
/**
* Used to store the individual path segments.
*/
private TokenList list;
/**
* Used to store consumed name characters.
*/
private Token name;
/**
* Used to store consumed file extension.
*/
private Token ext;
/**
* Used to store the highest directory path.
*/
private Token dir;
/**
* Used to store consumed normalized path name.
*/
private Token path;
/**
* The default constructor will create a <code>PathParser</code> that
* contains no specifics. The instance will return <code>null</code> for all
* the get methods. The <code>PathParser</code>'s get methods may be
* populated by using the parse method.
*/
public PathParser() {
this.list = new TokenList();
this.ext = new Token();
this.dir = new Token();
this.path = new Token();
this.name = new Token();
}
/**
* This is primarily a convineance constructor. This will parse the
* <code>String</code> given to extract the specifics. This could be achived
* by calling the default no-arg constructor and then using the instance to
* invoke the <code>parse</code> method on that <code>String</code> to
* extract the parts.
*
* @param path
* a <code>String</code> containing a path value
*/
public PathParser(String path) {
this();
this.parse(path);
}
/**
* This will parse the path in such a way that it ensures that at no stage
* there are trailing back references, using path normalization. The need to
* remove the back references is so that this <code>PathParser</code> will
* create the same <code>String</code> path given a set of paths that have
* different back references. For example the paths
* <code>/path/../path</code> and <code>/path</code> are the same path but
* different <code>String</code>'s.
* <p>
* This will NOT parse an immediate back reference as this signifies a path
* that cannot exist. So a path such as <code>/../</code> will result in a
* null for all methods. Paths such as <code>../bin</code> will not be
* allowed.
*/
@Override
protected void parse() {
this.normalize();
this.path();
this.segments();
this.name();
this.extension();
}
/**
* This will initialize the parser so that it is in a ready state. This
* allows the parser to be used to parse many paths. This will clear the
* parse buffer objects and reset the offset to point to the start of the
* char buffer. The count variable is reset by the <code>Parser.parse</code>
* method.
*/
@Override
protected void init() {
this.list.clear();
this.ext.clear();
this.dir.clear();
this.name.clear();
this.path.clear();
this.off = 0;
}
/**
* This will return the extension that the file name contains. For example a
* file name <code>file.en_US.extension</code> will produce an extension of
* <code>extension</code>. This will return null if the path contains no
* file extension.
*
* @return this will return the extension this path contains
*/
@Override
public String getExtension() {
return this.ext.toString();
}
/**
* This will return the full name of the file without the path. As regargs
* the definition of the path in RFC 2396 the name would be considered the
* last path segment. So if the path was <code>/usr/README</code> the name
* is <code>README</code>. Also for directorys the name of the directory in
* the last path segment is returned. This returns the name without any of
* the path parameters. As RFC 2396 defines the path to have path parameters
* after the path segments.
*
* @return this will return the name of the file in the path
*/
@Override
public String getName() {
return this.name.toString();
}
/**
* This will return the normalized path. The normalized path is the path
* without any references to its parent or itself. So if the path to be
* parsed is <code>/usr/../etc/./</code> the path is <code>/etc/</code>. If
* the path that this represents is a path with an immediate back reference
* then this will return null. This is the path with all its information
* even the parameter information if it was defined in the path.
*
* @return this returns the normalize path without <code>../</code> or
* <code>./</code>
*/
@Override
public String getPath() {
return this.path.toString();
}
/**
* This will return the normalized path from the specified path segment.
* This allows various path parts to be acquired in an efficient means what
* does not require copy operations of the use of <code>substring</code>
* invocations. Of particular interest is the extraction of context based
* paths. This is the path with all its information even the parameter
* information if it was defined in the path.
*
* @param from
* this is the segment offset to get the path for
*
* @return this returns the normalize path without <code>../</code> or
* <code>./</code>
*/
@Override
public String getPath(int from) {
return this.list.segment(from);
}
/**
* This will return the normalized path from the specified path segment.
* This allows various path parts to be acquired in an efficient means what
* does not require copy operations of the use of <code>substring</code>
* invocations. Of particular interest is the extraction of context based
* paths. This is the path with all its information even the parameter
* information if it was defined in the path.
*
* @param from
* this is the segment offset to get the path for
* @param count
* this is the number of path segments to include
*
* @return this returns the normalize path without <code>../</code> or
* <code>./</code>
*/
@Override
public String getPath(int from, int count) {
return this.list.segment(from, count);
}
/**
* This will return the highest directory that exists within the path. This
* is used to that files within the same path can be acquired. An example of
* that this would do given the path <code>/pub/./bin/README</code> would be
* to return the highest directory path <code>/pub/bin/</code>. The "/"
* character will allways be the last character in the path.
*
* @return this method will return the highest directory
*/
@Override
public String getDirectory() {
return this.dir.toString();
}
/**
* This method is used to break the path into individual parts called
* segments, see RFC 2396. This can be used as an easy way to compare paths
* and to examine the directory tree that the path points to. For example,
* if an path was broken from the string <code>/usr/bin/../etc</code> then
* the segments returned would be <code>usr</code> and <code>etc</code> as
* the path is normalized before the segments are extracted.
*
* @return return all the path segments within the directory
*/
@Override
public String[] getSegments() {
return this.list.list();
}
/**
* This will return the path as it is relative to the issued path. This in
* effect will chop the start of this path if it's start matches the highest
* directory of the given path as of <code>getDirectory</code>. This is
* useful if paths that are relative to a specific location are required. To
* illustrate what this method will do the following example is provided. If
* this object represented the path string
* <code>/usr/share/rfc/rfc2396.txt</code> and the issued path was
* <code>/usr/share/text.txt</code> then this will return the path string
* <code>/rfc/rfc2396.txt</code>.
*
* @param path
* the path prefix to acquire a relative path
*
* @return returns a path relative to the one it is given otherwize this
* method will return null
*/
@Override
public String getRelative(String path) {
return this.getRelative(new PathParser(path));
}
/**
* This is used by the <code>getRelative(String)</code> to normalize the
* path string and determine if it contains a highest directory which is
* shared with the path that is represented by this object. If the path has
* leading back references, such as <code>../</code>, then the result of
* this is null. The returned path begins with a '/'.
*
* @param path
* the path prefix to acquire a relative path
*
* @return returns a path relative to the one it is given otherwize this
* method will return null
*/
private String getRelative(PathParser path) {
char[] text = path.buf;
int off = path.dir.off;
int len = path.dir.len;
return this.getRelative(text, off, len);
}
/**
* This will return the path as it is relative to the issued path. This in
* effect will chop the start of this path if it's start matches the highest
* directory of the given path as of <code>getDirectory</code>. This is
* useful if paths that are relative to a specific location are required. To
* illustrate what this method will do the following example is provided. If
* this object represented the path string
* <code>/usr/share/rfc/rfc2396.txt</code> and the issued path was
* <code>/usr/share/text.txt</code> then this will return the path string
* <code>/rfc/rfc2396.txt</code>.
*
* @param text
* the path prefix to acquire a relative path
* @param off
* this is the offset within the text to read
* @param len
* this is the number of characters in the path
*
* @return returns a path relative to the one it is given otherwize this
* method will return null
*/
private String getRelative(char[] text, int off, int len) {
int size = (this.path.len - len) + 1; /* '/' */
int pos = (this.path.off + len) - 1;
for (int i = 0; i < len; i++) {
if (text[off++] != this.buf[this.path.off + i]) return null;
}
if (pos < 0) return null;
return new String(this.buf, pos, size);
}
/**
* This will extract the path of the given <code>String</code> after it has
* been normalized. If the path can not be normalized then the count is set
* to -1 and the path cannot be extracted. When this happens then the path
* parameter is <code>null</code>.
*/
private void path() {
if (this.count > 0) {
this.path.len = this.count;
this.path.off = 0;
}
}
/**
* This will simply read the characters from the end of the buffer until it
* encounters the first peroid character. When this is read it will store
* the file extension and remove the characters from the buffer.
*/
private void extension() {
int pos = this.off + this.count; /* index.html[] */
int len = 0;
while ((pos - 1) >= this.off) { /* index.htm[l] */
if (this.buf[--pos] == '.') { /* index[.]html */
this.ext.off = pos + 1;
this.ext.len = len;
this.count = pos;
break;
}
len++;
}
}
/**
* This wil extract each individual segment from the path and also extract
* the highest directory. The path segments are basically the strings
* delimited by the '/' character of a normalized path. As well as
* extracting the path segments this will also extract the directory of
* path, that is, the the path up to the last occurance of the '/'
* character.
*/
private void segments() {
int pos = this.count - 1;
int len = 1;
if (this.count > 0) {
if (this.buf[pos] == '/') { /* /pub/bin[/] */
this.dir.len = pos + 1;
this.dir.off = 0;
pos--; /* /pub/bi[n]/ */
}
while (pos >= this.off) {
if (this.buf[pos] == '/') { /* /pub[/]bin/ */
if (this.dir.len == 0) {
this.dir.len = pos + 1; /* [/] is 0 */
this.dir.off = 0;
}
this.list.add(pos + 1, len - 1);
len = 0;
}
len++;
pos--;
}
}
}
/**
* The normalization of the path is the conversion of the path given into
* it's actual path by removing the references to the parent directorys and
* to the current dir. So if the path given was
* <code>/usr/bin/../etc/./README</code> then the actual path, the
* normalized path, is <code>/usr/etc/README</code>.
* <p>
* This method ensures the if there are an illegal number of back references
* that the path will be evaluated as empty. This can evaluate any path
* configuration, this includes any references like <code>../</code> or
* <code>/..</code> within the path. This will also remove empty segments
* like <code>//</code>.
*/
private void normalize() {
int size = this.count + this.off;
int pos = this.off;
for (this.off = this.count = 0; pos < size; pos++) {
this.buf[this.count++] = this.buf[pos];
if (this.buf[pos] == '/') {
if ((this.count - 1) > 0) {
if (this.buf[this.count - 2] == '/') {
this.count--; /* /[/]./path/ */
}
}
} else if (this.buf[pos] == '.') { /* //[.]/path/ */
if ((this.count - 1) > 0) { /* /[/]./path/ */
if (this.buf[this.count - 2] != '/') {
continue; /* /path.[/] */
}
}
if ((pos + 2) > size) { /* /path/[.] */
this.count--;
} else {
if (this.buf[pos + 1] == '/') { /* /.[/]path */
pos++;/* /[/]. */
this.count--; /* /.[/]path */
}
if (this.buf[pos] != '.') { /* /.[/]path */
continue;
}
if ((pos + 2) < size) {
if (this.buf[pos + 2] != '/') {
continue; /* /[.].path */
}
}
if ((this.count - 2) > 0) {
for (this.count -= 2; (this.count - 1) > 0;) { /*
* /path[/
* ]..
*/
if (this.buf[this.count - 1] == '/') { /* [/]path/.. */
break;
}
this.count--;
}
} else { /* /../ */
this.count = 0;
this.off = 0;
break;
}
pos += 2; /* /path/.[.]/ */
}
}
}
}
/**
* This will extract the full name of the file without the path. As regards
* the definition of the path in RFC 2396 the name would be considered the
* last path segment. So if the path was <code>/usr/README</code> the name
* is <code>README</code>. Also for directorys the name of the directory in
* the last path segment is returned. This returns the name without any of
* the path parameters. As RFC 2396 defines the path to have path parameters
* after the path segments. So the path for the directory
* "/usr/bin;param=value/;param=value" would result in the name "bin". If
* the path given was "/" then there will be nothing in the buffer because
* <code>extract</code> will have removed it.
*/
private void name() {
int pos = this.count;
int len = 0;
while (pos-- > this.off) { /* /usr/bin/;para[m] */
if (this.buf[pos] == ';') { /* /usr/bin/[;]param */
if (this.buf[pos - 1] == '/') { /* /usr/bin[/];param */
pos--; /* /usr/bin[/];param */
}
len = 0; /* /usr/bin[/] */
} else if (this.buf[pos] == '/') { /* /usr[/]bin */
this.off = pos + 1; /* /usr/[b]in */
this.count = len; /* [b]in */
break;
} else {
len++;
}
}
this.name.len = this.count;
this.name.off = this.off;
}
/**
* This will return the normalized path. The normalized path is the path
* without any references to its parent or itself. So if the path to be
* parsed is <code>/usr/../etc/./</code> the path is <code>/etc/</code>. If
* the path that this represents is a path with an immediate back reference
* then this will return null. This is the path with all its information
* even the parameter information if it was defined in the path.
*
* @return this returns the normalize path without <code>../</code> or
* <code>./</code>
*/
@Override
public String toString() {
return this.getPath();
}
/**
* This is used so that the <code>PathParser</code> can speed up the parsing
* of the data. Rather than using a buffer like a <code>ParseBuffer</code>
* or worse a <code>StringBuffer</code> this just keeps an index into the
* character array from the start and end of the token. Also this enables a
* cache to be kept so that a <code>String</code> does not need to be made
* again after the first time it is created.
*/
private class Token {
/**
* Provides a quick retrieval of the token value.
*/
public String value;
/**
* Offset within the buffer that the token starts.
*/
public int off;
/**
* Length of the region that the token consumes.
*/
public int len;
/**
* If the <code>Token</code> is to be reused this will clear all
* previous data. Clearing the buffer allows it to be reused if there is
* a new URI to be parsed. This ensures that a null is returned if the
* token length is zero.
*/
public void clear() {
this.value = null;
this.len = 0;
}
/**
* This method will convert the <code>Token</code> into it's
* <code>String</code> equivelant. This will firstly check to see if
* there is a value, for the string representation, if there is the
* value is returned, otherwise the region is converted into a
* <code>String</code> and returned.
*
* @return this returns a value representing the token
*/
@Override
public String toString() {
if (this.value != null) return this.value;
if (this.len > 0) {
this.value = new String(PathParser.this.buf, this.off, this.len);
}
return this.value;
}
}
/**
* The <code>TokenList</code> class is used to store a list of tokens. This
* provides an <code>add</code> method which can be used to store an offset
* and length of a token within the buffer. Once the tokens have been added
* to they can be examined, in the order they were added, using the provided
* <code>list</code> method. This has a scalable capacity.
*/
private class TokenList {
/**
* This is used to cache the segments that are created.
*/
private String[] cache;
/**
* Contains the offsets and lengths of the tokens.
*/
private int[] list;
/**
* Determines the write offset into the array.
*/
private int count;
/**
* Constructor for the <code>TokenList</code> is used to create a
* scalable list to store tokens. The initial list is created with an
* array of sixteen ints, which is enough to store eight tokens.
*/
private TokenList() {
this.list = new int[16];
}
/**
* This is used to acquire the path from the segment that is specified.
* This provides an efficient means to get the path without having to
* perform expensive copy of substring operations.
*
* @param from
* this is the path segment to get the path
*
* @return the string that is the path segment created
*/
public String segment(int from) {
int total = this.count / 2;
int left = total - from;
return this.segment(from, left);
}
/**
* This is used to acquire the path from the segment that is specified.
* This provides an efficient means to get the path without having to
* perform expensive copy of substring operations.
*
* @param from
* this is the path segment to get the path
* @param total
* this is the number of segments to use
*
* @return the string that is the path segment created
*/
public String segment(int from, int total) {
int last = this.list[0] + this.list[1] + 1;
if ((from + total) < (this.count / 2)) {
last = this.offset(from + total);
}
int start = this.offset(from);
int length = last - start;
return new String(PathParser.this.buf, start - 1, length);
}
/**
* This is used to acquire the offset within the buffer of the specified
* segment. This allows a path to be created that is constructed from a
* given segment.
*
* @param segment
* this is the segment offset to use
*
* @return this returns the offset start for the segment
*/
private int offset(int segment) {
int last = this.count - 2;
int shift = segment * 2;
int index = last - shift;
return this.list[index];
}
/**
* This is used to add a new token to the list. Tokens will be available
* from the <code>list</code> method in the order it was added, so the
* first to be added will at index zero and the last with be in the last
* index.
*
* @param off
* this is the read offset within the buffer
* @param len
* the number of characters within the token
*/
public void add(int off, int len) {
if ((this.count + 1) > this.list.length) {
this.resize(this.count * 2);
}
this.list[this.count++] = off;
this.list[this.count++] = len;
}
/**
* This is used to retrieve the list of tokens inserted to this list
* using the <code>add</code> method. The indexes of the tokens
* represents the order that the tokens were added to the list.
*
* @return returns an ordered list of token strings
*/
public String[] list() {
if (this.cache == null) {
this.cache = this.build();
}
return this.cache;
}
/**
* This is used to retrieve the list of tokens inserted to this list
* using the <code>add</code> method. The indexes of the tokens
* represents the order that the tokens were added to the list.
*
* @return returns an ordered list of token strings
*/
private String[] build() {
String[] value = new String[this.count / 2];
for (int i = 0, j = this.count / 2; i < this.count; i += 2) {
int index = j - (i / 2) - 1;
int off = this.list[i];
int size = this.list[i + 1];
value[index] = new String(PathParser.this.buf, off, size);
}
return value;
}
/**
* This is used to clear all tokens previously stored in the list. This
* is required so that initialization of the parser with the
* <code>init</code> method can ensure that there are no tokens from
* previous data.
*/
public void clear() {
this.cache = null;
this.count = 0;
}
/**
* Scales the internal array used should the number of tokens exceed the
* initial capacity. This will just copy across the ints used to
* represent the token.
*
* @param size
* length the capacity is to increase to
*/
private void resize(int size) {
int[] copy = new int[size];
System.arraycopy(this.list, 0, copy, 0, this.count);
this.list = copy;
}
}
}