/* * PathParser.java February 2001 * * Copyright (C) 2001, Niall Gallagher <niallg@users.sf.net> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.simpleframework.http.parse; import org.simpleframework.http.Path; import org.simpleframework.util.parse.Parser; /** * This is used to parse a path given as part of a URI. This will read the path, * normalize it, and break it up into its components. The normalization of the * path is the conversion of the path given into it's actual path by removing * the references to the parent directories and to the current dir. * <p> * If the path that this represents is <code>/usr/bin/../etc/./README</code> * then the actual path, normalized, is <code>/usr/etc/README</code>. Once the * path has been normalized it is possible to acquire the segments as an array * of strings, which allows simple manipulation of the path. * <p> * Although RFC 2396 defines the path within a URI to have parameters this does * not extract those parameters this will simply normalize the path and include * the path parameters in the path. If the path is to be converted into a OS * specific file system path that has the parameters extracted then the * <code>AddressParser</code> should be used. * * @author Niall Gallagher */ public class PathParser extends Parser implements Path { /** * Used to store the individual path segments. */ private TokenList list; /** * Used to store consumed name characters. */ private Token name; /** * Used to store consumed file extension. */ private Token ext; /** * Used to store the highest directory path. */ private Token dir; /** * Used to store consumed normalized path name. */ private Token path; /** * The default constructor will create a <code>PathParser</code> that * contains no specifics. The instance will return <code>null</code> for all * the get methods. The <code>PathParser</code>'s get methods may be * populated by using the parse method. */ public PathParser() { this.list = new TokenList(); this.ext = new Token(); this.dir = new Token(); this.path = new Token(); this.name = new Token(); } /** * This is primarily a convineance constructor. This will parse the * <code>String</code> given to extract the specifics. This could be achived * by calling the default no-arg constructor and then using the instance to * invoke the <code>parse</code> method on that <code>String</code> to * extract the parts. * * @param path * a <code>String</code> containing a path value */ public PathParser(String path) { this(); this.parse(path); } /** * This will parse the path in such a way that it ensures that at no stage * there are trailing back references, using path normalization. The need to * remove the back references is so that this <code>PathParser</code> will * create the same <code>String</code> path given a set of paths that have * different back references. For example the paths * <code>/path/../path</code> and <code>/path</code> are the same path but * different <code>String</code>'s. * <p> * This will NOT parse an immediate back reference as this signifies a path * that cannot exist. So a path such as <code>/../</code> will result in a * null for all methods. Paths such as <code>../bin</code> will not be * allowed. */ @Override protected void parse() { this.normalize(); this.path(); this.segments(); this.name(); this.extension(); } /** * This will initialize the parser so that it is in a ready state. This * allows the parser to be used to parse many paths. This will clear the * parse buffer objects and reset the offset to point to the start of the * char buffer. The count variable is reset by the <code>Parser.parse</code> * method. */ @Override protected void init() { this.list.clear(); this.ext.clear(); this.dir.clear(); this.name.clear(); this.path.clear(); this.off = 0; } /** * This will return the extension that the file name contains. For example a * file name <code>file.en_US.extension</code> will produce an extension of * <code>extension</code>. This will return null if the path contains no * file extension. * * @return this will return the extension this path contains */ @Override public String getExtension() { return this.ext.toString(); } /** * This will return the full name of the file without the path. As regargs * the definition of the path in RFC 2396 the name would be considered the * last path segment. So if the path was <code>/usr/README</code> the name * is <code>README</code>. Also for directorys the name of the directory in * the last path segment is returned. This returns the name without any of * the path parameters. As RFC 2396 defines the path to have path parameters * after the path segments. * * @return this will return the name of the file in the path */ @Override public String getName() { return this.name.toString(); } /** * This will return the normalized path. The normalized path is the path * without any references to its parent or itself. So if the path to be * parsed is <code>/usr/../etc/./</code> the path is <code>/etc/</code>. If * the path that this represents is a path with an immediate back reference * then this will return null. This is the path with all its information * even the parameter information if it was defined in the path. * * @return this returns the normalize path without <code>../</code> or * <code>./</code> */ @Override public String getPath() { return this.path.toString(); } /** * This will return the normalized path from the specified path segment. * This allows various path parts to be acquired in an efficient means what * does not require copy operations of the use of <code>substring</code> * invocations. Of particular interest is the extraction of context based * paths. This is the path with all its information even the parameter * information if it was defined in the path. * * @param from * this is the segment offset to get the path for * * @return this returns the normalize path without <code>../</code> or * <code>./</code> */ @Override public String getPath(int from) { return this.list.segment(from); } /** * This will return the normalized path from the specified path segment. * This allows various path parts to be acquired in an efficient means what * does not require copy operations of the use of <code>substring</code> * invocations. Of particular interest is the extraction of context based * paths. This is the path with all its information even the parameter * information if it was defined in the path. * * @param from * this is the segment offset to get the path for * @param count * this is the number of path segments to include * * @return this returns the normalize path without <code>../</code> or * <code>./</code> */ @Override public String getPath(int from, int count) { return this.list.segment(from, count); } /** * This will return the highest directory that exists within the path. This * is used to that files within the same path can be acquired. An example of * that this would do given the path <code>/pub/./bin/README</code> would be * to return the highest directory path <code>/pub/bin/</code>. The "/" * character will allways be the last character in the path. * * @return this method will return the highest directory */ @Override public String getDirectory() { return this.dir.toString(); } /** * This method is used to break the path into individual parts called * segments, see RFC 2396. This can be used as an easy way to compare paths * and to examine the directory tree that the path points to. For example, * if an path was broken from the string <code>/usr/bin/../etc</code> then * the segments returned would be <code>usr</code> and <code>etc</code> as * the path is normalized before the segments are extracted. * * @return return all the path segments within the directory */ @Override public String[] getSegments() { return this.list.list(); } /** * This will return the path as it is relative to the issued path. This in * effect will chop the start of this path if it's start matches the highest * directory of the given path as of <code>getDirectory</code>. This is * useful if paths that are relative to a specific location are required. To * illustrate what this method will do the following example is provided. If * this object represented the path string * <code>/usr/share/rfc/rfc2396.txt</code> and the issued path was * <code>/usr/share/text.txt</code> then this will return the path string * <code>/rfc/rfc2396.txt</code>. * * @param path * the path prefix to acquire a relative path * * @return returns a path relative to the one it is given otherwize this * method will return null */ @Override public String getRelative(String path) { return this.getRelative(new PathParser(path)); } /** * This is used by the <code>getRelative(String)</code> to normalize the * path string and determine if it contains a highest directory which is * shared with the path that is represented by this object. If the path has * leading back references, such as <code>../</code>, then the result of * this is null. The returned path begins with a '/'. * * @param path * the path prefix to acquire a relative path * * @return returns a path relative to the one it is given otherwize this * method will return null */ private String getRelative(PathParser path) { char[] text = path.buf; int off = path.dir.off; int len = path.dir.len; return this.getRelative(text, off, len); } /** * This will return the path as it is relative to the issued path. This in * effect will chop the start of this path if it's start matches the highest * directory of the given path as of <code>getDirectory</code>. This is * useful if paths that are relative to a specific location are required. To * illustrate what this method will do the following example is provided. If * this object represented the path string * <code>/usr/share/rfc/rfc2396.txt</code> and the issued path was * <code>/usr/share/text.txt</code> then this will return the path string * <code>/rfc/rfc2396.txt</code>. * * @param text * the path prefix to acquire a relative path * @param off * this is the offset within the text to read * @param len * this is the number of characters in the path * * @return returns a path relative to the one it is given otherwize this * method will return null */ private String getRelative(char[] text, int off, int len) { int size = (this.path.len - len) + 1; /* '/' */ int pos = (this.path.off + len) - 1; for (int i = 0; i < len; i++) { if (text[off++] != this.buf[this.path.off + i]) return null; } if (pos < 0) return null; return new String(this.buf, pos, size); } /** * This will extract the path of the given <code>String</code> after it has * been normalized. If the path can not be normalized then the count is set * to -1 and the path cannot be extracted. When this happens then the path * parameter is <code>null</code>. */ private void path() { if (this.count > 0) { this.path.len = this.count; this.path.off = 0; } } /** * This will simply read the characters from the end of the buffer until it * encounters the first peroid character. When this is read it will store * the file extension and remove the characters from the buffer. */ private void extension() { int pos = this.off + this.count; /* index.html[] */ int len = 0; while ((pos - 1) >= this.off) { /* index.htm[l] */ if (this.buf[--pos] == '.') { /* index[.]html */ this.ext.off = pos + 1; this.ext.len = len; this.count = pos; break; } len++; } } /** * This wil extract each individual segment from the path and also extract * the highest directory. The path segments are basically the strings * delimited by the '/' character of a normalized path. As well as * extracting the path segments this will also extract the directory of * path, that is, the the path up to the last occurance of the '/' * character. */ private void segments() { int pos = this.count - 1; int len = 1; if (this.count > 0) { if (this.buf[pos] == '/') { /* /pub/bin[/] */ this.dir.len = pos + 1; this.dir.off = 0; pos--; /* /pub/bi[n]/ */ } while (pos >= this.off) { if (this.buf[pos] == '/') { /* /pub[/]bin/ */ if (this.dir.len == 0) { this.dir.len = pos + 1; /* [/] is 0 */ this.dir.off = 0; } this.list.add(pos + 1, len - 1); len = 0; } len++; pos--; } } } /** * The normalization of the path is the conversion of the path given into * it's actual path by removing the references to the parent directorys and * to the current dir. So if the path given was * <code>/usr/bin/../etc/./README</code> then the actual path, the * normalized path, is <code>/usr/etc/README</code>. * <p> * This method ensures the if there are an illegal number of back references * that the path will be evaluated as empty. This can evaluate any path * configuration, this includes any references like <code>../</code> or * <code>/..</code> within the path. This will also remove empty segments * like <code>//</code>. */ private void normalize() { int size = this.count + this.off; int pos = this.off; for (this.off = this.count = 0; pos < size; pos++) { this.buf[this.count++] = this.buf[pos]; if (this.buf[pos] == '/') { if ((this.count - 1) > 0) { if (this.buf[this.count - 2] == '/') { this.count--; /* /[/]./path/ */ } } } else if (this.buf[pos] == '.') { /* //[.]/path/ */ if ((this.count - 1) > 0) { /* /[/]./path/ */ if (this.buf[this.count - 2] != '/') { continue; /* /path.[/] */ } } if ((pos + 2) > size) { /* /path/[.] */ this.count--; } else { if (this.buf[pos + 1] == '/') { /* /.[/]path */ pos++;/* /[/]. */ this.count--; /* /.[/]path */ } if (this.buf[pos] != '.') { /* /.[/]path */ continue; } if ((pos + 2) < size) { if (this.buf[pos + 2] != '/') { continue; /* /[.].path */ } } if ((this.count - 2) > 0) { for (this.count -= 2; (this.count - 1) > 0;) { /* * /path[/ * ].. */ if (this.buf[this.count - 1] == '/') { /* [/]path/.. */ break; } this.count--; } } else { /* /../ */ this.count = 0; this.off = 0; break; } pos += 2; /* /path/.[.]/ */ } } } } /** * This will extract the full name of the file without the path. As regards * the definition of the path in RFC 2396 the name would be considered the * last path segment. So if the path was <code>/usr/README</code> the name * is <code>README</code>. Also for directorys the name of the directory in * the last path segment is returned. This returns the name without any of * the path parameters. As RFC 2396 defines the path to have path parameters * after the path segments. So the path for the directory * "/usr/bin;param=value/;param=value" would result in the name "bin". If * the path given was "/" then there will be nothing in the buffer because * <code>extract</code> will have removed it. */ private void name() { int pos = this.count; int len = 0; while (pos-- > this.off) { /* /usr/bin/;para[m] */ if (this.buf[pos] == ';') { /* /usr/bin/[;]param */ if (this.buf[pos - 1] == '/') { /* /usr/bin[/];param */ pos--; /* /usr/bin[/];param */ } len = 0; /* /usr/bin[/] */ } else if (this.buf[pos] == '/') { /* /usr[/]bin */ this.off = pos + 1; /* /usr/[b]in */ this.count = len; /* [b]in */ break; } else { len++; } } this.name.len = this.count; this.name.off = this.off; } /** * This will return the normalized path. The normalized path is the path * without any references to its parent or itself. So if the path to be * parsed is <code>/usr/../etc/./</code> the path is <code>/etc/</code>. If * the path that this represents is a path with an immediate back reference * then this will return null. This is the path with all its information * even the parameter information if it was defined in the path. * * @return this returns the normalize path without <code>../</code> or * <code>./</code> */ @Override public String toString() { return this.getPath(); } /** * This is used so that the <code>PathParser</code> can speed up the parsing * of the data. Rather than using a buffer like a <code>ParseBuffer</code> * or worse a <code>StringBuffer</code> this just keeps an index into the * character array from the start and end of the token. Also this enables a * cache to be kept so that a <code>String</code> does not need to be made * again after the first time it is created. */ private class Token { /** * Provides a quick retrieval of the token value. */ public String value; /** * Offset within the buffer that the token starts. */ public int off; /** * Length of the region that the token consumes. */ public int len; /** * If the <code>Token</code> is to be reused this will clear all * previous data. Clearing the buffer allows it to be reused if there is * a new URI to be parsed. This ensures that a null is returned if the * token length is zero. */ public void clear() { this.value = null; this.len = 0; } /** * This method will convert the <code>Token</code> into it's * <code>String</code> equivelant. This will firstly check to see if * there is a value, for the string representation, if there is the * value is returned, otherwise the region is converted into a * <code>String</code> and returned. * * @return this returns a value representing the token */ @Override public String toString() { if (this.value != null) return this.value; if (this.len > 0) { this.value = new String(PathParser.this.buf, this.off, this.len); } return this.value; } } /** * The <code>TokenList</code> class is used to store a list of tokens. This * provides an <code>add</code> method which can be used to store an offset * and length of a token within the buffer. Once the tokens have been added * to they can be examined, in the order they were added, using the provided * <code>list</code> method. This has a scalable capacity. */ private class TokenList { /** * This is used to cache the segments that are created. */ private String[] cache; /** * Contains the offsets and lengths of the tokens. */ private int[] list; /** * Determines the write offset into the array. */ private int count; /** * Constructor for the <code>TokenList</code> is used to create a * scalable list to store tokens. The initial list is created with an * array of sixteen ints, which is enough to store eight tokens. */ private TokenList() { this.list = new int[16]; } /** * This is used to acquire the path from the segment that is specified. * This provides an efficient means to get the path without having to * perform expensive copy of substring operations. * * @param from * this is the path segment to get the path * * @return the string that is the path segment created */ public String segment(int from) { int total = this.count / 2; int left = total - from; return this.segment(from, left); } /** * This is used to acquire the path from the segment that is specified. * This provides an efficient means to get the path without having to * perform expensive copy of substring operations. * * @param from * this is the path segment to get the path * @param total * this is the number of segments to use * * @return the string that is the path segment created */ public String segment(int from, int total) { int last = this.list[0] + this.list[1] + 1; if ((from + total) < (this.count / 2)) { last = this.offset(from + total); } int start = this.offset(from); int length = last - start; return new String(PathParser.this.buf, start - 1, length); } /** * This is used to acquire the offset within the buffer of the specified * segment. This allows a path to be created that is constructed from a * given segment. * * @param segment * this is the segment offset to use * * @return this returns the offset start for the segment */ private int offset(int segment) { int last = this.count - 2; int shift = segment * 2; int index = last - shift; return this.list[index]; } /** * This is used to add a new token to the list. Tokens will be available * from the <code>list</code> method in the order it was added, so the * first to be added will at index zero and the last with be in the last * index. * * @param off * this is the read offset within the buffer * @param len * the number of characters within the token */ public void add(int off, int len) { if ((this.count + 1) > this.list.length) { this.resize(this.count * 2); } this.list[this.count++] = off; this.list[this.count++] = len; } /** * This is used to retrieve the list of tokens inserted to this list * using the <code>add</code> method. The indexes of the tokens * represents the order that the tokens were added to the list. * * @return returns an ordered list of token strings */ public String[] list() { if (this.cache == null) { this.cache = this.build(); } return this.cache; } /** * This is used to retrieve the list of tokens inserted to this list * using the <code>add</code> method. The indexes of the tokens * represents the order that the tokens were added to the list. * * @return returns an ordered list of token strings */ private String[] build() { String[] value = new String[this.count / 2]; for (int i = 0, j = this.count / 2; i < this.count; i += 2) { int index = j - (i / 2) - 1; int off = this.list[i]; int size = this.list[i + 1]; value[index] = new String(PathParser.this.buf, off, size); } return value; } /** * This is used to clear all tokens previously stored in the list. This * is required so that initialization of the parser with the * <code>init</code> method can ensure that there are no tokens from * previous data. */ public void clear() { this.cache = null; this.count = 0; } /** * Scales the internal array used should the number of tokens exceed the * initial capacity. This will just copy across the ints used to * represent the token. * * @param size * length the capacity is to increase to */ private void resize(int size) { int[] copy = new int[size]; System.arraycopy(this.list, 0, copy, 0, this.count); this.list = copy; } } }