/* Copyright (c) 2008 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.gdata.util.common.base;
import static com.google.gdata.util.common.base.Preconditions.checkNotNull;
import java.io.IOException;
/**
* Utility functions for dealing with {@code CharEscaper}s, and some commonly
* used {@code CharEscaper} instances.
*
*
*
*/
public final class CharEscapers {
private CharEscapers() {}
// For each xxxEscaper method, please add links to external
// reference pages that we consider authoritative for what
// that escaper should exactly be doing.
/**
* Performs no escaping.
*/
private static final CharEscaper NULL_ESCAPER = new CharEscaper() {
@Override
public String escape(String string) {
checkNotNull(string);
return string;
}
@Override
public Appendable escape(final Appendable out) {
checkNotNull(out);
// we can't simply return out because the CharEscaper contract says that
// the returned Appendable will throw a NullPointerException if asked to
// append null.
return new Appendable() {
public Appendable append(CharSequence csq) throws IOException {
checkNotNull(csq);
out.append(csq);
return this;
}
public Appendable append(CharSequence csq, int start, int end)
throws IOException {
checkNotNull(csq);
out.append(csq, start, end);
return this;
}
public Appendable append(char c) throws IOException {
out.append(c);
return this;
}
};
}
@Override
protected char[] escape(char c) {
return null;
}
};
/**
* Returns a {@link CharEscaper} that does no escaping.
*/
public static CharEscaper nullEscaper() {
return NULL_ESCAPER;
}
/**
* Returns a {@link CharEscaper} instance that escapes special characters in a
* string so it can safely be included in an XML document in either element
* content or attribute values.
*
* <p><b>Note</b></p>: silently removes null-characters and control
* characters, as there is no way to represent them in XML.
*/
public static CharEscaper xmlEscaper() {
return XML_ESCAPER;
}
/**
* Escapes special characters from a string so it can safely be included in an
* XML document in either element content or attribute values. Also removes
* null-characters and control characters, as there is no way to represent
* them in XML.
*/
private static final CharEscaper XML_ESCAPER = newBasicXmlEscapeBuilder()
.addEscape('"', """)
.addEscape('\'', "'")
.toEscaper();
/**
* Returns a {@link CharEscaper} instance that escapes special characters in a
* string so it can safely be included in an XML document in element content.
*
* <p><b>Note</b></p>: double and single quotes are not escaped, so it is not
* safe to use this escaper to escape attribute values. Use the
* {@link #xmlEscaper()} escaper to escape attribute values or if you are
* unsure. Also silently removes non-whitespace control characters, as there
* is no way to represent them in XML.
*/
public static CharEscaper xmlContentEscaper() {
return XML_CONTENT_ESCAPER;
}
/**
* Escapes special characters from a string so it can safely be included in an
* XML document in element content. Note that quotes are <em>not</em>
* escaped, so <em>this is not safe for use in attribute values</em>. Use
* {@link #XML_ESCAPER} for attribute values, or if you are unsure. Also
* removes non-whitespace control characters, as there is no way to represent
* them in XML.
*/
private static final CharEscaper XML_CONTENT_ESCAPER =
newBasicXmlEscapeBuilder().toEscaper();
/**
* Returns a {@link CharEscaper} instance that escapes special characters in a
* string so it can safely be included in an HTML document in either element
* content or attribute values.
*
* <p><b>Note</b></p>: alters non-ASCII and control characters.
*
* The entity list was taken from:
* <a href="http://www.w3.org/TR/html4/sgml/entities.html">here</a>
*/
public static CharEscaper htmlEscaper() {
return HtmlEscaperHolder.HTML_ESCAPER;
}
/**
* A lazy initialization holder for HTML_ESCAPER.
*/
private static class HtmlEscaperHolder {
private static final CharEscaper HTML_ESCAPER
= new HtmlCharEscaper(new CharEscaperBuilder()
.addEscape('"', """)
.addEscape('\'', "'")
.addEscape('&', "&")
.addEscape('<', "<")
.addEscape('>', ">")
.addEscape('\u00A0', " ")
.addEscape('\u00A1', "¡")
.addEscape('\u00A2', "¢")
.addEscape('\u00A3', "£")
.addEscape('\u00A4', "¤")
.addEscape('\u00A5', "¥")
.addEscape('\u00A6', "¦")
.addEscape('\u00A7', "§")
.addEscape('\u00A8', "¨")
.addEscape('\u00A9', "©")
.addEscape('\u00AA', "ª")
.addEscape('\u00AB', "«")
.addEscape('\u00AC', "¬")
.addEscape('\u00AD', "")
.addEscape('\u00AE', "®")
.addEscape('\u00AF', "¯")
.addEscape('\u00B0', "°")
.addEscape('\u00B1', "±")
.addEscape('\u00B2', "²")
.addEscape('\u00B3', "³")
.addEscape('\u00B4', "´")
.addEscape('\u00B5', "µ")
.addEscape('\u00B6', "¶")
.addEscape('\u00B7', "·")
.addEscape('\u00B8', "¸")
.addEscape('\u00B9', "¹")
.addEscape('\u00BA', "º")
.addEscape('\u00BB', "»")
.addEscape('\u00BC', "¼")
.addEscape('\u00BD', "½")
.addEscape('\u00BE', "¾")
.addEscape('\u00BF', "¿")
.addEscape('\u00C0', "À")
.addEscape('\u00C1', "Á")
.addEscape('\u00C2', "Â")
.addEscape('\u00C3', "Ã")
.addEscape('\u00C4', "Ä")
.addEscape('\u00C5', "Å")
.addEscape('\u00C6', "Æ")
.addEscape('\u00C7', "Ç")
.addEscape('\u00C8', "È")
.addEscape('\u00C9', "É")
.addEscape('\u00CA', "Ê")
.addEscape('\u00CB', "Ë")
.addEscape('\u00CC', "Ì")
.addEscape('\u00CD', "Í")
.addEscape('\u00CE', "Î")
.addEscape('\u00CF', "Ï")
.addEscape('\u00D0', "Ð")
.addEscape('\u00D1', "Ñ")
.addEscape('\u00D2', "Ò")
.addEscape('\u00D3', "Ó")
.addEscape('\u00D4', "Ô")
.addEscape('\u00D5', "Õ")
.addEscape('\u00D6', "Ö")
.addEscape('\u00D7', "×")
.addEscape('\u00D8', "Ø")
.addEscape('\u00D9', "Ù")
.addEscape('\u00DA', "Ú")
.addEscape('\u00DB', "Û")
.addEscape('\u00DC', "Ü")
.addEscape('\u00DD', "Ý")
.addEscape('\u00DE', "Þ")
.addEscape('\u00DF', "ß")
.addEscape('\u00E0', "à")
.addEscape('\u00E1', "á")
.addEscape('\u00E2', "â")
.addEscape('\u00E3', "ã")
.addEscape('\u00E4', "ä")
.addEscape('\u00E5', "å")
.addEscape('\u00E6', "æ")
.addEscape('\u00E7', "ç")
.addEscape('\u00E8', "è")
.addEscape('\u00E9', "é")
.addEscape('\u00EA', "ê")
.addEscape('\u00EB', "ë")
.addEscape('\u00EC', "ì")
.addEscape('\u00ED', "í")
.addEscape('\u00EE', "î")
.addEscape('\u00EF', "ï")
.addEscape('\u00F0', "ð")
.addEscape('\u00F1', "ñ")
.addEscape('\u00F2', "ò")
.addEscape('\u00F3', "ó")
.addEscape('\u00F4', "ô")
.addEscape('\u00F5', "õ")
.addEscape('\u00F6', "ö")
.addEscape('\u00F7', "÷")
.addEscape('\u00F8', "ø")
.addEscape('\u00F9', "ù")
.addEscape('\u00FA', "ú")
.addEscape('\u00FB', "û")
.addEscape('\u00FC', "ü")
.addEscape('\u00FD', "ý")
.addEscape('\u00FE', "þ")
.addEscape('\u00FF', "ÿ")
.addEscape('\u0152', "Œ")
.addEscape('\u0153', "œ")
.addEscape('\u0160', "Š")
.addEscape('\u0161', "š")
.addEscape('\u0178', "Ÿ")
.addEscape('\u0192', "ƒ")
.addEscape('\u02C6', "ˆ")
.addEscape('\u02DC', "˜")
.addEscape('\u0391', "Α")
.addEscape('\u0392', "Β")
.addEscape('\u0393', "Γ")
.addEscape('\u0394', "Δ")
.addEscape('\u0395', "Ε")
.addEscape('\u0396', "Ζ")
.addEscape('\u0397', "Η")
.addEscape('\u0398', "Θ")
.addEscape('\u0399', "Ι")
.addEscape('\u039A', "Κ")
.addEscape('\u039B', "Λ")
.addEscape('\u039C', "Μ")
.addEscape('\u039D', "Ν")
.addEscape('\u039E', "Ξ")
.addEscape('\u039F', "Ο")
.addEscape('\u03A0', "Π")
.addEscape('\u03A1', "Ρ")
.addEscape('\u03A3', "Σ")
.addEscape('\u03A4', "Τ")
.addEscape('\u03A5', "Υ")
.addEscape('\u03A6', "Φ")
.addEscape('\u03A7', "Χ")
.addEscape('\u03A8', "Ψ")
.addEscape('\u03A9', "Ω")
.addEscape('\u03B1', "α")
.addEscape('\u03B2', "β")
.addEscape('\u03B3', "γ")
.addEscape('\u03B4', "δ")
.addEscape('\u03B5', "ε")
.addEscape('\u03B6', "ζ")
.addEscape('\u03B7', "η")
.addEscape('\u03B8', "θ")
.addEscape('\u03B9', "ι")
.addEscape('\u03BA', "κ")
.addEscape('\u03BB', "λ")
.addEscape('\u03BC', "μ")
.addEscape('\u03BD', "ν")
.addEscape('\u03BE', "ξ")
.addEscape('\u03BF', "ο")
.addEscape('\u03C0', "π")
.addEscape('\u03C1', "ρ")
.addEscape('\u03C2', "ς")
.addEscape('\u03C3', "σ")
.addEscape('\u03C4', "τ")
.addEscape('\u03C5', "υ")
.addEscape('\u03C6', "φ")
.addEscape('\u03C7', "χ")
.addEscape('\u03C8', "ψ")
.addEscape('\u03C9', "ω")
.addEscape('\u03D1', "ϑ")
.addEscape('\u03D2', "ϒ")
.addEscape('\u03D6', "ϖ")
.addEscape('\u2002', " ")
.addEscape('\u2003', " ")
.addEscape('\u2009', " ")
.addEscape('\u200C', "")
.addEscape('\u200D', "")
.addEscape('\u200E', "")
.addEscape('\u200F', "")
.addEscape('\u2013', "–")
.addEscape('\u2014', "—")
.addEscape('\u2018', "‘")
.addEscape('\u2019', "’")
.addEscape('\u201A', "‚")
.addEscape('\u201C', "“")
.addEscape('\u201D', "”")
.addEscape('\u201E', "„")
.addEscape('\u2020', "†")
.addEscape('\u2021', "‡")
.addEscape('\u2022', "•")
.addEscape('\u2026', "…")
.addEscape('\u2030', "‰")
.addEscape('\u2032', "′")
.addEscape('\u2033', "″")
.addEscape('\u2039', "‹")
.addEscape('\u203A', "›")
.addEscape('\u203E', "‾")
.addEscape('\u2044', "⁄")
.addEscape('\u20AC', "€")
.addEscape('\u2111', "ℑ")
.addEscape('\u2118', "℘")
.addEscape('\u211C', "ℜ")
.addEscape('\u2122', "™")
.addEscape('\u2135', "ℵ")
.addEscape('\u2190', "←")
.addEscape('\u2191', "↑")
.addEscape('\u2192', "→")
.addEscape('\u2193', "↓")
.addEscape('\u2194', "↔")
.addEscape('\u21B5', "↵")
.addEscape('\u21D0', "⇐")
.addEscape('\u21D1', "⇑")
.addEscape('\u21D2', "⇒")
.addEscape('\u21D3', "⇓")
.addEscape('\u21D4', "⇔")
.addEscape('\u2200', "∀")
.addEscape('\u2202', "∂")
.addEscape('\u2203', "∃")
.addEscape('\u2205', "∅")
.addEscape('\u2207', "∇")
.addEscape('\u2208', "∈")
.addEscape('\u2209', "∉")
.addEscape('\u220B', "∋")
.addEscape('\u220F', "∏")
.addEscape('\u2211', "∑")
.addEscape('\u2212', "−")
.addEscape('\u2217', "∗")
.addEscape('\u221A', "√")
.addEscape('\u221D', "∝")
.addEscape('\u221E', "∞")
.addEscape('\u2220', "∠")
.addEscape('\u2227', "∧")
.addEscape('\u2228', "∨")
.addEscape('\u2229', "∩")
.addEscape('\u222A', "∪")
.addEscape('\u222B', "∫")
.addEscape('\u2234', "∴")
.addEscape('\u223C', "∼")
.addEscape('\u2245', "≅")
.addEscape('\u2248', "≈")
.addEscape('\u2260', "≠")
.addEscape('\u2261', "≡")
.addEscape('\u2264', "≤")
.addEscape('\u2265', "≥")
.addEscape('\u2282', "⊂")
.addEscape('\u2283', "⊃")
.addEscape('\u2284', "⊄")
.addEscape('\u2286', "⊆")
.addEscape('\u2287', "⊇")
.addEscape('\u2295', "⊕")
.addEscape('\u2297', "⊗")
.addEscape('\u22A5', "⊥")
.addEscape('\u22C5', "⋅")
.addEscape('\u2308', "⌈")
.addEscape('\u2309', "⌉")
.addEscape('\u230A', "⌊")
.addEscape('\u230B', "⌋")
.addEscape('\u2329', "〈")
.addEscape('\u232A', "〉")
.addEscape('\u25CA', "◊")
.addEscape('\u2660', "♠")
.addEscape('\u2663', "♣")
.addEscape('\u2665', "♥")
.addEscape('\u2666', "♦")
.toArray());
}
/**
* Returns a {@link CharEscaper} instance that escapes special characters in a
* string so it can safely be included in an HTML document in either element
* content or attribute values.
*
* <p><b>Note</b></p>: does not alter non-ASCII and control characters.
*/
public static CharEscaper asciiHtmlEscaper() {
return ASCII_HTML_ESCAPER;
}
/**
* Escapes special characters from a string so it can safely be included in an
* HTML document in either element content or attribute values. Does
* <em>not</em> alter non-ASCII characters or control characters.
*/
private static final CharEscaper ASCII_HTML_ESCAPER = new CharEscaperBuilder()
.addEscape('"', """)
.addEscape('\'', "'")
.addEscape('&', "&")
.addEscape('<', "<")
.addEscape('>', ">")
.toEscaper();
/**
* Returns an {@link Escaper} instance that escapes Java chars so they can be
* safely included in URIs. For details on escaping URIs, see section 2.4 of
* <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>.
*
* <p>When encoding a String, the following rules apply:
* <ul>
* <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
* through "9" remain the same.
* <li>The special characters ".", "-", "*", and "_" remain the same.
* <li>The space character " " is converted into a plus sign "+".
* <li>All other characters are converted into one or more bytes using UTF-8
* encoding and each byte is then represented by the 3-character string
* "%XY", where "XY" is the two-digit, uppercase, hexadecimal
* representation of the byte value.
* <ul>
*
* <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
* hexidecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
* RFC 3986</a>:<br>
* <i>"URI producers and normalizers should use uppercase hexadecimal digits
* for all percent-encodings."</i>
*
* <p>This escaper has identical behavior to (but is potentially much faster
* than):
* <ul>
* <li>{@link com.google.gdata.util.httputil.FastURLEncoder#encode(String)}
* <li>{@link com.google.gdata.util.httputil.FastURLEncoder#encode(String,String)}
* with the encoding name "UTF-8"
* <li>{@link com.google.gdata.util.common.net.UriEncoder#encode(String)}
* <li>{@link com.google.gdata.util.common.net.UriEncoder#encode(String,java.nio.charset.Charset)}
* with the UTF_8 Charset
* <li>{@link java.net.URLEncoder#encode(String, String)}
* with the encoding name "UTF-8"
* </ul>
*
* <p>This method is equivalent to {@code uriEscaper(true)}.
*/
public static Escaper uriEscaper() {
return uriEscaper(true);
}
/**
* Returns an {@link Escaper} instance that escapes Java chars so they can be
* safely included in URI path segments. For details on escaping URIs, see
* section 2.4 of <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
*
* <p>When encoding a String, the following rules apply:
* <ul>
* <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
* through "9" remain the same.
* <li>The unreserved characters ".", "-", "~", and "_" remain the same.
* <li>The general delimiters "@" and ":" remain the same.
* <li>The subdelimiters "!", "$", "&", "'", "(", ")", "*", ",", ";",
* and "=" remain the same.
* <li>The space character " " is converted into %20.
* <li>All other characters are converted into one or more bytes using UTF-8
* encoding and each byte is then represented by the 3-character string
* "%XY", where "XY" is the two-digit, uppercase, hexadecimal
* representation of the byte value.
* </ul>
*
* <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
* hexidecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
* RFC 3986</a>:<br>
* <i>"URI producers and normalizers should use uppercase hexadecimal digits
* for all percent-encodings."</i>
*/
public static Escaper uriPathEscaper() {
return URI_PATH_ESCAPER;
}
/**
* Returns an {@link Escaper} instance that escapes Java chars so they can be
* safely included in URI query string segments. When the query string
* consists of a sequence of name=value pairs separated by &, the names
* and values should be individually encoded. If you escape an entire query
* string in one pass with this escaper, then the "=" and "&" characters
* used as separators will also be escaped.
*
* <p>This escaper is also suitable for escaping fragment identifiers.
*
* <p>For details on escaping URIs, see
* section 2.4 of <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
*
* <p>When encoding a String, the following rules apply:
* <ul>
* <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
* through "9" remain the same.
* <li>The unreserved characters ".", "-", "~", and "_" remain the same.
* <li>The general delimiters "@" and ":" remain the same.
* <li>The path delimiters "/" and "?" remain the same.
* <li>The subdelimiters "!", "$", "'", "(", ")", "*", ",", and ";",
* remain the same.
* <li>The space character " " is converted into %20.
* <li>The equals sign "=" is converted into %3D.
* <li>The ampersand "&" is converted into %26.
* <li>All other characters are converted into one or more bytes using UTF-8
* encoding and each byte is then represented by the 3-character string
* "%XY", where "XY" is the two-digit, uppercase, hexadecimal
* representation of the byte value.
* </ul>
*
* <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
* hexidecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
* RFC 3986</a>:<br>
* <i>"URI producers and normalizers should use uppercase hexadecimal digits
* for all percent-encodings."</i>
*/
public static Escaper uriQueryStringEscaper() {
return URI_QUERY_STRING_ESCAPER;
}
/**
* Returns a {@link Escaper} instance that escapes Java characters so they can
* be safely included in URIs. For details on escaping URIs, see section 2.4
* of <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>.
*
* <p>When encoding a String, the following rules apply:
* <ul>
* <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
* through "9" remain the same.
* <li>The special characters ".", "-", "*", and "_" remain the same.
* <li>If {@code plusForSpace} was specified, the space character " " is
* converted into a plus sign "+". Otherwise it is converted into "%20".
* <li>All other characters are converted into one or more bytes using UTF-8
* encoding and each byte is then represented by the 3-character string
* "%XY", where "XY" is the two-digit, uppercase, hexadecimal
* representation of the byte value.
* </ul>
*
* <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
* hexidecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
* RFC 3986</a>:<br>
* <i>"URI producers and normalizers should use uppercase hexadecimal digits
* for all percent-encodings."</i>
*
* @param plusForSpace if {@code true} space is escaped to {@code +} otherwise
* it is escaped to {@code %20}. Although common, the escaping of
* spaces as plus signs has a very ambiguous status in the relevant
* specifications. You should prefer {@code %20} unless you are doing
* exact character-by-character comparisons of URLs and backwards
* compatibility requires you to use plus signs.
*
* @see #uriEscaper()
*/
public static Escaper uriEscaper(boolean plusForSpace) {
return plusForSpace ? URI_ESCAPER : URI_ESCAPER_NO_PLUS;
}
private static final Escaper URI_ESCAPER =
new PercentEscaper(PercentEscaper.SAFECHARS_URLENCODER, true);
private static final Escaper URI_ESCAPER_NO_PLUS =
new PercentEscaper(PercentEscaper.SAFECHARS_URLENCODER, false);
private static final Escaper URI_PATH_ESCAPER =
new PercentEscaper(PercentEscaper.SAFEPATHCHARS_URLENCODER, false);
private static final Escaper URI_QUERY_STRING_ESCAPER =
new PercentEscaper(PercentEscaper.SAFEQUERYSTRINGCHARS_URLENCODER, false);
/**
* Returns a {@link Escaper} instance that escapes Java characters in a manner
* compatible with the C++ webutil/url URL class (the {@code kGoogle1Escape}
* set).
*
* <p>When encoding a String, the following rules apply:
* <ul>
* <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
* through "9" remain the same.
* <li>The special characters "!", "(", ")", "*", "-", ".", "_", "~", ",", "/"
* and ":" remain the same.
* <li>The space character " " is converted into a plus sign "+".
* <li>All other characters are converted into one or more bytes using UTF-8
* encoding and each byte is then represented by the 3-character string
* "%XY", where "XY" is the two-digit, uppercase, hexadecimal
* representation of the byte value.
* </ul>
*
* <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
* hexidecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
* RFC 3986</a>:<br>
* <i>"URI producers and normalizers should use uppercase hexadecimal digits
* for all percent-encodings."</i>
*
* <p><b>Note</b>: This escaper is a special case and is <em>not
* compliant</em> with <a href="http://www.ietf.org/rfc/rfc2396.txt">
* RFC 2396</a>. Specifically it will not escape "/", ":" and ",". This is
* only provided for certain limited use cases and you should favor using
* {@link #uriEscaper()} whenever possible.
*/
public static Escaper cppUriEscaper() {
return CPP_URI_ESCAPER;
}
// Based on comments from FastURLEncoder:
// These octets mimic the ones escaped by the C++ webutil/url URL class --
// the kGoogle1Escape set.
// To produce the same escaping as C++, use this set with the plusForSpace
// option.
// WARNING: Contrary to RFC 2396 ",", "/" and ":" are listed as safe here.
private static final Escaper CPP_URI_ESCAPER =
new PercentEscaper("!()*-._~,/:", true);
/**
* Returns a {@link CharEscaper} instance that escapes special characters in a
* string so it can safely be included in a Java string literal.
*
* <p><b>Note</b></p>: does not escape single quotes, so use the escaper
* returned by {@link #javaCharEscaper()} if you are generating char
* literals or if you are unsure.
*/
public static CharEscaper javaStringEscaper() {
return JAVA_STRING_ESCAPER;
}
/**
* Escapes special characters from a string so it can safely be included in a
* Java string literal. Does <em>not</em> escape single-quotes, so use
* JAVA_CHAR_ESCAPE if you are generating char literals, or if you are unsure.
*
* <p>Note that non-ASCII characters will be octal or Unicode escaped.
*/
private static final CharEscaper JAVA_STRING_ESCAPER
= new JavaCharEscaper(new CharEscaperBuilder()
.addEscape('\b', "\\b")
.addEscape('\f', "\\f")
.addEscape('\n', "\\n")
.addEscape('\r', "\\r")
.addEscape('\t', "\\t")
.addEscape('\"', "\\\"")
.addEscape('\\', "\\\\")
.toArray());
/**
* Returns a {@link CharEscaper} instance that escapes special characters in a
* string so it can safely be included in a Java char or string literal. The
* behavior of this escaper is the same as that of the
* {@link #javaStringEscaper()}, except it also escapes single quotes.
*/
public static CharEscaper javaCharEscaper() {
return JAVA_CHAR_ESCAPER;
}
/**
* Escapes special characters from a string so it can safely be included in a
* Java char literal or string literal.
*
* <p>Note that non-ASCII characters will be octal or Unicode escaped.
*
* <p>This is the same as {@link #JAVA_STRING_ESCAPER}, except that it escapes
* single quotes.
*/
private static final CharEscaper JAVA_CHAR_ESCAPER
= new JavaCharEscaper(new CharEscaperBuilder()
.addEscape('\b', "\\b")
.addEscape('\f', "\\f")
.addEscape('\n', "\\n")
.addEscape('\r', "\\r")
.addEscape('\t', "\\t")
.addEscape('\'', "\\'")
.addEscape('\"', "\\\"")
.addEscape('\\', "\\\\")
.toArray());
/**
* Returns a {@link CharEscaper} instance that replaces non-ASCII characters
* in a string with their Unicode escape sequences ({@code \\uxxxx} where
* {@code xxxx} is a hex number). Existing escape sequences won't be affected.
*/
public static CharEscaper javaStringUnicodeEscaper() {
return JAVA_STRING_UNICODE_ESCAPER;
}
/**
* Escapes each non-ASCII character in with its Unicode escape sequence
* {@code \\uxxxx} where {@code xxxx} is a hex number. Existing escape
* sequences won't be affected.
*/
private static final CharEscaper JAVA_STRING_UNICODE_ESCAPER
= new CharEscaper() {
@Override protected char[] escape(char c) {
if (c <= 127) {
return null;
}
char[] r = new char[6];
r[5] = HEX_DIGITS[c & 15];
c >>>= 4;
r[4] = HEX_DIGITS[c & 15];
c >>>= 4;
r[3] = HEX_DIGITS[c & 15];
c >>>= 4;
r[2] = HEX_DIGITS[c & 15];
r[1] = 'u';
r[0] = '\\';
return r;
}
};
/**
* Returns a {@link CharEscaper} instance that escapes special characters from
* a string so it can safely be included in a Python string literal. Does not
* have any special handling for non-ASCII characters.
*/
public static CharEscaper pythonEscaper() {
return PYTHON_ESCAPER;
}
/**
* Escapes special characters in a string so it can safely be included in a
* Python string literal. Does not have any special handling for non-ASCII
* characters.
*/
private static final CharEscaper PYTHON_ESCAPER = new CharEscaperBuilder()
.addEscape('\n', "\\n")
.addEscape('\r', "\\r")
.addEscape('\t', "\\t")
.addEscape('\\', "\\\\")
.addEscape('\"', "\\\"")
.addEscape('\'', "\\\'")
.toEscaper();
/**
* Returns a {@link CharEscaper} instance that escapes non-ASCII characters in
* a string so it can safely be included in a Javascript string literal.
* Non-ASCII characters are replaced with their ASCII javascript escape
* sequences (e.g., \\uhhhh or \xhh).
*/
public static CharEscaper javascriptEscaper() {
return JAVASCRIPT_ESCAPER;
}
/**
* {@code CharEscaper} to escape javascript strings. Turns all non-ASCII
* characters into ASCII javascript escape sequences (e.g., \\uhhhh or \xhh).
*/
private static final CharEscaper JAVASCRIPT_ESCAPER
= new JavascriptCharEscaper(new CharEscaperBuilder()
.addEscape('\'', "\\x27")
.addEscape('"', "\\x22")
.addEscape('<', "\\x3c")
.addEscape('=', "\\x3d")
.addEscape('>', "\\x3e")
.addEscape('&', "\\x26")
.addEscape('\b', "\\b")
.addEscape('\t', "\\t")
.addEscape('\n', "\\n")
.addEscape('\f', "\\f")
.addEscape('\r', "\\r")
.addEscape('\\', "\\\\")
.toArray());
private static CharEscaperBuilder newBasicXmlEscapeBuilder() {
return new CharEscaperBuilder()
.addEscape('&', "&")
.addEscape('<', "<")
.addEscape('>', ">")
.addEscapes(new char[] {
'\000', '\001', '\002', '\003', '\004',
'\005', '\006', '\007', '\010', '\013',
'\014', '\016', '\017', '\020', '\021',
'\022', '\023', '\024', '\025', '\026',
'\027', '\030', '\031', '\032', '\033',
'\034', '\035', '\036', '\037'}, "");
}
/**
* Returns a composite {@link CharEscaper} instance that tries to escape
* characters using a primary {@code CharEscaper} first and falls back to a
* secondary one if there is no escaping.
*
* <p>The returned escaper will attempt to escape each character using the
* primary escaper, and if the primary escaper has no escaping for that
* character, it will use the secondary escaper. If the secondary escaper has
* no escaping for a character either, the original character will be used.
* If the primary escaper has an escape for a character, the secondary escaper
* will not be used at all for that character; the escaped output of the
* primary is not run through the secondary. For a case where you would like
* to first escape with one escaper, and then with another, it is recommended
* that you call each escaper in order.
*
* @param primary The primary {@code CharEscaper} to use
* @param secondary The secondary {@code CharEscaper} to use if the first one
* has no escaping rule for a character
* @throws NullPointerException if any of the arguments is null
*/
public static CharEscaper fallThrough(CharEscaper primary,
CharEscaper secondary) {
checkNotNull(primary);
checkNotNull(secondary);
return new FallThroughCharEscaper(primary, secondary);
}
/**
* A fast {@link CharEscaper} that uses an array of replacement characters and
* a range of safe characters. It overrides {@link #escape(String)} to improve
* performance. Rough benchmarking shows that this almost doubles the speed
* when processing strings that do not require escaping (providing the escape
* test itself is efficient).
*/
private static abstract class FastCharEscaper extends CharEscaper {
protected final char[][] replacements;
protected final int replacementLength;
protected final char safeMin;
protected final char safeMax;
public FastCharEscaper(char[][] replacements, char safeMin, char safeMax) {
this.replacements = replacements;
this.replacementLength = replacements.length;
this.safeMin = safeMin;
this.safeMax = safeMax;
}
/** Overridden for performance (see {@link FastCharEscaper}). */
@Override public String escape(String s) {
int slen = s.length();
for (int index = 0; index < slen; index++) {
char c = s.charAt(index);
if ((c < replacementLength && replacements[c] != null)
|| c < safeMin || c > safeMax) {
return escapeSlow(s, index);
}
}
return s;
}
}
/**
* Escaper for Java character escaping, contains both an array and a
* backup function. We're not overriding the array decorator because we
* want to keep this as fast as possible, so no calls to super.escape first.
*/
private static class JavaCharEscaper extends FastCharEscaper {
public JavaCharEscaper(char[][] replacements) {
super(replacements, ' ', '~');
}
@Override protected char[] escape(char c) {
// First check if our array has a valid escaping.
if (c < replacementLength) {
char[] r = replacements[c];
if (r != null) {
return r;
}
}
// This range is un-escaped.
if (safeMin <= c && c <= safeMax) {
return null;
}
if (c <= 0xFF) {
// Convert c to an octal-escaped string.
// Equivalent to String.format("\\%03o", (int)c);
char[] r = new char[4];
r[0] = '\\';
r[3] = HEX_DIGITS[c & 7];
c >>>= 3;
r[2] = HEX_DIGITS[c & 7];
c >>>= 3;
r[1] = HEX_DIGITS[c & 7];
return r;
}
// Convert c to a hex-escaped string.
// Equivalent to String.format("\\u%04x", (int)c);
char[] r = new char[6];
r[0] = '\\';
r[1] = 'u';
r[5] = HEX_DIGITS[c & 15];
c >>>= 4;
r[4] = HEX_DIGITS[c & 15];
c >>>= 4;
r[3] = HEX_DIGITS[c & 15];
c >>>= 4;
r[2] = HEX_DIGITS[c & 15];
return r;
}
}
/**
* Escaper for javascript character escaping, contains both an array and a
* backup function. We're not overriding the array decorator because we
* want to keep this as fast as possible, so no calls to super.escape first.
*/
private static class JavascriptCharEscaper extends FastCharEscaper {
public JavascriptCharEscaper(char[][] replacements) {
super(replacements, ' ', '~');
}
@Override protected char[] escape(char c) {
// First check if our array has a valid escaping.
if (c < replacementLength) {
char[] r = replacements[c];
if (r != null) {
return r;
}
}
// This range is unescaped.
if (safeMin <= c && c <= safeMax) {
return null;
}
// we can do a 2 digit hex escape for chars less that 0x100
if (c < 0x100) {
char[] r = new char[4];
r[3] = HEX_DIGITS[c & 0xf];
c >>>= 4;
r[2] = HEX_DIGITS[c & 0xf];
r[1] = 'x';
r[0] = '\\';
return r;
}
// 4 digit hex escape everything else
char[] r = new char[6];
r[5] = HEX_DIGITS[c & 0xf];
c >>>= 4;
r[4] = HEX_DIGITS[c & 0xf];
c >>>= 4;
r[3] = HEX_DIGITS[c & 0xf];
c >>>= 4;
r[2] = HEX_DIGITS[c & 0xf];
r[1] = 'u';
r[0] = '\\';
return r;
}
}
/**
* Escaper for HTML character escaping, contains both an array and a
* backup function. We're not overriding the array decorator because we
* want to keep this as fast as possible, so no calls to super.escape first.
*/
private static class HtmlCharEscaper extends FastCharEscaper {
public HtmlCharEscaper(char[][] replacements) {
super(replacements, Character.MIN_VALUE, '~');
}
@Override protected char[] escape(char c) {
// First check if our array has a valid escaping.
if (c < replacementLength) {
char[] r = replacements[c];
if (r != null) {
return r;
}
}
// ~ is ASCII 126, the highest value char that does not need
// to be escaped
if (c <= safeMax) {
return null;
}
int index;
if (c < 1000) {
index = 4;
} else if (c < 10000) {
index = 5;
} else {
index = 6;
}
char[] result = new char[index + 2];
result[0] = '&';
result[1] = '#';
result[index + 1] = ';';
// to avoid the division and modulo operators.
int intValue = c;
for (; index > 1; index--) {
result[index] = HEX_DIGITS[intValue % 10];
intValue /= 10;
}
return result;
}
}
/**
* A composite {@code CharEscaper} object that tries to escape characters
* using a primary {@code CharEscaper} first and falls back to a secondary
* one if there is no escaping.
*/
private static class FallThroughCharEscaper extends CharEscaper {
private final CharEscaper primary;
private final CharEscaper secondary;
public FallThroughCharEscaper(CharEscaper primary, CharEscaper secondary) {
this.primary = primary;
this.secondary = secondary;
}
@Override
protected char[] escape(char c) {
char result[] = primary.escape(c);
if (result == null) {
result = secondary.escape(c);
}
return result;
}
}
private static final char[] HEX_DIGITS = "0123456789abcdef".toCharArray();
}