CharEscapers.java example

Explorer
gdata-java-client-master
- java
/* Copyright (c) 2008 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package com.google.gdata.util.common.base;

import static com.google.gdata.util.common.base.Preconditions.checkNotNull;

import java.io.IOException;

/**
 * Utility functions for dealing with {@code CharEscaper}s, and some commonly
 * used {@code CharEscaper} instances.
 *
 * 
 * 
 */
public final class CharEscapers {
  private CharEscapers() {}

  //                    For each xxxEscaper method, please add links to external
  //                    reference pages that we consider authoritative for what
  //                    that escaper should exactly be doing.

  /**
   * Performs no escaping.
   */
  private static final CharEscaper NULL_ESCAPER = new CharEscaper() {
      @Override
      public String escape(String string) {
        checkNotNull(string);
        return string;
      }

      @Override
      public Appendable escape(final Appendable out) {
        checkNotNull(out);

        // we can't simply return out because the CharEscaper contract says that
        // the returned Appendable will throw a NullPointerException if asked to
        // append null.
        return new Appendable() {
            public Appendable append(CharSequence csq) throws IOException {
              checkNotNull(csq);
              out.append(csq);
              return this;
            }

            public Appendable append(CharSequence csq, int start, int end)
                throws IOException {
              checkNotNull(csq);
              out.append(csq, start, end);
              return this;
            }

            public Appendable append(char c) throws IOException {
              out.append(c);
              return this;
            }
          };
      }

      @Override
      protected char[] escape(char c) {
        return null;
      }
    };

  /**
   * Returns a {@link CharEscaper} that does no escaping.
   */
  public static CharEscaper nullEscaper() {
    return NULL_ESCAPER;
  }

  /**
   * Returns a {@link CharEscaper} instance that escapes special characters in a
   * string so it can safely be included in an XML document in either element
   * content or attribute values.
   *
   * <p><b>Note</b></p>: silently removes null-characters and control
   * characters, as there is no way to represent them in XML.
   */
  public static CharEscaper xmlEscaper() {
    return XML_ESCAPER;
  }

  /**
   * Escapes special characters from a string so it can safely be included in an
   * XML document in either element content or attribute values.  Also removes
   * null-characters and control characters, as there is no way to represent
   * them in XML.
   */
  private static final CharEscaper XML_ESCAPER = newBasicXmlEscapeBuilder()
      .addEscape('"', """)
      .addEscape('\'', "'")
      .toEscaper();

  /**
   * Returns a {@link CharEscaper} instance that escapes special characters in a
   * string so it can safely be included in an XML document in element content.
   *
   * <p><b>Note</b></p>: double and single quotes are not escaped, so it is not
   * safe to use this escaper to escape attribute values. Use the
   * {@link #xmlEscaper()} escaper to escape attribute values or if you are
   * unsure. Also silently removes non-whitespace control characters, as there
   * is no way to represent them in XML.
   */
  public static CharEscaper xmlContentEscaper() {
    return XML_CONTENT_ESCAPER;
  }

  /**
   * Escapes special characters from a string so it can safely be included in an
   * XML document in element content.  Note that quotes are <em>not</em>
   * escaped, so <em>this is not safe for use in attribute values</em>. Use
   * {@link #XML_ESCAPER} for attribute values, or if you are unsure.  Also
   * removes non-whitespace control characters, as there is no way to represent
   * them in XML.
   */
  private static final CharEscaper XML_CONTENT_ESCAPER =
      newBasicXmlEscapeBuilder().toEscaper();

  /**
   * Returns a {@link CharEscaper} instance that escapes special characters in a
   * string so it can safely be included in an HTML document in either element
   * content or attribute values.
   *
   * <p><b>Note</b></p>: alters non-ASCII and control characters.
   *
   * The entity list was taken from:
   * <a href="http://www.w3.org/TR/html4/sgml/entities.html">here</a>
   */
  public static CharEscaper htmlEscaper() {
    return HtmlEscaperHolder.HTML_ESCAPER;
  }

  /**
   * A lazy initialization holder for HTML_ESCAPER.
   */
  private static class HtmlEscaperHolder {
    private static final CharEscaper HTML_ESCAPER
        = new HtmlCharEscaper(new CharEscaperBuilder()
            .addEscape('"',      """)
            .addEscape('\'',     "'")
            .addEscape('&',      "&")
            .addEscape('<',      "<")
            .addEscape('>',      ">")
            .addEscape('\u00A0', " ")
            .addEscape('\u00A1', "¡")
            .addEscape('\u00A2', "¢")
            .addEscape('\u00A3', "£")
            .addEscape('\u00A4', "¤")
            .addEscape('\u00A5', "¥")
            .addEscape('\u00A6', "¦")
            .addEscape('\u00A7', "§")
            .addEscape('\u00A8', "¨")
            .addEscape('\u00A9', "©")
            .addEscape('\u00AA', "ª")
            .addEscape('\u00AB', "«")
            .addEscape('\u00AC', "¬")
            .addEscape('\u00AD', "")
            .addEscape('\u00AE', "®")
            .addEscape('\u00AF', "¯")
            .addEscape('\u00B0', "°")
            .addEscape('\u00B1', "±")
            .addEscape('\u00B2', "²")
            .addEscape('\u00B3', "³")
            .addEscape('\u00B4', "´")
            .addEscape('\u00B5', "µ")
            .addEscape('\u00B6', "¶")
            .addEscape('\u00B7', "·")
            .addEscape('\u00B8', "¸")
            .addEscape('\u00B9', "¹")
            .addEscape('\u00BA', "º")
            .addEscape('\u00BB', "»")
            .addEscape('\u00BC', "¼")
            .addEscape('\u00BD', "½")
            .addEscape('\u00BE', "¾")
            .addEscape('\u00BF', "¿")
            .addEscape('\u00C0', "À")
            .addEscape('\u00C1', "Á")
            .addEscape('\u00C2', "Â")
            .addEscape('\u00C3', "Ã")
            .addEscape('\u00C4', "Ä")
            .addEscape('\u00C5', "Å")
            .addEscape('\u00C6', "Æ")
            .addEscape('\u00C7', "Ç")
            .addEscape('\u00C8', "È")
            .addEscape('\u00C9', "É")
            .addEscape('\u00CA', "Ê")
            .addEscape('\u00CB', "Ë")
            .addEscape('\u00CC', "Ì")
            .addEscape('\u00CD', "Í")
            .addEscape('\u00CE', "Î")
            .addEscape('\u00CF', "Ï")
            .addEscape('\u00D0', "Ð")
            .addEscape('\u00D1', "Ñ")
            .addEscape('\u00D2', "Ò")
            .addEscape('\u00D3', "Ó")
            .addEscape('\u00D4', "Ô")
            .addEscape('\u00D5', "Õ")
            .addEscape('\u00D6', "Ö")
            .addEscape('\u00D7', "×")
            .addEscape('\u00D8', "Ø")
            .addEscape('\u00D9', "Ù")
            .addEscape('\u00DA', "Ú")
            .addEscape('\u00DB', "Û")
            .addEscape('\u00DC', "Ü")
            .addEscape('\u00DD', "Ý")
            .addEscape('\u00DE', "Þ")
            .addEscape('\u00DF', "ß")
            .addEscape('\u00E0', "à")
            .addEscape('\u00E1', "á")
            .addEscape('\u00E2', "â")
            .addEscape('\u00E3', "ã")
            .addEscape('\u00E4', "ä")
            .addEscape('\u00E5', "å")
            .addEscape('\u00E6', "æ")
            .addEscape('\u00E7', "ç")
            .addEscape('\u00E8', "è")
            .addEscape('\u00E9', "é")
            .addEscape('\u00EA', "ê")
            .addEscape('\u00EB', "ë")
            .addEscape('\u00EC', "ì")
            .addEscape('\u00ED', "í")
            .addEscape('\u00EE', "î")
            .addEscape('\u00EF', "ï")
            .addEscape('\u00F0', "ð")
            .addEscape('\u00F1', "ñ")
            .addEscape('\u00F2', "ò")
            .addEscape('\u00F3', "ó")
            .addEscape('\u00F4', "ô")
            .addEscape('\u00F5', "õ")
            .addEscape('\u00F6', "ö")
            .addEscape('\u00F7', "÷")
            .addEscape('\u00F8', "ø")
            .addEscape('\u00F9', "ù")
            .addEscape('\u00FA', "ú")
            .addEscape('\u00FB', "û")
            .addEscape('\u00FC', "ü")
            .addEscape('\u00FD', "ý")
            .addEscape('\u00FE', "þ")
            .addEscape('\u00FF', "ÿ")
            .addEscape('\u0152', "Œ")
            .addEscape('\u0153', "œ")
            .addEscape('\u0160', "Š")
            .addEscape('\u0161', "š")
            .addEscape('\u0178', "Ÿ")
            .addEscape('\u0192', "ƒ")
            .addEscape('\u02C6', "ˆ")
            .addEscape('\u02DC', "˜")
            .addEscape('\u0391', "Α")
            .addEscape('\u0392', "Β")
            .addEscape('\u0393', "Γ")
            .addEscape('\u0394', "Δ")
            .addEscape('\u0395', "Ε")
            .addEscape('\u0396', "Ζ")
            .addEscape('\u0397', "Η")
            .addEscape('\u0398', "Θ")
            .addEscape('\u0399', "Ι")
            .addEscape('\u039A', "Κ")
            .addEscape('\u039B', "Λ")
            .addEscape('\u039C', "Μ")
            .addEscape('\u039D', "Ν")
            .addEscape('\u039E', "Ξ")
            .addEscape('\u039F', "Ο")
            .addEscape('\u03A0', "Π")
            .addEscape('\u03A1', "Ρ")
            .addEscape('\u03A3', "Σ")
            .addEscape('\u03A4', "Τ")
            .addEscape('\u03A5', "Υ")
            .addEscape('\u03A6', "Φ")
            .addEscape('\u03A7', "Χ")
            .addEscape('\u03A8', "Ψ")
            .addEscape('\u03A9', "Ω")
            .addEscape('\u03B1', "α")
            .addEscape('\u03B2', "β")
            .addEscape('\u03B3', "γ")
            .addEscape('\u03B4', "δ")
            .addEscape('\u03B5', "ε")
            .addEscape('\u03B6', "ζ")
            .addEscape('\u03B7', "η")
            .addEscape('\u03B8', "θ")
            .addEscape('\u03B9', "ι")
            .addEscape('\u03BA', "κ")
            .addEscape('\u03BB', "λ")
            .addEscape('\u03BC', "μ")
            .addEscape('\u03BD', "ν")
            .addEscape('\u03BE', "ξ")
            .addEscape('\u03BF', "ο")
            .addEscape('\u03C0', "π")
            .addEscape('\u03C1', "ρ")
            .addEscape('\u03C2', "ς")
            .addEscape('\u03C3', "σ")
            .addEscape('\u03C4', "τ")
            .addEscape('\u03C5', "υ")
            .addEscape('\u03C6', "φ")
            .addEscape('\u03C7', "χ")
            .addEscape('\u03C8', "ψ")
            .addEscape('\u03C9', "ω")
            .addEscape('\u03D1', "ϑ")
            .addEscape('\u03D2', "ϒ")
            .addEscape('\u03D6', "ϖ")
            .addEscape('\u2002', " ")
            .addEscape('\u2003', " ")
            .addEscape('\u2009', " ")
            .addEscape('\u200C', "‌")
            .addEscape('\u200D', "‍")
            .addEscape('\u200E', "‎")
            .addEscape('\u200F', "‏")
            .addEscape('\u2013', "–")
            .addEscape('\u2014', "—")
            .addEscape('\u2018', "‘")
            .addEscape('\u2019', "’")
            .addEscape('\u201A', "‚")
            .addEscape('\u201C', "“")
            .addEscape('\u201D', "”")
            .addEscape('\u201E', "„")
            .addEscape('\u2020', "†")
            .addEscape('\u2021', "‡")
            .addEscape('\u2022', "•")
            .addEscape('\u2026', "…")
            .addEscape('\u2030', "‰")
            .addEscape('\u2032', "′")
            .addEscape('\u2033', "″")
            .addEscape('\u2039', "‹")
            .addEscape('\u203A', "›")
            .addEscape('\u203E', "‾")
            .addEscape('\u2044', "⁄")
            .addEscape('\u20AC', "€")
            .addEscape('\u2111', "ℑ")
            .addEscape('\u2118', "℘")
            .addEscape('\u211C', "ℜ")
            .addEscape('\u2122', "™")
            .addEscape('\u2135', "ℵ")
            .addEscape('\u2190', "←")
            .addEscape('\u2191', "↑")
            .addEscape('\u2192', "→")
            .addEscape('\u2193', "↓")
            .addEscape('\u2194', "↔")
            .addEscape('\u21B5', "↵")
            .addEscape('\u21D0', "⇐")
            .addEscape('\u21D1', "⇑")
            .addEscape('\u21D2', "⇒")
            .addEscape('\u21D3', "⇓")
            .addEscape('\u21D4', "⇔")
            .addEscape('\u2200', "∀")
            .addEscape('\u2202', "∂")
            .addEscape('\u2203', "∃")
            .addEscape('\u2205', "∅")
            .addEscape('\u2207', "∇")
            .addEscape('\u2208', "∈")
            .addEscape('\u2209', "∉")
            .addEscape('\u220B', "∋")
            .addEscape('\u220F', "∏")
            .addEscape('\u2211', "∑")
            .addEscape('\u2212', "−")
            .addEscape('\u2217', "∗")
            .addEscape('\u221A', "√")
            .addEscape('\u221D', "∝")
            .addEscape('\u221E', "∞")
            .addEscape('\u2220', "∠")
            .addEscape('\u2227', "∧")
            .addEscape('\u2228', "∨")
            .addEscape('\u2229', "∩")
            .addEscape('\u222A', "∪")
            .addEscape('\u222B', "∫")
            .addEscape('\u2234', "∴")
            .addEscape('\u223C', "∼")
            .addEscape('\u2245', "≅")
            .addEscape('\u2248', "≈")
            .addEscape('\u2260', "≠")
            .addEscape('\u2261', "≡")
            .addEscape('\u2264', "≤")
            .addEscape('\u2265', "≥")
            .addEscape('\u2282', "⊂")
            .addEscape('\u2283', "⊃")
            .addEscape('\u2284', "⊄")
            .addEscape('\u2286', "⊆")
            .addEscape('\u2287', "⊇")
            .addEscape('\u2295', "⊕")
            .addEscape('\u2297', "⊗")
            .addEscape('\u22A5', "⊥")
            .addEscape('\u22C5', "⋅")
            .addEscape('\u2308', "⌈")
            .addEscape('\u2309', "⌉")
            .addEscape('\u230A', "⌊")
            .addEscape('\u230B', "⌋")
            .addEscape('\u2329', "⟨")
            .addEscape('\u232A', "⟩")
            .addEscape('\u25CA', "◊")
            .addEscape('\u2660', "♠")
            .addEscape('\u2663', "♣")
            .addEscape('\u2665', "♥")
            .addEscape('\u2666', "♦")
            .toArray());
  }

  /**
   * Returns a {@link CharEscaper} instance that escapes special characters in a
   * string so it can safely be included in an HTML document in either element
   * content or attribute values.
   *
   * <p><b>Note</b></p>: does not alter non-ASCII and control characters.
   */
  public static CharEscaper asciiHtmlEscaper() {
    return ASCII_HTML_ESCAPER;
  }

  /**
   * Escapes special characters from a string so it can safely be included in an
   * HTML document in either element content or attribute values. Does
   * <em>not</em> alter non-ASCII characters or control characters.
   */
  private static final CharEscaper ASCII_HTML_ESCAPER = new CharEscaperBuilder()
      .addEscape('"', """)
      .addEscape('\'', "'")
      .addEscape('&', "&")
      .addEscape('<', "<")
      .addEscape('>', ">")
      .toEscaper();

  /**
   * Returns an {@link Escaper} instance that escapes Java chars so they can be
   * safely included in URIs. For details on escaping URIs, see section 2.4 of
   * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>.
   *
   * <p>When encoding a String, the following rules apply:
   * <ul>
   * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
   *     through "9" remain the same.
   * <li>The special characters ".", "-", "*", and "_" remain the same.
   * <li>The space character " " is converted into a plus sign "+".
   * <li>All other characters are converted into one or more bytes using UTF-8
   *     encoding and each byte is then represented by the 3-character string
   *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
   *     representation of the byte value.
   * <ul>
   *
   * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
   * hexidecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
   * RFC 3986</a>:<br>
   * <i>"URI producers and normalizers should use uppercase hexadecimal digits
   * for all percent-encodings."</i>
   *
   * <p>This escaper has identical behavior to (but is potentially much faster
   * than):
   * <ul>
   * <li>{@link com.google.gdata.util.httputil.FastURLEncoder#encode(String)}
   * <li>{@link com.google.gdata.util.httputil.FastURLEncoder#encode(String,String)}
   *     with the encoding name "UTF-8"
   * <li>{@link com.google.gdata.util.common.net.UriEncoder#encode(String)}
   * <li>{@link com.google.gdata.util.common.net.UriEncoder#encode(String,java.nio.charset.Charset)}
   *     with the UTF_8 Charset
   * <li>{@link java.net.URLEncoder#encode(String, String)}
   *     with the encoding name "UTF-8"
   * </ul>
   *
   * <p>This method is equivalent to {@code uriEscaper(true)}.
   */
  public static Escaper uriEscaper() {
    return uriEscaper(true);
  }

  /**
   * Returns an {@link Escaper} instance that escapes Java chars so they can be
   * safely included in URI path segments. For details on escaping URIs, see
   * section 2.4 of <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
   *
   * <p>When encoding a String, the following rules apply:
   * <ul>
   * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
   *     through "9" remain the same.
   * <li>The unreserved characters ".", "-", "~", and "_" remain the same.
   * <li>The general delimiters "@" and ":" remain the same.
   * <li>The subdelimiters "!", "$", "&", "'", "(", ")", "*", ",", ";",
   *     and "=" remain the same.
   * <li>The space character " " is converted into %20.
   * <li>All other characters are converted into one or more bytes using UTF-8
   *     encoding and each byte is then represented by the 3-character string
   *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
   *     representation of the byte value.
   * </ul>
   *
   * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
   * hexidecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
   * RFC 3986</a>:<br>
   * <i>"URI producers and normalizers should use uppercase hexadecimal digits
   * for all percent-encodings."</i>
   */
  public static Escaper uriPathEscaper() {
    return URI_PATH_ESCAPER;
  }

  /**
   * Returns an {@link Escaper} instance that escapes Java chars so they can be
   * safely included in URI query string segments. When the query string
   * consists of a sequence of name=value pairs separated by &, the names
   * and values should be individually encoded. If you escape an entire query
   * string in one pass with this escaper, then the "=" and "&" characters
   * used as separators will also be escaped.
   *
   * <p>This escaper is also suitable for escaping fragment identifiers.
   *
   * <p>For details on escaping URIs, see
   * section 2.4 of <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
   *
   * <p>When encoding a String, the following rules apply:
   * <ul>
   * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
   *     through "9" remain the same.
   * <li>The unreserved characters ".", "-", "~", and "_" remain the same.
   * <li>The general delimiters "@" and ":" remain the same.
   *  <li>The path delimiters "/" and "?" remain the same.
   * <li>The subdelimiters "!", "$", "'", "(", ")", "*", ",", and ";",
   *     remain the same.
   * <li>The space character " " is converted into %20.
   * <li>The equals sign "=" is converted into %3D.
   * <li>The ampersand "&" is converted into %26.
   * <li>All other characters are converted into one or more bytes using UTF-8
   *     encoding and each byte is then represented by the 3-character string
   *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
   *     representation of the byte value.
   * </ul>
   *
   * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
   * hexidecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
   * RFC 3986</a>:<br>
   * <i>"URI producers and normalizers should use uppercase hexadecimal digits
   * for all percent-encodings."</i>
   */
  public static Escaper uriQueryStringEscaper() {
    return URI_QUERY_STRING_ESCAPER;
  }

  /**
   * Returns a {@link Escaper} instance that escapes Java characters so they can
   * be safely included in URIs. For details on escaping URIs, see section 2.4
   * of <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>.
   *
   * <p>When encoding a String, the following rules apply:
   * <ul>
   * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
   *     through "9" remain the same.
   * <li>The special characters ".", "-", "*", and "_" remain the same.
   * <li>If {@code plusForSpace} was specified, the space character " " is
   *     converted into a plus sign "+". Otherwise it is converted into "%20".
   * <li>All other characters are converted into one or more bytes using UTF-8
   *     encoding and each byte is then represented by the 3-character string
   *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
   *     representation of the byte value.
   * </ul>
   *
   * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
   * hexidecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
   * RFC 3986</a>:<br>
   * <i>"URI producers and normalizers should use uppercase hexadecimal digits
   * for all percent-encodings."</i>
   *
   * @param plusForSpace if {@code true} space is escaped to {@code +} otherwise
   *        it is escaped to {@code %20}. Although common, the escaping of
   *        spaces as plus signs has a very ambiguous status in the relevant
   *        specifications. You should prefer {@code %20} unless you are doing
   *        exact character-by-character comparisons of URLs and backwards
   *        compatibility requires you to use plus signs.
   *
   * @see #uriEscaper()
   */
  public static Escaper uriEscaper(boolean plusForSpace) {
    return plusForSpace ? URI_ESCAPER : URI_ESCAPER_NO_PLUS;
  }

  private static final Escaper URI_ESCAPER =
      new PercentEscaper(PercentEscaper.SAFECHARS_URLENCODER, true);

  private static final Escaper URI_ESCAPER_NO_PLUS =
      new PercentEscaper(PercentEscaper.SAFECHARS_URLENCODER, false);

  private static final Escaper URI_PATH_ESCAPER =
      new PercentEscaper(PercentEscaper.SAFEPATHCHARS_URLENCODER, false);

  private static final Escaper URI_QUERY_STRING_ESCAPER =
      new PercentEscaper(PercentEscaper.SAFEQUERYSTRINGCHARS_URLENCODER, false);

  /**
   * Returns a {@link Escaper} instance that escapes Java characters in a manner
   * compatible with the C++ webutil/url URL class (the {@code kGoogle1Escape}
   * set).
   *
   * <p>When encoding a String, the following rules apply:
   * <ul>
   * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
   * through "9" remain the same.
   * <li>The special characters "!", "(", ")", "*", "-", ".", "_", "~", ",", "/"
   * and ":" remain the same.
   * <li>The space character " " is converted into a plus sign "+".
   * <li>All other characters are converted into one or more bytes using UTF-8
   *     encoding and each byte is then represented by the 3-character string
   *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
   *     representation of the byte value.
   * </ul>
   *
   * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
   * hexidecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
   * RFC 3986</a>:<br>
   * <i>"URI producers and normalizers should use uppercase hexadecimal digits
   * for all percent-encodings."</i>
   *
   * <p><b>Note</b>: This escaper is a special case and is <em>not
   * compliant</em> with <a href="http://www.ietf.org/rfc/rfc2396.txt">
   * RFC 2396</a>. Specifically it will not escape "/", ":" and ",". This is
   * only provided for certain limited use cases and you should favor using
   * {@link #uriEscaper()} whenever possible.
   */
  public static Escaper cppUriEscaper() {
    return CPP_URI_ESCAPER;
  }

  // Based on comments from FastURLEncoder:
  // These octets mimic the ones escaped by the C++ webutil/url URL class --
  // the kGoogle1Escape set.
  // To produce the same escaping as C++, use this set with the plusForSpace
  // option.
  // WARNING: Contrary to RFC 2396 ",", "/" and ":" are listed as safe here.
  private static final Escaper CPP_URI_ESCAPER =
      new PercentEscaper("!()*-._~,/:", true);

  /**
   * Returns a {@link CharEscaper} instance that escapes special characters in a
   * string so it can safely be included in a Java string literal.
   *
   * <p><b>Note</b></p>: does not escape single quotes, so use the escaper
   * returned by {@link #javaCharEscaper()} if you are generating char
   * literals or if you are unsure.
   */
  public static CharEscaper javaStringEscaper() {
    return JAVA_STRING_ESCAPER;
  }

  /**
   * Escapes special characters from a string so it can safely be included in a
   * Java string literal. Does <em>not</em> escape single-quotes, so use
   * JAVA_CHAR_ESCAPE if you are generating char literals, or if you are unsure.
   *
   * <p>Note that non-ASCII characters will be octal or Unicode escaped.
   */
  private static final CharEscaper JAVA_STRING_ESCAPER
      = new JavaCharEscaper(new CharEscaperBuilder()
          .addEscape('\b', "\\b")
          .addEscape('\f', "\\f")
          .addEscape('\n', "\\n")
          .addEscape('\r', "\\r")
          .addEscape('\t', "\\t")
          .addEscape('\"', "\\\"")
          .addEscape('\\', "\\\\")
          .toArray());

  /**
   * Returns a {@link CharEscaper} instance that escapes special characters in a
   * string so it can safely be included in a Java char or string literal. The
   * behavior of this escaper is the same as that of the
   * {@link #javaStringEscaper()}, except it also escapes single quotes.
   */
  public static CharEscaper javaCharEscaper() {
    return JAVA_CHAR_ESCAPER;
  }

  /**
   * Escapes special characters from a string so it can safely be included in a
   * Java char literal or string literal.
   *
   * <p>Note that non-ASCII characters will be octal or Unicode escaped.
   *
   * <p>This is the same as {@link #JAVA_STRING_ESCAPER}, except that it escapes
   * single quotes.
   */
  private static final CharEscaper JAVA_CHAR_ESCAPER
      = new JavaCharEscaper(new CharEscaperBuilder()
          .addEscape('\b', "\\b")
          .addEscape('\f', "\\f")
          .addEscape('\n', "\\n")
          .addEscape('\r', "\\r")
          .addEscape('\t', "\\t")
          .addEscape('\'', "\\'")
          .addEscape('\"', "\\\"")
          .addEscape('\\', "\\\\")
          .toArray());

  /**
   * Returns a {@link CharEscaper} instance that replaces non-ASCII characters
   * in a string with their Unicode escape sequences ({@code \\uxxxx} where
   * {@code xxxx} is a hex number). Existing escape sequences won't be affected.
   */
  public static CharEscaper javaStringUnicodeEscaper() {
    return JAVA_STRING_UNICODE_ESCAPER;
  }

  /**
   * Escapes each non-ASCII character in with its Unicode escape sequence
   * {@code \\uxxxx} where {@code xxxx} is a hex number. Existing escape
   * sequences won't be affected.
   */
  private static final CharEscaper JAVA_STRING_UNICODE_ESCAPER
      = new CharEscaper() {
          @Override protected char[] escape(char c) {
            if (c <= 127) {
              return null;
            }

            char[] r = new char[6];
            r[5] = HEX_DIGITS[c & 15];
            c >>>= 4;
            r[4] = HEX_DIGITS[c & 15];
            c >>>= 4;
            r[3] = HEX_DIGITS[c & 15];
            c >>>= 4;
            r[2] = HEX_DIGITS[c & 15];
            r[1] = 'u';
            r[0] = '\\';
            return r;
          }
        };

  /**
   * Returns a {@link CharEscaper} instance that escapes special characters from
   * a string so it can safely be included in a Python string literal. Does not
   * have any special handling for non-ASCII characters.
   */
  public static CharEscaper pythonEscaper() {
    return PYTHON_ESCAPER;
  }

  /**
   * Escapes special characters in a string so it can safely be included in a
   * Python string literal. Does not have any special handling for non-ASCII
   * characters.
   */
  private static final CharEscaper PYTHON_ESCAPER = new CharEscaperBuilder()
      .addEscape('\n', "\\n")
      .addEscape('\r', "\\r")
      .addEscape('\t', "\\t")
      .addEscape('\\', "\\\\")
      .addEscape('\"', "\\\"")
      .addEscape('\'', "\\\'")
      .toEscaper();

  /**
   * Returns a {@link CharEscaper} instance that escapes non-ASCII characters in
   * a string so it can safely be included in a Javascript string literal.
   * Non-ASCII characters are replaced with their ASCII javascript escape
   * sequences (e.g., \\uhhhh or \xhh).
   */
  public static CharEscaper javascriptEscaper() {
    return JAVASCRIPT_ESCAPER;
  }

  /**
   * {@code CharEscaper} to escape javascript strings. Turns all non-ASCII
   * characters into ASCII javascript escape sequences (e.g., \\uhhhh or \xhh).
   */
  private static final CharEscaper JAVASCRIPT_ESCAPER
      = new JavascriptCharEscaper(new CharEscaperBuilder()
          .addEscape('\'', "\\x27")
          .addEscape('"',  "\\x22")
          .addEscape('<',  "\\x3c")
          .addEscape('=',  "\\x3d")
          .addEscape('>',  "\\x3e")
          .addEscape('&',  "\\x26")
          .addEscape('\b', "\\b")
          .addEscape('\t', "\\t")
          .addEscape('\n', "\\n")
          .addEscape('\f', "\\f")
          .addEscape('\r', "\\r")
          .addEscape('\\', "\\\\")
          .toArray());

  private static CharEscaperBuilder newBasicXmlEscapeBuilder() {
    return new CharEscaperBuilder()
        .addEscape('&', "&")
        .addEscape('<', "<")
        .addEscape('>', ">")
        .addEscapes(new char[] {
            '\000', '\001', '\002', '\003', '\004',
            '\005', '\006', '\007', '\010', '\013',
            '\014', '\016', '\017', '\020', '\021',
            '\022', '\023', '\024', '\025', '\026',
            '\027', '\030', '\031', '\032', '\033',
            '\034', '\035', '\036', '\037'}, "");
  }

  /**
   * Returns a composite {@link CharEscaper} instance that tries to escape
   * characters using a primary {@code CharEscaper} first and falls back to a
   * secondary one if there is no escaping.
   *
   * <p>The returned escaper will attempt to escape each character using the
   * primary escaper, and if the primary escaper has no escaping for that
   * character, it will use the secondary escaper. If the secondary escaper has
   * no escaping for a character either, the original character will be used.
   * If the primary escaper has an escape for a character, the secondary escaper
   * will not be used at all for that character; the escaped output of the
   * primary is not run through the secondary. For a case where you would like
   * to first escape with one escaper, and then with another, it is recommended
   * that you call each escaper in order.
   *
   * @param primary The primary {@code CharEscaper} to use
   * @param secondary The secondary {@code CharEscaper} to use if the first one
   *     has no escaping rule for a character
   * @throws NullPointerException if any of the arguments is null
   */
  public static CharEscaper fallThrough(CharEscaper primary,
      CharEscaper secondary) {
    checkNotNull(primary);
    checkNotNull(secondary);
    return new FallThroughCharEscaper(primary, secondary);
  }

  /**
   * A fast {@link CharEscaper} that uses an array of replacement characters and
   * a range of safe characters. It overrides {@link #escape(String)} to improve
   * performance. Rough benchmarking shows that this almost doubles the speed
   * when processing strings that do not require escaping (providing the escape
   * test itself is efficient).
   */
  private static abstract class FastCharEscaper extends CharEscaper {

    protected final char[][] replacements;
    protected final int replacementLength;
    protected final char safeMin;
    protected final char safeMax;

    public FastCharEscaper(char[][] replacements, char safeMin, char safeMax) {
      this.replacements = replacements;
      this.replacementLength = replacements.length;
      this.safeMin = safeMin;
      this.safeMax = safeMax;
    }

    /** Overridden for performance (see {@link FastCharEscaper}). */
    @Override public String escape(String s) {
      int slen = s.length();
      for (int index = 0; index < slen; index++) {
        char c = s.charAt(index);
        if ((c < replacementLength && replacements[c] != null)
            || c < safeMin || c > safeMax) {
          return escapeSlow(s, index);
        }
      }
      return s;
    }
  }

  /**
   * Escaper for Java character escaping, contains both an array and a
   * backup function.  We're not overriding the array decorator because we
   * want to keep this as fast as possible, so no calls to super.escape first.
   */
  private static class JavaCharEscaper extends FastCharEscaper {

    public JavaCharEscaper(char[][] replacements) {
      super(replacements, ' ', '~');
    }

    @Override protected char[] escape(char c) {
      // First check if our array has a valid escaping.
      if (c < replacementLength) {
        char[] r = replacements[c];
        if (r != null) {
          return r;
        }
      }

      // This range is un-escaped.
      if (safeMin <= c && c <= safeMax) {
        return null;
      }

      if (c <= 0xFF) {
        // Convert c to an octal-escaped string.
        // Equivalent to String.format("\\%03o", (int)c);
        char[] r = new char[4];
        r[0] = '\\';
        r[3] = HEX_DIGITS[c & 7];
        c >>>= 3;
        r[2] = HEX_DIGITS[c & 7];
        c >>>= 3;
        r[1] = HEX_DIGITS[c & 7];
        return r;
      }

      // Convert c to a hex-escaped string.
      // Equivalent to String.format("\\u%04x", (int)c);
      char[] r = new char[6];
      r[0] = '\\';
      r[1] = 'u';
      r[5] = HEX_DIGITS[c & 15];
      c >>>= 4;
      r[4] = HEX_DIGITS[c & 15];
      c >>>= 4;
      r[3] = HEX_DIGITS[c & 15];
      c >>>= 4;
      r[2] = HEX_DIGITS[c & 15];
      return r;
    }
  }

  /**
   * Escaper for javascript character escaping, contains both an array and a
   * backup function. We're not overriding the array decorator because we
   * want to keep this as fast as possible, so no calls to super.escape first.
   */
  private static class JavascriptCharEscaper extends FastCharEscaper {

    public JavascriptCharEscaper(char[][] replacements) {
      super(replacements, ' ', '~');
    }

    @Override protected char[] escape(char c) {
      // First check if our array has a valid escaping.
      if (c < replacementLength) {
        char[] r = replacements[c];
        if (r != null) {
          return r;
        }
      }

      // This range is unescaped.
      if (safeMin <= c && c <= safeMax) {
        return null;
      }

      // we can do a 2 digit hex escape for chars less that 0x100
      if (c < 0x100) {
        char[] r = new char[4];
        r[3] = HEX_DIGITS[c & 0xf];
        c >>>= 4;
        r[2] = HEX_DIGITS[c & 0xf];
        r[1] = 'x';
        r[0] = '\\';
        return r;
      }

      // 4 digit hex escape everything else
      char[] r = new char[6];
      r[5] = HEX_DIGITS[c & 0xf];
      c >>>= 4;
      r[4] = HEX_DIGITS[c & 0xf];
      c >>>= 4;
      r[3] = HEX_DIGITS[c & 0xf];
      c >>>= 4;
      r[2] = HEX_DIGITS[c & 0xf];
      r[1] = 'u';
      r[0] = '\\';
      return r;
    }
  }

  /**
   * Escaper for HTML character escaping, contains both an array and a
   * backup function.  We're not overriding the array decorator because we
   * want to keep this as fast as possible, so no calls to super.escape first.
   */
  private static class HtmlCharEscaper extends FastCharEscaper {

    public HtmlCharEscaper(char[][] replacements) {
      super(replacements, Character.MIN_VALUE, '~');
    }

    @Override protected char[] escape(char c) {
      // First check if our array has a valid escaping.
      if (c < replacementLength) {
        char[] r = replacements[c];
        if (r != null) {
          return r;
        }
      }

      // ~ is ASCII 126, the highest value char that does not need
      // to be escaped
      if (c <= safeMax) {
        return null;
      }

      int index;
      if (c < 1000) {
        index = 4;
      } else if (c < 10000) {
        index = 5;
      } else {
        index = 6;
      }
      char[] result = new char[index + 2];
      result[0] = '&';
      result[1] = '#';
      result[index + 1] = ';';

      // to avoid the division and modulo operators.
      int intValue = c;
      for (; index > 1; index--) {
        result[index] = HEX_DIGITS[intValue % 10];
        intValue /= 10;
      }
      return result;
    }
  }

  /**
   * A composite {@code CharEscaper} object that tries to escape characters
   * using a primary {@code CharEscaper} first and falls back to a secondary
   * one if there is no escaping.
   */
  private static class FallThroughCharEscaper extends CharEscaper {

    private final CharEscaper primary;
    private final CharEscaper secondary;

    public FallThroughCharEscaper(CharEscaper primary, CharEscaper secondary) {
      this.primary = primary;
      this.secondary = secondary;
    }

    @Override
    protected char[] escape(char c) {
      char result[] = primary.escape(c);
      if (result == null) {
        result = secondary.escape(c);
      }
      return result;
    }
  }

  private static final char[] HEX_DIGITS = "0123456789abcdef".toCharArray();
}