package com.mzeat.http;
import java.util.HashMap;
import java.util.Map;
/**
* HTMLEntity,主要实现String和StringBuffer的编码和解码
* @author windhuiyi
*
*/
public class HTMLEntity {
/**
* 编码
* @param original
* @return
*/
public static String escape(String original) {
StringBuffer buf = new StringBuffer(original);
escape(buf);
return buf.toString();
}
/**
* 编码
* @param original
*/
public static void escape(StringBuffer original) {
int index = 0;
String escaped;
while (index < original.length()) {
escaped = entityEscapeMap.get(original.substring(index, index + 1));
if (null != escaped) {
original.replace(index, index + 1, escaped);
index += escaped.length();
} else {
index++;
}
}
}
/**
* 解码
* @param original
* @return
*/
public static String unescape(String original) {
StringBuffer buf = new StringBuffer(original);
unescape(buf);
return buf.toString();
}
/**
* 解码
* @param original
*/
public static void unescape(StringBuffer original) {
int index = 0;
int semicolonIndex = 0;
String escaped;
String entity;
while (index < original.length()) {
index = original.indexOf("&", index);
if (-1 == index) {
break;
}
semicolonIndex = original.indexOf(";", index);
if (-1 != semicolonIndex && 10 > (semicolonIndex - index)) {
escaped = original.substring(index, semicolonIndex + 1);
entity = escapeEntityMap.get(escaped);
if (null != entity) {
original.replace(index, semicolonIndex + 1, entity);
}
index++;
} else {
break;
}
}
}
private static Map<String, String> entityEscapeMap = new HashMap<String, String>();
private static Map<String, String> escapeEntityMap = new HashMap<String, String>();
static {
String[][] entities = {
{ " ", " "/* no-break space = non-breaking space */,
"\u00A0" },
{ "¡", "¡"/* inverted exclamation mark */, "\u00A1" },
{ "¢", "¢"/* cent sign */, "\u00A2" },
{ "£", "£"/* pound sign */, "\u00A3" },
{ "¤", "¤"/* currency sign */, "\u00A4" },
{ "¥", "¥"/* yen sign = yuan sign */, "\u00A5" },
{ "¦", "¦"/* broken bar = broken vertical bar */,
"\u00A6" },
{ "§", "§"/* section sign */, "\u00A7" },
{ "¨", "¨"/* diaeresis = spacing diaeresis */,
"\u00A8" },
{ "©", "©"/* copyright sign */, "\u00A9" },
{ "ª", "ª"/* feminine ordinal indicator */, "\u00AA" },
{ "«", "«"/*
* left-pointing double angle quotation mark
* = left pointing guillemet
*/, "\u00AB" },
{ "¬", "¬"/* not sign = discretionary hyphen */,
"\u00AC" },
{ "", ""/* soft hyphen = discretionary hyphen */,
"\u00AD" },
{ "®", "®"/*
* registered sign = registered trade mark
* sign
*/, "\u00AE" },
{ "¯", "¯"/*
* macron = spacing macron = overline = APL
* overbar
*/, "\u00AF" },
{ "°", "°"/* degree sign */, "\u00B0" },
{ "±", "±"/* plus-minus sign = plus-or-minus sign */,
"\u00B1" },
{ "²", "²"/*
* superscript two = superscript digit two =
* squared
*/, "\u00B2" },
{ "³", "³"/*
* superscript three = superscript digit
* three = cubed
*/, "\u00B3" },
{ "´", "´"/* acute accent = spacing acute */,
"\u00B4" },
{ "µ", "µ"/* micro sign */, "\u00B5" },
{ "¶", "¶"/* pilcrow sign = paragraph sign */,
"\u00B6" },
{ "·", "·"/*
* middle dot = Georgian comma = Greek
* middle dot
*/, "\u00B7" },
{ "¸", "¸"/* cedilla = spacing cedilla */, "\u00B8" },
{ "¹",
"¹"/* superscript one = superscript digit one */,
"\u00B9" },
{ "º", "º"/* masculine ordinal indicator */, "\u00BA" },
{ "»", "»"/*
* right-pointing double angle quotation
* mark = right pointing guillemet
*/, "\u00BB" },
{ "¼", "¼"/*
* vulgar fraction one quarter = fraction
* one quarter
*/, "\u00BC" },
{ "½", "½"/*
* vulgar fraction one half = fraction one
* half
*/, "\u00BD" },
{ "¾", "¾"/*
* vulgar fraction three quarters = fraction
* three quarters
*/, "\u00BE" },
{ "¿", "¿"/*
* inverted question mark = turned question
* mark
*/, "\u00BF" },
{ "À", "À"/*
* latin capital letter A with grave = latin
* capital letter A grave
*/, "\u00C0" },
{ "Á", "Á"/* latin capital letter A with acute */,
"\u00C1" },
{ "Â",
"Â"/* latin capital letter A with circumflex */,
"\u00C2" },
{ "Ã", "Ã"/* latin capital letter A with tilde */,
"\u00C3" },
{ "Ä",
"Ä"/* latin capital letter A with diaeresis */,
"\u00C4" },
{ "Å", "Å"/*
* latin capital letter A with ring above =
* latin capital letter A ring
*/, "\u00C5" },
{ "Æ", "Æ"/*
* latin capital letter AE = latin capital
* ligature AE
*/, "\u00C6" },
{ "Ç", "Ç"/* latin capital letter C with cedilla */,
"\u00C7" },
{ "È", "È"/* latin capital letter E with grave */,
"\u00C8" },
{ "É", "É"/* latin capital letter E with acute */,
"\u00C9" },
{ "Ê",
"Ê"/* latin capital letter E with circumflex */,
"\u00CA" },
{ "Ë",
"Ë"/* latin capital letter E with diaeresis */,
"\u00CB" },
{ "Ì", "Ì"/* latin capital letter I with grave */,
"\u00CC" },
{ "Í", "Í"/* latin capital letter I with acute */,
"\u00CD" },
{ "Î",
"Î"/* latin capital letter I with circumflex */,
"\u00CE" },
{ "Ï",
"Ï"/* latin capital letter I with diaeresis */,
"\u00CF" },
{ "Ð", "Ð"/* latin capital letter ETH */, "\u00D0" },
{ "Ñ", "Ñ"/* latin capital letter N with tilde */,
"\u00D1" },
{ "Ò", "Ò"/* latin capital letter O with grave */,
"\u00D2" },
{ "Ó", "Ó"/* latin capital letter O with acute */,
"\u00D3" },
{ "Ô",
"Ô"/* latin capital letter O with circumflex */,
"\u00D4" },
{ "Õ", "Õ"/* latin capital letter O with tilde */,
"\u00D5" },
{ "Ö",
"Ö"/* latin capital letter O with diaeresis */,
"\u00D6" },
{ "×", "×"/* multiplication sign */, "\u00D7" },
{ "Ø", "Ø"/*
* latin capital letter O with stroke =
* latin capital letter O slash
*/, "\u00D8" },
{ "Ù", "Ù"/* latin capital letter U with grave */,
"\u00D9" },
{ "Ú", "Ú"/* latin capital letter U with acute */,
"\u00DA" },
{ "Û",
"Û"/* latin capital letter U with circumflex */,
"\u00DB" },
{ "Ü",
"Ü"/* latin capital letter U with diaeresis */,
"\u00DC" },
{ "Ý", "Ý"/* latin capital letter Y with acute */,
"\u00DD" },
{ "Þ", "Þ"/* latin capital letter THORN */, "\u00DE" },
{ "ß", "ß"/* latin small letter sharp s = ess-zed */,
"\u00DF" },
{ "à", "à"/*
* latin small letter a with grave = latin
* small letter a grave
*/, "\u00E0" },
{ "á", "á"/* latin small letter a with acute */,
"\u00E1" },
{ "â", "â"/* latin small letter a with circumflex */,
"\u00E2" },
{ "ã", "ã"/* latin small letter a with tilde */,
"\u00E3" },
{ "ä", "ä"/* latin small letter a with diaeresis */,
"\u00E4" },
{ "å", "å"/*
* latin small letter a with ring above =
* latin small letter a ring
*/, "\u00E5" },
{ "æ", "æ"/*
* latin small letter ae = latin small
* ligature ae
*/, "\u00E6" },
{ "ç", "ç"/* latin small letter c with cedilla */,
"\u00E7" },
{ "è", "è"/* latin small letter e with grave */,
"\u00E8" },
{ "é", "é"/* latin small letter e with acute */,
"\u00E9" },
{ "ê", "ê"/* latin small letter e with circumflex */,
"\u00EA" },
{ "ë", "ë"/* latin small letter e with diaeresis */,
"\u00EB" },
{ "ì", "ì"/* latin small letter i with grave */,
"\u00EC" },
{ "í", "í"/* latin small letter i with acute */,
"\u00ED" },
{ "î", "î"/* latin small letter i with circumflex */,
"\u00EE" },
{ "ï", "ï"/* latin small letter i with diaeresis */,
"\u00EF" },
{ "ð", "ð"/* latin small letter eth */, "\u00F0" },
{ "ñ", "ñ"/* latin small letter n with tilde */,
"\u00F1" },
{ "ò", "ò"/* latin small letter o with grave */,
"\u00F2" },
{ "ó", "ó"/* latin small letter o with acute */,
"\u00F3" },
{ "ô", "ô"/* latin small letter o with circumflex */,
"\u00F4" },
{ "õ", "õ"/* latin small letter o with tilde */,
"\u00F5" },
{ "ö", "ö"/* latin small letter o with diaeresis */,
"\u00F6" },
{ "÷", "÷"/* division sign */, "\u00F7" },
{ "ø", "ø"/*
* latin small letter o with stroke = latin
* small letter o slash
*/, "\u00F8" },
{ "ù", "ù"/* latin small letter u with grave */,
"\u00F9" },
{ "ú", "ú"/* latin small letter u with acute */,
"\u00FA" },
{ "û", "û"/* latin small letter u with circumflex */,
"\u00FB" },
{ "ü", "ü"/* latin small letter u with diaeresis */,
"\u00FC" },
{ "ý", "ý"/* latin small letter y with acute */,
"\u00FD" },
{ "þ", "þ"/* latin small letter thorn with */,
"\u00FE" },
{ "ÿ", "ÿ"/* latin small letter y with diaeresis */,
"\u00FF" },
{ "ƒ", "ƒ"/*
* latin small f with hook = function =
* florin
*/, "\u0192" }
/* Greek */
,
{ "Α", "Α"/* greek capital letter alpha */, "\u0391" },
{ "Β", "Β"/* greek capital letter beta */, "\u0392" },
{ "Γ", "Γ"/* greek capital letter gamma */, "\u0393" },
{ "Δ", "Δ"/* greek capital letter delta */, "\u0394" },
{ "Ε", "Ε"/* greek capital letter epsilon */,
"\u0395" },
{ "Ζ", "Ζ"/* greek capital letter zeta */, "\u0396" },
{ "Η", "Η"/* greek capital letter eta */, "\u0397" },
{ "Θ", "Θ"/* greek capital letter theta */, "\u0398" },
{ "Ι", "Ι"/* greek capital letter iota */, "\u0399" },
{ "Κ", "Κ"/* greek capital letter kappa */, "\u039A" },
{ "Λ", "Λ"/* greek capital letter lambda */, "\u039B" },
{ "Μ", "Μ"/* greek capital letter mu */, "\u039C" },
{ "Ν", "Ν"/* greek capital letter nu */, "\u039D" },
{ "Ξ", "Ξ"/* greek capital letter xi */, "\u039E" },
{ "Ο", "Ο"/* greek capital letter omicron */,
"\u039F" },
{ "Π", "Π"/* greek capital letter pi */, "\u03A0" },
{ "Ρ", "Ρ"/* greek capital letter rho */, "\u03A1" }
/* there is no Sigmaf and no \u03A2 */
,
{ "Σ", "Σ"/* greek capital letter sigma */, "\u03A3" },
{ "Τ", "Τ"/* greek capital letter tau */, "\u03A4" },
{ "Υ", "Υ"/* greek capital letter upsilon */,
"\u03A5" },
{ "Φ", "Φ"/* greek capital letter phi */, "\u03A6" },
{ "Χ", "Χ"/* greek capital letter chi */, "\u03A7" },
{ "Ψ", "Ψ"/* greek capital letter psi */, "\u03A8" },
{ "Ω", "Ω"/* greek capital letter omega */, "\u03A9" },
{ "α", "α"/* greek small letter alpha */, "\u03B1" },
{ "β", "β"/* greek small letter beta */, "\u03B2" },
{ "γ", "γ"/* greek small letter gamma */, "\u03B3" },
{ "δ", "δ"/* greek small letter delta */, "\u03B4" },
{ "ε", "ε"/* greek small letter epsilon */,
"\u03B5" },
{ "ζ", "ζ"/* greek small letter zeta */, "\u03B6" },
{ "η", "η"/* greek small letter eta */, "\u03B7" },
{ "θ", "θ"/* greek small letter theta */, "\u03B8" },
{ "ι", "ι"/* greek small letter iota */, "\u03B9" },
{ "κ", "κ"/* greek small letter kappa */, "\u03BA" },
{ "λ", "λ"/* greek small letter lambda */, "\u03BB" },
{ "μ", "μ"/* greek small letter mu */, "\u03BC" },
{ "ν", "ν"/* greek small letter nu */, "\u03BD" },
{ "ξ", "ξ"/* greek small letter xi */, "\u03BE" },
{ "ο", "ο"/* greek small letter omicron */,
"\u03BF" },
{ "π", "π"/* greek small letter pi */, "\u03C0" },
{ "ρ", "ρ"/* greek small letter rho */, "\u03C1" },
{ "ς", "ς"/* greek small letter final sigma */,
"\u03C2" },
{ "σ", "σ"/* greek small letter sigma */, "\u03C3" },
{ "τ", "τ"/* greek small letter tau */, "\u03C4" },
{ "υ", "υ"/* greek small letter upsilon */,
"\u03C5" },
{ "φ", "φ"/* greek small letter phi */, "\u03C6" },
{ "χ", "χ"/* greek small letter chi */, "\u03C7" },
{ "ψ", "ψ"/* greek small letter psi */, "\u03C8" },
{ "ω", "ω"/* greek small letter omega */, "\u03C9" },
{ "ϑ", "ϑ"/* greek small letter theta symbol */,
"\u03D1" },
{ "ϒ", "ϒ"/* greek upsilon with hook symbol */,
"\u03D2" },
{ "ϖ", "ϖ"/* greek pi symbol */, "\u03D6" }
/* General Punctuation */
,
{ "•", "•"/* bullet = black small circle */, "\u2022" }
/* bullet is NOT the same as bullet operator ,"\u2219 */
,
{ "…", "…"/*
* horizontal ellipsis = three dot
* leader
*/, "\u2026" },
{ "′", "′"/* prime = minutes = feet */, "\u2032" },
{ "″", "″"/* double prime = seconds = inches */,
"\u2033" },
{ "‾", "‾"/* overline = spacing overscore */,
"\u203E" },
{ "⁄", "⁄"/* fraction slash */, "\u2044" }
/* Letterlike Symbols */
,
{ "℘", "℘"/*
* script capital P = power set =
* Weierstrass p
*/, "\u2118" },
{ "ℑ",
"ℑ"/* blackletter capital I = imaginary part */,
"\u2111" },
{ "ℜ",
"ℜ"/* blackletter capital R = real part symbol */,
"\u211C" },
{ "™", "™"/* trade mark sign */, "\u2122" },
{ "ℵ", "ℵ"/*
* alef symbol = first transfinite
* cardinal
*/, "\u2135" }
/* alef symbol is NOT the same as hebrew letter alef ,"\u05D0"} */
/* Arrows */
,
{ "←", "←"/* leftwards arrow */, "\u2190" },
{ "↑", "↑"/* upwards arrow */, "\u2191" },
{ "→", "→"/* rightwards arrow */, "\u2192" },
{ "↓", "↓"/* downwards arrow */, "\u2193" },
{ "↔", "↔"/* left right arrow */, "\u2194" },
{ "↵", "↵"/*
* downwards arrow with corner leftwards =
* carriage return
*/, "\u21B5" },
{ "⇐", "⇐"/* leftwards double arrow */, "\u21D0" }
/*
* Unicode does not say that lArr is the same as the 'is implied
* by' arrow but also does not have any other character for that
* function. So ? lArr can be used for 'is implied by' as
* ISOtech suggests
*/
,
{ "⇑", "⇑"/* upwards double arrow */, "\u21D1" },
{ "⇒", "⇒"/* rightwards double arrow */, "\u21D2" }
/*
* Unicode does not say this is the 'implies' character but does
* not have another character with this function so ? rArr can
* be used for 'implies' as ISOtech suggests
*/
,
{ "⇓", "⇓"/* downwards double arrow */, "\u21D3" },
{ "⇔", "⇔"/* left right double arrow */, "\u21D4" }
/* Mathematical Operators */
,
{ "∀", "∀"/* for all */, "\u2200" },
{ "∂", "∂"/* partial differential */, "\u2202" },
{ "∃", "∃"/* there exists */, "\u2203" },
{ "∅", "∅"/* empty set = null set = diameter */,
"\u2205" },
{ "∇", "∇"/* nabla = backward difference */, "\u2207" },
{ "∈", "∈"/* element of */, "\u2208" },
{ "∉", "∉"/* not an element of */, "\u2209" },
{ "∋", "∋"/* contains as member */, "\u220B" }
/* should there be a more memorable name than 'ni'? */
,
{ "∏", "∏"/* n-ary product = product sign */,
"\u220F" }
/* prod is NOT the same character as ,"\u03A0"} */
,
{ "∑", "∑"/* n-ary sumation */, "\u2211" }
/* sum is NOT the same character as ,"\u03A3"} */
,
{ "−", "−"/* minus sign */, "\u2212" },
{ "∗", "∗"/* asterisk operator */, "\u2217" },
{ "√", "√"/* square root = radical sign */, "\u221A" },
{ "∝", "∝"/* proportional to */, "\u221D" },
{ "∞", "∞"/* infinity */, "\u221E" },
{ "∠", "∠"/* angle */, "\u2220" },
{ "∧", "∧"/* logical and = wedge */, "\u2227" },
{ "∨", "∨"/* logical or = vee */, "\u2228" },
{ "∩", "∩"/* intersection = cap */, "\u2229" },
{ "∪", "∪"/* union = cup */, "\u222A" },
{ "∫", "∫"/* integral */, "\u222B" },
{ "∴", "∴"/* therefore */, "\u2234" },
{
"∼",
"∼"/* tilde operator = varies with = similar to */,
"\u223C" }
/*
* tilde operator is NOT the same character as the tilde
* ,"\u007E"}
*/
,
{ "≅", "≅"/* approximately equal to */, "\u2245" },
{ "≈", "≈"/* almost equal to = asymptotic to */,
"\u2248" },
{ "≠", "≠"/* not equal to */, "\u2260" },
{ "≡", "≡"/* identical to */, "\u2261" },
{ "≤", "≤"/* less-than or equal to */, "\u2264" },
{ "≥", "≥"/* greater-than or equal to */, "\u2265" },
{ "⊂", "⊂"/* subset of */, "\u2282" },
{ "⊃", "⊃"/* superset of */, "\u2283" }
/* note that nsup 'not a superset of ,"\u2283"} */
,
{ "⊆", "⊆"/* subset of or equal to */, "\u2286" },
{ "⊇", "⊇"/* superset of or equal to */, "\u2287" },
{ "⊕", "⊕"/* circled plus = direct sum */, "\u2295" },
{ "⊗", "⊗"/* circled times = vector product */,
"\u2297" },
{ "⊥",
"⊥"/* up tack = orthogonal to = perpendicular */,
"\u22A5" },
{ "⋅", "⋅"/* dot operator */, "\u22C5" }
/*
* dot operator is NOT the same character as ,"\u00B7"} /*
* Miscellaneous Technical
*/
,
{ "⌈", "⌈"/* left ceiling = apl upstile */, "\u2308" },
{ "⌉", "⌉"/* right ceiling */, "\u2309" },
{ "⌊", "⌊"/* left floor = apl downstile */,
"\u230A" },
{ "⌋", "⌋"/* right floor */, "\u230B" },
{ "〈", "〈"/* left-pointing angle bracket = bra */,
"\u2329" }
/* lang is NOT the same character as ,"\u003C"} */
,
{ "〉", "〉"/* right-pointing angle bracket = ket */,
"\u232A" }
/* rang is NOT the same character as ,"\u003E"} */
/* Geometric Shapes */
,
{ "◊", "◊"/* lozenge */, "\u25CA" }
/* Miscellaneous Symbols */
,
{ "♠", "♠"/* black spade suit */, "\u2660" }
/* black here seems to mean filled as opposed to hollow */
,
{ "♣", "♣"/* black club suit = shamrock */, "\u2663" },
{ "♥", "♥"/* black heart suit = valentine */,
"\u2665" },
{ "♦", "♦"/* black diamond suit */, "\u2666" },
{ """, """ /* quotation mark = APL quote */, "\"" },
{ "&", "&" /* ampersand */, "\u0026" },
{ "<", "<" /* less-than sign */, "\u003C" },
{ ">", ">" /* greater-than sign */, "\u003E" }
/* Latin Extended-A */
,
{ "Œ", "Œ" /* latin capital ligature OE */, "\u0152" },
{ "œ", "œ" /* latin small ligature oe */, "\u0153" }
/*
* ligature is a misnomer this is a separate character in some
* languages
*/
,
{ "Š",
"Š" /* latin capital letter S with caron */,
"\u0160" },
{ "š", "š" /* latin small letter s with caron */,
"\u0161" },
{ "Ÿ",
"Ÿ" /* latin capital letter Y with diaeresis */,
"\u0178" }
/* Spacing Modifier Letters */
,
{ "ˆ", "ˆ" /* modifier letter circumflex accent */,
"\u02C6" },
{ "˜", "˜" /* small tilde */, "\u02DC" }
/* General Punctuation */
,
{ " ", " "/* en space */, "\u2002" },
{ " ", " "/* em space */, "\u2003" },
{ " ", " "/* thin space */, "\u2009" },
{ "", ""/* zero width non-joiner */, "\u200C" },
{ "", ""/* zero width joiner */, "\u200D" },
{ "", ""/* left-to-right mark */, "\u200E" },
{ "", ""/* right-to-left mark */, "\u200F" },
{ "–", "–"/* en dash */, "\u2013" },
{ "—", "—"/* em dash */, "\u2014" },
{ "‘", "‘"/* left single quotation mark */, "\u2018" },
{ "’", "’"/* right single quotation mark */, "\u2019" },
{ "‚", "‚"/* single low-9 quotation mark */, "\u201A" },
{ "“", "“"/* left double quotation mark */, "\u201C" },
{ "”", "”"/* right double quotation mark */, "\u201D" },
{ "„", "„"/* double low-9 quotation mark */, "\u201E" },
{ "†", "†"/* dagger */, "\u2020" },
{ "‡", "‡"/* double dagger */, "\u2021" },
{ "‰", "‰"/* per mille sign */, "\u2030" },
{ "‹", "‹"/*
* single left-pointing angle quotation
* mark
*/, "\u2039" }
/* lsaquo is proposed but not yet ISO standardized */
, { "›", "›"/*
* single right-pointing angle quotation
* mark
*/, "\u203A" }
/* rsaquo is proposed but not yet ISO standardized */
, { "€", "€" /* euro sign */, "\u20AC" } };
for (String[] entity : entities) {
entityEscapeMap.put(entity[2], entity[0]);
escapeEntityMap.put(entity[0], entity[2]);
escapeEntityMap.put(entity[1], entity[2]);
}
}
}