1 // todo....figure out what license changes need to be made here...
3 // HTMLParser Library v1_3_20030511 - A java-based parser for HTML
4 // Copyright (C) Dec 31, 2000 Somik Raha
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of the GNU Lesser General Public
8 // License as published by the Free Software Foundation; either
9 // version 2.1 of the License, or (at your option) any later version.
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // Lesser General Public License for more details.
16 // You should have received a copy of the GNU Lesser General Public
17 // License along with this library; if not, write to the Free Software
18 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 // For any questions or suggestions, you can write to me at :
21 // Email :somik@industriallogic.com
25 // Extreme Programmer & Coach
26 // Industrial Logic Corporation
27 // 2583 Cedar Street, Berkeley,
29 // Website : http://www.industriallogic.com
31 // This class was contributed by
37 import java.util.HashMap;
38 import java.util.Iterator;
42 * Translate numeric character references and character entity references to unicode characters.
43 * Based on tables found at <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">
44 * http://www.w3.org/TR/REC-html40/sgml/entities.html</a>
45 * <p><b>Note: Do not edit! This class is created by the Generate class.</b>
48 * String s = Translate.decode (getTextFromHtmlPage ());
50 * @author <a href='mailto:DerrickOswald@users.sourceforge.net?subject=Character Reference Translation class'>Derrick Oswald</a>
52 public class Translate
55 * Table mapping entity reference kernel to character.
56 * <p><code>String</code>-><code>Character</code>
58 protected static Map refChar;
61 refChar = new HashMap(1000);
63 // Portions © International Organization for Standardization 1986
64 // Permission to copy in any form is granted for use with
65 // conforming SGML systems and applications as defined in
66 // ISO 8879, provided this notice is included in all copies.
67 // Character entity set. Typical invocation:
68 // <!ENTITY % HTMLlat1 PUBLIC
69 // "-//W3C//ENTITIES Latin 1//EN//HTML">
71 refChar.put ("nbsp", new Character ('\u00a0')); // no-break space = non-breaking space, U+00A0 ISOnum
72 refChar.put ("iexcl", new Character ('\u00a1')); // inverted exclamation mark, U+00A1 ISOnum
73 refChar.put ("cent", new Character ('\u00a2')); // cent sign, U+00A2 ISOnum
74 refChar.put ("pound", new Character ('\u00a3')); // pound sign, U+00A3 ISOnum
75 refChar.put ("curren", new Character ('\u00a4')); // currency sign, U+00A4 ISOnum
76 refChar.put ("yen", new Character ('\u00a5')); // yen sign = yuan sign, U+00A5 ISOnum
77 refChar.put ("brvbar", new Character ('\u00a6')); // broken bar = broken vertical bar, U+00A6 ISOnum
78 refChar.put ("sect", new Character ('\u00a7')); // section sign, U+00A7 ISOnum
79 refChar.put ("uml", new Character ('\u00a8')); // diaeresis = spacing diaeresis, U+00A8 ISOdia
80 refChar.put ("copy", new Character ('\u00a9')); // copyright sign, U+00A9 ISOnum
81 refChar.put ("ordf", new Character ('\u00aa')); // feminine ordinal indicator, U+00AA ISOnum
82 refChar.put ("laquo", new Character ('\u00ab')); // left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum
83 refChar.put ("not", new Character ('\u00ac')); // not sign, U+00AC ISOnum
84 refChar.put ("shy", new Character ('\u00ad')); // soft hyphen = discretionary hyphen, U+00AD ISOnum
85 refChar.put ("reg", new Character ('\u00ae')); // registered sign = registered trade mark sign, U+00AE ISOnum
86 refChar.put ("macr", new Character ('\u00af')); // macron = spacing macron = overline = APL overbar, U+00AF ISOdia
87 refChar.put ("deg", new Character ('\u00b0')); // degree sign, U+00B0 ISOnum
88 refChar.put ("plusmn", new Character ('\u00b1')); // plus-minus sign = plus-or-minus sign, U+00B1 ISOnum
89 refChar.put ("sup2", new Character ('\u00b2')); // superscript two = superscript digit two = squared, U+00B2 ISOnum
90 refChar.put ("sup3", new Character ('\u00b3')); // superscript three = superscript digit three = cubed, U+00B3 ISOnum
91 refChar.put ("acute", new Character ('\u00b4')); // acute accent = spacing acute, U+00B4 ISOdia
92 refChar.put ("micro", new Character ('\u00b5')); // micro sign, U+00B5 ISOnum
93 refChar.put ("para", new Character ('\u00b6')); // pilcrow sign = paragraph sign, U+00B6 ISOnum
94 refChar.put ("middot", new Character ('\u00b7')); // middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum
95 refChar.put ("cedil", new Character ('\u00b8')); // cedilla = spacing cedilla, U+00B8 ISOdia
96 refChar.put ("sup1", new Character ('\u00b9')); // superscript one = superscript digit one, U+00B9 ISOnum
97 refChar.put ("ordm", new Character ('\u00ba')); // masculine ordinal indicator, U+00BA ISOnum
98 refChar.put ("raquo", new Character ('\u00bb')); // right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum
99 refChar.put ("frac14", new Character ('\u00bc')); // vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum
100 refChar.put ("frac12", new Character ('\u00bd')); // vulgar fraction one half = fraction one half, U+00BD ISOnum
101 refChar.put ("frac34", new Character ('\u00be')); // vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum
102 refChar.put ("iquest", new Character ('\u00bf')); // inverted question mark = turned question mark, U+00BF ISOnum
103 refChar.put ("Agrave", new Character ('\u00c0')); // latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1
104 refChar.put ("Aacute", new Character ('\u00c1')); // latin capital letter A with acute, U+00C1 ISOlat1
105 refChar.put ("Acirc", new Character ('\u00c2')); // latin capital letter A with circumflex, U+00C2 ISOlat1
106 refChar.put ("Atilde", new Character ('\u00c3')); // latin capital letter A with tilde, U+00C3 ISOlat1
107 refChar.put ("Auml", new Character ('\u00c4')); // latin capital letter A with diaeresis, U+00C4 ISOlat1
108 refChar.put ("Aring", new Character ('\u00c5')); // latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1
109 refChar.put ("AElig", new Character ('\u00c6')); // latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
110 refChar.put ("Ccedil", new Character ('\u00c7')); // latin capital letter C with cedilla, U+00C7 ISOlat1
111 refChar.put ("Egrave", new Character ('\u00c8')); // latin capital letter E with grave, U+00C8 ISOlat1
112 refChar.put ("Eacute", new Character ('\u00c9')); // latin capital letter E with acute, U+00C9 ISOlat1
113 refChar.put ("Ecirc", new Character ('\u00ca')); // latin capital letter E with circumflex, U+00CA ISOlat1
114 refChar.put ("Euml", new Character ('\u00cb')); // latin capital letter E with diaeresis, U+00CB ISOlat1
115 refChar.put ("Igrave", new Character ('\u00cc')); // latin capital letter I with grave, U+00CC ISOlat1
116 refChar.put ("Iacute", new Character ('\u00cd')); // latin capital letter I with acute, U+00CD ISOlat1
117 refChar.put ("Icirc", new Character ('\u00ce')); // latin capital letter I with circumflex, U+00CE ISOlat1
118 refChar.put ("Iuml", new Character ('\u00cf')); // latin capital letter I with diaeresis, U+00CF ISOlat1
119 refChar.put ("ETH", new Character ('\u00d0')); // latin capital letter ETH, U+00D0 ISOlat1
120 refChar.put ("Ntilde", new Character ('\u00d1')); // latin capital letter N with tilde, U+00D1 ISOlat1
121 refChar.put ("Ograve", new Character ('\u00d2')); // latin capital letter O with grave, U+00D2 ISOlat1
122 refChar.put ("Oacute", new Character ('\u00d3')); // latin capital letter O with acute, U+00D3 ISOlat1
123 refChar.put ("Ocirc", new Character ('\u00d4')); // latin capital letter O with circumflex, U+00D4 ISOlat1
124 refChar.put ("Otilde", new Character ('\u00d5')); // latin capital letter O with tilde, U+00D5 ISOlat1
125 refChar.put ("Ouml", new Character ('\u00d6')); // latin capital letter O with diaeresis, U+00D6 ISOlat1
126 refChar.put ("times", new Character ('\u00d7')); // multiplication sign, U+00D7 ISOnum
127 refChar.put ("Oslash", new Character ('\u00d8')); // latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1
128 refChar.put ("Ugrave", new Character ('\u00d9')); // latin capital letter U with grave, U+00D9 ISOlat1
129 refChar.put ("Uacute", new Character ('\u00da')); // latin capital letter U with acute, U+00DA ISOlat1
130 refChar.put ("Ucirc", new Character ('\u00db')); // latin capital letter U with circumflex, U+00DB ISOlat1
131 refChar.put ("Uuml", new Character ('\u00dc')); // latin capital letter U with diaeresis, U+00DC ISOlat1
132 refChar.put ("Yacute", new Character ('\u00dd')); // latin capital letter Y with acute, U+00DD ISOlat1
133 refChar.put ("THORN", new Character ('\u00de')); // latin capital letter THORN, U+00DE ISOlat1
134 refChar.put ("szlig", new Character ('\u00df')); // latin small letter sharp s = ess-zed, U+00DF ISOlat1
135 refChar.put ("agrave", new Character ('\u00e0')); // latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1
136 refChar.put ("aacute", new Character ('\u00e1')); // latin small letter a with acute, U+00E1 ISOlat1
137 refChar.put ("acirc", new Character ('\u00e2')); // latin small letter a with circumflex, U+00E2 ISOlat1
138 refChar.put ("atilde", new Character ('\u00e3')); // latin small letter a with tilde, U+00E3 ISOlat1
139 refChar.put ("auml", new Character ('\u00e4')); // latin small letter a with diaeresis, U+00E4 ISOlat1
140 refChar.put ("aring", new Character ('\u00e5')); // latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1
141 refChar.put ("aelig", new Character ('\u00e6')); // latin small letter ae = latin small ligature ae, U+00E6 ISOlat1
142 refChar.put ("ccedil", new Character ('\u00e7')); // latin small letter c with cedilla, U+00E7 ISOlat1
143 refChar.put ("egrave", new Character ('\u00e8')); // latin small letter e with grave, U+00E8 ISOlat1
144 refChar.put ("eacute", new Character ('\u00e9')); // latin small letter e with acute, U+00E9 ISOlat1
145 refChar.put ("ecirc", new Character ('\u00ea')); // latin small letter e with circumflex, U+00EA ISOlat1
146 refChar.put ("euml", new Character ('\u00eb')); // latin small letter e with diaeresis, U+00EB ISOlat1
147 refChar.put ("igrave", new Character ('\u00ec')); // latin small letter i with grave, U+00EC ISOlat1
148 refChar.put ("iacute", new Character ('\u00ed')); // latin small letter i with acute, U+00ED ISOlat1
149 refChar.put ("icirc", new Character ('\u00ee')); // latin small letter i with circumflex, U+00EE ISOlat1
150 refChar.put ("iuml", new Character ('\u00ef')); // latin small letter i with diaeresis, U+00EF ISOlat1
151 refChar.put ("eth", new Character ('\u00f0')); // latin small letter eth, U+00F0 ISOlat1
152 refChar.put ("ntilde", new Character ('\u00f1')); // latin small letter n with tilde, U+00F1 ISOlat1
153 refChar.put ("ograve", new Character ('\u00f2')); // latin small letter o with grave, U+00F2 ISOlat1
154 refChar.put ("oacute", new Character ('\u00f3')); // latin small letter o with acute, U+00F3 ISOlat1
155 refChar.put ("ocirc", new Character ('\u00f4')); // latin small letter o with circumflex, U+00F4 ISOlat1
156 refChar.put ("otilde", new Character ('\u00f5')); // latin small letter o with tilde, U+00F5 ISOlat1
157 refChar.put ("ouml", new Character ('\u00f6')); // latin small letter o with diaeresis, U+00F6 ISOlat1
158 refChar.put ("divide", new Character ('\u00f7')); // division sign, U+00F7 ISOnum
159 refChar.put ("oslash", new Character ('\u00f8')); // latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1
160 refChar.put ("ugrave", new Character ('\u00f9')); // latin small letter u with grave, U+00F9 ISOlat1
161 refChar.put ("uacute", new Character ('\u00fa')); // latin small letter u with acute, U+00FA ISOlat1
162 refChar.put ("ucirc", new Character ('\u00fb')); // latin small letter u with circumflex, U+00FB ISOlat1
163 refChar.put ("uuml", new Character ('\u00fc')); // latin small letter u with diaeresis, U+00FC ISOlat1
164 refChar.put ("yacute", new Character ('\u00fd')); // latin small letter y with acute, U+00FD ISOlat1
165 refChar.put ("thorn", new Character ('\u00fe')); // latin small letter thorn, U+00FE ISOlat1
166 refChar.put ("yuml", new Character ('\u00ff')); // latin small letter y with diaeresis, U+00FF ISOlat1
167 // Mathematical, Greek and Symbolic characters for HTML
168 // Character entity set. Typical invocation:
169 // <!ENTITY % HTMLsymbol PUBLIC
170 // "-//W3C//ENTITIES Symbols//EN//HTML">
172 // Portions © International Organization for Standardization 1986:
173 // Permission to copy in any form is granted for use with
174 // conforming SGML systems and applications as defined in
175 // ISO 8879, provided this notice is included in all copies.
176 // Relevant ISO entity set is given unless names are newly introduced.
177 // New names (i.e., not in ISO 8879 list) do not clash with any
178 // existing ISO 8879 entity names. ISO 10646 character numbers
179 // are given for each character, in hex. CDATA values are decimal
180 // conversions of the ISO 10646 values and refer to the document
181 // character set. Names are ISO 10646 names.
183 refChar.put ("fnof", new Character ('\u0192')); // latin small f with hook = function = florin, U+0192 ISOtech
185 refChar.put ("Alpha", new Character ('\u0391')); // greek capital letter alpha, U+0391
186 refChar.put ("Beta", new Character ('\u0392')); // greek capital letter beta, U+0392
187 refChar.put ("Gamma", new Character ('\u0393')); // greek capital letter gamma, U+0393 ISOgrk3
188 refChar.put ("Delta", new Character ('\u0394')); // greek capital letter delta, U+0394 ISOgrk3
189 refChar.put ("Epsilon", new Character ('\u0395')); // greek capital letter epsilon, U+0395
190 refChar.put ("Zeta", new Character ('\u0396')); // greek capital letter zeta, U+0396
191 refChar.put ("Eta", new Character ('\u0397')); // greek capital letter eta, U+0397
192 refChar.put ("Theta", new Character ('\u0398')); // greek capital letter theta, U+0398 ISOgrk3
193 refChar.put ("Iota", new Character ('\u0399')); // greek capital letter iota, U+0399
194 refChar.put ("Kappa", new Character ('\u039a')); // greek capital letter kappa, U+039A
195 refChar.put ("Lambda", new Character ('\u039b')); // greek capital letter lambda, U+039B ISOgrk3
196 refChar.put ("Mu", new Character ('\u039c')); // greek capital letter mu, U+039C
197 refChar.put ("Nu", new Character ('\u039d')); // greek capital letter nu, U+039D
198 refChar.put ("Xi", new Character ('\u039e')); // greek capital letter xi, U+039E ISOgrk3
199 refChar.put ("Omicron", new Character ('\u039f')); // greek capital letter omicron, U+039F
200 refChar.put ("Pi", new Character ('\u03a0')); // greek capital letter pi, U+03A0 ISOgrk3
201 refChar.put ("Rho", new Character ('\u03a1')); // greek capital letter rho, U+03A1
202 // there is no Sigmaf, and no U+03A2 character either
203 refChar.put ("Sigma", new Character ('\u03a3')); // greek capital letter sigma, U+03A3 ISOgrk3
204 refChar.put ("Tau", new Character ('\u03a4')); // greek capital letter tau, U+03A4
205 refChar.put ("Upsilon", new Character ('\u03a5')); // greek capital letter upsilon, U+03A5 ISOgrk3
206 refChar.put ("Phi", new Character ('\u03a6')); // greek capital letter phi, U+03A6 ISOgrk3
207 refChar.put ("Chi", new Character ('\u03a7')); // greek capital letter chi, U+03A7
208 refChar.put ("Psi", new Character ('\u03a8')); // greek capital letter psi, U+03A8 ISOgrk3
209 refChar.put ("Omega", new Character ('\u03a9')); // greek capital letter omega, U+03A9 ISOgrk3
210 refChar.put ("alpha", new Character ('\u03b1')); // greek small letter alpha, U+03B1 ISOgrk3
211 refChar.put ("beta", new Character ('\u03b2')); // greek small letter beta, U+03B2 ISOgrk3
212 refChar.put ("gamma", new Character ('\u03b3')); // greek small letter gamma, U+03B3 ISOgrk3
213 refChar.put ("delta", new Character ('\u03b4')); // greek small letter delta, U+03B4 ISOgrk3
214 refChar.put ("epsilon", new Character ('\u03b5')); // greek small letter epsilon, U+03B5 ISOgrk3
215 refChar.put ("zeta", new Character ('\u03b6')); // greek small letter zeta, U+03B6 ISOgrk3
216 refChar.put ("eta", new Character ('\u03b7')); // greek small letter eta, U+03B7 ISOgrk3
217 refChar.put ("theta", new Character ('\u03b8')); // greek small letter theta, U+03B8 ISOgrk3
218 refChar.put ("iota", new Character ('\u03b9')); // greek small letter iota, U+03B9 ISOgrk3
219 refChar.put ("kappa", new Character ('\u03ba')); // greek small letter kappa, U+03BA ISOgrk3
220 refChar.put ("lambda", new Character ('\u03bb')); // greek small letter lambda, U+03BB ISOgrk3
221 refChar.put ("mu", new Character ('\u03bc')); // greek small letter mu, U+03BC ISOgrk3
222 refChar.put ("nu", new Character ('\u03bd')); // greek small letter nu, U+03BD ISOgrk3
223 refChar.put ("xi", new Character ('\u03be')); // greek small letter xi, U+03BE ISOgrk3
224 refChar.put ("omicron", new Character ('\u03bf')); // greek small letter omicron, U+03BF NEW
225 refChar.put ("pi", new Character ('\u03c0')); // greek small letter pi, U+03C0 ISOgrk3
226 refChar.put ("rho", new Character ('\u03c1')); // greek small letter rho, U+03C1 ISOgrk3
227 refChar.put ("sigmaf", new Character ('\u03c2')); // greek small letter final sigma, U+03C2 ISOgrk3
228 refChar.put ("sigma", new Character ('\u03c3')); // greek small letter sigma, U+03C3 ISOgrk3
229 refChar.put ("tau", new Character ('\u03c4')); // greek small letter tau, U+03C4 ISOgrk3
230 refChar.put ("upsilon", new Character ('\u03c5')); // greek small letter upsilon, U+03C5 ISOgrk3
231 refChar.put ("phi", new Character ('\u03c6')); // greek small letter phi, U+03C6 ISOgrk3
232 refChar.put ("chi", new Character ('\u03c7')); // greek small letter chi, U+03C7 ISOgrk3
233 refChar.put ("psi", new Character ('\u03c8')); // greek small letter psi, U+03C8 ISOgrk3
234 refChar.put ("omega", new Character ('\u03c9')); // greek small letter omega, U+03C9 ISOgrk3
235 refChar.put ("thetasym", new Character ('\u03d1')); // greek small letter theta symbol, U+03D1 NEW
236 refChar.put ("upsih", new Character ('\u03d2')); // greek upsilon with hook symbol, U+03D2 NEW
237 refChar.put ("piv", new Character ('\u03d6')); // greek pi symbol, U+03D6 ISOgrk3
238 // General Punctuation
239 refChar.put ("bull", new Character ('\u2022')); // bullet = black small circle, U+2022 ISOpub
240 // bullet is NOT the same as bullet operator, U+2219
241 refChar.put ("hellip", new Character ('\u2026')); // horizontal ellipsis = three dot leader, U+2026 ISOpub
242 refChar.put ("prime", new Character ('\u2032')); // prime = minutes = feet, U+2032 ISOtech
243 refChar.put ("Prime", new Character ('\u2033')); // double prime = seconds = inches, U+2033 ISOtech
244 refChar.put ("oline", new Character ('\u203e')); // overline = spacing overscore, U+203E NEW
245 refChar.put ("frasl", new Character ('\u2044')); // fraction slash, U+2044 NEW
246 // Letterlike Symbols
247 refChar.put ("weierp", new Character ('\u2118')); // script capital P = power set = Weierstrass p, U+2118 ISOamso
248 refChar.put ("image", new Character ('\u2111')); // blackletter capital I = imaginary part, U+2111 ISOamso
249 refChar.put ("real", new Character ('\u211c')); // blackletter capital R = real part symbol, U+211C ISOamso
250 refChar.put ("trade", new Character ('\u2122')); // trade mark sign, U+2122 ISOnum
251 refChar.put ("alefsym", new Character ('\u2135')); // alef symbol = first transfinite cardinal, U+2135 NEW
252 // alef symbol is NOT the same as hebrew letter alef,
253 // U+05D0 although the same glyph could be used to depict both characters
255 refChar.put ("larr", new Character ('\u2190')); // leftwards arrow, U+2190 ISOnum
256 refChar.put ("uarr", new Character ('\u2191')); // upwards arrow, U+2191 ISOnum
257 refChar.put ("rarr", new Character ('\u2192')); // rightwards arrow, U+2192 ISOnum
258 refChar.put ("darr", new Character ('\u2193')); // downwards arrow, U+2193 ISOnum
259 refChar.put ("harr", new Character ('\u2194')); // left right arrow, U+2194 ISOamsa
260 refChar.put ("crarr", new Character ('\u21b5')); // downwards arrow with corner leftwards = carriage return, U+21B5 NEW
261 refChar.put ("lArr", new Character ('\u21d0')); // leftwards double arrow, U+21D0 ISOtech
262 // ISO 10646 does not say that lArr is the same as the 'is implied by' arrow
263 // but also does not have any other character for that function. So ? lArr can
264 // be used for 'is implied by' as ISOtech suggests
265 refChar.put ("uArr", new Character ('\u21d1')); // upwards double arrow, U+21D1 ISOamsa
266 refChar.put ("rArr", new Character ('\u21d2')); // rightwards double arrow, U+21D2 ISOtech
267 // ISO 10646 does not say this is the 'implies' character but does not have
268 // another character with this function so ?
269 // rArr can be used for 'implies' as ISOtech suggests
270 refChar.put ("dArr", new Character ('\u21d3')); // downwards double arrow, U+21D3 ISOamsa
271 refChar.put ("hArr", new Character ('\u21d4')); // left right double arrow, U+21D4 ISOamsa
272 // Mathematical Operators
273 refChar.put ("forall", new Character ('\u2200')); // for all, U+2200 ISOtech
274 refChar.put ("part", new Character ('\u2202')); // partial differential, U+2202 ISOtech
275 refChar.put ("exist", new Character ('\u2203')); // there exists, U+2203 ISOtech
276 refChar.put ("empty", new Character ('\u2205')); // empty set = null set = diameter, U+2205 ISOamso
277 refChar.put ("nabla", new Character ('\u2207')); // nabla = backward difference, U+2207 ISOtech
278 refChar.put ("isin", new Character ('\u2208')); // element of, U+2208 ISOtech
279 refChar.put ("notin", new Character ('\u2209')); // not an element of, U+2209 ISOtech
280 refChar.put ("ni", new Character ('\u220b')); // contains as member, U+220B ISOtech
281 // should there be a more memorable name than 'ni'?
282 refChar.put ("prod", new Character ('\u220f')); // n-ary product = product sign, U+220F ISOamsb
283 // prod is NOT the same character as U+03A0 'greek capital letter pi' though
284 // the same glyph might be used for both
285 refChar.put ("sum", new Character ('\u2211')); // n-ary sumation, U+2211 ISOamsb
286 // sum is NOT the same character as U+03A3 'greek capital letter sigma'
287 // though the same glyph might be used for both
288 refChar.put ("minus", new Character ('\u2212')); // minus sign, U+2212 ISOtech
289 refChar.put ("lowast", new Character ('\u2217')); // asterisk operator, U+2217 ISOtech
290 refChar.put ("radic", new Character ('\u221a')); // square root = radical sign, U+221A ISOtech
291 refChar.put ("prop", new Character ('\u221d')); // proportional to, U+221D ISOtech
292 refChar.put ("infin", new Character ('\u221e')); // infinity, U+221E ISOtech
293 refChar.put ("ang", new Character ('\u2220')); // angle, U+2220 ISOamso
294 refChar.put ("and", new Character ('\u2227')); // logical and = wedge, U+2227 ISOtech
295 refChar.put ("or", new Character ('\u2228')); // logical or = vee, U+2228 ISOtech
296 refChar.put ("cap", new Character ('\u2229')); // intersection = cap, U+2229 ISOtech
297 refChar.put ("cup", new Character ('\u222a')); // union = cup, U+222A ISOtech
298 refChar.put ("int", new Character ('\u222b')); // integral, U+222B ISOtech
299 refChar.put ("there4", new Character ('\u2234')); // therefore, U+2234 ISOtech
300 refChar.put ("sim", new Character ('\u223c')); // tilde operator = varies with = similar to, U+223C ISOtech
301 // tilde operator is NOT the same character as the tilde, U+007E,
302 // although the same glyph might be used to represent both
303 refChar.put ("cong", new Character ('\u2245')); // approximately equal to, U+2245 ISOtech
304 refChar.put ("asymp", new Character ('\u2248')); // almost equal to = asymptotic to, U+2248 ISOamsr
305 refChar.put ("ne", new Character ('\u2260')); // not equal to, U+2260 ISOtech
306 refChar.put ("equiv", new Character ('\u2261')); // identical to, U+2261 ISOtech
307 refChar.put ("le", new Character ('\u2264')); // less-than or equal to, U+2264 ISOtech
308 refChar.put ("ge", new Character ('\u2265')); // greater-than or equal to, U+2265 ISOtech
309 refChar.put ("sub", new Character ('\u2282')); // subset of, U+2282 ISOtech
310 refChar.put ("sup", new Character ('\u2283')); // superset of, U+2283 ISOtech
311 // note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
312 // font encoding and is not included. Should it be, for symmetry?
314 refChar.put ("nsub", new Character ('\u2284')); // not a subset of, U+2284 ISOamsn
315 refChar.put ("sube", new Character ('\u2286')); // subset of or equal to, U+2286 ISOtech
316 refChar.put ("supe", new Character ('\u2287')); // superset of or equal to, U+2287 ISOtech
317 refChar.put ("oplus", new Character ('\u2295')); // circled plus = direct sum, U+2295 ISOamsb
318 refChar.put ("otimes", new Character ('\u2297')); // circled times = vector product, U+2297 ISOamsb
319 refChar.put ("perp", new Character ('\u22a5')); // up tack = orthogonal to = perpendicular, U+22A5 ISOtech
320 refChar.put ("sdot", new Character ('\u22c5')); // dot operator, U+22C5 ISOamsb
321 // dot operator is NOT the same character as U+00B7 middle dot
322 // Miscellaneous Technical
323 refChar.put ("lceil", new Character ('\u2308')); // left ceiling = apl upstile, U+2308 ISOamsc
324 refChar.put ("rceil", new Character ('\u2309')); // right ceiling, U+2309 ISOamsc
325 refChar.put ("lfloor", new Character ('\u230a')); // left floor = apl downstile, U+230A ISOamsc
326 refChar.put ("rfloor", new Character ('\u230b')); // right floor, U+230B ISOamsc
327 refChar.put ("lang", new Character ('\u2329')); // left-pointing angle bracket = bra, U+2329 ISOtech
328 // lang is NOT the same character as U+003C 'less than'
329 // or U+2039 'single left-pointing angle quotation mark'
330 refChar.put ("rang", new Character ('\u232a')); // right-pointing angle bracket = ket, U+232A ISOtech
331 // rang is NOT the same character as U+003E 'greater than'
332 // or U+203A 'single right-pointing angle quotation mark'
334 refChar.put ("loz", new Character ('\u25ca')); // lozenge, U+25CA ISOpub
335 // Miscellaneous Symbols
336 refChar.put ("spades", new Character ('\u2660')); // black spade suit, U+2660 ISOpub
337 // black here seems to mean filled as opposed to hollow
338 refChar.put ("clubs", new Character ('\u2663')); // black club suit = shamrock, U+2663 ISOpub
339 refChar.put ("hearts", new Character ('\u2665')); // black heart suit = valentine, U+2665 ISOpub
340 refChar.put ("diams", new Character ('\u2666')); // black diamond suit, U+2666 ISOpub
341 // Special characters for HTML
342 // Character entity set. Typical invocation:
343 // <!ENTITY % HTMLspecial PUBLIC
344 // "-//W3C//ENTITIES Special//EN//HTML">
346 // Portions © International Organization for Standardization 1986:
347 // Permission to copy in any form is granted for use with
348 // conforming SGML systems and applications as defined in
349 // ISO 8879, provided this notice is included in all copies.
350 // Relevant ISO entity set is given unless names are newly introduced.
351 // New names (i.e., not in ISO 8879 list) do not clash with any
352 // existing ISO 8879 entity names. ISO 10646 character numbers
353 // are given for each character, in hex. CDATA values are decimal
354 // conversions of the ISO 10646 values and refer to the document
355 // character set. Names are ISO 10646 names.
356 // C0 Controls and Basic Latin
357 refChar.put ("quot", new Character ('\u0022')); // quotation mark = APL quote, U+0022 ISOnum
358 refChar.put ("amp", new Character ('\u0026')); // ampersand, U+0026 ISOnum
359 refChar.put ("lt", new Character ('\u003c')); // less-than sign, U+003C ISOnum
360 refChar.put ("gt", new Character ('\u003e')); // greater-than sign, U+003E ISOnum
362 refChar.put ("OElig", new Character ('\u0152')); // latin capital ligature OE, U+0152 ISOlat2
363 refChar.put ("oelig", new Character ('\u0153')); // latin small ligature oe, U+0153 ISOlat2
364 // ligature is a misnomer, this is a separate character in some languages
365 refChar.put ("Scaron", new Character ('\u0160')); // latin capital letter S with caron, U+0160 ISOlat2
366 refChar.put ("scaron", new Character ('\u0161')); // latin small letter s with caron, U+0161 ISOlat2
367 refChar.put ("Yuml", new Character ('\u0178')); // latin capital letter Y with diaeresis, U+0178 ISOlat2
368 // Spacing Modifier Letters
369 refChar.put ("circ", new Character ('\u02c6')); // modifier letter circumflex accent, U+02C6 ISOpub
370 refChar.put ("tilde", new Character ('\u02dc')); // small tilde, U+02DC ISOdia
371 // General Punctuation
372 refChar.put ("ensp", new Character ('\u2002')); // en space, U+2002 ISOpub
373 refChar.put ("emsp", new Character ('\u2003')); // em space, U+2003 ISOpub
374 refChar.put ("thinsp", new Character ('\u2009')); // thin space, U+2009 ISOpub
375 refChar.put ("zwnj", new Character ('\u200c')); // zero width non-joiner, U+200C NEW RFC 2070
376 refChar.put ("zwj", new Character ('\u200d')); // zero width joiner, U+200D NEW RFC 2070
377 refChar.put ("lrm", new Character ('\u200e')); // left-to-right mark, U+200E NEW RFC 2070
378 refChar.put ("rlm", new Character ('\u200f')); // right-to-left mark, U+200F NEW RFC 2070
379 refChar.put ("ndash", new Character ('\u2013')); // en dash, U+2013 ISOpub
380 refChar.put ("mdash", new Character ('\u2014')); // em dash, U+2014 ISOpub
381 refChar.put ("lsquo", new Character ('\u2018')); // left single quotation mark, U+2018 ISOnum
382 refChar.put ("rsquo", new Character ('\u2019')); // right single quotation mark, U+2019 ISOnum
383 refChar.put ("sbquo", new Character ('\u201a')); // single low-9 quotation mark, U+201A NEW
384 refChar.put ("ldquo", new Character ('\u201c')); // left double quotation mark, U+201C ISOnum
385 refChar.put ("rdquo", new Character ('\u201d')); // right double quotation mark, U+201D ISOnum
386 refChar.put ("bdquo", new Character ('\u201e')); // double low-9 quotation mark, U+201E NEW
387 refChar.put ("dagger", new Character ('\u2020')); // dagger, U+2020 ISOpub
388 refChar.put ("Dagger", new Character ('\u2021')); // double dagger, U+2021 ISOpub
389 refChar.put ("permil", new Character ('\u2030')); // per mille sign, U+2030 ISOtech
390 refChar.put ("lsaquo", new Character ('\u2039')); // single left-pointing angle quotation mark, U+2039 ISO proposed
391 // lsaquo is proposed but not yet ISO standardized
392 refChar.put ("rsaquo", new Character ('\u203a')); // single right-pointing angle quotation mark, U+203A ISO proposed
393 // rsaquo is proposed but not yet ISO standardized
394 refChar.put ("euro", new Character ('\u20ac')); // euro sign, U+20AC NEW
398 * Table mapping character to entity reference kernel.
399 * <p><code>Character</code>-><code>String</code>
401 protected static Map charRefTable;
404 charRefTable = new HashMap (refChar.size ());
405 Iterator iterator = refChar.keySet ().iterator ();
406 while (iterator.hasNext ())
408 String key = (String)iterator.next ();
409 Character character = (Character)refChar.get (key);
410 charRefTable.put (character, key);
415 * Private constructor.
416 * This class is fully static and thread safe.
423 * Convert a reference to a unicode character.
424 * Convert a single numeric character reference or character entity reference
425 * to a unicode character.
426 * @param string The string to convert. Of the form &xxxx; or &#xxxx; with
427 * or without the leading ampersand or trailing semi-colon.
428 * @return The converted character or '\0' (zero) if the string is an
431 public static char convertToChar (String string)
439 length = string.length ();
442 if ('&' == string.charAt (0))
444 string = string.substring (1);
449 if (';' == string.charAt (length - 1))
450 string = string.substring (0, --length);
453 if ('#' == string.charAt (0))
456 ret = (char)Integer.parseInt (string.substring (1));
458 catch (NumberFormatException nfe)
460 /* failed conversion, return 0 */
464 item = (Character)refChar.get (string);
466 ret = item.charValue ();
476 * Decode a string containing references.
477 * Change all numeric character reference and character entity references
478 * to unicode characters.
479 * @param string The string to translate.
481 public static String decode (String string)
490 ret = new StringBuffer (string.length ());
492 length = string.length ();
493 while ((index < length) && (-1 != (amp = string.indexOf ('&', index))))
495 ret.append (string.substring (index, amp));
497 if (amp < length - 1)
499 semi = string.indexOf (';', amp);
501 code = string.substring (amp, semi + 1);
503 code = string.substring (amp);
504 if (0 != (character = convertToChar (code)))
505 index += code.length () - 1;
511 ret.append (character);
514 ret.append (string.substring (index));
515 return (ret.toString ());
519 * Convert a character to a character entity reference.
520 * Convert a unicode character to a character entity reference of
522 * @param character The character to convert.
523 * @return The converted character or <code>null</code> if the character
524 * is not one of the known entity references.
526 public static String convertToString (Character character)
530 if (null != (ret = (String)charRefTable.get (character)))
532 buffer = new StringBuffer (ret.length () + 2);
536 ret = buffer.toString ();
542 * Convert a character to a numeric character reference.
543 * Convert a unicode character to a numeric character reference of
544 * the form &#xxxx;.
545 * @param character The character to convert.
546 * @return The converted character.
548 public static String convertToString (int character)
551 ret = new StringBuffer (13); /* � */
553 ret.append (character);
555 return (ret.toString ());
559 * Encode a string to use references.
560 * Change all characters that are not ASCII to their numeric character
561 * reference or character entity reference.
562 * This implementation is inefficient, allocating a new
563 * <code>Character</code> for each character in the string,
564 * but this class is primarily intended to decode strings
565 * so efficiency and speed in the encoding was not a priority.
566 * @param string The string to translate.
568 public static String encode (String string)
575 ret = new StringBuffer (string.length () * 6);
576 length = string.length ();
577 for (int i = 0; i < length; i++)
579 c = string.charAt (i);
580 character = new Character (c);
581 if (null != (value = convertToString (character)))
583 else if (!((c > 0x001F) && (c < 0x007F)))
585 value = convertToString (c);
589 ret.append (character);
591 return (ret.toString ());