2 * Copyright (C) 2001, 2002 The Mir-coders group
4 * This file is part of Mir.
6 * Mir is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * Mir is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with Mir; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * In addition, as a special exception, The Mir-coders gives permission to link
21 * the code of this program with any library licensed under the Apache Software License,
22 * The Sun (tm) Java Advanced Imaging library (JAI), The Sun JIMI library
23 * (or with modified versions of the above that use the same license as the above),
24 * and distribute linked combinations including the two. You must obey the
25 * GNU General Public License in all respects for all of the code used other than
26 * the above mentioned libraries. If you modify this file, you may extend this
27 * exception to your version of the file, but you are not obligated to do so.
28 * If you do not wish to do so, delete this exception statement from your version.
33 import java.net.URLEncoder;
34 import java.util.HashMap;
37 public class HTMLRoutines {
39 * Encodes a URL: escapes reserved URL characters like &, = into % escape
42 public static String encodeURL(String aString) {
43 return URLEncoder.encode(aString);
46 public static String encodeURL(String aString, String anEncoding) {
48 return URLEncoder.encode(aString);
51 throw new RuntimeException(t.getMessage());
55 public static String encodeHTML(String aText) {
56 final char[] CHARACTERS_TO_ESCAPE = { '&', '<', '>', '"' };
57 final String[] ESCAPE_CODES = { "&", "<", ">", """ };
59 return StringRoutines.replaceStringCharacters(aText, CHARACTERS_TO_ESCAPE, ESCAPE_CODES);
62 public static String prettyEncodeHTML(String aText) throws UtilExc {
64 StringRoutines.performRegularExpressionReplacement(encodeHTML(aText), "\\n", "<br>\n");
67 public static String encodeXML(String aText) {
68 //#x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
69 final char[] CHARACTERS_TO_ESCAPE = { '&', '<', '>', '"', '\'',
70 '\u0000', '\u0001', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u0000', '\u000B',
71 '\u000C', '\u000E', '\u000F', '\u0010', '\u0011', '\u0012', '\u0013', '\u0014', '\u0015', '\u0016',
72 '\u0017', '\u0018', '\u0019', '\u001A', '\u001B', '\u001C', '\u001D', '\u001E' };
73 final String[] ESCAPE_CODES = { "&", "<", ">", """, "'",
74 "", "", "", "", "", "", "", "", "", "",
75 "", "", "", "", "", "", "", "", "", "",
76 "", "", "", "", "", "", "", ""};
78 return StringRoutines.replaceStringCharacters(aText, CHARACTERS_TO_ESCAPE, ESCAPE_CODES);
81 private static final Map htmlEntities = new HashMap();
83 private static final String[] HTML_ENTITY_TABLE = {
84 "nbsp", "\u00a0", "iexcl", "\u00a1", "cent", "\u00a2", "pound", "\u00a3", "curren", "\u00a4", "yen", "\u00a5",
85 "brvbar", "\u00a6", "sect", "\u00a7", "uml", "\u00a8", "copy", "\u00a9", "ordf", "\u00aa", "laquo", "\u00ab",
86 "not", "\u00ac", "shy", "\u00ad", "reg", "\u00ae", "macr", "\u00af", "deg", "\u00b0", "plusmn", "\u00b1",
87 "sup2", "\u00b2", "sup3", "\u00b3", "acute", "\u00b4", "micro", "\u00b5", "para", "\u00b6", "middot", "\u00b7",
88 "cedil", "\u00b8", "sup1", "\u00b9", "ordm", "\u00ba", "raquo", "\u00bb", "frac14", "\u00bc", "frac12", "\u00bd",
89 "frac34", "\u00be", "iquest", "\u00bf", "Agrave", "\u00c0", "Aacute", "\u00c1", "Acirc", "\u00c2", "Atilde", "\u00c3",
90 "Auml", "\u00c4", "Aring", "\u00c5", "AElig", "\u00c6", "Ccedil", "\u00c7", "Egrave", "\u00c8", "Eacute", "\u00c9",
91 "Ecirc", "\u00ca", "Euml", "\u00cb", "Igrave", "\u00cc", "Iacute", "\u00cd", "Icirc", "\u00ce", "Iuml", "\u00cf",
92 "ETH", "\u00d0", "Ntilde", "\u00d1", "Ograve", "\u00d2", "Oacute", "\u00d3", "Ocirc", "\u00d4", "Otilde", "\u00d5",
93 "Ouml", "\u00d6", "times", "\u00d7", "Oslash", "\u00d8", "Ugrave", "\u00d9", "Uacute", "\u00da", "Ucirc", "\u00db",
94 "Uuml", "\u00dc", "Yacute", "\u00dd", "THORN", "\u00de", "szlig", "\u00df", "agrave", "\u00e0", "aacute", "\u00e1",
95 "acirc", "\u00e2", "atilde", "\u00e3", "auml", "\u00e4", "aring", "\u00e5", "aelig", "\u00e6", "ccedil", "\u00e7",
96 "egrave", "\u00e8", "eacute", "\u00e9", "ecirc", "\u00ea", "euml", "\u00eb", "igrave", "\u00ec", "iacute", "\u00ed",
97 "icirc", "\u00ee", "iuml", "\u00ef", "eth", "\u00f0", "ntilde", "\u00f1", "ograve", "\u00f2", "oacute", "\u00f3",
98 "ocirc", "\u00f4", "otilde", "\u00f5", "ouml", "\u00f6", "divide", "\u00f7", "oslash", "\u00f8", "ugrave", "\u00f9",
99 "uacute", "\u00fa", "ucirc", "\u00fb", "uuml", "\u00fc", "yacute", "\u00fd", "thorn", "\u00fe", "yuml", "\u00ff",
100 "fnof", "\u0192", "Alpha", "\u0391", "Beta", "\u0392", "Gamma", "\u0393", "Delta", "\u0394", "Epsilon", "\u0395",
101 "Zeta", "\u0396", "Eta", "\u0397", "Theta", "\u0398", "Iota", "\u0399", "Kappa", "\u039a", "Lambda", "\u039b",
102 "Mu", "\u039c", "Nu", "\u039d", "Xi", "\u039e", "Omicron", "\u039f", "Pi", "\u03a0", "Rho", "\u03a1",
103 "Sigma", "\u03a3", "Tau", "\u03a4", "Upsilon", "\u03a5", "Phi", "\u03a6", "Chi", "\u03a7", "Psi", "\u03a8",
104 "Omega", "\u03a9", "alpha", "\u03b1", "beta", "\u03b2", "gamma", "\u03b3", "delta", "\u03b4", "epsilon", "\u03b5",
105 "zeta", "\u03b6", "eta", "\u03b7", "theta", "\u03b8", "iota", "\u03b9", "kappa", "\u03ba", "lambda", "\u03bb",
106 "mu", "\u03bc", "nu", "\u03bd", "xi", "\u03be", "omicron", "\u03bf", "pi", "\u03c0", "rho", "\u03c1",
107 "sigmaf", "\u03c2", "sigma", "\u03c3", "tau", "\u03c4", "upsilon", "\u03c5", "phi", "\u03c6", "chi", "\u03c7",
108 "psi", "\u03c8", "omega", "\u03c9", "thetasym","\u03d1", "upsih", "\u03d2", "piv", "\u03d6", "bull", "\u2022",
109 "hellip", "\u2026", "prime", "\u2032", "Prime", "\u2033", "oline", "\u203e", "frasl", "\u2044", "weierp", "\u2118",
110 "image", "\u2111", "real", "\u211c", "trade", "\u2122", "alefsym", "\u2135", "larr", "\u2190", "uarr", "\u2191",
111 "rarr", "\u2192", "darr", "\u2193", "harr", "\u2194", "crarr", "\u21b5", "lArr", "\u21d0", "uArr", "\u21d1",
112 "rArr", "\u21d2", "dArr", "\u21d3", "hArr", "\u21d4", "forall", "\u2200", "part", "\u2202", "exist", "\u2203",
113 "empty", "\u2205", "nabla", "\u2207", "isin", "\u2208", "notin", "\u2209", "ni", "\u220b", "prod", "\u220f",
114 "sum", "\u2211", "minus", "\u2212", "lowast", "\u2217", "radic", "\u221a", "prop", "\u221d", "infin", "\u221e",
115 "ang", "\u2220", "and", "\u2227", "or", "\u2228", "cap", "\u2229", "cup", "\u222a", "int", "\u222b",
116 "there4", "\u2234", "sim", "\u223c", "cong", "\u2245", "asymp", "\u2248", "ne", "\u2260", "equiv", "\u2261",
117 "le", "\u2264", "ge", "\u2265", "sub", "\u2282", "sup", "\u2283", "nsub", "\u2284", "sube", "\u2286",
118 "supe", "\u2287", "oplus", "\u2295", "otimes", "\u2297", "perp", "\u22a5", "sdot", "\u22c5", "lceil", "\u2308",
119 "rceil", "\u2309", "lfloor", "\u230a", "rfloor", "\u230b", "lang", "\u2329", "rang", "\u232a", "loz", "\u25ca",
120 "spades", "\u2660", "clubs", "\u2663", "hearts", "\u2665", "diams", "\u2666", "quot", "\"", "amp", "\u0026",
121 "lt", "\u003c", "gt", "\u003e", "OElig", "\u0152", "oelig", "\u0153", "Scaron", "\u0160", "scaron", "\u0161",
122 "Yuml", "\u0178", "circ", "\u02c6", "tilde", "\u02dc", "ensp", "\u2002", "emsp", "\u2003", "thinsp", "\u2009",
123 "zwnj", "\u200c", "zwj", "\u200d", "lrm", "\u200e", "rlm", "\u200f", "ndash", "\u2013", "mdash", "\u2014",
124 "lsquo", "\u2018", "rsquo", "\u2019", "sbquo", "\u201a", "ldquo", "\u201c", "rdquo", "\u201d", "bdquo", "\u201e",
125 "dagger", "\u2020", "Dagger", "\u2021", "permil", "\u2030", "lsaquo", "\u2039", "rsaquo", "\u203a", "euro", "\u20ac"
129 for (int i=0; i+1<HTML_ENTITY_TABLE.length; i+=2) {
130 htmlEntities.put(HTML_ENTITY_TABLE[i], HTML_ENTITY_TABLE[i+1]);
135 * Resolves an html entity if possible, returns the unresolved entity otherwise
138 * &#<decimal number>;
139 * &#x<hexadecimal number>;
142 public static String resolveHTMLEntity(String anEntity) {
143 if (anEntity.length()<3 || anEntity.length()>10 ||
144 anEntity.charAt(0)!='&' ||
145 anEntity.charAt(anEntity.length()-1)!=';')
148 if (anEntity.charAt(1)=='#') {
152 if (anEntity.charAt(2)=='x') {
153 number = Integer.parseInt(anEntity.substring(3,anEntity.length()-1), 16);
156 number = Integer.parseInt(anEntity.substring(2,anEntity.length()-1), 10);
159 if (number>=Character.MIN_VALUE && number<=Character.MAX_VALUE &&
160 Character.isDefined((char) number)) {
161 return new String(new char[]{(char) number});
164 catch (NumberFormatException e) {
168 String name = anEntity.substring(1,anEntity.length()-1);
170 String result = (String) htmlEntities.get(name);
180 * Resolve all HTML entities (&....;) in a text
182 public static String resolveHTMLEntites(String aText) {
183 StringBuffer result = new StringBuffer();
189 position = aText.indexOf("&", oldPosition);
191 position = aText.length();
193 result.append(aText.substring(oldPosition,position));
195 if (position<aText.length()) {
196 int position2 = aText.indexOf(";", position);
198 if (position2>position+1) {
199 result.append(resolveHTMLEntity(aText.substring(position, position2+1)));
200 oldPosition=position2+1;
203 result.append(aText.charAt(position));
204 oldPosition = position+1;
207 } while (position<aText.length());
209 return result.toString();