2 * Copyright (C) 2006 The Mir-coders group
\r
4 * This file is part of Mir.
\r
6 * Mir is free software; you can redistribute it and/or modify
\r
7 * it under the terms of the GNU General Public License as published by
\r
8 * the Free Software Foundation; either version 2 of the License, or
\r
9 * (at your option) any later version.
\r
11 * Mir is distributed in the hope that it will be useful,
\r
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
14 * GNU General Public License for more details.
\r
16 * You should have received a copy of the GNU General Public License
\r
17 * along with Mir; if not, write to the Free Software
\r
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
\r
20 * In addition, as a special exception, The Mir-coders gives permission to link
\r
21 * the code of this program with any library licensed under the Apache Software License,
\r
22 * The Sun (tm) Java Advanced Imaging library (JAI), The Sun JIMI library
\r
23 * (or with modified versions of the above that use the same license as the above),
\r
24 * and distribute linked combinations including the two. You must obey the
\r
25 * GNU General Public License in all respects for all of the code used other than
\r
26 * the above mentioned libraries. If you modify this file, you may extend this
\r
27 * exception to your version of the file, but you are not obligated to do so.
\r
28 * If you do not wish to do so, delete this exception statement from your version.
\r
32 import org.apache.oro.text.regex.Pattern;
\r
33 import org.apache.oro.text.regex.Perl5Compiler;
\r
34 import org.apache.oro.text.regex.Perl5Matcher;
\r
35 import org.apache.oro.text.regex.Perl5Substitution;
\r
36 import org.apache.oro.text.regex.Util;
\r
39 * Class used to enrich text-based content with HTML links
\r
40 * according to a set of rules
\r
42 public class HTMLStripper {
\r
44 private Pattern newLineExpression;
\r
45 private Pattern doubleBRExpression;
\r
46 private Pattern emailAddressExpression;
\r
47 private Pattern urlExpression;
\r
48 private Pattern htmlTagExpression;
\r
50 public HTMLStripper() {
\r
51 Perl5Compiler compiler = new Perl5Compiler();
\r
55 compiler.compile("(\r?\n){1}", Perl5Compiler.READ_ONLY_MASK);
\r
56 doubleBRExpression =
\r
57 compiler.compile("(<br>\r?\n<br>){1,}", Perl5Compiler.READ_ONLY_MASK);
\r
58 emailAddressExpression =
\r
59 compiler.compile("\\b([a-zA-Z0-9_.-]+)@([a-zA-Z0-9_-]+)\\.([a-zA-Z0-9_.-]+)\\b", Perl5Compiler.READ_ONLY_MASK);
\r
61 compiler.compile("((https://)|(http://)|(ftp://)){1}([a-zA-Z0-9_-]+).([a-zA-Z0-9_.:-]+)/?([^ \t\r\n<>\\)\\]]+[^ \t\r\n.,<>\\)\\]])", Perl5Compiler.READ_ONLY_MASK);
\r
63 compiler.compile("<[^>]*>", Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK);
\r
65 catch (Exception e) {
\r
66 throw new RuntimeException(e.getMessage());
\r
71 * this routine takes text in url format and makes
\r
72 * a clickaeble "<href>" link removing any "illegal" html tags
\r
73 * @param haystack the url
\r
74 * @param title the href link text
\r
75 * @param imageRoot the place to find icons
\r
76 * @param extImage the url of the icon to show next to the link
\r
77 * @return a String containing the url
\r
79 private String createURLLinks(String haystack, String title, String imageRoot, String extImage) {
\r
80 if (title == null) {
\r
81 return substituteAll(haystack, urlExpression,
\r
82 "<img src=\"" + imageRoot + "/" + extImage + "\" border=\"0\"/> <a href=\"$0\">$0</a>");
\r
85 title = removeHTMLTags(title);
\r
87 return substituteAll(haystack, urlExpression,
\r
88 "<img src=\"" + imageRoot + "/" + extImage + "\" border=\"0\"/> <a href=\"$0\">" + title + "</a>");
\r
91 private String substituteAll(String anInput, Pattern anExpression, String aReplacement) {
\r
92 Perl5Matcher matcher = new Perl5Matcher();
\r
94 return Util.substitute(
\r
95 matcher, anExpression,
\r
96 new Perl5Substitution(aReplacement), anInput,
\r
97 Util.SUBSTITUTE_ALL);
\r
101 * Remove all HTML tags
\r
103 public String removeHTMLTags(String haystack){
\r
104 return substituteAll(haystack, htmlTagExpression, "");
\r
109 * convertNewline2P ist eine regex-routine zum umwandeln von 2 oder mehr newlines (\n)
\r
110 * in den html-tag <p>
\r
111 * nur sinnvoll, wenn text nicht im html-format eingegeben
\r
113 private String convertNewline2P(String haystack) {
\r
114 return substituteAll(haystack, doubleBRExpression, "\n</p><p>");
\r
118 * convertNewline2Break ist eine regex-routine zum umwandeln von 1 newline (\n)
\r
119 * in den html-tag <br>
\r
120 * nur sinnvoll, wenn text nicht im html-format eingegeben
\r
122 private String convertNewline2Break(String haystack) {
\r
123 return substituteAll(haystack, newLineExpression, "$0<br />");
\r
128 * createMailLinks wandelt text im email-adressenformat
\r
129 * in einen klickbaren link um
\r
130 * nur sinnvoll, wenn text nicht im html-format eingegeben
\r
132 private String createMailLinks(String haystack, String imageRoot, String mailImage) {
\r
133 return substituteAll(haystack, emailAddressExpression,
\r
134 "<img src=\"" + imageRoot + "/" + mailImage + "\" border=\"0\"/> <a href=\"mailto:$0\">$0</a>");
\r
139 * this routine takes text in url format and makes
\r
140 * a clickaeble "<href>" link removing any "illegal" html tags
\r
141 * @param haystack the url
\r
142 * @param imageRoot the place to find icons
\r
143 * @param extImage the url of the icon to show next to the link
\r
144 * @param intImage unused
\r
145 * @return a String containing the url
\r
147 private String createURLLinks(String haystack, String title, String imageRoot,String extImage, String intImage) {
\r
148 return createURLLinks(haystack, title, imageRoot, extImage);
\r
153 public String createHTML(String content, String producerDocRoot, String mailImage, String extImage, String intImage){
\r
154 content = convertNewline2Break(content);
\r
155 content = convertNewline2P(content);
\r
156 content = createMailLinks(content, producerDocRoot, mailImage);
\r
157 content = createURLLinks(content, null, producerDocRoot,
\r
158 extImage, intImage);
\r