1 package mir.util.xml.html;
3 import java.io.IOException;
5 import java.util.HashMap;
6 import java.util.Iterator;
8 import java.util.Stack;
13 * [x] selfclosing <br/> tags
14 * [ ] de-html-escaping of cdata, parameter values etc
15 * [ ] Smarter corrections
17 * [ ] case sensitivity optional
20 public class HTMLParser {
21 private HTMLSchemaInformation schemaInformation;
24 schemaInformation = new HTMLSchemaInformation();
27 public void parse(Reader aReader, ParserReceiver aReceiver) throws HTMLParserExc, HTMLParserFailure, IOException {
30 parser = new CoreParser(aReceiver);
31 scanner = new HTMLScanner(parser, aReader);
35 private class CoreParser implements HTMLScanner.ScannerReceiver {
36 private ParserReceiver receiver;
37 private Stack tagStack;
39 public CoreParser(ParserReceiver aReceiver) {
41 tagStack = new Stack();
44 public void handleDTD(String aDTD) throws HTMLParserExc {
48 public void handleOpenTag(String aTag, Map anAttributes) throws HTMLParserExc {
49 String lowercaseTag = aTag.toLowerCase();
51 HTMLSchemaInformation.HTMLTagInformation tagInformation =
52 schemaInformation.lookupTag(lowercaseTag);
54 Map attributes = new HashMap();
56 Iterator i = anAttributes.entrySet().iterator();
58 Map.Entry entry = (Map.Entry) i.next();
60 attributes.put(((String) entry.getKey()).toLowerCase(), entry.getValue());
63 if (tagInformation!=null) {
64 if (tagInformation.getIsBlock()) {
68 closeAllAutoclosingTags(tagInformation);
70 receiver.openTag(lowercaseTag, attributes);
71 if (tagInformation.getHasBody()) {
72 tagStack.push(lowercaseTag);
75 receiver.closeTag(lowercaseTag);
79 receiver.openTag(lowercaseTag, attributes);
80 tagStack.push(lowercaseTag);
84 public void handleClosingTag(String aTag) throws HTMLParserExc {
85 String lowercaseTag = aTag.toLowerCase();
87 HTMLSchemaInformation.HTMLTagInformation tagInformation =
88 schemaInformation.lookupTag(lowercaseTag);
90 if (tagInformation!=null) {
91 if (tagInformation.getIsBlock()) {
96 int index = tagStack.search(lowercaseTag);
98 if (index>-1 && index<4) {
99 for (int i=0; i<index; i++) {
105 public void handleCData(String aData) throws HTMLParserExc {
106 receiver.cdata(aData);
109 public void handleComment(String aTag) throws HTMLParserExc {
110 receiver.comment(aTag);
113 public void handleEndOfStream() throws HTMLParserExc {
114 while (!tagStack.empty())
118 private void closeAllAutoclosingTags(HTMLSchemaInformation.HTMLTagInformation aTagInformation) throws HTMLParserExc {
119 while (!tagStack.empty()) {
120 String tag = (String) tagStack.peek();
122 if (aTagInformation.autoClose(tag)) {
131 private void closeAllInlineTags() throws HTMLParserExc {
132 while (!tagStack.empty()) {
133 HTMLSchemaInformation.HTMLTagInformation tagInformation =
134 schemaInformation.lookupTag((String) tagStack.peek());
136 if (tagInformation!=null && !tagInformation.getIsBlock()) {
145 private void closeUpmostTag() throws HTMLParserExc {
146 receiver.closeTag((String) tagStack.peek());
151 public interface ParserReceiver {
152 public void dtd(String aDTD) throws HTMLParserExc;
153 public void openTag(String aTag, Map anAttributes) throws HTMLParserExc;
154 public void closeTag(String aTag) throws HTMLParserExc;
155 public void comment(String aData) throws HTMLParserExc;
156 public void cdata(String aData) throws HTMLParserExc;