1 package mir.util.xml.html;
9 * [x] selfclosing <br/> tags
10 * [ ] de-html-escaping of cdata, parameter values etc
11 * [ ] Smarter corrections
13 * [ ] case sensitivity optional
16 public class HTMLParser {
17 private HTMLSchemaInformation schemaInformation;
20 schemaInformation = new HTMLSchemaInformation();
23 public void parse(Reader aReader, ParserReceiver aReceiver) throws HTMLParserExc, HTMLParserFailure, IOException {
26 parser = new CoreParser(aReceiver);
27 scanner = new HTMLScanner(parser, aReader);
31 private class CoreParser implements HTMLScanner.ScannerReceiver {
32 private ParserReceiver receiver;
33 private Stack tagStack;
35 public CoreParser(ParserReceiver aReceiver) {
37 tagStack = new Stack();
40 public void handleDTD(String aDTD) throws HTMLParserExc {
44 public void handleOpenTag(String aTag, Map anAttributes) throws HTMLParserExc {
45 String lowercaseTag = aTag.toLowerCase();
47 HTMLSchemaInformation.HTMLTagInformation tagInformation =
48 schemaInformation.lookupTag(lowercaseTag);
50 if (tagInformation!=null) {
51 if (tagInformation.getIsBlock()) {
55 closeAllAutoclosingTags(tagInformation);
57 Map attributes = new HashMap();
59 Iterator i = anAttributes.entrySet().iterator();
61 Map.Entry entry = (Map.Entry) i.next();
63 attributes.put(((String) entry.getKey()).toLowerCase(), entry.getValue());
66 receiver.openTag(lowercaseTag, attributes);
67 if (tagInformation.getHasBody()) {
68 tagStack.push(lowercaseTag);
71 receiver.closeTag(lowercaseTag);
79 public void handleClosingTag(String aTag) throws HTMLParserExc {
80 String lowercaseTag = aTag.toLowerCase();
83 HTMLSchemaInformation.HTMLTagInformation tagInformation =
84 schemaInformation.lookupTag(lowercaseTag);
86 if (tagInformation!=null) {
88 if (tagInformation.getIsBlock()) {
93 int index = tagStack.search(tag);
95 if (index>-1 && index<4) {
96 for (int i=0; i<index; i++) {
102 public void handleCData(String aData) throws HTMLParserExc {
103 receiver.cdata(aData);
106 public void handleComment(String aTag) throws HTMLParserExc {
107 receiver.comment(aTag);
110 public void handleEndOfStream() throws HTMLParserExc {
111 while (!tagStack.empty())
115 private void closeAllAutoclosingTags(HTMLSchemaInformation.HTMLTagInformation aTagInformation) throws HTMLParserExc {
116 while (!tagStack.empty()) {
117 String tag = (String) tagStack.peek();
119 if (aTagInformation.autoClose(tag)) {
128 private void closeAllInlineTags() throws HTMLParserExc {
129 while (!tagStack.empty()) {
130 HTMLSchemaInformation.HTMLTagInformation tagInformation =
131 schemaInformation.lookupTag((String) tagStack.peek());
133 if (tagInformation!=null && !tagInformation.getIsBlock()) {
142 private void closeUpmostTag() throws HTMLParserExc {
143 receiver.closeTag((String) tagStack.peek());
148 public interface ParserReceiver {
149 public void dtd(String aDTD) throws HTMLParserExc;
150 public void openTag(String aTag, Map anAttributes) throws HTMLParserExc;
151 public void closeTag(String aTag) throws HTMLParserExc;
152 public void comment(String aData) throws HTMLParserExc;
153 public void cdata(String aData) throws HTMLParserExc;