1 package mir.util.xml.html;
9 * [x] selfclosing <br/> tags
10 * [ ] de-html-escaping of cdata, parameter values etc
11 * [ ] Smarter corrections
13 * [ ] case sensitivity optional
16 public class HTMLParser {
17 private HTMLSchemaInformation schemaInformation;
20 schemaInformation = new HTMLSchemaInformation();
23 public void parse(Reader aReader, ParserReceiver aReceiver) throws HTMLParserExc, HTMLParserFailure, IOException {
26 parser = new CoreParser(aReceiver);
27 scanner = new HTMLScanner(parser, aReader);
31 private class CoreParser implements HTMLScanner.ScannerReceiver {
32 private ParserReceiver receiver;
33 private Stack tagStack;
35 public CoreParser(ParserReceiver aReceiver) {
37 tagStack = new Stack();
40 public void handleDTD(String aDTD) throws HTMLParserExc {
44 public void handleOpenTag(String aTag, Map anAttributes) throws HTMLParserExc {
45 String lowercaseTag = aTag.toLowerCase();
47 HTMLSchemaInformation.HTMLTagInformation tagInformation =
48 schemaInformation.lookupTag(lowercaseTag);
50 Map attributes = new HashMap();
52 Iterator i = anAttributes.entrySet().iterator();
54 Map.Entry entry = (Map.Entry) i.next();
56 attributes.put(((String) entry.getKey()).toLowerCase(), entry.getValue());
59 if (tagInformation!=null) {
60 if (tagInformation.getIsBlock()) {
64 closeAllAutoclosingTags(tagInformation);
66 receiver.openTag(lowercaseTag, attributes);
67 if (tagInformation.getHasBody()) {
68 tagStack.push(lowercaseTag);
71 receiver.closeTag(lowercaseTag);
75 receiver.openTag(lowercaseTag, attributes);
76 tagStack.push(lowercaseTag);
80 public void handleClosingTag(String aTag) throws HTMLParserExc {
81 String lowercaseTag = aTag.toLowerCase();
83 HTMLSchemaInformation.HTMLTagInformation tagInformation =
84 schemaInformation.lookupTag(lowercaseTag);
86 if (tagInformation!=null) {
87 if (tagInformation.getIsBlock()) {
92 int index = tagStack.search(lowercaseTag);
94 if (index>-1 && index<4) {
95 for (int i=0; i<index; i++) {
101 public void handleCData(String aData) throws HTMLParserExc {
102 receiver.cdata(aData);
105 public void handleComment(String aTag) throws HTMLParserExc {
106 receiver.comment(aTag);
109 public void handleEndOfStream() throws HTMLParserExc {
110 while (!tagStack.empty())
114 private void closeAllAutoclosingTags(HTMLSchemaInformation.HTMLTagInformation aTagInformation) throws HTMLParserExc {
115 while (!tagStack.empty()) {
116 String tag = (String) tagStack.peek();
118 if (aTagInformation.autoClose(tag)) {
127 private void closeAllInlineTags() throws HTMLParserExc {
128 while (!tagStack.empty()) {
129 HTMLSchemaInformation.HTMLTagInformation tagInformation =
130 schemaInformation.lookupTag((String) tagStack.peek());
132 if (tagInformation!=null && !tagInformation.getIsBlock()) {
141 private void closeUpmostTag() throws HTMLParserExc {
142 receiver.closeTag((String) tagStack.peek());
147 public interface ParserReceiver {
148 public void dtd(String aDTD) throws HTMLParserExc;
149 public void openTag(String aTag, Map anAttributes) throws HTMLParserExc;
150 public void closeTag(String aTag) throws HTMLParserExc;
151 public void comment(String aData) throws HTMLParserExc;
152 public void cdata(String aData) throws HTMLParserExc;