--- /dev/null
+package mir.util.xml.html;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ *
+ * TODO
+ * [x] selfclosing <br/> tags
+ * [ ] de-html-escaping of cdata, parameter values etc
+ * [ ] Smarter corrections
+ * [ ]
+ * [ ] case sensitivity optional
+ */
+
+public class HTMLParser {
+ private HTMLSchemaInformation schemaInformation;
+
+ public HTMLParser() {
+ schemaInformation = new HTMLSchemaInformation();
+ }
+
+ public void parse(Reader aReader, ParserReceiver aReceiver) throws HTMLParserExc, HTMLParserFailure, IOException {
+ HTMLScanner scanner;
+ CoreParser parser;
+ parser = new CoreParser(aReceiver);
+ scanner = new HTMLScanner(parser, aReader);
+ scanner.run();
+ }
+
+ private class CoreParser implements HTMLScanner.ScannerReceiver {
+ private ParserReceiver receiver;
+ private Stack tagStack;
+
+ public CoreParser(ParserReceiver aReceiver) {
+ receiver = aReceiver;
+ tagStack = new Stack();
+ }
+
+ public void handleDTD(String aDTD) throws HTMLParserExc {
+ receiver.dtd(aDTD);
+ }
+
+ public void handleOpenTag(String aTag, Map anAttributes) throws HTMLParserExc {
+ String lowercaseTag = aTag.toLowerCase();
+
+ HTMLSchemaInformation.HTMLTagInformation tagInformation =
+ schemaInformation.lookupTag(lowercaseTag);
+
+ Map attributes = new HashMap();
+
+ Iterator i = anAttributes.entrySet().iterator();
+ while (i.hasNext()) {
+ Map.Entry entry = (Map.Entry) i.next();
+
+ attributes.put(((String) entry.getKey()).toLowerCase(), entry.getValue());
+ }
+
+ if (tagInformation!=null) {
+ if (tagInformation.getIsBlock()) {
+ closeAllInlineTags();
+ }
+
+ closeAllAutoclosingTags(tagInformation);
+
+ receiver.openTag(lowercaseTag, attributes);
+ if (tagInformation.getHasBody()) {
+ tagStack.push(lowercaseTag);
+ }
+ else {
+ receiver.closeTag(lowercaseTag);
+ }
+ }
+ else {
+ receiver.openTag(lowercaseTag, attributes);
+ tagStack.push(lowercaseTag);
+ }
+ }
+
+ public void handleClosingTag(String aTag) throws HTMLParserExc {
+ String lowercaseTag = aTag.toLowerCase();
+
+ HTMLSchemaInformation.HTMLTagInformation tagInformation =
+ schemaInformation.lookupTag(lowercaseTag);
+
+ if (tagInformation!=null) {
+ if (tagInformation.getIsBlock()) {
+ closeAllInlineTags();
+ }
+ }
+
+ int index = tagStack.search(lowercaseTag);
+
+ if (index>-1 && index<4) {
+ for (int i=0; i<index; i++) {
+ closeUpmostTag();
+ }
+ }
+ }
+
+ public void handleCData(String aData) throws HTMLParserExc {
+ receiver.cdata(aData);
+ }
+
+ public void handleComment(String aTag) throws HTMLParserExc {
+ receiver.comment(aTag);
+ }
+
+ public void handleEndOfStream() throws HTMLParserExc {
+ while (!tagStack.empty())
+ closeUpmostTag();
+ }
+
+ private void closeAllAutoclosingTags(HTMLSchemaInformation.HTMLTagInformation aTagInformation) throws HTMLParserExc {
+ while (!tagStack.empty()) {
+ String tag = (String) tagStack.peek();
+
+ if (aTagInformation.autoClose(tag)) {
+ closeUpmostTag();
+ }
+ else {
+ break;
+ }
+ }
+ }
+
+ private void closeAllInlineTags() throws HTMLParserExc {
+ while (!tagStack.empty()) {
+ HTMLSchemaInformation.HTMLTagInformation tagInformation =
+ schemaInformation.lookupTag((String) tagStack.peek());
+
+ if (tagInformation!=null && !tagInformation.getIsBlock()) {
+ closeUpmostTag();
+ }
+ else {
+ break;
+ }
+ }
+ }
+
+ private void closeUpmostTag() throws HTMLParserExc {
+ receiver.closeTag((String) tagStack.peek());
+ tagStack.pop();
+ }
+ }
+
+ public interface ParserReceiver {
+ public void dtd(String aDTD) throws HTMLParserExc;
+ public void openTag(String aTag, Map anAttributes) throws HTMLParserExc;
+ public void closeTag(String aTag) throws HTMLParserExc;
+ public void comment(String aData) throws HTMLParserExc;
+ public void cdata(String aData) throws HTMLParserExc;
+ }
+}
\ No newline at end of file