rebuilding head
[mir.git] / source / mir / util / xml / html / HTMLParser.java
diff --git a/source/mir/util/xml/html/HTMLParser.java b/source/mir/util/xml/html/HTMLParser.java
new file mode 100755 (executable)
index 0000000..669a9bd
--- /dev/null
@@ -0,0 +1,154 @@
+package mir.util.xml.html;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ *
+ * TODO
+ *   [x] selfclosing <br/> tags
+ *   [ ] de-html-escaping of cdata, parameter values etc
+ *   [ ] Smarter corrections
+ *       [ ]
+ *   [ ] case sensitivity optional
+ */
+
+public class HTMLParser {
+  private HTMLSchemaInformation schemaInformation;
+
+  public HTMLParser() {
+    schemaInformation = new HTMLSchemaInformation();
+  }
+
+  public void parse(Reader aReader, ParserReceiver aReceiver) throws HTMLParserExc, HTMLParserFailure, IOException {
+    HTMLScanner scanner;
+    CoreParser parser;
+    parser = new CoreParser(aReceiver);
+    scanner = new HTMLScanner(parser, aReader);
+    scanner.run();
+  }
+
+  private class CoreParser implements HTMLScanner.ScannerReceiver {
+    private ParserReceiver receiver;
+    private Stack tagStack;
+
+    public CoreParser(ParserReceiver aReceiver) {
+      receiver = aReceiver;
+      tagStack = new Stack();
+    }
+
+    public void handleDTD(String aDTD) throws HTMLParserExc  {
+      receiver.dtd(aDTD);
+    }
+
+    public void handleOpenTag(String aTag, Map anAttributes) throws HTMLParserExc  {
+      String lowercaseTag = aTag.toLowerCase();
+
+      HTMLSchemaInformation.HTMLTagInformation tagInformation =
+          schemaInformation.lookupTag(lowercaseTag);
+
+      Map attributes = new HashMap();
+
+      Iterator i = anAttributes.entrySet().iterator();
+      while (i.hasNext()) {
+        Map.Entry entry = (Map.Entry) i.next();
+
+        attributes.put(((String) entry.getKey()).toLowerCase(), entry.getValue());
+      }
+
+      if (tagInformation!=null) {
+        if (tagInformation.getIsBlock()) {
+          closeAllInlineTags();
+        }
+
+        closeAllAutoclosingTags(tagInformation);
+
+        receiver.openTag(lowercaseTag, attributes);
+        if (tagInformation.getHasBody()) {
+          tagStack.push(lowercaseTag);
+        }
+        else {
+          receiver.closeTag(lowercaseTag);
+        }
+      }
+      else {
+        receiver.openTag(lowercaseTag, attributes);
+        tagStack.push(lowercaseTag);
+      }
+    }
+
+    public void handleClosingTag(String aTag) throws HTMLParserExc {
+      String lowercaseTag = aTag.toLowerCase();
+
+      HTMLSchemaInformation.HTMLTagInformation tagInformation =
+          schemaInformation.lookupTag(lowercaseTag);
+
+      if (tagInformation!=null) {
+        if (tagInformation.getIsBlock()) {
+          closeAllInlineTags();
+        }
+      }
+
+      int index = tagStack.search(lowercaseTag);
+
+      if (index>-1 && index<4) {
+        for (int i=0; i<index; i++) {
+          closeUpmostTag();
+        }
+      }
+    }
+
+    public void handleCData(String aData)  throws HTMLParserExc {
+      receiver.cdata(aData);
+    }
+
+    public void handleComment(String aTag) throws HTMLParserExc  {
+      receiver.comment(aTag);
+    }
+
+    public void handleEndOfStream() throws HTMLParserExc {
+      while (!tagStack.empty())
+        closeUpmostTag();
+    }
+
+    private void closeAllAutoclosingTags(HTMLSchemaInformation.HTMLTagInformation aTagInformation) throws HTMLParserExc {
+      while (!tagStack.empty()) {
+        String tag = (String) tagStack.peek();
+
+        if (aTagInformation.autoClose(tag)) {
+          closeUpmostTag();
+        }
+        else {
+          break;
+        }
+      }
+    }
+
+    private void closeAllInlineTags() throws HTMLParserExc {
+      while (!tagStack.empty()) {
+        HTMLSchemaInformation.HTMLTagInformation tagInformation =
+            schemaInformation.lookupTag((String) tagStack.peek());
+
+        if (tagInformation!=null && !tagInformation.getIsBlock()) {
+          closeUpmostTag();
+        }
+        else {
+          break;
+        }
+      }
+    }
+
+    private void closeUpmostTag() throws HTMLParserExc {
+      receiver.closeTag((String) tagStack.peek());
+      tagStack.pop();
+    }
+  }
+
+  public interface ParserReceiver {
+    public void dtd(String aDTD) throws HTMLParserExc;
+    public void openTag(String aTag, Map anAttributes) throws HTMLParserExc;
+    public void closeTag(String aTag) throws HTMLParserExc;
+    public void comment(String aData) throws HTMLParserExc;
+    public void cdata(String aData) throws HTMLParserExc;
+  }
+}
\ No newline at end of file