--- /dev/null
+package mir.util.xml.html;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.Map;
+
+import mir.util.HTMLRoutines;
+
+public class HTMLScanner {
+ private ReaderWrapper reader;
+ private ScannerReceiver receiver;
+
+ public HTMLScanner(ScannerReceiver aReceiver, Reader aReader) {
+ reader = new ReaderWrapper(aReader);
+ receiver = aReceiver;
+ }
+
+ public interface ScannerReceiver {
+ public void handleDTD(String aDTD) throws HTMLParserExc;
+ public void handleOpenTag(String aTag, Map anAttributes) throws HTMLParserExc;
+ public void handleClosingTag(String aTag) throws HTMLParserExc;
+ public void handleCData(String aData) throws HTMLParserExc;
+ public void handleComment(String aTag) throws HTMLParserExc;
+ public void handleEndOfStream() throws HTMLParserExc;
+ }
+
+ public void run() throws IOException, HTMLParserExc {
+
+ while (!reader.isAtEnd()) {
+ char c = reader.peek();
+
+ if (c != '<')
+ readCData();
+ else {
+ reader.get();
+ c = reader.peek();
+
+ switch (c) {
+ case '!':
+ reader.get();
+ readSpecial();
+ break;
+ case '/':
+ reader.get();
+ readEndTag();
+ break;
+ default:
+ readTag();
+ }
+ }
+ }
+
+ receiver.handleEndOfStream();
+ }
+
+ private boolean isValidTagNameCharacter(char aCharacter) {
+ int type = Character.getType(aCharacter);
+
+ return
+ (type == Character.UPPERCASE_LETTER) ||
+ (type == Character.LOWERCASE_LETTER) ||
+ (type == Character.DECIMAL_DIGIT_NUMBER) ||
+ (aCharacter == '.') ||
+ (aCharacter == '-') ||
+ (aCharacter == '_') ||
+ (aCharacter == ':');
+ }
+
+ private boolean isValidUnQuotedAttributeCharacter(char aCharacter) {
+ int type = Character.getType(aCharacter);
+
+ return
+ (type == Character.UPPERCASE_LETTER) ||
+ (type == Character.LOWERCASE_LETTER) ||
+ (type == Character.DECIMAL_DIGIT_NUMBER) ||
+ (aCharacter == '.') ||
+ (aCharacter == '#') ||
+ (aCharacter == '-') ||
+ (aCharacter == '_') ||
+ (aCharacter == ':');
+ }
+
+ private void skipWhiteSpace() throws IOException {
+ while (!reader.isAtEnd() && Character.isWhitespace(reader.peek())) {
+ reader.get();
+ }
+ }
+
+ private void readEndTag() throws IOException, HTMLParserExc {
+ StringBuffer result = new StringBuffer();
+
+ skipWhiteSpace();
+
+ while (!reader.isAtEnd() && isValidTagNameCharacter(reader.peek())) {
+ result.append(reader.get());
+ }
+
+ skipWhiteSpace();
+
+ if (!reader.isAtEnd() && reader.peek()=='>')
+ reader.get();
+
+ receiver.handleClosingTag(result.toString());
+ }
+
+ private String getName() throws IOException {
+ StringBuffer result = new StringBuffer();
+
+ skipWhiteSpace();
+
+ while (!reader.isAtEnd() && isValidTagNameCharacter(reader.peek())) {
+ result.append(reader.get());
+ }
+
+ if (result.length()==0)
+ return null;
+ else
+ return result.toString();
+ }
+
+ private String getAttributeValue() throws IOException {
+ StringBuffer result = new StringBuffer();
+
+ skipWhiteSpace();
+
+ if (!reader.isAtEnd()) {
+ if (reader.peek() == '\'' || reader.peek() == '\"') {
+ char boundary = reader.get();
+
+ while (!reader.isAtEnd() && reader.peek()!=boundary) {
+ result.append(reader.get());
+ }
+
+ if (!reader.isAtEnd() && reader.peek()==boundary)
+ reader.get();
+ }
+ else {
+ while (!reader.isAtEnd() && isValidUnQuotedAttributeCharacter(reader.peek())) {
+ result.append(reader.get());
+ }
+ }
+ }
+
+ return HTMLRoutines.resolveHTMLEntites(result.toString());
+ }
+
+ private void readTag() throws IOException, HTMLParserExc {
+ String tagName = getName();
+
+ Map attributes = new HashMap();
+
+ String attributeName = getName();
+ String attributeValue = null;
+
+ while (attributeName!=null) {
+ skipWhiteSpace();
+
+ if (!reader.isAtEnd() && reader.peek()=='=') {
+ reader.get();
+ attributeValue = getAttributeValue();
+ }
+
+ attributes.put(attributeName, attributeValue);
+
+ attributeName = getName();
+ }
+
+ boolean isClosed=false;
+
+ skipWhiteSpace();
+ if (!reader.isAtEnd() && reader.peek()=='/') {
+ isClosed = true;
+ reader.get();
+ }
+
+ skipWhiteSpace();
+ if (!reader.isAtEnd() && reader.peek()=='>') {
+ reader.get();
+ }
+
+ receiver.handleOpenTag(tagName, attributes);
+ if (isClosed)
+ receiver.handleClosingTag(tagName);
+ }
+
+ private void readSpecial() throws IOException, HTMLParserExc {
+ StringBuffer result = new StringBuffer();
+
+ if (!reader.isAtEnd() && reader.peek()=='-') {
+ reader.get();
+ if (!reader.isAtEnd() && reader.peek()=='-') {
+ reader.get();
+
+ while (!reader.isAtEnd()) {
+ if (reader.peek()=='-') {
+ reader.get();
+ if (!reader.isAtEnd() && reader.peek()=='-') {
+ reader.get();
+ if (!reader.isAtEnd() && reader.peek()=='>') {
+ reader.get();
+ break;
+ }
+ result.append('-');
+ }
+ result.append('-');
+ }
+ if (!reader.isAtEnd())
+ result.append(reader.get());
+ }
+
+ skipWhiteSpace();
+
+ if (!reader.isAtEnd() && reader.peek()=='>')
+ reader.get();
+
+ receiver.handleComment(result.toString());
+
+ return;
+ }
+ else {
+ result.append('-');
+ }
+ }
+
+ while (!reader.isAtEnd() && reader.peek()!='>') {
+ result.append(reader.get());
+ }
+
+ if (!reader.isAtEnd() && reader.peek()=='>')
+ reader.get();
+
+ receiver.handleDTD("<!"+result.toString()+">");
+ }
+
+ private void readCData() throws IOException, HTMLParserExc {
+ StringBuffer result = new StringBuffer();
+
+ while (!reader.isAtEnd() && reader.peek()!='<') {
+ result.append(reader.get());
+ }
+
+
+
+ receiver.handleCData(HTMLRoutines.resolveHTMLEntites(result.toString()));
+ }
+
+ /**
+ * Class to provide for a 1 character look-ahead on a reader
+ */
+ public static class ReaderWrapper {
+ private Reader reader;
+ private char buffer;
+ private boolean haveBuffer;
+
+ public ReaderWrapper(Reader aReader) {
+ reader = aReader;
+ haveBuffer = false;
+ }
+
+ /**
+ * Returns <code>true</code> if the stream contains no more characters.
+ */
+ public boolean isAtEnd() throws IOException {
+ fillBuffer();
+
+ return !haveBuffer;
+ }
+
+ /**
+ * Gets the next character from the reader but will not remove it from the
+ * stream.
+ * {@link #isAtEnd()} must return <code>false</code> before call this
+ * routine.
+ */
+ public char peek() throws IOException {
+ fillBuffer();
+
+ return buffer;
+ }
+
+ /**
+ * Gets the next character from the reader and removes it from the stream.
+ * {@link #isAtEnd()} must return <code>false</code> before call this
+ * routine.
+ */
+ public char get() throws IOException {
+ fillBuffer();
+ haveBuffer = false;
+
+ return buffer;
+ }
+
+ /**
+ * If the reader is not at it's end, then upon return, the buffer will
+ * be filled. If the buffer was already filled, then this method won't
+ * do anything.
+ */
+ public void fillBuffer() throws IOException {
+ if (!haveBuffer) {
+ int c = reader.read();
+
+ if (c!=-1) {
+ buffer = (char) c;
+ haveBuffer=true;
+ }
+ }
+ }
+ }
+}
\ No newline at end of file