source/mir/util/xml/html/HTMLScanner.java

   1 package mir.util.xml.html;
   2
   3 import java.io.IOException;
   4 import java.io.Reader;
   5 import java.util.HashMap;
   6 import java.util.Map;
   7
   8 import mir.util.HTMLRoutines;
   9
  10 public class HTMLScanner {
  11   private ReaderWrapper reader;
  12   private ScannerReceiver receiver;
  13
  14   public HTMLScanner(ScannerReceiver aReceiver, Reader aReader) {
  15     reader = new ReaderWrapper(aReader);
  16     receiver = aReceiver;
  17   }
  18
  19   public interface ScannerReceiver {
  20     public void handleDTD(String aDTD) throws HTMLParserExc;
  21     public void handleOpenTag(String aTag, Map anAttributes) throws HTMLParserExc;
  22     public void handleClosingTag(String aTag) throws HTMLParserExc;
  23     public void handleCData(String aData) throws HTMLParserExc;
  24     public void handleComment(String aTag) throws HTMLParserExc;
  25     public void handleEndOfStream() throws HTMLParserExc;
  26   }
  27
  28   public void run() throws IOException, HTMLParserExc {
  29
  30     while (!reader.isAtEnd()) {
  31       char c = reader.peek();
  32
  33       if (c != '<')
  34         readCData();
  35       else {
  36         reader.get();
  37         c = reader.peek();
  38
  39         switch (c) {
  40           case '!':
  41             reader.get();
  42             readSpecial();
  43             break;
  44           case '/':
  45             reader.get();
  46             readEndTag();
  47             break;
  48           default:
  49             readTag();
  50         }
  51       }
  52     }
  53
  54     receiver.handleEndOfStream();
  55   }
  56
  57   private boolean isValidTagNameCharacter(char aCharacter) {
  58     int type = Character.getType(aCharacter);
  59
  60     return
  61         (type == Character.UPPERCASE_LETTER)  ||
  62         (type == Character.LOWERCASE_LETTER)  ||
  63         (type == Character.DECIMAL_DIGIT_NUMBER)  ||
  64         (aCharacter == '.') ||
  65         (aCharacter == '-') ||
  66         (aCharacter == '_') ||
  67         (aCharacter == ':');
  68   }
  69
  70   private boolean isValidUnQuotedAttributeCharacter(char aCharacter) {
  71     int type = Character.getType(aCharacter);
  72
  73     return
  74         (type == Character.UPPERCASE_LETTER)  ||
  75         (type == Character.LOWERCASE_LETTER)  ||
  76         (type == Character.DECIMAL_DIGIT_NUMBER)  ||
  77         (aCharacter == '.') ||
  78         (aCharacter == '#') ||
  79         (aCharacter == '-') ||
  80         (aCharacter == '_') ||
  81         (aCharacter == ':');
  82   }
  83
  84   private void skipWhiteSpace() throws IOException {
  85     while (!reader.isAtEnd() && Character.isWhitespace(reader.peek())) {
  86       reader.get();
  87     }
  88   }
  89
  90   private void readEndTag() throws IOException, HTMLParserExc {
  91     StringBuffer result = new StringBuffer();
  92
  93     skipWhiteSpace();
  94
  95     while (!reader.isAtEnd() && isValidTagNameCharacter(reader.peek())) {
  96       result.append(reader.get());
  97     }
  98
  99     skipWhiteSpace();
 100
 101     if (!reader.isAtEnd() && reader.peek()=='>')
 102       reader.get();
 103
 104     receiver.handleClosingTag(result.toString());
 105   }
 106
 107   private String getName() throws IOException {
 108     StringBuffer result = new StringBuffer();
 109
 110     skipWhiteSpace();
 111
 112     while (!reader.isAtEnd() && isValidTagNameCharacter(reader.peek())) {
 113       result.append(reader.get());
 114     }
 115
 116     if (result.length()==0)
 117       return null;
 118     else
 119       return result.toString();
 120   }
 121
 122   private String getAttributeValue() throws IOException {
 123     StringBuffer result = new StringBuffer();
 124
 125     skipWhiteSpace();
 126
 127     if (!reader.isAtEnd()) {
 128       if (reader.peek() == '\'' || reader.peek() == '\"') {
 129         char boundary = reader.get();
 130
 131         while (!reader.isAtEnd() && reader.peek()!=boundary) {
 132           result.append(reader.get());
 133         }
 134
 135         if (!reader.isAtEnd() && reader.peek()==boundary)
 136           reader.get();
 137       }
 138       else {
 139         while (!reader.isAtEnd() && isValidUnQuotedAttributeCharacter(reader.peek())) {
 140           result.append(reader.get());
 141         }
 142       }
 143     }
 144
 145     return HTMLRoutines.resolveHTMLEntites(result.toString());
 146   }
 147
 148   private void readTag() throws IOException, HTMLParserExc {
 149     String tagName = getName();
 150
 151     Map attributes = new HashMap();
 152
 153     String attributeName = getName();
 154     String attributeValue = null;
 155
 156     while (attributeName!=null) {
 157       skipWhiteSpace();
 158
 159       if (!reader.isAtEnd() && reader.peek()=='=') {
 160         reader.get();
 161         attributeValue = getAttributeValue();
 162       }
 163
 164       attributes.put(attributeName, attributeValue);
 165
 166       attributeName = getName();
 167     }
 168
 169     boolean isClosed=false;
 170
 171     skipWhiteSpace();
 172     if (!reader.isAtEnd() && reader.peek()=='/') {
 173       isClosed = true;
 174       reader.get();
 175     }
 176
 177     skipWhiteSpace();
 178     if (!reader.isAtEnd() && reader.peek()=='>') {
 179       reader.get();
 180     }
 181
 182     receiver.handleOpenTag(tagName, attributes);
 183     if (isClosed)
 184       receiver.handleClosingTag(tagName);
 185   }
 186
 187   private void readSpecial() throws IOException, HTMLParserExc  {
 188     StringBuffer result = new StringBuffer();
 189
 190     if (!reader.isAtEnd() && reader.peek()=='-') {
 191       reader.get();
 192       if (!reader.isAtEnd() && reader.peek()=='-') {
 193         reader.get();
 194
 195         while (!reader.isAtEnd()) {
 196           if (reader.peek()=='-') {
 197             reader.get();
 198             if (!reader.isAtEnd() && reader.peek()=='-') {
 199               reader.get();
 200               if (!reader.isAtEnd() && reader.peek()=='>') {
 201                 reader.get();
 202                 break;
 203               }
 204               result.append('-');
 205             }
 206             result.append('-');
 207           }
 208           if (!reader.isAtEnd())
 209             result.append(reader.get());
 210         }
 211
 212         skipWhiteSpace();
 213
 214         if (!reader.isAtEnd() && reader.peek()=='>')
 215           reader.get();
 216
 217         receiver.handleComment(result.toString());
 218
 219         return;
 220       }
 221       else {
 222         result.append('-');
 223       }
 224     }
 225
 226     while (!reader.isAtEnd() && reader.peek()!='>') {
 227       result.append(reader.get());
 228     }
 229
 230     if (!reader.isAtEnd() && reader.peek()=='>')
 231       reader.get();
 232
 233     receiver.handleDTD("<!"+result.toString()+">");
 234   }
 235
 236   private void readCData() throws IOException, HTMLParserExc {
 237     StringBuffer result = new StringBuffer();
 238
 239     while (!reader.isAtEnd() && reader.peek()!='<') {
 240       result.append(reader.get());
 241     }
 242
 243
 244
 245     receiver.handleCData(HTMLRoutines.resolveHTMLEntites(result.toString()));
 246   }
 247
 248   /**
 249    * Class to provide for a 1 character look-ahead on a reader
 250    */
 251   public static class ReaderWrapper {
 252     private Reader reader;
 253     private char buffer;
 254     private boolean haveBuffer;
 255
 256     public ReaderWrapper(Reader aReader) {
 257       reader = aReader;
 258       haveBuffer = false;
 259     }
 260
 261     /**
 262      * Returns <code>true</code> if the stream contains no more characters.
 263      */
 264     public boolean isAtEnd() throws IOException {
 265       fillBuffer();
 266
 267       return !haveBuffer;
 268     }
 269
 270     /**
 271      * Gets the next character from the reader but will not remove it from the
 272      *    stream.
 273      *    {@link #isAtEnd()} must return <code>false</code> before call this
 274      *    routine.
 275      */
 276     public char peek() throws IOException {
 277       fillBuffer();
 278
 279       return buffer;
 280     }
 281
 282     /**
 283      * Gets the next character from the reader and removes it from the stream.
 284      *    {@link #isAtEnd()} must return <code>false</code> before call this
 285      *    routine.
 286      */
 287     public char get() throws IOException {
 288       fillBuffer();
 289       haveBuffer = false;
 290
 291       return buffer;
 292     }
 293
 294     /**
 295      * If the reader is not at it's end, then upon return, the buffer will
 296      *    be filled. If the buffer was already filled, then this method won't
 297      *    do anything.
 298      */
 299     public void fillBuffer() throws IOException {
 300       if (!haveBuffer) {
 301         int c = reader.read();
 302
 303         if (c!=-1) {
 304           buffer = (char) c;
 305           haveBuffer=true;
 306         }
 307       }
 308     }
 309   }
 310 }