source/mir/util/xml/html/HTMLScanner.java

   1 package mir.util.xml.html;
   2
   3 import java.io.IOException;
   4 import java.io.Reader;
   5 import java.util.HashMap;
   6 import java.util.Map;
   7
   8 import mir.util.HTMLRoutines;
   9
  10 public class HTMLScanner {
  11   private ReaderWrapper reader;
  12   private ScannerReceiver receiver;
  13
  14   public HTMLScanner(ScannerReceiver aReceiver, Reader aReader) {
  15     reader = new ReaderWrapper(aReader);
  16     receiver = aReceiver;
  17   }
  18
  19   public interface ScannerReceiver {
  20     public void handleDTD(String aDTD) throws HTMLParserExc;
  21     public void handleOpenTag(String aTag, Map anAttributes) throws HTMLParserExc;
  22     public void handleClosingTag(String aTag) throws HTMLParserExc;
  23     public void handleCData(String aData) throws HTMLParserExc;
  24     public void handleComment(String aTag) throws HTMLParserExc;
  25     public void handleEndOfStream() throws HTMLParserExc;
  26   }
  27
  28   public void run() throws IOException, HTMLParserExc {
  29
  30     while (!reader.isAtEnd()) {
  31       char c = reader.peek();
  32
  33       if (c != '<')
  34         readCData();
  35       else {
  36         reader.get();
  37         c = reader.peek();
  38
  39         switch (c) {
  40           case '!':
  41             reader.get();
  42             readSpecial();
  43             break;
  44           case '/':
  45             reader.get();
  46             readEndTag();
  47             break;
  48           case '<':
  49             receiver.handleCData(new String(new char[] {c }));
  50             break;
  51           default:
  52             readTag();
  53         }
  54       }
  55     }
  56
  57     receiver.handleEndOfStream();
  58   }
  59
  60   private boolean isValidTagNameCharacter(char aCharacter) {
  61     int type = Character.getType(aCharacter);
  62
  63     return
  64         (type == Character.UPPERCASE_LETTER)  ||
  65         (type == Character.LOWERCASE_LETTER)  ||
  66         (type == Character.DECIMAL_DIGIT_NUMBER)  ||
  67         (aCharacter == '.') ||
  68         (aCharacter == '-') ||
  69         (aCharacter == '_') ||
  70         (aCharacter == ':');
  71   }
  72
  73   private boolean isValidUnQuotedAttributeCharacter(char aCharacter) {
  74     int type = Character.getType(aCharacter);
  75
  76     return
  77         (type == Character.UPPERCASE_LETTER)  ||
  78         (type == Character.LOWERCASE_LETTER)  ||
  79         (type == Character.DECIMAL_DIGIT_NUMBER)  ||
  80         (aCharacter == '.') ||
  81         (aCharacter == '#') ||
  82         (aCharacter == '-') ||
  83         (aCharacter == '_') ||
  84         (aCharacter == ':');
  85   }
  86
  87   private void skipWhiteSpace() throws IOException {
  88     while (!reader.isAtEnd() && Character.isWhitespace(reader.peek())) {
  89       reader.get();
  90     }
  91   }
  92
  93   private void readEndTag() throws IOException, HTMLParserExc {
  94     StringBuffer result = new StringBuffer();
  95
  96     skipWhiteSpace();
  97
  98     while (!reader.isAtEnd() && isValidTagNameCharacter(reader.peek())) {
  99       result.append(reader.get());
 100     }
 101
 102     skipWhiteSpace();
 103
 104     if (!reader.isAtEnd() && reader.peek()=='>')
 105       reader.get();
 106
 107     receiver.handleClosingTag(result.toString());
 108   }
 109
 110   private String getName() throws IOException {
 111     StringBuffer result = new StringBuffer();
 112
 113     skipWhiteSpace();
 114
 115     while (!reader.isAtEnd() && isValidTagNameCharacter(reader.peek())) {
 116       result.append(reader.get());
 117     }
 118
 119     if (result.length()==0)
 120       return null;
 121                 return result.toString();
 122   }
 123
 124   private String getAttributeValue() throws IOException {
 125     StringBuffer result = new StringBuffer();
 126
 127     skipWhiteSpace();
 128
 129     if (!reader.isAtEnd()) {
 130       if (reader.peek() == '\'' || reader.peek() == '\"') {
 131         char boundary = reader.get();
 132
 133         while (!reader.isAtEnd() && reader.peek()!=boundary) {
 134           result.append(reader.get());
 135         }
 136
 137         if (!reader.isAtEnd() && reader.peek()==boundary)
 138           reader.get();
 139       }
 140       else {
 141         while (!reader.isAtEnd() && isValidUnQuotedAttributeCharacter(reader.peek())) {
 142           result.append(reader.get());
 143         }
 144       }
 145     }
 146
 147     return HTMLRoutines.resolveHTMLEntites(result.toString());
 148   }
 149
 150   private void readTag() throws IOException, HTMLParserExc {
 151     String tagName = getName();
 152
 153     Map attributes = new HashMap();
 154
 155     String attributeName = getName();
 156     String attributeValue = null;
 157
 158     while (attributeName!=null) {
 159       skipWhiteSpace();
 160
 161       if (!reader.isAtEnd() && reader.peek()=='=') {
 162         reader.get();
 163         attributeValue = getAttributeValue();
 164       }
 165
 166       attributes.put(attributeName, attributeValue);
 167
 168       attributeName = getName();
 169     }
 170
 171     boolean isClosed=false;
 172
 173     skipWhiteSpace();
 174     if (!reader.isAtEnd() && reader.peek()=='/') {
 175       isClosed = true;
 176       reader.get();
 177     }
 178
 179     skipWhiteSpace();
 180     if (!reader.isAtEnd() && reader.peek()=='>') {
 181       reader.get();
 182     }
 183
 184     receiver.handleOpenTag(tagName, attributes);
 185     if (isClosed)
 186       receiver.handleClosingTag(tagName);
 187   }
 188
 189   private void readSpecial() throws IOException, HTMLParserExc  {
 190     StringBuffer result = new StringBuffer();
 191
 192     if (!reader.isAtEnd() && reader.peek()=='-') {
 193       reader.get();
 194       if (!reader.isAtEnd() && reader.peek()=='-') {
 195         reader.get();
 196
 197         while (!reader.isAtEnd()) {
 198           if (reader.peek()=='-') {
 199             reader.get();
 200             if (!reader.isAtEnd() && reader.peek()=='-') {
 201               reader.get();
 202               if (!reader.isAtEnd() && reader.peek()=='>') {
 203                 reader.get();
 204                 break;
 205               }
 206               result.append('-');
 207             }
 208             result.append('-');
 209           }
 210           if (!reader.isAtEnd())
 211             result.append(reader.get());
 212         }
 213
 214         skipWhiteSpace();
 215
 216         if (!reader.isAtEnd() && reader.peek()=='>')
 217           reader.get();
 218
 219         receiver.handleComment(result.toString());
 220
 221         return;
 222       }
 223                         result.append('-');
 224     }
 225
 226     while (!reader.isAtEnd() && reader.peek()!='>') {
 227       result.append(reader.get());
 228     }
 229
 230     if (!reader.isAtEnd() && reader.peek()=='>')
 231       reader.get();
 232
 233     receiver.handleDTD("<!"+result.toString()+">");
 234   }
 235
 236   private void readCData() throws IOException, HTMLParserExc {
 237     StringBuffer result = new StringBuffer();
 238
 239     while (!reader.isAtEnd() && reader.peek()!='<') {
 240       result.append(reader.get());
 241     }
 242
 243     receiver.handleCData(HTMLRoutines.resolveHTMLEntites(result.toString()));
 244   }
 245
 246   /**
 247    * Class to provide for a 1 character look-ahead on a reader
 248    */
 249   public static class ReaderWrapper {
 250     private Reader reader;
 251     private char buffer;
 252     private boolean haveBuffer;
 253
 254     public ReaderWrapper(Reader aReader) {
 255       reader = aReader;
 256       haveBuffer = false;
 257     }
 258
 259     /**
 260      * Returns <code>true</code> if the stream contains no more characters.
 261      */
 262     public boolean isAtEnd() throws IOException {
 263       fillBuffer();
 264
 265       return !haveBuffer;
 266     }
 267
 268     /**
 269      * Gets the next character from the reader but will not remove it from the
 270      *    stream.
 271      *    {@link #isAtEnd()} must return <code>false</code> before call this
 272      *    routine.
 273      */
 274     public char peek() throws IOException {
 275       fillBuffer();
 276
 277       return buffer;
 278     }
 279
 280     /**
 281      * Gets the next character from the reader and removes it from the stream.
 282      *    {@link #isAtEnd()} must return <code>false</code> before call this
 283      *    routine.
 284      */
 285     public char get() throws IOException {
 286       fillBuffer();
 287       haveBuffer = false;
 288
 289       return buffer;
 290     }
 291
 292     /**
 293      * If the reader is not at it's end, then upon return, the buffer will
 294      *    be filled. If the buffer was already filled, then this method won't
 295      *    do anything.
 296      */
 297     public void fillBuffer() throws IOException {
 298       if (!haveBuffer) {
 299         int c = reader.read();
 300
 301         if (c!=-1) {
 302           buffer = (char) c;
 303           haveBuffer=true;
 304         }
 305       }
 306     }
 307   }
 308 }