source/mir/util/xml/html/HTMLScanner.java

   1 package mir.util.xml.html;
   2
   3 import java.io.IOException;
   4 import java.io.Reader;
   5 import java.util.HashMap;
   6 import java.util.Map;
   7
   8 import mir.util.HTMLRoutines;
   9
  10 public class HTMLScanner {
  11   private ReaderWrapper reader;
  12   private ScannerReceiver receiver;
  13
  14   public HTMLScanner(ScannerReceiver aReceiver, Reader aReader) {
  15     reader = new ReaderWrapper(aReader);
  16     receiver = aReceiver;
  17   }
  18
  19   public interface ScannerReceiver {
  20     public void handleDTD(String aDTD) throws HTMLParserExc;
  21     public void handleOpenTag(String aTag, Map anAttributes) throws HTMLParserExc;
  22     public void handleClosingTag(String aTag) throws HTMLParserExc;
  23     public void handleCData(String aData) throws HTMLParserExc;
  24     public void handleComment(String aTag) throws HTMLParserExc;
  25     public void handleEndOfStream() throws HTMLParserExc;
  26   }
  27
  28   public void run() throws IOException, HTMLParserExc {
  29
  30     while (!reader.isAtEnd()) {
  31       char c = reader.peek();
  32
  33       if (c != '<')
  34         readCData();
  35       else {
  36         reader.get();
  37         c = reader.peek();
  38
  39         switch (c) {
  40           case '!':
  41             reader.get();
  42             readSpecial();
  43             break;
  44           case '/':
  45             reader.get();
  46             readEndTag();
  47             break;
  48           default:
  49             readTag();
  50         }
  51       }
  52     }
  53
  54     receiver.handleEndOfStream();
  55   }
  56
  57   private boolean isValidTagNameCharacter(char aCharacter) {
  58     int type = Character.getType(aCharacter);
  59
  60     return
  61         (type == Character.UPPERCASE_LETTER)  ||
  62         (type == Character.LOWERCASE_LETTER)  ||
  63         (type == Character.DECIMAL_DIGIT_NUMBER)  ||
  64         (aCharacter == '.') ||
  65         (aCharacter == '-') ||
  66         (aCharacter == '_') ||
  67         (aCharacter == ':');
  68   }
  69
  70   private void skipWhiteSpace() throws IOException {
  71     while (!reader.isAtEnd() && Character.isWhitespace(reader.peek())) {
  72       reader.get();
  73     }
  74   }
  75
  76   private void readEndTag() throws IOException, HTMLParserExc {
  77     StringBuffer result = new StringBuffer();
  78
  79     skipWhiteSpace();
  80
  81     while (!reader.isAtEnd() && isValidTagNameCharacter(reader.peek())) {
  82       result.append(reader.get());
  83     }
  84
  85     skipWhiteSpace();
  86
  87     if (!reader.isAtEnd() && reader.peek()=='>')
  88       reader.get();
  89
  90     receiver.handleClosingTag(result.toString());
  91   }
  92
  93   private String getName() throws IOException {
  94     StringBuffer result = new StringBuffer();
  95
  96     skipWhiteSpace();
  97
  98     while (!reader.isAtEnd() && isValidTagNameCharacter(reader.peek())) {
  99       result.append(reader.get());
 100     }
 101
 102     if (result.length()==0)
 103       return null;
 104     else
 105       return result.toString();
 106   }
 107
 108   private String getAttributeValue() throws IOException {
 109     StringBuffer result = new StringBuffer();
 110
 111     skipWhiteSpace();
 112
 113     if (!reader.isAtEnd()) {
 114       if (reader.peek() == '\'' || reader.peek() == '\"') {
 115         char boundary = reader.get();
 116
 117         while (!reader.isAtEnd() && reader.peek()!=boundary) {
 118           result.append(reader.get());
 119         }
 120
 121         if (!reader.isAtEnd() && reader.peek()==boundary)
 122           reader.get();
 123       }
 124       else {
 125         while (!reader.isAtEnd() && isValidTagNameCharacter(reader.peek())) {
 126           result.append(reader.get());
 127         }
 128       }
 129     }
 130
 131     return HTMLRoutines.resolveHTMLEntites(result.toString());
 132   }
 133
 134   private void readTag() throws IOException, HTMLParserExc {
 135     String tagName = getName();
 136
 137     Map attributes = new HashMap();
 138
 139     String attributeName = getName();
 140     String attributeValue = null;
 141
 142     while (attributeName!=null) {
 143       skipWhiteSpace();
 144
 145       if (!reader.isAtEnd() && reader.peek()=='=') {
 146         reader.get();
 147         attributeValue = getAttributeValue();
 148       }
 149
 150       attributes.put(attributeName, attributeValue);
 151
 152       attributeName = getName();
 153     }
 154
 155     boolean isClosed=false;
 156
 157     skipWhiteSpace();
 158     if (!reader.isAtEnd() && reader.peek()=='/') {
 159       isClosed = true;
 160       reader.get();
 161     }
 162
 163     skipWhiteSpace();
 164     if (!reader.isAtEnd() && reader.peek()=='>') {
 165       reader.get();
 166     }
 167
 168     receiver.handleOpenTag(tagName, attributes);
 169     if (isClosed)
 170       receiver.handleClosingTag(tagName);
 171   }
 172
 173   private void readSpecial() throws IOException, HTMLParserExc  {
 174     StringBuffer result = new StringBuffer();
 175
 176     if (!reader.isAtEnd() && reader.peek()=='-') {
 177       reader.get();
 178       if (!reader.isAtEnd() && reader.peek()=='-') {
 179         reader.get();
 180
 181         while (!reader.isAtEnd()) {
 182           if (reader.peek()=='-') {
 183             reader.get();
 184             if (!reader.isAtEnd() && reader.peek()=='-') {
 185               reader.get();
 186               break;
 187             }
 188             result.append('-');
 189           }
 190           if (!reader.isAtEnd())
 191             result.append(reader.get());
 192         }
 193
 194         skipWhiteSpace();
 195
 196         if (!reader.isAtEnd() && reader.peek()=='>')
 197           reader.get();
 198
 199         receiver.handleComment(result.toString());
 200
 201         return;
 202       }
 203       else {
 204         result.append('-');
 205       }
 206     }
 207
 208     while (!reader.isAtEnd() && reader.peek()!='>') {
 209       result.append(reader.get());
 210     }
 211
 212     if (!reader.isAtEnd() && reader.peek()=='>')
 213       reader.get();
 214
 215     receiver.handleDTD("<!"+result.toString()+">");
 216   }
 217
 218   private void readCData() throws IOException, HTMLParserExc {
 219     StringBuffer result = new StringBuffer();
 220
 221     while (!reader.isAtEnd() && reader.peek()!='<') {
 222       result.append(reader.get());
 223     }
 224
 225
 226
 227     receiver.handleCData(HTMLRoutines.resolveHTMLEntites(result.toString()));
 228   }
 229
 230   /**
 231    * Class to provide for a 1 character look-ahead on a reader
 232    */
 233   public static class ReaderWrapper {
 234     private Reader reader;
 235     private char buffer;
 236     private boolean haveBuffer;
 237
 238     public ReaderWrapper(Reader aReader) {
 239       reader = aReader;
 240       haveBuffer = false;
 241     }
 242
 243     /**
 244      * Returns <code>true</code> if the stream contains no more characters.
 245      */
 246     public boolean isAtEnd() throws IOException {
 247       fillBuffer();
 248
 249       return !haveBuffer;
 250     }
 251
 252     /**
 253      * Gets the next character from the reader but will not remove it from the
 254      *    stream.
 255      *    {@link #isAtEnd()} must return <code>false</code> before call this
 256      *    routine.
 257      */
 258     public char peek() throws IOException {
 259       fillBuffer();
 260
 261       return buffer;
 262     }
 263
 264     /**
 265      * Gets the next character from the reader and removes it from the stream.
 266      *    {@link #isAtEnd()} must return <code>false</code> before call this
 267      *    routine.
 268      */
 269     public char get() throws IOException {
 270       fillBuffer();
 271       haveBuffer = false;
 272
 273       return buffer;
 274     }
 275
 276     /**
 277      * If the reader is not at it's end, then upon return, the buffer will
 278      *    be filled. If the buffer was already filled, then this method won't
 279      *    do anything.
 280      */
 281     public void fillBuffer() throws IOException {
 282       if (!haveBuffer) {
 283         int c = reader.read();
 284
 285         if (c!=-1) {
 286           buffer = (char) c;
 287           haveBuffer=true;
 288         }
 289       }
 290     }
 291   }
 292 }