1 package mir.util.xml.html;
3 import java.io.IOException;
5 import java.util.HashMap;
8 import mir.util.HTMLRoutines;
10 public class HTMLScanner {
11 private ReaderWrapper reader;
12 private ScannerReceiver receiver;
14 public HTMLScanner(ScannerReceiver aReceiver, Reader aReader) {
15 reader = new ReaderWrapper(aReader);
19 public interface ScannerReceiver {
20 public void handleDTD(String aDTD) throws HTMLParserExc;
21 public void handleOpenTag(String aTag, Map anAttributes) throws HTMLParserExc;
22 public void handleClosingTag(String aTag) throws HTMLParserExc;
23 public void handleCData(String aData) throws HTMLParserExc;
24 public void handleComment(String aTag) throws HTMLParserExc;
25 public void handleEndOfStream() throws HTMLParserExc;
28 public void run() throws IOException, HTMLParserExc {
30 while (!reader.isAtEnd()) {
31 char c = reader.peek();
54 receiver.handleEndOfStream();
57 private boolean isValidTagNameCharacter(char aCharacter) {
58 int type = Character.getType(aCharacter);
61 (type == Character.UPPERCASE_LETTER) ||
62 (type == Character.LOWERCASE_LETTER) ||
63 (type == Character.DECIMAL_DIGIT_NUMBER) ||
64 (aCharacter == '.') ||
65 (aCharacter == '-') ||
66 (aCharacter == '_') ||
70 private boolean isValidUnQuotedAttributeCharacter(char aCharacter) {
71 int type = Character.getType(aCharacter);
74 (type == Character.UPPERCASE_LETTER) ||
75 (type == Character.LOWERCASE_LETTER) ||
76 (type == Character.DECIMAL_DIGIT_NUMBER) ||
77 (aCharacter == '.') ||
78 (aCharacter == '#') ||
79 (aCharacter == '-') ||
80 (aCharacter == '_') ||
84 private void skipWhiteSpace() throws IOException {
85 while (!reader.isAtEnd() && Character.isWhitespace(reader.peek())) {
90 private void readEndTag() throws IOException, HTMLParserExc {
91 StringBuffer result = new StringBuffer();
95 while (!reader.isAtEnd() && isValidTagNameCharacter(reader.peek())) {
96 result.append(reader.get());
101 if (!reader.isAtEnd() && reader.peek()=='>')
104 receiver.handleClosingTag(result.toString());
107 private String getName() throws IOException {
108 StringBuffer result = new StringBuffer();
112 while (!reader.isAtEnd() && isValidTagNameCharacter(reader.peek())) {
113 result.append(reader.get());
116 if (result.length()==0)
119 return result.toString();
122 private String getAttributeValue() throws IOException {
123 StringBuffer result = new StringBuffer();
127 if (!reader.isAtEnd()) {
128 if (reader.peek() == '\'' || reader.peek() == '\"') {
129 char boundary = reader.get();
131 while (!reader.isAtEnd() && reader.peek()!=boundary) {
132 result.append(reader.get());
135 if (!reader.isAtEnd() && reader.peek()==boundary)
139 while (!reader.isAtEnd() && isValidUnQuotedAttributeCharacter(reader.peek())) {
140 result.append(reader.get());
145 return HTMLRoutines.resolveHTMLEntites(result.toString());
148 private void readTag() throws IOException, HTMLParserExc {
149 String tagName = getName();
151 Map attributes = new HashMap();
153 String attributeName = getName();
154 String attributeValue = null;
156 while (attributeName!=null) {
159 if (!reader.isAtEnd() && reader.peek()=='=') {
161 attributeValue = getAttributeValue();
164 attributes.put(attributeName, attributeValue);
166 attributeName = getName();
169 boolean isClosed=false;
172 if (!reader.isAtEnd() && reader.peek()=='/') {
178 if (!reader.isAtEnd() && reader.peek()=='>') {
182 receiver.handleOpenTag(tagName, attributes);
184 receiver.handleClosingTag(tagName);
187 private void readSpecial() throws IOException, HTMLParserExc {
188 StringBuffer result = new StringBuffer();
190 if (!reader.isAtEnd() && reader.peek()=='-') {
192 if (!reader.isAtEnd() && reader.peek()=='-') {
195 while (!reader.isAtEnd()) {
196 if (reader.peek()=='-') {
198 if (!reader.isAtEnd() && reader.peek()=='-') {
200 if (!reader.isAtEnd() && reader.peek()=='>') {
208 if (!reader.isAtEnd())
209 result.append(reader.get());
214 if (!reader.isAtEnd() && reader.peek()=='>')
217 receiver.handleComment(result.toString());
226 while (!reader.isAtEnd() && reader.peek()!='>') {
227 result.append(reader.get());
230 if (!reader.isAtEnd() && reader.peek()=='>')
233 receiver.handleDTD("<!"+result.toString()+">");
236 private void readCData() throws IOException, HTMLParserExc {
237 StringBuffer result = new StringBuffer();
239 while (!reader.isAtEnd() && reader.peek()!='<') {
240 result.append(reader.get());
245 receiver.handleCData(HTMLRoutines.resolveHTMLEntites(result.toString()));
249 * Class to provide for a 1 character look-ahead on a reader
251 public static class ReaderWrapper {
252 private Reader reader;
254 private boolean haveBuffer;
256 public ReaderWrapper(Reader aReader) {
262 * Returns <code>true</code> if the stream contains no more characters.
264 public boolean isAtEnd() throws IOException {
271 * Gets the next character from the reader but will not remove it from the
273 * {@link #isAtEnd()} must return <code>false</code> before call this
276 public char peek() throws IOException {
283 * Gets the next character from the reader and removes it from the stream.
284 * {@link #isAtEnd()} must return <code>false</code> before call this
287 public char get() throws IOException {
295 * If the reader is not at it's end, then upon return, the buffer will
296 * be filled. If the buffer was already filled, then this method won't
299 public void fillBuffer() throws IOException {
301 int c = reader.read();