1 package mir.util.xml.html;
3 import java.io.IOException;
5 import java.util.HashMap;
8 import mir.util.HTMLRoutines;
10 public class HTMLScanner {
11 private ReaderWrapper reader;
12 private ScannerReceiver receiver;
14 public HTMLScanner(ScannerReceiver aReceiver, Reader aReader) {
15 reader = new ReaderWrapper(aReader);
19 public interface ScannerReceiver {
20 public void handleDTD(String aDTD) throws HTMLParserExc;
21 public void handleOpenTag(String aTag, Map anAttributes) throws HTMLParserExc;
22 public void handleClosingTag(String aTag) throws HTMLParserExc;
23 public void handleCData(String aData) throws HTMLParserExc;
24 public void handleComment(String aTag) throws HTMLParserExc;
25 public void handleEndOfStream() throws HTMLParserExc;
28 public void run() throws IOException, HTMLParserExc {
30 while (!reader.isAtEnd()) {
31 char c = reader.peek();
49 receiver.handleCData(new String(new char[] {c }));
57 receiver.handleEndOfStream();
60 private boolean isValidTagNameCharacter(char aCharacter) {
61 int type = Character.getType(aCharacter);
64 (type == Character.UPPERCASE_LETTER) ||
65 (type == Character.LOWERCASE_LETTER) ||
66 (type == Character.DECIMAL_DIGIT_NUMBER) ||
67 (aCharacter == '.') ||
68 (aCharacter == '-') ||
69 (aCharacter == '_') ||
73 private boolean isValidUnQuotedAttributeCharacter(char aCharacter) {
74 int type = Character.getType(aCharacter);
77 (type == Character.UPPERCASE_LETTER) ||
78 (type == Character.LOWERCASE_LETTER) ||
79 (type == Character.DECIMAL_DIGIT_NUMBER) ||
80 (aCharacter == '.') ||
81 (aCharacter == '#') ||
82 (aCharacter == '-') ||
83 (aCharacter == '_') ||
87 private void skipWhiteSpace() throws IOException {
88 while (!reader.isAtEnd() && Character.isWhitespace(reader.peek())) {
93 private void readEndTag() throws IOException, HTMLParserExc {
94 StringBuffer result = new StringBuffer();
98 while (!reader.isAtEnd() && isValidTagNameCharacter(reader.peek())) {
99 result.append(reader.get());
104 if (!reader.isAtEnd() && reader.peek()=='>')
107 receiver.handleClosingTag(result.toString());
110 private String getName() throws IOException {
111 StringBuffer result = new StringBuffer();
115 while (!reader.isAtEnd() && isValidTagNameCharacter(reader.peek())) {
116 result.append(reader.get());
119 if (result.length()==0)
121 return result.toString();
124 private String getAttributeValue() throws IOException {
125 StringBuffer result = new StringBuffer();
129 if (!reader.isAtEnd()) {
130 if (reader.peek() == '\'' || reader.peek() == '\"') {
131 char boundary = reader.get();
133 while (!reader.isAtEnd() && reader.peek()!=boundary) {
134 result.append(reader.get());
137 if (!reader.isAtEnd() && reader.peek()==boundary)
141 while (!reader.isAtEnd() && isValidUnQuotedAttributeCharacter(reader.peek())) {
142 result.append(reader.get());
147 return HTMLRoutines.resolveHTMLEntites(result.toString());
150 private void readTag() throws IOException, HTMLParserExc {
151 String tagName = getName();
153 Map attributes = new HashMap();
155 String attributeName = getName();
156 String attributeValue = null;
158 while (attributeName!=null) {
161 if (!reader.isAtEnd() && reader.peek()=='=') {
163 attributeValue = getAttributeValue();
166 attributes.put(attributeName, attributeValue);
168 attributeName = getName();
171 boolean isClosed=false;
174 if (!reader.isAtEnd() && reader.peek()=='/') {
180 if (!reader.isAtEnd() && reader.peek()=='>') {
184 receiver.handleOpenTag(tagName, attributes);
186 receiver.handleClosingTag(tagName);
189 private void readSpecial() throws IOException, HTMLParserExc {
190 StringBuffer result = new StringBuffer();
192 if (!reader.isAtEnd() && reader.peek()=='-') {
194 if (!reader.isAtEnd() && reader.peek()=='-') {
197 while (!reader.isAtEnd()) {
198 if (reader.peek()=='-') {
200 if (!reader.isAtEnd() && reader.peek()=='-') {
202 if (!reader.isAtEnd() && reader.peek()=='>') {
210 if (!reader.isAtEnd())
211 result.append(reader.get());
216 if (!reader.isAtEnd() && reader.peek()=='>')
219 receiver.handleComment(result.toString());
226 while (!reader.isAtEnd() && reader.peek()!='>') {
227 result.append(reader.get());
230 if (!reader.isAtEnd() && reader.peek()=='>')
233 receiver.handleDTD("<!"+result.toString()+">");
236 private void readCData() throws IOException, HTMLParserExc {
237 StringBuffer result = new StringBuffer();
239 while (!reader.isAtEnd() && reader.peek()!='<') {
240 result.append(reader.get());
243 receiver.handleCData(HTMLRoutines.resolveHTMLEntites(result.toString()));
247 * Class to provide for a 1 character look-ahead on a reader
249 public static class ReaderWrapper {
250 private Reader reader;
252 private boolean haveBuffer;
254 public ReaderWrapper(Reader aReader) {
260 * Returns <code>true</code> if the stream contains no more characters.
262 public boolean isAtEnd() throws IOException {
269 * Gets the next character from the reader but will not remove it from the
271 * {@link #isAtEnd()} must return <code>false</code> before call this
274 public char peek() throws IOException {
281 * Gets the next character from the reader and removes it from the stream.
282 * {@link #isAtEnd()} must return <code>false</code> before call this
285 public char get() throws IOException {
293 * If the reader is not at it's end, then upon return, the buffer will
294 * be filled. If the buffer was already filled, then this method won't
297 public void fillBuffer() throws IOException {
299 int c = reader.read();