1 package mir.util.xml.html;
3 import java.io.IOException;
5 import java.util.HashMap;
8 import mir.util.HTMLRoutines;
10 public class HTMLScanner {
11 private ReaderWrapper reader;
12 private ScannerReceiver receiver;
14 public HTMLScanner(ScannerReceiver aReceiver, Reader aReader) {
15 reader = new ReaderWrapper(aReader);
19 public interface ScannerReceiver {
20 public void handleDTD(String aDTD) throws HTMLParserExc;
21 public void handleOpenTag(String aTag, Map anAttributes) throws HTMLParserExc;
22 public void handleClosingTag(String aTag) throws HTMLParserExc;
23 public void handleCData(String aData) throws HTMLParserExc;
24 public void handleComment(String aTag) throws HTMLParserExc;
25 public void handleEndOfStream() throws HTMLParserExc;
28 public void run() throws IOException, HTMLParserExc {
30 while (!reader.isAtEnd()) {
31 char c = reader.peek();
54 receiver.handleEndOfStream();
57 private boolean isValidTagNameCharacter(char aCharacter) {
58 int type = Character.getType(aCharacter);
61 (type == Character.UPPERCASE_LETTER) ||
62 (type == Character.LOWERCASE_LETTER) ||
63 (type == Character.DECIMAL_DIGIT_NUMBER) ||
64 (aCharacter == '.') ||
65 (aCharacter == '-') ||
66 (aCharacter == '_') ||
70 private void skipWhiteSpace() throws IOException {
71 while (!reader.isAtEnd() && Character.isWhitespace(reader.peek())) {
76 private void readEndTag() throws IOException, HTMLParserExc {
77 StringBuffer result = new StringBuffer();
81 while (!reader.isAtEnd() && isValidTagNameCharacter(reader.peek())) {
82 result.append(reader.get());
87 if (!reader.isAtEnd() && reader.peek()=='>')
90 receiver.handleClosingTag(result.toString());
93 private String getName() throws IOException {
94 StringBuffer result = new StringBuffer();
98 while (!reader.isAtEnd() && isValidTagNameCharacter(reader.peek())) {
99 result.append(reader.get());
102 if (result.length()==0)
105 return result.toString();
108 private String getAttributeValue() throws IOException {
109 StringBuffer result = new StringBuffer();
113 if (!reader.isAtEnd()) {
114 if (reader.peek() == '\'' || reader.peek() == '\"') {
115 char boundary = reader.get();
117 while (!reader.isAtEnd() && reader.peek()!=boundary) {
118 result.append(reader.get());
121 if (!reader.isAtEnd() && reader.peek()==boundary)
125 while (!reader.isAtEnd() && isValidTagNameCharacter(reader.peek())) {
126 result.append(reader.get());
131 return HTMLRoutines.resolveHTMLEntites(result.toString());
134 private void readTag() throws IOException, HTMLParserExc {
135 String tagName = getName();
137 Map attributes = new HashMap();
139 String attributeName = getName();
140 String attributeValue = null;
142 while (attributeName!=null) {
145 if (!reader.isAtEnd() && reader.peek()=='=') {
147 attributeValue = getAttributeValue();
150 attributes.put(attributeName, attributeValue);
152 attributeName = getName();
155 boolean isClosed=false;
158 if (!reader.isAtEnd() && reader.peek()=='/') {
164 if (!reader.isAtEnd() && reader.peek()=='>') {
168 receiver.handleOpenTag(tagName, attributes);
170 receiver.handleClosingTag(tagName);
173 private void readSpecial() throws IOException, HTMLParserExc {
174 StringBuffer result = new StringBuffer();
176 if (!reader.isAtEnd() && reader.peek()=='-') {
178 if (!reader.isAtEnd() && reader.peek()=='-') {
181 while (!reader.isAtEnd()) {
182 if (reader.peek()=='-') {
184 if (!reader.isAtEnd() && reader.peek()=='-') {
190 if (!reader.isAtEnd())
191 result.append(reader.get());
196 if (!reader.isAtEnd() && reader.peek()=='>')
199 receiver.handleComment(result.toString());
208 while (!reader.isAtEnd() && reader.peek()!='>') {
209 result.append(reader.get());
212 if (!reader.isAtEnd() && reader.peek()=='>')
215 receiver.handleDTD("<!"+result.toString()+">");
218 private void readCData() throws IOException, HTMLParserExc {
219 StringBuffer result = new StringBuffer();
221 while (!reader.isAtEnd() && reader.peek()!='<') {
222 result.append(reader.get());
227 receiver.handleCData(HTMLRoutines.resolveHTMLEntites(result.toString()));
231 * Class to provide for a 1 character look-ahead on a reader
233 public static class ReaderWrapper {
234 private Reader reader;
236 private boolean haveBuffer;
238 public ReaderWrapper(Reader aReader) {
244 * Returns <code>true</code> if the stream contains no more characters.
246 public boolean isAtEnd() throws IOException {
253 * Gets the next character from the reader but will not remove it from the
255 * {@link #isAtEnd()} must return <code>false</code> before call this
258 public char peek() throws IOException {
265 * Gets the next character from the reader and removes it from the stream.
266 * {@link #isAtEnd()} must return <code>false</code> before call this
269 public char get() throws IOException {
277 * If the reader is not at it's end, then upon return, the buffer will
278 * be filled. If the buffer was already filled, then this method won't
281 public void fillBuffer() throws IOException {
283 int c = reader.read();