1 package mir.util.xml.html;
3 import java.io.IOException;
5 import java.util.HashMap;
8 public class HTMLScanner {
9 private ReaderWrapper reader;
10 private ScannerReceiver receiver;
12 public HTMLScanner(ScannerReceiver aReceiver, Reader aReader) {
13 reader = new ReaderWrapper(aReader);
17 public interface ScannerReceiver {
18 public void handleDTD(String aDTD) throws HTMLParserExc;
19 public void handleOpenTag(String aTag, Map anAttributes) throws HTMLParserExc;
20 public void handleClosingTag(String aTag) throws HTMLParserExc;
21 public void handleCData(String aData) throws HTMLParserExc;
22 public void handleComment(String aTag) throws HTMLParserExc;
23 public void handleEndOfStream() throws HTMLParserExc;
26 public void run() throws IOException, HTMLParserExc {
28 while (!reader.isAtEnd()) {
29 char c = reader.peek();
52 receiver.handleEndOfStream();
55 private boolean isValidTagNameCharacter(char aCharacter) {
56 int type = Character.getType(aCharacter);
59 (type == Character.UPPERCASE_LETTER) ||
60 (type == Character.LOWERCASE_LETTER) ||
61 (type == Character.DECIMAL_DIGIT_NUMBER) ||
62 (aCharacter == '-') ||
63 (aCharacter == '_') ||
67 private void skipWhiteSpace() throws IOException {
68 while (!reader.isAtEnd() && Character.isWhitespace(reader.peek())) {
73 private void readEndTag() throws IOException, HTMLParserExc {
74 StringBuffer result = new StringBuffer();
78 while (!reader.isAtEnd() && isValidTagNameCharacter(reader.peek())) {
79 result.append(reader.get());
84 if (!reader.isAtEnd() && reader.peek()=='>')
87 receiver.handleClosingTag(result.toString());
90 private String getName() throws IOException {
91 StringBuffer result = new StringBuffer();
95 while (!reader.isAtEnd() && isValidTagNameCharacter(reader.peek())) {
96 result.append(reader.get());
99 if (result.length()==0)
102 return result.toString();
105 private String getAttributeValue() throws IOException {
106 StringBuffer result = new StringBuffer();
110 if (!reader.isAtEnd()) {
111 if (reader.peek() == '\'' || reader.peek() == '\"') {
112 char boundary = reader.get();
114 while (!reader.isAtEnd() && reader.peek()!=boundary) {
115 result.append(reader.get());
118 if (!reader.isAtEnd() && reader.peek()==boundary)
122 while (!reader.isAtEnd() && isValidTagNameCharacter(reader.peek())) {
123 result.append(reader.get());
128 return result.toString();
131 private void readTag() throws IOException, HTMLParserExc {
132 String tagName = getName();
134 Map attributes = new HashMap();
136 String attributeName = getName();
137 String attributeValue = null;
139 while (attributeName!=null) {
142 if (!reader.isAtEnd() && reader.peek()=='=') {
144 attributeValue = getAttributeValue();
147 attributes.put(attributeName, attributeValue);
149 attributeName = getName();
152 boolean isClosed=false;
155 if (!reader.isAtEnd() && reader.peek()=='/') {
161 if (!reader.isAtEnd() && reader.peek()=='>') {
165 receiver.handleOpenTag(tagName, attributes);
167 receiver.handleClosingTag(tagName);
170 private void readSpecial() throws IOException, HTMLParserExc {
171 StringBuffer result = new StringBuffer();
173 if (!reader.isAtEnd() && reader.peek()=='-') {
175 if (!reader.isAtEnd() && reader.peek()=='-') {
178 while (!reader.isAtEnd()) {
179 if (reader.peek()=='-') {
181 if (!reader.isAtEnd() && reader.peek()=='-') {
187 if (!reader.isAtEnd())
188 result.append(reader.get());
193 if (!reader.isAtEnd() && reader.peek()=='>')
196 receiver.handleComment(result.toString());
205 while (!reader.isAtEnd() && reader.peek()!='>') {
206 result.append(reader.get());
209 if (!reader.isAtEnd() && reader.peek()=='>')
212 receiver.handleDTD("<!"+result.toString()+">");
215 private void readCData() throws IOException, HTMLParserExc {
216 StringBuffer result = new StringBuffer();
218 while (!reader.isAtEnd() && reader.peek()!='<') {
219 result.append(reader.get());
222 receiver.handleCData(result.toString());
226 * Class to provide for a 1 character look-ahead on a reader
228 public static class ReaderWrapper {
229 private Reader reader;
231 private boolean haveBuffer;
233 public ReaderWrapper(Reader aReader) {
239 * Returns <code>true</code> if the stream contains no more characters.
241 public boolean isAtEnd() throws IOException {
248 * Gets the next character from the reader but will not remove it from the
250 * {@link #isAtEnd()} must return <code>false</code> before call this
253 public char peek() throws IOException {
260 * Gets the next character from the reader and removes it from the stream.
261 * {@link #isAtEnd()} must return <code>false</code> before call this
264 public char get() throws IOException {
272 * If the reader is not at it's end, then upon return, the buffer will
273 * be filled. If the buffer was already filled, then this method won't
276 public void fillBuffer() throws IOException {
278 int c = reader.read();