Skip to main content
Tweeted twitter.com/#!/StackCodeReview/status/539756391716896768
edited tags
Link
200_success
  • 145.7k
  • 22
  • 191
  • 481
description
Source Link
rolfl
  • 98.1k
  • 17
  • 220
  • 419

I am sorry, if Istudying Java and trying to write something wrong. English not my native languagean HTML parser, which should parse tag names and attributes. I only study itwrote a class (code below) using the State pattern.

I am study java and try to write html parser, what should be parsing tag name and attribute. I write class (code below) with using State pattern. This is necessary for my training project, where I currently use Jsoup nowJSoup. But jsoupJSoup is too slow for me. So though, so I want to make so goodbetter performance, as it is impossible. Although, will be good, if this code will be conventional write suggestions about following conventions and follow best practise are also good. If you send me remark aboutAdditionally, comments on the interface / API of my class, I will would be very grateful for youappreciated too.

I am sorry, if I write something wrong. English not my native language and I only study it.

I am study java and try to write html parser, what should be parsing tag name and attribute. I write class (code below) with using State pattern. This is necessary for my training project, where I use Jsoup now. But jsoup too slow for me. So, I want to make so good performance, as it is impossible. Although, will be good, if this code will be conventional write and follow best practise. If you send me remark about interface of my class, I will be very grateful for you.

I am studying Java and trying to write an HTML parser, which should parse tag names and attributes. I wrote a class (code below) using the State pattern.

This is necessary for my training project, where I currently use JSoup. JSoup is too slow for me though, so I want better performance. Although suggestions about following conventions and best practise are also good. Additionally, comments on the interface / API of my class would be appreciated too.

Source Link
Weres
  • 43
  • 4

Simple attribute parser for HTML

I am sorry, if I write something wrong. English not my native language and I only study it.

I am study java and try to write html parser, what should be parsing tag name and attribute. I write class (code below) with using State pattern. This is necessary for my training project, where I use Jsoup now. But jsoup too slow for me. So, I want to make so good performance, as it is impossible. Although, will be good, if this code will be conventional write and follow best practise. If you send me remark about interface of my class, I will be very grateful for you.

import java.io.BufferedReader;
import java.io.IOException;
import java.util.HashMap;

public class AttributeParser {
    public AttributeParser(BufferedReader reader) {
        this.reader = reader;
        states.put(AttrStat.NAME, new NameState());
        states.put(AttrStat.VALUE, new ValueState());
        states.put(AttrStat.VALUE_QUOTES, new ValueQuotesState());
        states.put(AttrStat.AFTER_NAME, new AfterNameState());
        states.put(AttrStat.NEW_ATTR, new NewAttrState());
        states.put(AttrStat.NEW_VALUE, new NewValueState());
        current = states.get(AttrStat.NEW_ATTR);
    }

    public String tag() throws IOException {
        int ch;
        ch = reader.read();
        while (ch > 0) {
            if (ch == '<') {
                StringBuilder tagName = new StringBuilder();
                ch = reader.read();
                while ((ch > 0) && (" >\n\t".indexOf(ch) == -1)) {
                    reader.mark(1);
                    tagName.append((char) ch);
                    if (tagName.toString().equals("!--")) {
                        break;
                    }
                    ch = reader.read();
                }
                if (ch == '>') {
                    reader.reset();
                }
                return tagName.toString();
            }
            ch = reader.read();
        }
        return null;
    }

    public HashMap<String, String> attribute() throws IOException {
        attr = new HashMap<>();
        while (current.read(reader.read())) {
            //without body
        }
        addAttribute();
        return attr;
    }

    private void addAttribute() {
        if ((name.length() > 0) && !name.toString().equals("/") && (attr.get(name.toString()) == null)) {
            attr.put(name.toString(), value.toString());
        }
        name.setLength(0);
        value.setLength(0);
    }

    abstract class State {
        final public boolean read(int ch) {
            if (ch == -1) return false;
            switch (ch) {
                case '>':
                    return readAngleBracket((char) ch);
                case '=':
                    return readEqual((char) ch);
                case '\n':
                case ' ':
                case '\t':
                    return readBreaker((char) ch);
                //double quote
                case '"':
                case '\'':
                    return readSequence((char) ch);
                default:
                    return readChar((char) ch);
            }
        }

        protected abstract boolean readChar(char ch);

        protected abstract boolean readSequence(char ch);

        protected abstract boolean readBreaker(char ch);

        protected abstract boolean readEqual(char ch);

        protected boolean readAngleBracket(char ch) {
            addAttribute();
            return false;
        }
    }

    private class NameState extends State {
        protected boolean readChar(char ch) {
            name.append(ch);
            return true;
        }

        protected boolean readSequence(char ch) {
            name.append(ch);
            return true;
        }

        protected boolean readBreaker(char ch) {
            current = states.get(AttrStat.AFTER_NAME);
            return true;
        }

        protected boolean readEqual(char ch) {
            current = states.get(AttrStat.NEW_VALUE);
            return true;
        }
    }

    private class ValueState extends State {
        protected boolean readChar(char ch) {
            value.append(ch);
            return true;
        }

        protected boolean readSequence(char ch) {
            value.append(ch);
            return true;
        }

        protected boolean readBreaker(char ch) {
            addAttribute();
            current = states.get(AttrStat.NEW_ATTR);
            return true;
        }

        protected boolean readEqual(char ch) {
            value.append(ch);
            return true;
        }
    }

    private class ValueQuotesState extends State {
        protected boolean readChar(char ch) {
            value.append(ch);
            return true;
        }

        protected boolean readSequence(char ch) {
            if (quotes == ch) {
                addAttribute();
                current = states.get(AttrStat.NEW_ATTR);
            } else {
                value.append(ch);
            }
            return true;
        }

        protected boolean readBreaker(char ch) {
            value.append(ch);
            return true;
        }

        protected boolean readEqual(char ch) {
            value.append(ch);
            return true;
        }

        protected boolean readAngleBracket(char ch) {
            value.append(ch);
            return true;
        }
    }

    private class AfterNameState extends State {
        protected boolean readChar(char ch) {
            addAttribute();
            current = states.get(AttrStat.NEW_ATTR);
            name.append(ch);
            return true;
        }

        protected boolean readSequence(char ch) {
            addAttribute();
            name.append(ch);
            current = states.get(AttrStat.NEW_ATTR);
            return true;
        }

        protected boolean readBreaker(char ch) {
            return true;
        }

        protected boolean readEqual(char ch) {
            current = states.get(AttrStat.NEW_VALUE);
            return true;
        }
    }

    private class NewAttrState extends State {
        protected boolean readChar(char ch) {
            name.append(ch);
            current = states.get(AttrStat.NAME);
            return true;
        }

        protected boolean readSequence(char ch) {
            name.append(ch);
            current = states.get(AttrStat.NAME);
            return true;
        }

        protected boolean readBreaker(char ch) {
            addAttribute();
            return true;
        }

        protected boolean readEqual(char ch) {
            name.append(ch);
            current = states.get(AttrStat.NAME);
            return true;
        }
    }

    private class NewValueState extends State {
        protected boolean readChar(char ch) {
            value.append(ch);
            current = states.get(AttrStat.VALUE);
            return true;
        }

        protected boolean readSequence(char ch) {
            quotes = ch;
            current = states.get(AttrStat.VALUE_QUOTES);
            return true;
        }

        protected boolean readBreaker(char ch) {
            return true;
        }

        protected boolean readEqual(char ch) {
            value.append(ch);
            current = states.get(AttrStat.VALUE);
            return true;
        }
    }

    enum AttrStat {NAME, VALUE, VALUE_QUOTES, AFTER_NAME, NEW_ATTR, NEW_VALUE}

    private BufferedReader reader;
    private StringBuilder name = new StringBuilder();
    private StringBuilder value = new StringBuilder();
    private State current;
    private char quotes = ' ';
    private HashMap<String, String> attr;
    private HashMap<AttrStat, State> states = new HashMap<>();
}