java-commons/commons-serialize-xml/src/main/java/io/gitlab/jfronny/commons/serialize/xml/NativeXmlReader.java

package io.gitlab.jfronny.commons.serialize.xml;

import io.gitlab.jfronny.commons.serialize.MalformedDataException;
import io.gitlab.jfronny.commons.serialize.StringEscapeUtil;
import io.gitlab.jfronny.commons.serialize.xml.impl.XmlScope;

import java.io.Closeable;
import java.io.EOFException;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Objects;

public class NativeXmlReader implements Closeable {
    private static final int PEEKED_NONE = 0;
    private static final int PEEKED_BEGIN_TAG = 1;
    private static final int PEEKED_END_TAG = 2;
    private static final int PEEKED_END_TAG_CONCISE = 3;
    private static final int PEEKED_TEXT = 4;
    private static final int PEEKED_CDATA = 5;
    private static final int PEEKED_ATTRIBUTE_NAME = 6;
    private static final int PEEKED_ATTRIBUTE_VALUE = 7;
    private static final int PEEKED_EOF = 8;

    /** The input JSON. */
    private final Reader in;

    static final int BUFFER_SIZE = 1024;
    /**
     * Use a manual buffer to easily read and unread upcoming characters, and also so we can create
     * strings without an intermediate StringBuilder. We decode literals directly out of this buffer,
     * so it must be at least as long as the longest token that can be reported as a number.
     */
    private final char[] buffer = new char[BUFFER_SIZE];

    private int pos = 0;
    private int limit = 0;

    private int lineNumber = 0;
    private int lineStart = 0;

    int peeked = PEEKED_NONE;

    /**
     * The number of characters in a peeked number literal. Increment 'pos' by this after reading a
     * number.
     */
    private int peekedNumberLength;

    /**
     * A peeked string that should be parsed on the next double, long or string. This is populated
     * before a numeric value is parsed and used if that parsing fails.
     */
    private String peekedString;

    /*
     * The nesting stack. Using a manual array rather than an ArrayList saves 20%.
     */
    private int[] stack = new int[32];
    private int stackSize = 0;

    {
        stack[stackSize++] = XmlScope.EMPTY_DOCUMENT;
    }

    /*
     * The path members. It corresponds directly to stack: At indices where the
     * stack contains an object (EMPTY_OBJECT, DANGLING_NAME or NONEMPTY_OBJECT),
     * pathNames contains the name at this scope. Where it contains an array
     * (EMPTY_ARRAY, NONEMPTY_ARRAY) pathIndices contains the current index in
     * that array. Otherwise the value is undefined, and we take advantage of that
     * by incrementing pathIndices when doing so isn't useful.
     */
    private String[] pathNames = new String[32];
    private int[] pathIndices = new int[32];

    private boolean lenient = false;
    private boolean skipWhitespace = true;
    public NativeXmlReader(Reader in) {
        this.in = Objects.requireNonNull(in, "in == null");
    }

    public NativeXmlReader setLenient(boolean lenient) {
        this.lenient = lenient;
        return this;
    }

    public boolean isLenient() {
        return lenient;
    }

    public NativeXmlReader setSkipWhitespace(boolean skipWhitespace) {
        this.skipWhitespace = skipWhitespace;
        return this;
    }

    public boolean isSkipWhitespace() {
        return skipWhitespace;
    }

    private void push(int newTop) {
        if (stackSize == stack.length) {
            int newLength = stackSize * 2;
            stack = Arrays.copyOf(stack, newLength);
            pathIndices = Arrays.copyOf(pathIndices, newLength);
            pathNames = Arrays.copyOf(pathNames, newLength);
        }
        stack[stackSize++] = newTop;
    }

    /**
     * Returns true once {@code limit - pos >= minimum}. If the data is exhausted before that many
     * characters are available, this returns false.
     */
    private boolean fillBuffer(int minimum) throws IOException {
        char[] buffer = this.buffer;
        lineStart -= pos;
        if (limit != pos) {
            limit -= pos;
            System.arraycopy(buffer, pos, buffer, 0, limit);
        } else {
            limit = 0;
        }

        pos = 0;
        int total;
        while ((total = in.read(buffer, limit, buffer.length - limit)) != -1) {
            limit += total;

            // if this is the first read, consume an optional byte order mark (BOM) if it exists
            if (lineNumber == 0 && lineStart == 0 && limit > 0 && buffer[0] == '\ufeff') {
                pos++;
                lineStart++;
                minimum++;
            }

            if (limit >= minimum) {
                return true;
            }
        }
        return false;
    }

    public String beginTag() throws IOException {
        int p = peeked;
        if (p == PEEKED_NONE) {
            p = doPeek();
        }
        if (p != PEEKED_BEGIN_TAG) {
            throw unexpectedTokenError("BEGIN_TAG");
        }
        String name = nextName();
        pathNames[stackSize - 1] = name;
        push(XmlScope.TAG_HEAD);
        peeked = PEEKED_NONE;
        return name;
    }

    public String endTag() throws IOException {
        int p = peeked;
        if (p == PEEKED_NONE) {
            p = doPeek();
        }
        String name;
        if (p == PEEKED_END_TAG) {
            name = nextName();
            if (buffer[pos] != '>') {
                throw syntaxError("Expected > but was " + buffer[pos]);
            }
            pos++;
        } else if (p == PEEKED_END_TAG_CONCISE) {
            name = pathNames[stackSize - 2];
        } else throw unexpectedTokenError("END_TAG");
        if (!name.equals(pathNames[stackSize - 2])) {
            if (!lenient) throw syntaxError("Mismatched closing tag: Expected " + pathNames[stackSize - 1] + " but was " + name);
        }
        stackSize--;
        pathNames[stackSize] = null; // Free the last path name so that it can be garbage collected!
        pathIndices[stackSize - 1]++;
        peeked = PEEKED_NONE;
        return name;
    }

    public boolean hasNext() throws IOException {
        int p = peeked;
        if (p == PEEKED_NONE) {
            p = doPeek();
        }
        return p != PEEKED_EOF && p != PEEKED_END_TAG && p != PEEKED_END_TAG_CONCISE;
    }

    public XmlToken peek() throws IOException {
        int p = peeked;
        if (p == PEEKED_NONE) {
            p = doPeek();
        }
        return switch (p) {
            case PEEKED_BEGIN_TAG -> XmlToken.BEGIN_TAG;
            case PEEKED_END_TAG, PEEKED_END_TAG_CONCISE -> XmlToken.END_TAG;
            case PEEKED_TEXT -> XmlToken.TEXT;
            case PEEKED_CDATA -> XmlToken.CDATA;
            case PEEKED_ATTRIBUTE_NAME -> XmlToken.ATTRIBUTE_NAME;
            case PEEKED_ATTRIBUTE_VALUE -> XmlToken.ATTRIBUTE_VALUE;
            case PEEKED_EOF -> XmlToken.EOF;
            default -> throw new AssertionError();
        };
    }

    int doPeek() throws IOException {
        int peekStack = stack[stackSize - 1];
        if (peekStack == XmlScope.TAG_HEAD) {
            stack[stackSize - 1] = XmlScope.DANGLING_NAME;
            int c = nextNonWhitespace(true);
            if (c == -1) {
                throw syntaxError("Unterminated tag");
            } if (c == '/') {
                if (pos < limit || fillBuffer(1)) {
                    char chNext = buffer[pos++];
                    if (chNext == '>') {
                        stack[stackSize - 1] = XmlScope.TAG_BODY;
                        return peeked = PEEKED_END_TAG_CONCISE;
                    } else {
                        throw syntaxError("Expected /> but was /" + chNext);
                    }
                } else {
                    throw syntaxError("Unterminated tag at " + c);
                }
            } else if (c == '>') {
                stack[stackSize - 1] = XmlScope.TAG_BODY;
                // fall through
            } else if (pos < limit || fillBuffer(1)) {
                char chNext = buffer[pos + 1];
                var check = isNameStart((char) c, chNext);
                pos--;
                if (check != NameCheck.NONE) {
                    return peeked = PEEKED_ATTRIBUTE_NAME;
                } else {
                    throw unexpectedTokenError("attribute name");
                }
            } else throw syntaxError("Unterminated tag at " + c);
        } else if (peekStack == XmlScope.DANGLING_NAME) {
            stack[stackSize - 1] = XmlScope.TAG_HEAD;
            // Look for an equals sign before the value
            int c = lenient ? nextNonWhitespace(true) : buffer[pos++];
            if (c == '=') {
                c = lenient ? nextNonWhitespace(true) : buffer[pos++];
                pos--;
                if (c == '\'' || c == '"') {
                    return peeked = PEEKED_ATTRIBUTE_VALUE;
                } else {
                    throw syntaxError("Expected a value but was " + (char) c);
                }
            } else {
                throw syntaxError("Expected '='");
            }
        } else if (peekStack == XmlScope.TAG_BODY) {
            // fall through: a new element is starting
        } else if (peekStack == XmlScope.EMPTY_DOCUMENT) {
            stack[stackSize - 1] = XmlScope.NONEMPTY_DOCUMENT;
            // fall through: a new element is starting
        } else if (peekStack == XmlScope.NONEMPTY_DOCUMENT) {
            int c = skipWhitespace ? nextNonWhitespace(false) : buffer[pos++];
            if (c == -1) {
                return peeked = PEEKED_EOF;
            } else {
                checkLenient();
                pos--;
                // fall through: a new element is starting
            }
        } else if (peekStack == XmlScope.CLOSED) {
            throw new IllegalStateException("BaseXmlReader is closed");
        }
        int c = skipWhitespace ? nextNonWhitespace(true) : buffer[pos++];
        if (c == -1) {
            throw syntaxError("Unterminated tag");
        } else if (c == '<') {
            if (pos + 1 <= limit || fillBuffer(1)) {
                char chNext = buffer[pos];
                if (chNext == '/') {
                    pos++;
                    return peeked = PEEKED_END_TAG;
                } else if (chNext == '!') {
                    if (pos + 8 <= limit || fillBuffer(8)) {
                        if (buffer[pos + 1] == '[' && buffer[pos + 2] == 'C' && buffer[pos + 3] == 'D' && buffer[pos + 4] == 'A' && buffer[pos + 5] == 'T' && buffer[pos + 6] == 'A' && buffer[pos + 7] == '[') {
                            pos += 8;
                            return peeked = PEEKED_CDATA;
                        } else {
                            throw syntaxError("Expected <![CDATA[ but was <![" + new String(buffer, pos, 5));
                        }
                    }
                } else if (pos + 2 <= limit || fillBuffer(2)) {
                    var check = isNameStart(chNext, buffer[pos + 1]);
                    if (check != NameCheck.NONE) {
                        return peeked = PEEKED_BEGIN_TAG;
                    }
                }
            }
            throw syntaxError("Unterminated tag");
        } else {
            pos--;
            return peeked = PEEKED_TEXT;
        }
    }

    private enum NameCheck { FIRST, BOTH, NONE }
    private NameCheck isNameStart(char ch, char chNext) {
        if ('A' <= ch && ch <= 'Z') return NameCheck.FIRST;
        if ('a' <= ch && ch <= 'z') return NameCheck.FIRST;
        return switch (ch) {
            case ':', '_' -> NameCheck.FIRST;
            case '\u2070' -> chNext == '\u218F' ? NameCheck.BOTH : NameCheck.NONE;
            case '\u2C00' -> chNext == '\u2FEF' ? NameCheck.BOTH : NameCheck.NONE;
            case '\u3001' -> chNext == '\uD7FF' ? NameCheck.BOTH : NameCheck.NONE;
            case '\uF900' -> chNext == '\uFDCF' ? NameCheck.BOTH : NameCheck.NONE;
            case '\uFDF0' -> chNext == '\uFFFD' ? NameCheck.BOTH : NameCheck.NONE;
            default -> NameCheck.NONE;
        };
    }

    private NameCheck isName(char ch, char chNext) {
        var nameStart = isNameStart(ch, chNext);
        if (nameStart != NameCheck.NONE) return nameStart;
        if ('0' <= ch && ch <= '9') return NameCheck.FIRST;
        return switch (ch) {
            case '-', '.', '\u00B7' -> NameCheck.FIRST;
            case '\u0300' -> chNext == '\u036F' ? NameCheck.BOTH : NameCheck.NONE;
            case '\u203F' -> chNext == '\u2040' ? NameCheck.BOTH : NameCheck.NONE;
            default -> NameCheck.NONE;
        };
    }

    public String nextAttributeName() throws IOException {
        int p = peeked;
        if (p == PEEKED_NONE) {
            p = doPeek();
        }
        if (p != PEEKED_ATTRIBUTE_NAME) {
            throw unexpectedTokenError("ATTRIBUTE_NAME");
        }
        String result = nextName();
        peeked = PEEKED_NONE;
        pathNames[stackSize - 1] = result;
        return result;
    }

    public String nextAttributeValue() throws IOException {
        int p = peeked;
        if (p == PEEKED_NONE) {
            p = doPeek();
        }
        if (p != PEEKED_ATTRIBUTE_VALUE) {
            throw unexpectedTokenError("ATTRIBUTE_VALUE");
        }
        char quote = buffer[pos++];
        String result = readUntil((c, i) -> {
            if (!lenient && c < 0x20 && c != 0x09) throw syntaxError("Control character in attribute value");
            if (c == '<') throw syntaxError("Expected " + quote + " but was '<'");
            return c == quote;
        }, true);
        pos++;
        peeked = PEEKED_NONE;
        return result;
    }

    private String readReference() throws IOException {
        if (pos == limit && !fillBuffer(1)) {
            throw syntaxError("Unterminated escape sequence");
        }
        if (buffer[pos] == '#') {
            // read the character reference
            pos++;
            if (pos == limit && !fillBuffer(1)) {
                throw syntaxError("Unterminated escape sequence");
            }
            boolean isHex = buffer[pos] == 'x' || buffer[pos] == 'X';
            if (isHex) pos++;
            String result = readUntil((c, i) -> {
                if (c == ';') return true;
                if ('0' <= c && c <= '9') return false;
                if (isHex && ('a' <= c && c <= 'f' || 'A' <= c && c <= 'F')) return false;
                throw syntaxError("Malformed character reference");
            }, false);
            if (!result.endsWith(";")) throw syntaxError("Missing ';' in character reference");
            result = result.substring(0, result.length() - 1);
            return String.valueOf((char) Integer.parseInt(result, isHex ? 16 : 10));
        } else {
            // read the entity reference
            // we don't support these, so just handle them like a normal string
            String result = nextName();
            if (buffer[pos] != ';') throw syntaxError("Missing ';' in entity reference");
            pos++;
            if (result.equals("apos")) return "'";
            if (result.equals("quot")) return "\"";
            if (result.equals("amp")) return "&";
            if (result.equals("lt")) return "<";
            if (result.equals("gt")) return ">";
            return "&" + result + ";";
        }
    }

    public String nextText() throws IOException {
        int p = peeked;
        if (p == PEEKED_NONE) {
            p = doPeek();
        }
        if (p != PEEKED_TEXT) {
            throw unexpectedTokenError("TEXT");
        }
        String result = readUntil((c, i) -> c == '<', true);
        if (skipWhitespace) {
            result = result.trim();
        }
        peeked = PEEKED_NONE;
        return result;
    }

    public String nextCData() throws IOException {
        int p = peeked;
        if (p == PEEKED_NONE) {
            p = doPeek();
        }
        if (p != PEEKED_CDATA) {
            throw unexpectedTokenError("CDATA");
        }
        StringBuilder sb = new StringBuilder();
        while (true) {
            sb.append(readUntil((c, i) -> c == ']', false));
            if (pos + 2 < limit || fillBuffer(3)) {
                if (buffer[pos] == ']' && buffer[pos + 1] == ']' && buffer[pos + 2] == '>') {
                    pos += 3;
                    peeked = PEEKED_NONE;
                    return sb.toString();
                }
            } else {
                throw syntaxError("Unterminated CDATA");
            }
        }
    }

    private String nextName() throws IOException {
        return readUntil((c, i) -> isName(c, pos + i + 1 < limit ? buffer[pos + i + 1] : '\0') == NameCheck.NONE, false);
    }

    @FunctionalInterface
    private interface EndPredicate {
        boolean test(char c, int i) throws MalformedDataException;
    }

    private String readUntil(EndPredicate character, boolean handleReferences) throws IOException {
        StringBuilder builder = null;
        int i = 0;
        findEnd:
        while (true) {
            for (; pos + i < limit; i++) {
                char c = buffer[pos + i];
                if (character.test(c, i)) {
                    break findEnd;
                } else if (handleReferences && c == '&') {
                    if (builder == null) {
                        builder = new StringBuilder(Math.max(i, 16));
                    }
                    builder.append(buffer, pos, i);
                    pos += i;
                    i = 0;
                    builder.append(readReference());
                } else if (c == '\n') {
                    lineNumber++;
                    lineStart = pos + i + 1;
                }
            }

            // Attempt to load the entire name into the buffer at once.
            if (i < buffer.length) {
                if (fillBuffer(i + 1)) {
                    continue;
                } else {
                    break;
                }
            }

            // use a StringBuilder when the name is too long.
            if (builder == null) {
                builder = new StringBuilder(Math.max(i, 16));
            }
            builder.append(buffer, pos, i);
            pos += i;
            i = 0;
            if (!fillBuffer(1)) {
                break;
            }
        }

        String result = builder != null
                ? builder.append(buffer, pos, i).toString()
                : new String(buffer, pos, i);
        pos += i;
        return result;
    }

    /**
     * Returns the next character in the stream that is neither whitespace nor a part of a comment.
     * When this returns, the returned character is always at {@code buffer[pos-1]}; this means the
     * caller can always push back the returned character by decrementing {@code pos}.
     */
    private int nextNonWhitespace(boolean throwOnEof) throws IOException {
        /*
         * This code uses ugly local variables 'p' and 'l' representing the 'pos'
         * and 'limit' fields respectively. Using locals rather than fields saves
         * a few field reads for each whitespace character in a pretty-printed
         * document, resulting in a 5% speedup. We need to flush 'p' to its field
         * before any (potentially indirect) call to fillBuffer() and reread both
         * 'p' and 'l' after any (potentially indirect) call to the same method.
         */
        char[] buffer = this.buffer;
        int p = pos;
        int l = limit;
        while (true) {
            if (p == l) {
                pos = p;
                if (!fillBuffer(1)) {
                    break;
                }
                p = pos;
                l = limit;
            }

            int c = buffer[p++];
            if (c == '\n') {
                lineNumber++;
                lineStart = p;
                continue;
            } else if (c == ' ' || c == '\r' || c == '\t') {
                continue;
            }

            pos = p;
            if (c == '<') {
                if (p == l) {
                    pos--; // push back '/' so it's still in the buffer when this method returns
                    boolean charsLoaded = fillBuffer(4);
                    pos++; // consume the '/' again
                    if (!charsLoaded) {
                        return c;
                    }
                }

                if (buffer[pos] == '!' && buffer[pos + 1] == '-' && buffer[pos + 2] == '-') {
                    pos += 3;
                    if (!skipTo("-->")) {
                        throw syntaxError("Unterminated comment");
                    }
                    p = pos + 3;
                    l = limit;
                    continue;
                }
            }
            return c;
        }
        if (throwOnEof) {
            throw new EOFException("End of input" + locationString());
        } else {
            return -1;
        }
    }

    private void checkLenient() throws MalformedDataException {
        if (!lenient) {
            throw syntaxError("Use JsonReader.setLenient(true) to accept malformed JSON");
        }
    }

    /**
     * @param toFind a string to search for. Must not contain a newline.
     */
    private boolean skipTo(String toFind) throws IOException {
        int length = toFind.length();
        outer:
        for (; pos + length <= limit || fillBuffer(length); pos++) {
            if (buffer[pos] == '\n') {
                lineNumber++;
                lineStart = pos + 1;
                continue;
            }
            for (int c = 0; c < length; c++) {
                if (buffer[pos + c] != toFind.charAt(c)) {
                    continue outer;
                }
            }
            return true;
        }
        return false;
    }

    protected String locationString() {
        int line = lineNumber + 1;
        int column = pos - lineStart + 1;
        String replacement = StringEscapeUtil.getReplacement(buffer[pos]);
        if (replacement == null)  {
            replacement = String.valueOf(buffer[pos]);
        }
        String charInterjection = pos < buffer.length ? " (char '" + replacement + "')" : "";
        return " at line " + line + " column " + column + charInterjection + " path " + getPath();
    }

    public String getPath() {
        StringBuilder result = new StringBuilder();
        boolean first = true;
        for (int i = 0; i < stackSize; i++) {
            int scope = stack[i];
            switch (scope) {
                case XmlScope.TAG_HEAD:
                case XmlScope.TAG_BODY:
                case XmlScope.DANGLING_NAME:
                case XmlScope.NONEMPTY_DOCUMENT:
                    if (first) first = false;
                    else result.append('.');
                    if (pathNames[i] != null) {
                        result.append(pathNames[i]);
                    }
                    break;
                case XmlScope.EMPTY_DOCUMENT:
                case XmlScope.CLOSED:
                    break;
                default:
                    throw new AssertionError("Unknown scope value: " + scope);
            }
        }
        return result.toString();
    }

    /**
     * Unescapes the character identified by the character or characters that immediately follow a
     * backslash. The backslash '\' should have already been read. This supports both Unicode escapes
     * "u000A" and two-character escapes "\n".
     *
     * @throws MalformedDataException if the escape sequence is malformed
     */
    @SuppressWarnings("fallthrough")
    private char readEscapeCharacter() throws IOException {
        if (pos == limit && !fillBuffer(1)) {
            throw syntaxError("Unterminated escape sequence");
        }

        char escaped = buffer[pos++];
        switch (escaped) {
            case 'u':
                if (pos + 4 > limit && !fillBuffer(4)) {
                    throw syntaxError("Unterminated escape sequence");
                }
                // Equivalent to Integer.parseInt(stringPool.get(buffer, pos, 4), 16);
                int result = 0;
                for (int i = pos, end = i + 4; i < end; i++) {
                    char c = buffer[i];
                    result <<= 4;
                    if (c >= '0' && c <= '9') {
                        result += (c - '0');
                    } else if (c >= 'a' && c <= 'f') {
                        result += (c - 'a' + 10);
                    } else if (c >= 'A' && c <= 'F') {
                        result += (c - 'A' + 10);
                    } else {
                        throw syntaxError("Malformed Unicode escape \\u" + new String(buffer, pos, 4));
                    }
                }
                pos += 4;
                return (char) result;

            case 't':
                return '\t';

            case 'b':
                return '\b';

            case 'n':
                return '\n';

            case 'r':
                return '\r';

            case 'f':
                return '\f';

            case '\n':
                if (!lenient) {
                    throw syntaxError("Cannot escape a newline character in strict mode");
                }
                lineNumber++;
                lineStart = pos;
                // fall-through

            case '\'':
                if (!lenient) {
                    throw syntaxError("Invalid escaped character \"'\" in strict mode");
                }
            case '"':
            case '\\':
            case '/':
                return escaped;
            default:
                // throw error when none of the above cases are matched
                throw syntaxError("Invalid escape sequence");
        }
    }

    /**
     * Throws a new {@link MalformedDataException} with the given message and information about the
     * current location.
     */
    private MalformedDataException syntaxError(String message) throws MalformedDataException {
        throw new MalformedDataException(message + locationString());
    }

    private IllegalStateException unexpectedTokenError(String expected) throws IOException {
        return new IllegalStateException("Expected " + expected + " but was " + peek() + locationString());
    }

    /** Consumes the header if it exists. */
    private void consumeHeader() throws IOException {
        // fast-forward through the leading whitespace
        int unused = nextNonWhitespace(true);
        pos--;

        if (pos + 5 > limit && !fillBuffer(5)) {
            return;
        }

        int p = pos;
        char[] buf = buffer;
        if (buf[p] != '<'
                || buf[p + 1] != '?'
                || buf[p + 2] != 'x'
                || buf[p + 3] != 'm'
                || buf[p + 4] != 'l') {
            return; // not a header!
        }

        // we found a header, consume it
        pos += 5;
        skipTo("?>");
    }

    @Override
    public void close() throws IOException {
        peeked = PEEKED_NONE;
        stack[0] = XmlScope.CLOSED;
        stackSize = 1;
        in.close();
    }
}