package io.gitlab.jfronny.commons.serialize.xml; import io.gitlab.jfronny.commons.serialize.MalformedDataException; import io.gitlab.jfronny.commons.serialize.StringEscapeUtil; import io.gitlab.jfronny.commons.serialize.xml.impl.NameCheck; import io.gitlab.jfronny.commons.serialize.xml.impl.XmlScope; import java.io.Closeable; import java.io.EOFException; import java.io.IOException; import java.io.Reader; import java.util.Arrays; import java.util.Objects; public class NativeXmlReader implements Closeable { private static final int PEEKED_NONE = 0; private static final int PEEKED_BEGIN_TAG = 1; private static final int PEEKED_END_TAG = 2; private static final int PEEKED_END_TAG_CONCISE = 3; private static final int PEEKED_TEXT = 4; private static final int PEEKED_CDATA = 5; private static final int PEEKED_ATTRIBUTE_NAME = 6; private static final int PEEKED_ATTRIBUTE_VALUE = 7; private static final int PEEKED_EOF = 8; /** The input JSON. */ private final Reader in; static final int BUFFER_SIZE = 1024; /** * Use a manual buffer to easily read and unread upcoming characters, and also so we can create * strings without an intermediate StringBuilder. We decode literals directly out of this buffer, * so it must be at least as long as the longest token that can be reported as a number. */ private final char[] buffer = new char[BUFFER_SIZE]; private int pos = 0; private int limit = 0; private int lineNumber = 0; private int lineStart = 0; int peeked = PEEKED_NONE; /** * The number of characters in a peeked number literal. Increment 'pos' by this after reading a * number. */ private int peekedNumberLength; /** * A peeked string that should be parsed on the next double, long or string. This is populated * before a numeric value is parsed and used if that parsing fails. */ private String peekedString; /* * The nesting stack. Using a manual array rather than an ArrayList saves 20%. */ private int[] stack = new int[32]; private int stackSize = 0; { stack[stackSize++] = XmlScope.EMPTY_DOCUMENT; } /* * The path members. It corresponds directly to stack: At indices where the * stack contains an object (EMPTY_OBJECT, DANGLING_NAME or NONEMPTY_OBJECT), * pathNames contains the name at this scope. Where it contains an array * (EMPTY_ARRAY, NONEMPTY_ARRAY) pathIndices contains the current index in * that array. Otherwise the value is undefined, and we take advantage of that * by incrementing pathIndices when doing so isn't useful. */ private String[] pathNames = new String[32]; private int[] pathIndices = new int[32]; private boolean lenient = false; private boolean skipWhitespace = true; public NativeXmlReader(Reader in) { this.in = Objects.requireNonNull(in, "in == null"); } public NativeXmlReader setLenient(boolean lenient) { this.lenient = lenient; return this; } public boolean isLenient() { return lenient; } public NativeXmlReader setSkipWhitespace(boolean skipWhitespace) { this.skipWhitespace = skipWhitespace; return this; } public boolean isSkipWhitespace() { return skipWhitespace; } private void push(int newTop) { if (stackSize == stack.length) { int newLength = stackSize * 2; stack = Arrays.copyOf(stack, newLength); pathIndices = Arrays.copyOf(pathIndices, newLength); pathNames = Arrays.copyOf(pathNames, newLength); } stack[stackSize++] = newTop; } /** * Returns true once {@code limit - pos >= minimum}. If the data is exhausted before that many * characters are available, this returns false. */ private boolean fillBuffer(int minimum) throws IOException { char[] buffer = this.buffer; lineStart -= pos; if (limit != pos) { limit -= pos; System.arraycopy(buffer, pos, buffer, 0, limit); } else { limit = 0; } pos = 0; int total; while ((total = in.read(buffer, limit, buffer.length - limit)) != -1) { limit += total; // if this is the first read, consume an optional byte order mark (BOM) if it exists if (lineNumber == 0 && lineStart == 0 && limit > 0 && buffer[0] == '\ufeff') { pos++; lineStart++; minimum++; } if (limit >= minimum) { return true; } } return false; } public String beginTag() throws IOException { int p = peeked; if (p == PEEKED_NONE) { p = doPeek(); } if (p != PEEKED_BEGIN_TAG) { throw unexpectedTokenError("BEGIN_TAG"); } String name = nextName(); pathNames[stackSize - 1] = name; push(XmlScope.TAG_HEAD); peeked = PEEKED_NONE; return name; } public String endTag() throws IOException { int p = peeked; if (p == PEEKED_NONE) { p = doPeek(); } String name; if (p == PEEKED_END_TAG) { name = nextName(); if (buffer[pos] != '>') { throw syntaxError("Expected > but was " + buffer[pos]); } pos++; } else if (p == PEEKED_END_TAG_CONCISE) { name = pathNames[stackSize - 2]; } else throw unexpectedTokenError("END_TAG"); if (!name.equals(pathNames[stackSize - 2])) { if (!lenient) throw syntaxError("Mismatched closing tag: Expected " + pathNames[stackSize - 2] + " but was " + name); } stackSize--; pathNames[stackSize] = null; // Free the last path name so that it can be garbage collected! pathIndices[stackSize - 1]++; peeked = PEEKED_NONE; return name; } public boolean hasNext() throws IOException { int p = peeked; if (p == PEEKED_NONE) { p = doPeek(); } return p != PEEKED_EOF && p != PEEKED_END_TAG && p != PEEKED_END_TAG_CONCISE; } public XmlToken peek() throws IOException { int p = peeked; if (p == PEEKED_NONE) { p = doPeek(); } return switch (p) { case PEEKED_BEGIN_TAG -> XmlToken.BEGIN_TAG; case PEEKED_END_TAG, PEEKED_END_TAG_CONCISE -> XmlToken.END_TAG; case PEEKED_TEXT -> XmlToken.TEXT; case PEEKED_CDATA -> XmlToken.CDATA; case PEEKED_ATTRIBUTE_NAME -> XmlToken.ATTRIBUTE_NAME; case PEEKED_ATTRIBUTE_VALUE -> XmlToken.ATTRIBUTE_VALUE; case PEEKED_EOF -> XmlToken.EOF; default -> throw new AssertionError(); }; } int doPeek() throws IOException { int peekStack = stack[stackSize - 1]; if (peekStack == XmlScope.TAG_HEAD) { stack[stackSize - 1] = XmlScope.DANGLING_NAME; int c = nextNonWhitespace(true); if (c == -1) { throw syntaxError("Unterminated tag"); } if (c == '/') { if (pos < limit || fillBuffer(1)) { char chNext = buffer[pos++]; if (chNext == '>') { stack[stackSize - 1] = XmlScope.TAG_BODY; return peeked = PEEKED_END_TAG_CONCISE; } else { throw syntaxError("Expected /> but was /" + chNext); } } else { throw syntaxError("Unterminated tag at " + c); } } else if (c == '>') { stack[stackSize - 1] = XmlScope.TAG_BODY; // fall through } else if (pos < limit || fillBuffer(1)) { char chNext = buffer[pos + 1]; var check = NameCheck.isNameStart((char) c, chNext); pos--; if (check != NameCheck.NONE) { return peeked = PEEKED_ATTRIBUTE_NAME; } else { throw unexpectedTokenError("attribute name"); } } else throw syntaxError("Unterminated tag at " + c); } else if (peekStack == XmlScope.DANGLING_NAME) { stack[stackSize - 1] = XmlScope.TAG_HEAD; // Look for an equals sign before the value int c = lenient ? nextNonWhitespace(true) : buffer[pos++]; if (c == '=') { c = lenient ? nextNonWhitespace(true) : buffer[pos++]; pos--; if (c == '\'' || c == '"') { return peeked = PEEKED_ATTRIBUTE_VALUE; } else { throw syntaxError("Expected a value but was " + (char) c); } } else { throw syntaxError("Expected '='"); } } else if (peekStack == XmlScope.TAG_BODY) { // fall through: a new element is starting } else if (peekStack == XmlScope.EMPTY_DOCUMENT) { stack[stackSize - 1] = XmlScope.NONEMPTY_DOCUMENT; // fall through: a new element is starting } else if (peekStack == XmlScope.NONEMPTY_DOCUMENT) { int c = skipWhitespace ? nextNonWhitespace(false) : buffer[pos++]; if (c == -1) { return peeked = PEEKED_EOF; } else { checkLenient(); pos--; // fall through: a new element is starting } } else if (peekStack == XmlScope.CLOSED) { throw new IllegalStateException("BaseXmlReader is closed"); } int c = skipWhitespace ? nextNonWhitespace(true) : buffer[pos++]; if (c == -1) { throw syntaxError("Unterminated tag"); } else if (c == '<') { if (pos + 1 <= limit || fillBuffer(1)) { char chNext = buffer[pos]; if (chNext == '/') { pos++; return peeked = PEEKED_END_TAG; } else if (chNext == '!') { if (pos + 8 <= limit || fillBuffer(8)) { if (buffer[pos + 1] == '[' && buffer[pos + 2] == 'C' && buffer[pos + 3] == 'D' && buffer[pos + 4] == 'A' && buffer[pos + 5] == 'T' && buffer[pos + 6] == 'A' && buffer[pos + 7] == '[') { pos += 8; return peeked = PEEKED_CDATA; } else { throw syntaxError("Expected { if (!lenient && c < 0x20 && c != 0x09) throw syntaxError("Control character in attribute value"); if (c == '<') throw syntaxError("Expected " + quote + " but was '<'"); return c == quote; }, true); pos++; peeked = PEEKED_NONE; return result; } private String readReference() throws IOException { if (pos == limit && !fillBuffer(1)) { throw syntaxError("Unterminated escape sequence"); } if (buffer[pos] == '#') { // read the character reference pos++; if (pos == limit && !fillBuffer(1)) { throw syntaxError("Unterminated escape sequence"); } boolean isHex = buffer[pos] == 'x' || buffer[pos] == 'X'; if (isHex) pos++; String result = readUntil((c, i) -> { if (c == ';') return true; if ('0' <= c && c <= '9') return false; if (isHex && ('a' <= c && c <= 'f' || 'A' <= c && c <= 'F')) return false; throw syntaxError("Malformed character reference"); }, false); if (!result.endsWith(";")) throw syntaxError("Missing ';' in character reference"); result = result.substring(0, result.length() - 1); return String.valueOf((char) Integer.parseInt(result, isHex ? 16 : 10)); } else { // read the entity reference // we don't support these, so just handle them like a normal string String result = nextName(); if (buffer[pos] != ';') throw syntaxError("Missing ';' in entity reference"); pos++; if (result.equals("apos")) return "'"; if (result.equals("quot")) return "\""; if (result.equals("amp")) return "&"; if (result.equals("lt")) return "<"; if (result.equals("gt")) return ">"; return "&" + result + ";"; } } public String nextText() throws IOException { int p = peeked; if (p == PEEKED_NONE) { p = doPeek(); } if (p != PEEKED_TEXT) { throw unexpectedTokenError("TEXT"); } String result = readUntil((c, i) -> c == '<', true); if (skipWhitespace) { result = result.trim(); } peeked = PEEKED_NONE; return result; } public String nextCData() throws IOException { int p = peeked; if (p == PEEKED_NONE) { p = doPeek(); } if (p != PEEKED_CDATA) { throw unexpectedTokenError("CDATA"); } StringBuilder sb = new StringBuilder(); while (true) { sb.append(readUntil((c, i) -> c == ']', false)); if (pos + 2 < limit || fillBuffer(3)) { if (buffer[pos] == ']' && buffer[pos + 1] == ']' && buffer[pos + 2] == '>') { pos += 3; peeked = PEEKED_NONE; return sb.toString(); } } else { throw syntaxError("Unterminated CDATA"); } } } public void skipValue() throws IOException { int count = 0; do { int p = peeked; if (p == PEEKED_NONE) { p = doPeek(); } switch (p) { case PEEKED_BEGIN_TAG -> { if (count == 0) { pathNames[stackSize - 1] = ""; } push(XmlScope.TAG_HEAD); count++; } case PEEKED_END_TAG, PEEKED_END_TAG_CONCISE -> { if (count == 0) { pathNames[stackSize - 1] = null; } stackSize--; count--; } case PEEKED_TEXT -> skipUntil((c, i) -> c == '<'); case PEEKED_CDATA -> { skipUntil((c, i) -> c == ']'); if (pos + 2 < limit || fillBuffer(3)) { if (buffer[pos] == ']' && buffer[pos + 1] == ']' && buffer[pos + 2] == '>') { pos += 3; peeked = PEEKED_NONE; return; } } else { throw syntaxError("Unterminated CDATA"); } } case PEEKED_ATTRIBUTE_NAME -> { skipUntil((c, i) -> NameCheck.isName(c, pos + i + 1 < limit ? buffer[pos + i + 1] : '\0') == NameCheck.NONE); if (count == 0) pathNames[stackSize - 1] = ""; peeked = PEEKED_NONE; } case PEEKED_ATTRIBUTE_VALUE -> { char quote = buffer[pos++]; skipUntil((c, i) -> c == quote); pos++; peeked = PEEKED_NONE; } case PEEKED_EOF -> throw new IllegalStateException("Attempt to skip led outside the document"); default -> {} } peeked = PEEKED_NONE; } while (count > 0); pathIndices[stackSize - 1]++; if (count < 0) throw new IllegalStateException("Attempt to skip led outside its parent"); } private String nextName() throws IOException { return readUntil((c, i) -> NameCheck.isName(c, pos + i + 1 < limit ? buffer[pos + i + 1] : '\0') == NameCheck.NONE, false); } @FunctionalInterface private interface EndPredicate { boolean test(char c, int i) throws MalformedDataException; } private String readUntil(EndPredicate character, boolean handleReferences) throws IOException { StringBuilder builder = null; int i = 0; findEnd: while (true) { for (; pos + i < limit; i++) { char c = buffer[pos + i]; if (character.test(c, i)) { break findEnd; } else if (handleReferences && c == '&') { if (builder == null) { builder = new StringBuilder(Math.max(i, 16)); } builder.append(buffer, pos, i); pos += i; i = 0; builder.append(readReference()); } else if (c == '\n') { lineNumber++; lineStart = pos + i + 1; } } // Attempt to load the entire name into the buffer at once. if (i < buffer.length) { if (fillBuffer(i + 1)) { continue; } else { break; } } // use a StringBuilder when the name is too long. if (builder == null) { builder = new StringBuilder(Math.max(i, 16)); } builder.append(buffer, pos, i); pos += i; i = 0; if (!fillBuffer(1)) { break; } } String result = builder != null ? builder.append(buffer, pos, i).toString() : new String(buffer, pos, i); pos += i; return result; } private void skipUntil(EndPredicate character) throws IOException { int i = 0; findEnd: while (true) { for (; pos + i < limit; i++) { char c = buffer[pos + i]; if (character.test(c, i)) { break findEnd; } else if (c == '\n') { lineNumber++; lineStart = pos + i + 1; } } // Attempt to load the entire name into the buffer at once. if (i < buffer.length) { if (fillBuffer(i + 1)) { continue; } else { break; } } pos += i; i = 0; if (!fillBuffer(1)) { break; } } pos += i; } /** * Returns the next character in the stream that is neither whitespace nor a part of a comment. * When this returns, the returned character is always at {@code buffer[pos-1]}; this means the * caller can always push back the returned character by decrementing {@code pos}. */ private int nextNonWhitespace(boolean throwOnEof) throws IOException { /* * This code uses ugly local variables 'p' and 'l' representing the 'pos' * and 'limit' fields respectively. Using locals rather than fields saves * a few field reads for each whitespace character in a pretty-printed * document, resulting in a 5% speedup. We need to flush 'p' to its field * before any (potentially indirect) call to fillBuffer() and reread both * 'p' and 'l' after any (potentially indirect) call to the same method. */ char[] buffer = this.buffer; int p = pos; int l = limit; while (true) { if (p == l) { pos = p; if (!fillBuffer(1)) { break; } p = pos; l = limit; } int c = buffer[p++]; if (c == '\n') { lineNumber++; lineStart = p; continue; } else if (c == ' ' || c == '\r' || c == '\t') { continue; } pos = p; if (c == '<') { if (p == l) { pos--; // push back '/' so it's still in the buffer when this method returns boolean charsLoaded = fillBuffer(4); pos++; // consume the '/' again if (!charsLoaded) { return c; } } if (buffer[pos] == '!' && buffer[pos + 1] == '-' && buffer[pos + 2] == '-') { pos += 3; if (!skipTo("-->")) { throw syntaxError("Unterminated comment"); } p = pos + 3; l = limit; continue; } } return c; } if (throwOnEof) { throw new EOFException("End of input" + locationString()); } else { return -1; } } private void checkLenient() throws MalformedDataException { if (!lenient) { throw syntaxError("Use JsonReader.setLenient(true) to accept malformed JSON"); } } /** * @param toFind a string to search for. Must not contain a newline. */ private boolean skipTo(String toFind) throws IOException { int length = toFind.length(); outer: for (; pos + length <= limit || fillBuffer(length); pos++) { if (buffer[pos] == '\n') { lineNumber++; lineStart = pos + 1; continue; } for (int c = 0; c < length; c++) { if (buffer[pos + c] != toFind.charAt(c)) { continue outer; } } return true; } return false; } protected String locationString() { int line = lineNumber + 1; int column = pos - lineStart + 1; String replacement = StringEscapeUtil.getReplacement(buffer[pos]); if (replacement == null) { replacement = String.valueOf(buffer[pos]); } String charInterjection = pos < buffer.length ? " (char '" + replacement + "')" : ""; return " at line " + line + " column " + column + charInterjection + " path " + getPath(); } public String getPath() { StringBuilder result = new StringBuilder(); boolean first = true; for (int i = 0; i < stackSize; i++) { int scope = stack[i]; switch (scope) { case XmlScope.TAG_HEAD: case XmlScope.TAG_BODY: case XmlScope.DANGLING_NAME: case XmlScope.NONEMPTY_DOCUMENT: if (pathNames[i] != null) { if (first) first = false; else result.append('.'); result.append(pathNames[i]); } break; case XmlScope.EMPTY_DOCUMENT: case XmlScope.CLOSED: break; default: throw new AssertionError("Unknown scope value: " + scope); } } return result.toString(); } /** * Unescapes the character identified by the character or characters that immediately follow a * backslash. The backslash '\' should have already been read. This supports both Unicode escapes * "u000A" and two-character escapes "\n". * * @throws MalformedDataException if the escape sequence is malformed */ @SuppressWarnings("fallthrough") private char readEscapeCharacter() throws IOException { if (pos == limit && !fillBuffer(1)) { throw syntaxError("Unterminated escape sequence"); } char escaped = buffer[pos++]; switch (escaped) { case 'u': if (pos + 4 > limit && !fillBuffer(4)) { throw syntaxError("Unterminated escape sequence"); } // Equivalent to Integer.parseInt(stringPool.get(buffer, pos, 4), 16); int result = 0; for (int i = pos, end = i + 4; i < end; i++) { char c = buffer[i]; result <<= 4; if (c >= '0' && c <= '9') { result += (c - '0'); } else if (c >= 'a' && c <= 'f') { result += (c - 'a' + 10); } else if (c >= 'A' && c <= 'F') { result += (c - 'A' + 10); } else { throw syntaxError("Malformed Unicode escape \\u" + new String(buffer, pos, 4)); } } pos += 4; return (char) result; case 't': return '\t'; case 'b': return '\b'; case 'n': return '\n'; case 'r': return '\r'; case 'f': return '\f'; case '\n': if (!lenient) { throw syntaxError("Cannot escape a newline character in strict mode"); } lineNumber++; lineStart = pos; // fall-through case '\'': if (!lenient) { throw syntaxError("Invalid escaped character \"'\" in strict mode"); } case '"': case '\\': case '/': return escaped; default: // throw error when none of the above cases are matched throw syntaxError("Invalid escape sequence"); } } /** * Throws a new {@link MalformedDataException} with the given message and information about the * current location. */ private MalformedDataException syntaxError(String message) throws MalformedDataException { throw new MalformedDataException(message + locationString()); } private IllegalStateException unexpectedTokenError(String expected) throws IOException { return new IllegalStateException("Expected " + expected + " but was " + peek() + locationString()); } /** Consumes the header if it exists. */ private void consumeHeader() throws IOException { // fast-forward through the leading whitespace int unused = nextNonWhitespace(true); pos--; if (pos + 5 > limit && !fillBuffer(5)) { return; } int p = pos; char[] buf = buffer; if (buf[p] != '<' || buf[p + 1] != '?' || buf[p + 2] != 'x' || buf[p + 3] != 'm' || buf[p + 4] != 'l') { return; // not a header! } // we found a header, consume it pos += 5; skipTo("?>"); } @Override public void close() throws IOException { peeked = PEEKED_NONE; stack[0] = XmlScope.CLOSED; stackSize = 1; in.close(); } }