package io.gitlab.jfronny.commons.serialize.xml;
import io.gitlab.jfronny.commons.serialize.MalformedDataException;
import io.gitlab.jfronny.commons.serialize.StringEscapeUtil;
import io.gitlab.jfronny.commons.serialize.xml.impl.XmlScope;
import java.util.Arrays;
import java.util.Objects;
public class NativeXmlReader implements Closeable {
private static final int PEEKED_NONE = 0;
private static final int PEEKED_BEGIN_TAG = 1;
private static final int PEEKED_END_TAG = 2;
private static final int PEEKED_END_TAG_CONCISE = 3;
private static final int PEEKED_TEXT = 4;
private static final int PEEKED_CDATA = 5;
private static final int PEEKED_ATTRIBUTE_NAME = 6;
private static final int PEEKED_ATTRIBUTE_VALUE = 7;
private static final int PEEKED_EOF = 8;
/** The input JSON. */
private final Reader in;
static final int BUFFER_SIZE = 1024;
* Use a manual buffer to easily read and unread upcoming characters, and also so we can create
* strings without an intermediate StringBuilder. We decode literals directly out of this buffer,
* so it must be at least as long as the longest token that can be reported as a number.
private final char[] buffer = new char[BUFFER_SIZE];
private int pos = 0;
private int limit = 0;
private int lineNumber = 0;
private int lineStart = 0;
int peeked = PEEKED_NONE;
* The number of characters in a peeked number literal. Increment 'pos' by this after reading a
* number.
private int peekedNumberLength;
* A peeked string that should be parsed on the next double, long or string. This is populated
* before a numeric value is parsed and used if that parsing fails.
private String peekedString;
* The nesting stack. Using a manual array rather than an ArrayList saves 20%.
private int[] stack = new int[32];
private int stackSize = 0;
stack[stackSize++] = XmlScope.EMPTY_DOCUMENT;
* The path members. It corresponds directly to stack: At indices where the
* stack contains an object (EMPTY_OBJECT, DANGLING_NAME or NONEMPTY_OBJECT),
* pathNames contains the name at this scope. Where it contains an array
* (EMPTY_ARRAY, NONEMPTY_ARRAY) pathIndices contains the current index in
* that array. Otherwise the value is undefined, and we take advantage of that
* by incrementing pathIndices when doing so isn't useful.
private String[] pathNames = new String[32];
private int[] pathIndices = new int[32];
private boolean lenient = false;
private boolean skipWhitespace = true;
public NativeXmlReader(Reader in) { = Objects.requireNonNull(in, "in == null");
2024-04-13 20:41:13 +02:00
public NativeXmlReader setLenient(boolean lenient) {
this.lenient = lenient;
return this;
public boolean isLenient() {
return lenient;
public NativeXmlReader setSkipWhitespace(boolean skipWhitespace) {
this.skipWhitespace = skipWhitespace;
return this;
public boolean isSkipWhitespace() {
return skipWhitespace;
private void push(int newTop) {
if (stackSize == stack.length) {
int newLength = stackSize * 2;
stack = Arrays.copyOf(stack, newLength);
pathIndices = Arrays.copyOf(pathIndices, newLength);
pathNames = Arrays.copyOf(pathNames, newLength);
stack[stackSize++] = newTop;
* Returns true once {@code limit - pos >= minimum}. If the data is exhausted before that many
* characters are available, this returns false.
private boolean fillBuffer(int minimum) throws IOException {
char[] buffer = this.buffer;
lineStart -= pos;
if (limit != pos) {
limit -= pos;
System.arraycopy(buffer, pos, buffer, 0, limit);
} else {
limit = 0;
pos = 0;
int total;
while ((total =, limit, buffer.length - limit)) != -1) {
limit += total;
// if this is the first read, consume an optional byte order mark (BOM) if it exists
if (lineNumber == 0 && lineStart == 0 && limit > 0 && buffer[0] == '\ufeff') {
if (limit >= minimum) {
return true;
return false;
public String beginTag() throws IOException {
int p = peeked;
if (p == PEEKED_NONE) {
p = doPeek();
if (p != PEEKED_BEGIN_TAG) {
throw unexpectedTokenError("BEGIN_TAG");
String name = nextName();
pathNames[stackSize - 1] = name;
peeked = PEEKED_NONE;
return name;
public String endTag() throws IOException {
int p = peeked;
if (p == PEEKED_NONE) {
p = doPeek();
String name;
if (p == PEEKED_END_TAG) {
name = nextName();
if (buffer[pos] != '>') {
throw syntaxError("Expected > but was " + buffer[pos]);
} else if (p == PEEKED_END_TAG_CONCISE) {
name = pathNames[stackSize - 2];
} else throw unexpectedTokenError("END_TAG");
if (!name.equals(pathNames[stackSize - 2])) {
if (!lenient) throw syntaxError("Mismatched closing tag: Expected " + pathNames[stackSize - 1] + " but was " + name);
pathNames[stackSize] = null; // Free the last path name so that it can be garbage collected!
pathIndices[stackSize - 1]++;
peeked = PEEKED_NONE;
return name;
public boolean hasNext() throws IOException {
int p = peeked;
if (p == PEEKED_NONE) {
p = doPeek();
public XmlToken peek() throws IOException {
int p = peeked;
if (p == PEEKED_NONE) {
p = doPeek();
return switch (p) {
case PEEKED_TEXT -> XmlToken.TEXT;
case PEEKED_CDATA -> XmlToken.CDATA;
case PEEKED_EOF -> XmlToken.EOF;
default -> throw new AssertionError();
int doPeek() throws IOException {
int peekStack = stack[stackSize - 1];
if (peekStack == XmlScope.TAG_HEAD) {
stack[stackSize - 1] = XmlScope.DANGLING_NAME;
int c = nextNonWhitespace(true);
if (c == -1) {
throw syntaxError("Unterminated tag");
} if (c == '/') {
if (pos < limit || fillBuffer(1)) {
char chNext = buffer[pos++];
if (chNext == '>') {
stack[stackSize - 1] = XmlScope.TAG_BODY;
return peeked = PEEKED_END_TAG_CONCISE;
} else {
throw syntaxError("Expected /> but was /" + chNext);
} else {
throw syntaxError("Unterminated tag at " + c);
} else if (c == '>') {
stack[stackSize - 1] = XmlScope.TAG_BODY;
// fall through
} else if (pos < limit || fillBuffer(1)) {
char chNext = buffer[pos + 1];
var check = isNameStart((char) c, chNext);
if (check != NameCheck.NONE) {
return peeked = PEEKED_ATTRIBUTE_NAME;
} else {
throw unexpectedTokenError("attribute name");
} else throw syntaxError("Unterminated tag at " + c);
} else if (peekStack == XmlScope.DANGLING_NAME) {
stack[stackSize - 1] = XmlScope.TAG_HEAD;
// Look for an equals sign before the value
int c = lenient ? nextNonWhitespace(true) : buffer[pos++];
if (c == '=') {
c = lenient ? nextNonWhitespace(true) : buffer[pos++];
if (c == '\'' || c == '"') {
} else {
throw syntaxError("Expected a value but was " + (char) c);
} else {
throw syntaxError("Expected '='");
} else if (peekStack == XmlScope.TAG_BODY) {
// fall through: a new element is starting
} else if (peekStack == XmlScope.EMPTY_DOCUMENT) {
stack[stackSize - 1] = XmlScope.NONEMPTY_DOCUMENT;
// fall through: a new element is starting
} else if (peekStack == XmlScope.NONEMPTY_DOCUMENT) {
int c = skipWhitespace ? nextNonWhitespace(false) : buffer[pos++];
if (c == -1) {
return peeked = PEEKED_EOF;
} else {
// fall through: a new element is starting
} else if (peekStack == XmlScope.CLOSED) {
throw new IllegalStateException("BaseXmlReader is closed");
int c = skipWhitespace ? nextNonWhitespace(true) : buffer[pos++];
if (c == -1) {
throw syntaxError("Unterminated tag");
} else if (c == '<') {
if (pos + 1 <= limit || fillBuffer(1)) {
char chNext = buffer[pos];
if (chNext == '/') {
return peeked = PEEKED_END_TAG;
} else if (chNext == '!') {
if (pos + 8 <= limit || fillBuffer(8)) {
if (buffer[pos + 1] == '[' && buffer[pos + 2] == 'C' && buffer[pos + 3] == 'D' && buffer[pos + 4] == 'A' && buffer[pos + 5] == 'T' && buffer[pos + 6] == 'A' && buffer[pos + 7] == '[') {
pos += 8;
return peeked = PEEKED_CDATA;
} else {
throw syntaxError("Expected <![CDATA[ but was <![" + new String(buffer, pos, 5));
} else if (pos + 2 <= limit || fillBuffer(2)) {
var check = isNameStart(chNext, buffer[pos + 1]);
if (check != NameCheck.NONE) {
return peeked = PEEKED_BEGIN_TAG;
throw syntaxError("Unterminated tag");
} else {
return peeked = PEEKED_TEXT;
private enum NameCheck { FIRST, BOTH, NONE }
private NameCheck isNameStart(char ch, char chNext) {
if ('A' <= ch && ch <= 'Z') return NameCheck.FIRST;
if ('a' <= ch && ch <= 'z') return NameCheck.FIRST;
return switch (ch) {
case ':', '_' -> NameCheck.FIRST;
case '\u2070' -> chNext == '\u218F' ? NameCheck.BOTH : NameCheck.NONE;
case '\u2C00' -> chNext == '\u2FEF' ? NameCheck.BOTH : NameCheck.NONE;
case '\u3001' -> chNext == '\uD7FF' ? NameCheck.BOTH : NameCheck.NONE;
case '\uF900' -> chNext == '\uFDCF' ? NameCheck.BOTH : NameCheck.NONE;
case '\uFDF0' -> chNext == '\uFFFD' ? NameCheck.BOTH : NameCheck.NONE;
default -> NameCheck.NONE;
private NameCheck isName(char ch, char chNext) {
var nameStart = isNameStart(ch, chNext);
if (nameStart != NameCheck.NONE) return nameStart;
if ('0' <= ch && ch <= '9') return NameCheck.FIRST;
return switch (ch) {
case '-', '.', '\u00B7' -> NameCheck.FIRST;
case '\u0300' -> chNext == '\u036F' ? NameCheck.BOTH : NameCheck.NONE;
case '\u203F' -> chNext == '\u2040' ? NameCheck.BOTH : NameCheck.NONE;
default -> NameCheck.NONE;
public String nextAttributeName() throws IOException {
int p = peeked;
if (p == PEEKED_NONE) {
p = doPeek();
throw unexpectedTokenError("ATTRIBUTE_NAME");
String result = nextName();
peeked = PEEKED_NONE;
pathNames[stackSize - 1] = result;
return result;
public String nextAttributeValue() throws IOException {
int p = peeked;
if (p == PEEKED_NONE) {
p = doPeek();
throw unexpectedTokenError("ATTRIBUTE_VALUE");
char quote = buffer[pos++];
String result = readUntil((c, i) -> {
if (!lenient && c < 0x20 && c != 0x09) throw syntaxError("Control character in attribute value");
if (c == '<') throw syntaxError("Expected " + quote + " but was '<'");
return c == quote;
}, true);
peeked = PEEKED_NONE;
return result;
private String readReference() throws IOException {
if (pos == limit && !fillBuffer(1)) {
throw syntaxError("Unterminated escape sequence");
if (buffer[pos] == '#') {
// read the character reference
if (pos == limit && !fillBuffer(1)) {
throw syntaxError("Unterminated escape sequence");
boolean isHex = buffer[pos] == 'x' || buffer[pos] == 'X';
if (isHex) pos++;
String result = readUntil((c, i) -> {
if (c == ';') return true;
if ('0' <= c && c <= '9') return false;
if (isHex && ('a' <= c && c <= 'f' || 'A' <= c && c <= 'F')) return false;
throw syntaxError("Malformed character reference");
}, false);
if (!result.endsWith(";")) throw syntaxError("Missing ';' in character reference");
result = result.substring(0, result.length() - 1);
return String.valueOf((char) Integer.parseInt(result, isHex ? 16 : 10));
} else {
// read the entity reference
// we don't support these, so just handle them like a normal string
String result = nextName();
if (buffer[pos] != ';') throw syntaxError("Missing ';' in entity reference");
if (result.equals("apos")) return "'";
if (result.equals("quot")) return "\"";
if (result.equals("amp")) return "&";
if (result.equals("lt")) return "<";
if (result.equals("gt")) return ">";
return "&" + result + ";";
public String nextText() throws IOException {
int p = peeked;
if (p == PEEKED_NONE) {
p = doPeek();
if (p != PEEKED_TEXT) {
throw unexpectedTokenError("TEXT");
String result = readUntil((c, i) -> c == '<', true);
if (skipWhitespace) {
result = result.trim();
peeked = PEEKED_NONE;
return result;
public String nextCData() throws IOException {
int p = peeked;
if (p == PEEKED_NONE) {
p = doPeek();
if (p != PEEKED_CDATA) {
throw unexpectedTokenError("CDATA");
StringBuilder sb = new StringBuilder();
while (true) {
sb.append(readUntil((c, i) -> c == ']', false));
if (pos + 2 < limit || fillBuffer(3)) {
if (buffer[pos] == ']' && buffer[pos + 1] == ']' && buffer[pos + 2] == '>') {
pos += 3;
peeked = PEEKED_NONE;
return sb.toString();
} else {
throw syntaxError("Unterminated CDATA");
private String nextName() throws IOException {
return readUntil((c, i) -> isName(c, pos + i + 1 < limit ? buffer[pos + i + 1] : '\0') == NameCheck.NONE, false);
private interface EndPredicate {
boolean test(char c, int i) throws MalformedDataException;
private String readUntil(EndPredicate character, boolean handleReferences) throws IOException {
StringBuilder builder = null;
int i = 0;
while (true) {
for (; pos + i < limit; i++) {
char c = buffer[pos + i];
if (character.test(c, i)) {
break findEnd;
} else if (handleReferences && c == '&') {
if (builder == null) {
builder = new StringBuilder(Math.max(i, 16));
builder.append(buffer, pos, i);
pos += i;
i = 0;
} else if (c == '\n') {
lineStart = pos + i + 1;
// Attempt to load the entire name into the buffer at once.
if (i < buffer.length) {
if (fillBuffer(i + 1)) {
} else {
// use a StringBuilder when the name is too long.
if (builder == null) {
builder = new StringBuilder(Math.max(i, 16));
builder.append(buffer, pos, i);
pos += i;
i = 0;
if (!fillBuffer(1)) {
String result = builder != null
? builder.append(buffer, pos, i).toString()
: new String(buffer, pos, i);
pos += i;
return result;
* Returns the next character in the stream that is neither whitespace nor a part of a comment.
* When this returns, the returned character is always at {@code buffer[pos-1]}; this means the
* caller can always push back the returned character by decrementing {@code pos}.
private int nextNonWhitespace(boolean throwOnEof) throws IOException {
* This code uses ugly local variables 'p' and 'l' representing the 'pos'
* and 'limit' fields respectively. Using locals rather than fields saves
* a few field reads for each whitespace character in a pretty-printed
* document, resulting in a 5% speedup. We need to flush 'p' to its field
* before any (potentially indirect) call to fillBuffer() and reread both
* 'p' and 'l' after any (potentially indirect) call to the same method.
char[] buffer = this.buffer;
int p = pos;
int l = limit;
while (true) {
if (p == l) {
pos = p;
if (!fillBuffer(1)) {
p = pos;
l = limit;
int c = buffer[p++];
if (c == '\n') {
lineStart = p;
} else if (c == ' ' || c == '\r' || c == '\t') {
pos = p;
if (c == '<') {
if (p == l) {
pos--; // push back '/' so it's still in the buffer when this method returns
boolean charsLoaded = fillBuffer(4);
pos++; // consume the '/' again
if (!charsLoaded) {
return c;
if (buffer[pos] == '!' && buffer[pos + 1] == '-' && buffer[pos + 2] == '-') {
pos += 3;
if (!skipTo("-->")) {
throw syntaxError("Unterminated comment");
p = pos + 3;
l = limit;
return c;
if (throwOnEof) {
throw new EOFException("End of input" + locationString());
} else {
return -1;
private void checkLenient() throws MalformedDataException {
if (!lenient) {
throw syntaxError("Use JsonReader.setLenient(true) to accept malformed JSON");
* @param toFind a string to search for. Must not contain a newline.
private boolean skipTo(String toFind) throws IOException {
int length = toFind.length();
for (; pos + length <= limit || fillBuffer(length); pos++) {
if (buffer[pos] == '\n') {
lineStart = pos + 1;
for (int c = 0; c < length; c++) {
if (buffer[pos + c] != toFind.charAt(c)) {
continue outer;
return true;
return false;
protected String locationString() {
int line = lineNumber + 1;
int column = pos - lineStart + 1;
String replacement = StringEscapeUtil.getReplacement(buffer[pos]);
if (replacement == null) {
replacement = String.valueOf(buffer[pos]);
String charInterjection = pos < buffer.length ? " (char '" + replacement + "')" : "";
return " at line " + line + " column " + column + charInterjection + " path " + getPath();
public String getPath() {
StringBuilder result = new StringBuilder();
boolean first = true;
for (int i = 0; i < stackSize; i++) {
int scope = stack[i];
switch (scope) {
case XmlScope.TAG_HEAD:
case XmlScope.TAG_BODY:
case XmlScope.DANGLING_NAME:
if (first) first = false;
else result.append('.');
if (pathNames[i] != null) {
case XmlScope.CLOSED:
throw new AssertionError("Unknown scope value: " + scope);
return result.toString();
* Unescapes the character identified by the character or characters that immediately follow a
* backslash. The backslash '\' should have already been read. This supports both Unicode escapes
* "u000A" and two-character escapes "\n".
* @throws MalformedDataException if the escape sequence is malformed
private char readEscapeCharacter() throws IOException {
if (pos == limit && !fillBuffer(1)) {
throw syntaxError("Unterminated escape sequence");
char escaped = buffer[pos++];
switch (escaped) {
case 'u':
if (pos + 4 > limit && !fillBuffer(4)) {
throw syntaxError("Unterminated escape sequence");
// Equivalent to Integer.parseInt(stringPool.get(buffer, pos, 4), 16);
int result = 0;
for (int i = pos, end = i + 4; i < end; i++) {
char c = buffer[i];
result <<= 4;
if (c >= '0' && c <= '9') {
result += (c - '0');
} else if (c >= 'a' && c <= 'f') {
result += (c - 'a' + 10);
} else if (c >= 'A' && c <= 'F') {
result += (c - 'A' + 10);
} else {
throw syntaxError("Malformed Unicode escape \\u" + new String(buffer, pos, 4));
pos += 4;
return (char) result;
case 't':
return '\t';
case 'b':
return '\b';
case 'n':
return '\n';
case 'r':
return '\r';
case 'f':
return '\f';
case '\n':
if (!lenient) {
throw syntaxError("Cannot escape a newline character in strict mode");
lineStart = pos;
// fall-through
case '\'':
if (!lenient) {
throw syntaxError("Invalid escaped character \"'\" in strict mode");
case '"':
case '\\':
case '/':
return escaped;
// throw error when none of the above cases are matched
throw syntaxError("Invalid escape sequence");
* Throws a new {@link MalformedDataException} with the given message and information about the
* current location.
private MalformedDataException syntaxError(String message) throws MalformedDataException {
throw new MalformedDataException(message + locationString());
private IllegalStateException unexpectedTokenError(String expected) throws IOException {
return new IllegalStateException("Expected " + expected + " but was " + peek() + locationString());
/** Consumes the header if it exists. */
private void consumeHeader() throws IOException {
// fast-forward through the leading whitespace
int unused = nextNonWhitespace(true);
if (pos + 5 > limit && !fillBuffer(5)) {
int p = pos;
char[] buf = buffer;
if (buf[p] != '<'
|| buf[p + 1] != '?'
|| buf[p + 2] != 'x'
|| buf[p + 3] != 'm'
|| buf[p + 4] != 'l') {
return; // not a header!
// we found a header, consume it
pos += 5;
public void close() throws IOException {
peeked = PEEKED_NONE;
stack[0] = XmlScope.CLOSED;
stackSize = 1;