java-commons/commons-serialize-xml/src/main/java/io/gitlab/jfronny/commons/serialize/xml/NativeXmlReader.java

855 lines
30 KiB
Java

package io.gitlab.jfronny.commons.serialize.xml;
import io.gitlab.jfronny.commons.serialize.MalformedDataException;
import io.gitlab.jfronny.commons.serialize.StringEscapeUtil;
import io.gitlab.jfronny.commons.serialize.xml.impl.NameCheck;
import io.gitlab.jfronny.commons.serialize.xml.impl.XmlScope;
import java.io.Closeable;
import java.io.EOFException;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Objects;
public class NativeXmlReader implements Closeable {
private static final int PEEKED_NONE = 0;
private static final int PEEKED_BEGIN_TAG = 1;
private static final int PEEKED_END_TAG = 2;
private static final int PEEKED_END_TAG_CONCISE = 3;
private static final int PEEKED_TEXT = 4;
private static final int PEEKED_CDATA = 5;
private static final int PEEKED_ATTRIBUTE_NAME = 6;
private static final int PEEKED_ATTRIBUTE_VALUE = 7;
private static final int PEEKED_EOF = 8;
/** The input JSON. */
private final Reader in;
static final int BUFFER_SIZE = 1024;
/**
* Use a manual buffer to easily read and unread upcoming characters, and also so we can create
* strings without an intermediate StringBuilder. We decode literals directly out of this buffer,
* so it must be at least as long as the longest token that can be reported as a number.
*/
private final char[] buffer = new char[BUFFER_SIZE];
private int pos = 0;
private int limit = 0;
private int lineNumber = 0;
private int lineStart = 0;
int peeked = PEEKED_NONE;
/**
* The number of characters in a peeked number literal. Increment 'pos' by this after reading a
* number.
*/
private int peekedNumberLength;
/**
* A peeked string that should be parsed on the next double, long or string. This is populated
* before a numeric value is parsed and used if that parsing fails.
*/
private String peekedString;
/*
* The nesting stack. Using a manual array rather than an ArrayList saves 20%.
*/
private int[] stack = new int[32];
private int stackSize = 0;
{
stack[stackSize++] = XmlScope.EMPTY_DOCUMENT;
}
/*
* The path members. It corresponds directly to stack: At indices where the
* stack contains an object (EMPTY_OBJECT, DANGLING_NAME or NONEMPTY_OBJECT),
* pathNames contains the name at this scope. Where it contains an array
* (EMPTY_ARRAY, NONEMPTY_ARRAY) pathIndices contains the current index in
* that array. Otherwise the value is undefined, and we take advantage of that
* by incrementing pathIndices when doing so isn't useful.
*/
private String[] pathNames = new String[32];
private int[] pathIndices = new int[32];
private boolean lenient = false;
private boolean skipWhitespace = true;
public NativeXmlReader(Reader in) {
this.in = Objects.requireNonNull(in, "in == null");
}
public NativeXmlReader setLenient(boolean lenient) {
this.lenient = lenient;
return this;
}
public boolean isLenient() {
return lenient;
}
public NativeXmlReader setSkipWhitespace(boolean skipWhitespace) {
this.skipWhitespace = skipWhitespace;
return this;
}
public boolean isSkipWhitespace() {
return skipWhitespace;
}
private void push(int newTop) {
if (stackSize == stack.length) {
int newLength = stackSize * 2;
stack = Arrays.copyOf(stack, newLength);
pathIndices = Arrays.copyOf(pathIndices, newLength);
pathNames = Arrays.copyOf(pathNames, newLength);
}
stack[stackSize++] = newTop;
}
/**
* Returns true once {@code limit - pos >= minimum}. If the data is exhausted before that many
* characters are available, this returns false.
*/
private boolean fillBuffer(int minimum) throws IOException {
char[] buffer = this.buffer;
lineStart -= pos;
if (limit != pos) {
limit -= pos;
System.arraycopy(buffer, pos, buffer, 0, limit);
} else {
limit = 0;
}
pos = 0;
int total;
while ((total = in.read(buffer, limit, buffer.length - limit)) != -1) {
limit += total;
// if this is the first read, consume an optional byte order mark (BOM) if it exists
if (lineNumber == 0 && lineStart == 0 && limit > 0 && buffer[0] == '\ufeff') {
pos++;
lineStart++;
minimum++;
}
if (limit >= minimum) {
return true;
}
}
return false;
}
public String beginTag() throws IOException {
int p = peeked;
if (p == PEEKED_NONE) {
p = doPeek();
}
if (p != PEEKED_BEGIN_TAG) {
throw unexpectedTokenError("BEGIN_TAG");
}
String name = nextName();
pathNames[stackSize - 1] = name;
push(XmlScope.TAG_HEAD);
peeked = PEEKED_NONE;
return name;
}
public String endTag() throws IOException {
int p = peeked;
if (p == PEEKED_NONE) {
p = doPeek();
}
String name;
if (p == PEEKED_END_TAG) {
name = nextName();
if (buffer[pos] != '>') {
throw syntaxError("Expected > but was " + buffer[pos]);
}
pos++;
} else if (p == PEEKED_END_TAG_CONCISE) {
name = pathNames[stackSize - 2];
} else throw unexpectedTokenError("END_TAG");
if (!name.equals(pathNames[stackSize - 2])) {
if (!lenient) throw syntaxError("Mismatched closing tag: Expected " + pathNames[stackSize - 2] + " but was " + name);
}
stackSize--;
pathNames[stackSize] = null; // Free the last path name so that it can be garbage collected!
pathIndices[stackSize - 1]++;
peeked = PEEKED_NONE;
return name;
}
public boolean hasNext() throws IOException {
int p = peeked;
if (p == PEEKED_NONE) {
p = doPeek();
}
return p != PEEKED_EOF && p != PEEKED_END_TAG && p != PEEKED_END_TAG_CONCISE;
}
public boolean isConciseEndTag() throws IOException {
int p = peeked;
if (p == PEEKED_NONE) {
p = doPeek();
}
if (p == PEEKED_END_TAG_CONCISE) {
return true;
} else if (p == PEEKED_END_TAG) {
return false;
} else {
throw unexpectedTokenError("END_TAG");
}
}
public XmlToken peek() throws IOException {
int p = peeked;
if (p == PEEKED_NONE) {
p = doPeek();
}
return switch (p) {
case PEEKED_BEGIN_TAG -> XmlToken.BEGIN_TAG;
case PEEKED_END_TAG, PEEKED_END_TAG_CONCISE -> XmlToken.END_TAG;
case PEEKED_TEXT -> XmlToken.TEXT;
case PEEKED_CDATA -> XmlToken.CDATA;
case PEEKED_ATTRIBUTE_NAME -> XmlToken.ATTRIBUTE_NAME;
case PEEKED_ATTRIBUTE_VALUE -> XmlToken.ATTRIBUTE_VALUE;
case PEEKED_EOF -> XmlToken.EOF;
default -> throw new AssertionError();
};
}
int doPeek() throws IOException {
int peekStack = stack[stackSize - 1];
if (peekStack == XmlScope.TAG_HEAD) {
stack[stackSize - 1] = XmlScope.DANGLING_NAME;
int c = nextNonWhitespace(true);
if (c == -1) {
throw syntaxError("Unterminated tag");
} if (c == '/') {
if (pos < limit || fillBuffer(1)) {
char chNext = buffer[pos++];
if (chNext == '>') {
stack[stackSize - 1] = XmlScope.TAG_BODY;
return peeked = PEEKED_END_TAG_CONCISE;
} else {
throw syntaxError("Expected /> but was /" + chNext);
}
} else {
throw syntaxError("Unterminated tag at " + c);
}
} else if (c == '>') {
stack[stackSize - 1] = XmlScope.TAG_BODY;
// fall through
} else if (pos < limit || fillBuffer(1)) {
char chNext = buffer[pos + 1];
var check = NameCheck.isNameStart((char) c, chNext);
pos--;
if (check != NameCheck.NONE) {
return peeked = PEEKED_ATTRIBUTE_NAME;
} else {
throw unexpectedTokenError("attribute name");
}
} else throw syntaxError("Unterminated tag at " + c);
} else if (peekStack == XmlScope.DANGLING_NAME) {
stack[stackSize - 1] = XmlScope.TAG_HEAD;
// Look for an equals sign before the value
int c = lenient ? nextNonWhitespace(true) : buffer[pos++];
if (c == '=') {
c = lenient ? nextNonWhitespace(true) : buffer[pos++];
pos--;
if (c == '\'' || c == '"') {
return peeked = PEEKED_ATTRIBUTE_VALUE;
} else {
throw syntaxError("Expected a value but was " + (char) c);
}
} else {
throw syntaxError("Expected '='");
}
} else if (peekStack == XmlScope.TAG_BODY) {
// fall through: a new element is starting
} else if (peekStack == XmlScope.EMPTY_DOCUMENT) {
stack[stackSize - 1] = XmlScope.NONEMPTY_DOCUMENT;
// fall through: a new element is starting
} else if (peekStack == XmlScope.NONEMPTY_DOCUMENT) {
int c = skipWhitespace ? nextNonWhitespace(false) : buffer[pos++];
if (c == -1) {
return peeked = PEEKED_EOF;
} else {
checkLenient();
pos--;
// fall through: a new element is starting
}
} else if (peekStack == XmlScope.CLOSED) {
throw new IllegalStateException("BaseXmlReader is closed");
}
int c = skipWhitespace ? nextNonWhitespace(true) : buffer[pos++];
if (c == -1) {
throw syntaxError("Unterminated tag");
} else if (c == '<') {
if (pos + 1 <= limit || fillBuffer(1)) {
char chNext = buffer[pos];
if (chNext == '/') {
pos++;
return peeked = PEEKED_END_TAG;
} else if (chNext == '!') {
if (pos + 8 <= limit || fillBuffer(8)) {
if (buffer[pos + 1] == '[' && buffer[pos + 2] == 'C' && buffer[pos + 3] == 'D' && buffer[pos + 4] == 'A' && buffer[pos + 5] == 'T' && buffer[pos + 6] == 'A' && buffer[pos + 7] == '[') {
pos += 8;
return peeked = PEEKED_CDATA;
} else {
throw syntaxError("Expected <![CDATA[ but was <![" + new String(buffer, pos, 5));
}
}
} else if (pos + 2 <= limit || fillBuffer(2)) {
var check = NameCheck.isNameStart(chNext, buffer[pos + 1]);
if (check != NameCheck.NONE) {
return peeked = PEEKED_BEGIN_TAG;
}
}
}
throw syntaxError("Unterminated tag");
} else {
pos--;
return peeked = PEEKED_TEXT;
}
}
public String nextAttributeName() throws IOException {
int p = peeked;
if (p == PEEKED_NONE) {
p = doPeek();
}
if (p != PEEKED_ATTRIBUTE_NAME) {
throw unexpectedTokenError("ATTRIBUTE_NAME");
}
String result = nextName();
peeked = PEEKED_NONE;
pathNames[stackSize - 1] = result;
return result;
}
public String nextAttributeValue() throws IOException {
int p = peeked;
if (p == PEEKED_NONE) {
p = doPeek();
}
if (p != PEEKED_ATTRIBUTE_VALUE) {
throw unexpectedTokenError("ATTRIBUTE_VALUE");
}
char quote = buffer[pos++];
String result = readUntil((c, i) -> {
if (!lenient && c < 0x20 && c != 0x09) throw syntaxError("Control character in attribute value");
if (c == '<') throw syntaxError("Expected " + quote + " but was '<'");
return c == quote;
}, true);
pos++;
peeked = PEEKED_NONE;
return result;
}
private String readReference() throws IOException {
if (pos == limit && !fillBuffer(1)) {
throw syntaxError("Unterminated escape sequence");
}
if (buffer[pos] == '#') {
// read the character reference
pos++;
if (pos == limit && !fillBuffer(1)) {
throw syntaxError("Unterminated escape sequence");
}
boolean isHex = buffer[pos] == 'x' || buffer[pos] == 'X';
if (isHex) pos++;
String result = readUntil((c, i) -> {
if (c == ';') return true;
if ('0' <= c && c <= '9') return false;
if (isHex && ('a' <= c && c <= 'f' || 'A' <= c && c <= 'F')) return false;
throw syntaxError("Malformed character reference");
}, false);
if (!result.endsWith(";")) throw syntaxError("Missing ';' in character reference");
result = result.substring(0, result.length() - 1);
return String.valueOf((char) Integer.parseInt(result, isHex ? 16 : 10));
} else {
// read the entity reference
// we don't support these, so just handle them like a normal string
String result = nextName();
if (buffer[pos] != ';') throw syntaxError("Missing ';' in entity reference");
pos++;
if (result.equals("apos")) return "'";
if (result.equals("quot")) return "\"";
if (result.equals("amp")) return "&";
if (result.equals("lt")) return "<";
if (result.equals("gt")) return ">";
return "&" + result + ";";
}
}
public String nextText() throws IOException {
int p = peeked;
if (p == PEEKED_NONE) {
p = doPeek();
}
if (p != PEEKED_TEXT) {
throw unexpectedTokenError("TEXT");
}
String result = readUntil((c, i) -> c == '<', true);
if (skipWhitespace) {
result = result.trim();
}
peeked = PEEKED_NONE;
return result;
}
public String nextCData() throws IOException {
int p = peeked;
if (p == PEEKED_NONE) {
p = doPeek();
}
if (p != PEEKED_CDATA) {
throw unexpectedTokenError("CDATA");
}
StringBuilder sb = new StringBuilder();
while (true) {
sb.append(readUntil((c, i) -> c == ']', false));
if (pos + 2 < limit || fillBuffer(3)) {
if (buffer[pos] == ']' && buffer[pos + 1] == ']' && buffer[pos + 2] == '>') {
pos += 3;
peeked = PEEKED_NONE;
return sb.toString();
}
} else {
throw syntaxError("Unterminated CDATA");
}
}
}
public void skipValue() throws IOException {
int count = 0;
do {
int p = peeked;
if (p == PEEKED_NONE) {
p = doPeek();
}
switch (p) {
case PEEKED_BEGIN_TAG -> {
if (count == 0) {
pathNames[stackSize - 1] = "<skipped>";
}
push(XmlScope.TAG_HEAD);
count++;
}
case PEEKED_END_TAG, PEEKED_END_TAG_CONCISE -> {
if (count == 0) {
pathNames[stackSize - 1] = null;
}
stackSize--;
count--;
}
case PEEKED_TEXT -> skipUntil((c, i) -> c == '<');
case PEEKED_CDATA -> {
skipUntil((c, i) -> c == ']');
if (pos + 2 < limit || fillBuffer(3)) {
if (buffer[pos] == ']' && buffer[pos + 1] == ']' && buffer[pos + 2] == '>') {
pos += 3;
peeked = PEEKED_NONE;
return;
}
} else {
throw syntaxError("Unterminated CDATA");
}
}
case PEEKED_ATTRIBUTE_NAME -> {
skipUntil((c, i) -> NameCheck.isName(c, pos + i + 1 < limit ? buffer[pos + i + 1] : '\0') == NameCheck.NONE);
if (count == 0) pathNames[stackSize - 1] = "<skipped>";
peeked = PEEKED_NONE;
}
case PEEKED_ATTRIBUTE_VALUE -> {
char quote = buffer[pos++];
skipUntil((c, i) -> c == quote);
pos++;
peeked = PEEKED_NONE;
}
case PEEKED_EOF -> throw new IllegalStateException("Attempt to skip led outside the document");
default -> {}
}
peeked = PEEKED_NONE;
} while (count > 0);
pathIndices[stackSize - 1]++;
if (count < 0) throw new IllegalStateException("Attempt to skip led outside its parent");
}
private String nextName() throws IOException {
return readUntil((c, i) -> NameCheck.isName(c, pos + i + 1 < limit ? buffer[pos + i + 1] : '\0') == NameCheck.NONE, false);
}
@FunctionalInterface
private interface EndPredicate {
boolean test(char c, int i) throws MalformedDataException;
}
private String readUntil(EndPredicate character, boolean handleReferences) throws IOException {
StringBuilder builder = null;
int i = 0;
findEnd:
while (true) {
for (; pos + i < limit; i++) {
char c = buffer[pos + i];
if (character.test(c, i)) {
break findEnd;
} else if (handleReferences && c == '&') {
if (builder == null) {
builder = new StringBuilder(Math.max(i, 16));
}
builder.append(buffer, pos, i);
pos += i;
i = 0;
builder.append(readReference());
} else if (c == '\n') {
lineNumber++;
lineStart = pos + i + 1;
}
}
// Attempt to load the entire name into the buffer at once.
if (i < buffer.length) {
if (fillBuffer(i + 1)) {
continue;
} else {
break;
}
}
// use a StringBuilder when the name is too long.
if (builder == null) {
builder = new StringBuilder(Math.max(i, 16));
}
builder.append(buffer, pos, i);
pos += i;
i = 0;
if (!fillBuffer(1)) {
break;
}
}
String result = builder != null
? builder.append(buffer, pos, i).toString()
: new String(buffer, pos, i);
pos += i;
return result;
}
private void skipUntil(EndPredicate character) throws IOException {
int i = 0;
findEnd:
while (true) {
for (; pos + i < limit; i++) {
char c = buffer[pos + i];
if (character.test(c, i)) {
break findEnd;
} else if (c == '\n') {
lineNumber++;
lineStart = pos + i + 1;
}
}
// Attempt to load the entire name into the buffer at once.
if (i < buffer.length) {
if (fillBuffer(i + 1)) {
continue;
} else {
break;
}
}
pos += i;
i = 0;
if (!fillBuffer(1)) {
break;
}
}
pos += i;
}
/**
* Returns the next character in the stream that is neither whitespace nor a part of a comment.
* When this returns, the returned character is always at {@code buffer[pos-1]}; this means the
* caller can always push back the returned character by decrementing {@code pos}.
*/
private int nextNonWhitespace(boolean throwOnEof) throws IOException {
/*
* This code uses ugly local variables 'p' and 'l' representing the 'pos'
* and 'limit' fields respectively. Using locals rather than fields saves
* a few field reads for each whitespace character in a pretty-printed
* document, resulting in a 5% speedup. We need to flush 'p' to its field
* before any (potentially indirect) call to fillBuffer() and reread both
* 'p' and 'l' after any (potentially indirect) call to the same method.
*/
char[] buffer = this.buffer;
int p = pos;
int l = limit;
while (true) {
if (p == l) {
pos = p;
if (!fillBuffer(1)) {
break;
}
p = pos;
l = limit;
}
int c = buffer[p++];
if (c == '\n') {
lineNumber++;
lineStart = p;
continue;
} else if (c == ' ' || c == '\r' || c == '\t') {
continue;
}
pos = p;
if (c == '<') {
if (p == l) {
pos--; // push back '/' so it's still in the buffer when this method returns
boolean charsLoaded = fillBuffer(4);
pos++; // consume the '/' again
if (!charsLoaded) {
return c;
}
}
if (buffer[pos] == '!' && buffer[pos + 1] == '-' && buffer[pos + 2] == '-') {
pos += 3;
if (!skipTo("-->")) {
throw syntaxError("Unterminated comment");
}
p = pos + 3;
l = limit;
continue;
}
}
return c;
}
if (throwOnEof) {
throw new EOFException("End of input" + locationString());
} else {
return -1;
}
}
private void checkLenient() throws MalformedDataException {
if (!lenient) {
throw syntaxError("Use JsonReader.setLenient(true) to accept malformed JSON");
}
}
/**
* @param toFind a string to search for. Must not contain a newline.
*/
private boolean skipTo(String toFind) throws IOException {
int length = toFind.length();
outer:
for (; pos + length <= limit || fillBuffer(length); pos++) {
if (buffer[pos] == '\n') {
lineNumber++;
lineStart = pos + 1;
continue;
}
for (int c = 0; c < length; c++) {
if (buffer[pos + c] != toFind.charAt(c)) {
continue outer;
}
}
return true;
}
return false;
}
protected String locationString() {
int line = lineNumber + 1;
int column = pos - lineStart + 1;
String replacement = StringEscapeUtil.getReplacement(buffer[pos]);
if (replacement == null) {
replacement = String.valueOf(buffer[pos]);
}
String charInterjection = pos < buffer.length ? " (char '" + replacement + "')" : "";
return " at line " + line + " column " + column + charInterjection + " path " + getPath();
}
public String getPath() {
StringBuilder result = new StringBuilder();
boolean first = true;
for (int i = 0; i < stackSize; i++) {
int scope = stack[i];
switch (scope) {
case XmlScope.TAG_HEAD:
case XmlScope.TAG_BODY:
case XmlScope.DANGLING_NAME:
case XmlScope.NONEMPTY_DOCUMENT:
if (pathNames[i] != null) {
if (first) first = false;
else result.append('.');
result.append(pathNames[i]);
}
break;
case XmlScope.EMPTY_DOCUMENT:
case XmlScope.CLOSED:
break;
default:
throw new AssertionError("Unknown scope value: " + scope);
}
}
return result.toString();
}
/**
* Unescapes the character identified by the character or characters that immediately follow a
* backslash. The backslash '\' should have already been read. This supports both Unicode escapes
* "u000A" and two-character escapes "\n".
*
* @throws MalformedDataException if the escape sequence is malformed
*/
@SuppressWarnings("fallthrough")
private char readEscapeCharacter() throws IOException {
if (pos == limit && !fillBuffer(1)) {
throw syntaxError("Unterminated escape sequence");
}
char escaped = buffer[pos++];
switch (escaped) {
case 'u':
if (pos + 4 > limit && !fillBuffer(4)) {
throw syntaxError("Unterminated escape sequence");
}
// Equivalent to Integer.parseInt(stringPool.get(buffer, pos, 4), 16);
int result = 0;
for (int i = pos, end = i + 4; i < end; i++) {
char c = buffer[i];
result <<= 4;
if (c >= '0' && c <= '9') {
result += (c - '0');
} else if (c >= 'a' && c <= 'f') {
result += (c - 'a' + 10);
} else if (c >= 'A' && c <= 'F') {
result += (c - 'A' + 10);
} else {
throw syntaxError("Malformed Unicode escape \\u" + new String(buffer, pos, 4));
}
}
pos += 4;
return (char) result;
case 't':
return '\t';
case 'b':
return '\b';
case 'n':
return '\n';
case 'r':
return '\r';
case 'f':
return '\f';
case '\n':
if (!lenient) {
throw syntaxError("Cannot escape a newline character in strict mode");
}
lineNumber++;
lineStart = pos;
// fall-through
case '\'':
if (!lenient) {
throw syntaxError("Invalid escaped character \"'\" in strict mode");
}
case '"':
case '\\':
case '/':
return escaped;
default:
// throw error when none of the above cases are matched
throw syntaxError("Invalid escape sequence");
}
}
/**
* Throws a new {@link MalformedDataException} with the given message and information about the
* current location.
*/
private MalformedDataException syntaxError(String message) throws MalformedDataException {
throw new MalformedDataException(message + locationString());
}
private IllegalStateException unexpectedTokenError(String expected) throws IOException {
return new IllegalStateException("Expected " + expected + " but was " + peek() + locationString());
}
/** Consumes the header if it exists. */
private void consumeHeader() throws IOException {
// fast-forward through the leading whitespace
int unused = nextNonWhitespace(true);
pos--;
if (pos + 5 > limit && !fillBuffer(5)) {
return;
}
int p = pos;
char[] buf = buffer;
if (buf[p] != '<'
|| buf[p + 1] != '?'
|| buf[p + 2] != 'x'
|| buf[p + 3] != 'm'
|| buf[p + 4] != 'l') {
return; // not a header!
}
// we found a header, consume it
pos += 5;
skipTo("?>");
}
@Override
public void close() throws IOException {
peeked = PEEKED_NONE;
stack[0] = XmlScope.CLOSED;
stackSize = 1;
in.close();
}
/**
* Copies the current element to the writer.
*
* @param writer the writer to copy to
* @throws IOException if an error occurs
*/
public void copyTo(NativeXmlWriter writer) throws IOException {
switch (peek()) {
case BEGIN_TAG -> {
beginTag();
writer.beginTag(pathNames[stackSize - 1]);
while (hasNext()) {
copyTo(writer);
}
endTag();
writer.endTag();
}
case END_TAG -> throw new IllegalStateException("Cannot copy standalone END_TAG");
case TEXT -> writer.text(nextText());
case CDATA -> writer.cdata(nextCData());
case ATTRIBUTE_NAME -> {
String name = nextAttributeName();
writer.attribute(name, nextAttributeValue());
}
case ATTRIBUTE_VALUE -> writer.attributeValue(nextAttributeValue());
case EOF -> throw new IllegalStateException("Cannot copy END_DOCUMENT");
}
}
}