Skip to content

Commit

Permalink
Parsing Unicode escape sequences (#21)
Browse files Browse the repository at this point in the history
  • Loading branch information
piotrrzysko authored Sep 12, 2023
1 parent 8c4c689 commit dd6d5b5
Show file tree
Hide file tree
Showing 12 changed files with 677 additions and 259 deletions.
10 changes: 2 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,6 @@ A Java version of [simdjson](https://github.com/simdjson/simdjson) - a JSON pars
based on the paper [Parsing Gigabytes of JSON per Second](https://arxiv.org/abs/1902.08318)
by Geoff Langdale and Daniel Lemire.

This implementation is still missing several features available in simdsjon. For example:

* Support for Unicode characters
* UTF-8 validation
* Support for 512-bit vectors

## Code Sample

```java
Expand Down Expand Up @@ -73,8 +67,8 @@ This section presents a performance comparison of different JSON parsers availab
the [twitter.json](src/jmh/resources/twitter.json) dataset, and its goal was to measure the throughput (ops/s) of parsing
and finding all unique users with a default profile.

**Note that simdjson-java is still missing several features (mentioned in the introduction), so the following results
may not reflect its real performance.**
**Note that simdjson-java is still missing several features (see [GitHub Issues](https://github.com/simdjson/simdjson-java/issues)),
so the following results may not reflect its real performance.**

Environment:
* CPU: Intel(R) Core(TM) i5-4590 CPU @ 3.30GHz
Expand Down
9 changes: 7 additions & 2 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,18 @@ java {

ext {
junitVersion = '5.9.1'
jsoniterScalaVersion = '2.23.2'
}

dependencies {
jmhImplementation group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.15.2'
jmhImplementation group: 'com.alibaba.fastjson2', name: 'fastjson2', version: '2.0.35'
jmhImplementation group: 'com.jsoniter', name: 'jsoniter', version: '0.9.23'
jmhImplementation group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-core_2.13', version: '2.23.2'
compileOnly group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-macros_2.13', version: '2.23.2'
jmhImplementation group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-core_2.13', version: jsoniterScalaVersion
compileOnly group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-macros_2.13', version: jsoniterScalaVersion

testImplementation group: 'org.assertj', name: 'assertj-core', version: '3.24.2'
testImplementation group: 'org.apache.commons', name: 'commons-text', version: '1.10.0'
testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-api', version: junitVersion
testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-params', version: junitVersion
testRuntimeOnly group: 'org.junit.jupiter', name: 'junit-jupiter-engine', version: junitVersion
Expand All @@ -74,6 +76,9 @@ test {
'--add-modules', 'jdk.incubator.vector',
'-Xmx2g'
]
testLogging {
events 'PASSED', 'SKIPPED', 'FAILED', 'STANDARD_OUT', 'STANDARD_ERROR'
}
}

tasks.withType(JmhBytecodeGeneratorTask).configureEach {
Expand Down
2 changes: 1 addition & 1 deletion src/jmh/java/org/simdjson/ParseBenchmark.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
@OutputTimeUnit(TimeUnit.SECONDS)
public class ParseBenchmark {

@Param({"/twitter.json" /*, "/gsoc-2018.json - unicode is not supported yet"*/, "/github_events.json"})
@Param({"/twitter.json", "/gsoc-2018.json", "/github_events.json"})
String fileName;

private final SimdJsonParser simdJsonParser = new SimdJsonParser();
Expand Down
252 changes: 252 additions & 0 deletions src/main/java/org/simdjson/CharacterUtils.java

Large diffs are not rendered by default.

49 changes: 0 additions & 49 deletions src/main/java/org/simdjson/JsonCharUtils.java

This file was deleted.

2 changes: 1 addition & 1 deletion src/main/java/org/simdjson/NumberParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import static java.lang.Long.remainderUnsigned;
import static java.lang.Math.abs;
import static java.lang.Math.unsignedMultiplyHigh;
import static org.simdjson.JsonCharUtils.isStructuralOrWhitespace;
import static org.simdjson.CharacterUtils.isStructuralOrWhitespace;
import static org.simdjson.NumberParserTables.NUMBER_OF_ADDITIONAL_DIGITS_AFTER_LEFT_SHIFT;
import static org.simdjson.NumberParserTables.POWERS_OF_FIVE;
import static org.simdjson.NumberParserTables.POWER_OF_FIVE_DIGITS;
Expand Down
127 changes: 127 additions & 0 deletions src/main/java/org/simdjson/StringParser.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
package org.simdjson;

import jdk.incubator.vector.ByteVector;

import static org.simdjson.CharacterUtils.escape;
import static org.simdjson.CharacterUtils.hexToInt;
import static org.simdjson.Tape.STRING;

class StringParser {

private static final byte BACKSLASH = '\\';
private static final byte QUOTE = '"';
private static final int BYTES_PROCESSED = StructuralIndexer.SPECIES.vectorByteSize();
private static final int MIN_HIGH_SURROGATE = 0xD800;
private static final int MAX_HIGH_SURROGATE = 0xDBFF;
private static final int MIN_LOW_SURROGATE = 0xDC00;
private static final int MAX_LOW_SURROGATE = 0xDFFF;

private final Tape tape;
private final byte[] stringBuffer;

private int stringBufferIdx;

StringParser(Tape tape, byte[] stringBuffer) {
this.tape = tape;
this.stringBuffer = stringBuffer;
}

void parseString(byte[] buffer, int idx) {
tape.append(stringBufferIdx, STRING);
int src = idx + 1;
int dst = stringBufferIdx + Integer.BYTES;
while (true) {
ByteVector srcVec = ByteVector.fromArray(StructuralIndexer.SPECIES, buffer, src);
srcVec.intoArray(stringBuffer, dst);
long backslashBits = srcVec.eq(BACKSLASH).toLong();
long quoteBits = srcVec.eq(QUOTE).toLong();

if (hasQuoteFirst(backslashBits, quoteBits)) {
dst += Long.numberOfTrailingZeros(quoteBits);
break;
}
if (hasBackslash(backslashBits, quoteBits)) {
int backslashDist = Long.numberOfTrailingZeros(backslashBits);
byte escapeChar = buffer[src + backslashDist + 1];
if (escapeChar == 'u') {
src += backslashDist;
dst += backslashDist;
int codePoint = hexToInt(buffer, src + 2);
src += 6;
if (codePoint >= MIN_HIGH_SURROGATE && codePoint <= MAX_HIGH_SURROGATE) {
codePoint = parseLowSurrogate(buffer, src, codePoint);
src += 6;
} else if (codePoint >= MIN_LOW_SURROGATE && codePoint <= MAX_LOW_SURROGATE) {
throw new JsonParsingException("Invalid code point. The range U+DC00–U+DFFF is reserved for low surrogate.");
}
dst += storeCodePointInStringBuffer(codePoint, dst);
} else {
stringBuffer[dst + backslashDist] = escape(escapeChar);
src += backslashDist + 2;
dst += backslashDist + 1;
}
} else {
src += BYTES_PROCESSED;
dst += BYTES_PROCESSED;
}
}
int len = dst - stringBufferIdx - Integer.BYTES;
IntegerUtils.toBytes(len, stringBuffer, stringBufferIdx);
stringBufferIdx = dst;
}

private int parseLowSurrogate(byte[] buffer, int src, int codePoint) {
if ((buffer[src] << 8 | buffer[src + 1]) != ('\\' << 8 | 'u')) {
throw new JsonParsingException("Low surrogate should start with '\\u'");
} else {
int codePoint2 = hexToInt(buffer, src + 2);
int lowBit = codePoint2 - MIN_LOW_SURROGATE;
if (lowBit >> 10 == 0) {
return (((codePoint - MIN_HIGH_SURROGATE) << 10) | lowBit) + 0x10000;
} else {
throw new JsonParsingException("Invalid code point. Low surrogate should be in the range U+DC00–U+DFFF.");
}
}
}

private int storeCodePointInStringBuffer(int codePoint, int dst) {
if (codePoint < 0) {
throw new JsonParsingException("Invalid unicode escape sequence.");
}
if (codePoint <= 0x7F) {
stringBuffer[dst] = (byte) codePoint;
return 1;
}
if (codePoint <= 0x7FF) {
stringBuffer[dst] = (byte) ((codePoint >> 6) + 192);
stringBuffer[dst + 1] = (byte) ((codePoint & 63) + 128);
return 2;
}
if (codePoint <= 0xFFFF) {
stringBuffer[dst] = (byte) ((codePoint >> 12) + 224);
stringBuffer[dst + 1] = (byte) (((codePoint >> 6) & 63) + 128);
stringBuffer[dst + 2] = (byte) ((codePoint & 63) + 128);
return 3;
}
if (codePoint <= 0x10FFFF) {
stringBuffer[dst] = (byte) ((codePoint >> 18) + 240);
stringBuffer[dst + 1] = (byte) (((codePoint >> 12) & 63) + 128);
stringBuffer[dst + 2] = (byte) (((codePoint >> 6) & 63) + 128);
stringBuffer[dst + 3] = (byte) ((codePoint & 63) + 128);
return 4;
}
throw new IllegalStateException("Code point is greater than 0x110000.");
}

private boolean hasQuoteFirst(long backslashBits, long quoteBits) {
return ((backslashBits - 1) & quoteBits) != 0;
}

private boolean hasBackslash(long backslashBits, long quoteBits) {
return ((quoteBits - 1) & backslashBits) != 0;
}

void reset() {
stringBufferIdx = 0;
}
}
86 changes: 5 additions & 81 deletions src/main/java/org/simdjson/TapeBuilder.java
Original file line number Diff line number Diff line change
@@ -1,55 +1,27 @@
package org.simdjson;

import jdk.incubator.vector.ByteVector;

import java.util.Arrays;

import static org.simdjson.JsonCharUtils.isStructuralOrWhitespace;
import static org.simdjson.CharacterUtils.isStructuralOrWhitespace;
import static org.simdjson.Tape.END_ARRAY;
import static org.simdjson.Tape.END_OBJECT;
import static org.simdjson.Tape.FALSE_VALUE;
import static org.simdjson.Tape.NULL_VALUE;
import static org.simdjson.Tape.ROOT;
import static org.simdjson.Tape.START_ARRAY;
import static org.simdjson.Tape.START_OBJECT;
import static org.simdjson.Tape.STRING;
import static org.simdjson.Tape.TRUE_VALUE;

class TapeBuilder {

private static final byte SPACE = 0x20;
private static final byte BACKSLASH = '\\';
private static final byte QUOTE = '"';
private static final int BYTES_PROCESSED = StructuralIndexer.SPECIES.vectorByteSize();
private static final byte[] ESCAPE_MAP = new byte[]{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};

private final Tape tape;
private final byte[] stringBuffer;
private final OpenContainer[] openContainers;
private final int padding;
private final NumberParser numberParser;

private int stringBufferIdx;
private final StringParser stringParser;

TapeBuilder(int capacity, int depth, int padding) {
this.tape = new Tape(capacity);
Expand All @@ -60,6 +32,7 @@ class TapeBuilder {
}
this.stringBuffer = new byte[capacity];
this.numberParser = new NumberParser(tape);
this.stringParser = new StringParser(tape, stringBuffer);
}

void visitDocumentStart() {
Expand Down Expand Up @@ -193,56 +166,7 @@ void visitKey(byte[] buffer, int idx) {
}

private void visitString(byte[] buffer, int idx) {
tape.append(stringBufferIdx, STRING);
int src = idx + 1;
int dst = stringBufferIdx + Integer.BYTES;
while (true) {
ByteVector srcVec = ByteVector.fromArray(StructuralIndexer.SPECIES, buffer, src);
srcVec.intoArray(stringBuffer, dst);
long backslashBits = srcVec.eq(BACKSLASH).toLong();
long quoteBits = srcVec.eq(QUOTE).toLong();

if (hasQuoteFirst(backslashBits, quoteBits)) {
dst += Long.numberOfTrailingZeros(quoteBits);
break;
}
if (hasBackslash(backslashBits, quoteBits)) {
int backslashDist = Long.numberOfTrailingZeros(backslashBits);
byte escapeChar = buffer[src + backslashDist + 1];
if (escapeChar == 'u') {
throw new UnsupportedOperationException("Support for unicode characters is not implemented yet.");
} else {
stringBuffer[dst + backslashDist] = escape(escapeChar);
src += backslashDist + 2;
dst += backslashDist + 1;
}
} else {
src += BYTES_PROCESSED;
dst += BYTES_PROCESSED;
}
}
int len = dst - stringBufferIdx - Integer.BYTES;
IntegerUtils.toBytes(len, stringBuffer, stringBufferIdx);
stringBufferIdx = dst;
}

private byte escape(byte escapeChar) {
if (escapeChar < 0) {
throw new JsonParsingException("Escaped unexpected character: " + ((char) escapeChar));
}
byte escapeResult = ESCAPE_MAP[escapeChar];
if (escapeResult == 0) {
throw new JsonParsingException("Escaped unexpected character: " + ((char) escapeChar));
}
return escapeResult;
}

private boolean hasQuoteFirst(long backslashBits, long quoteBits) {
return ((backslashBits - 1) & quoteBits) != 0;
}

private boolean hasBackslash(long backslashBits, long quoteBits) {
return ((quoteBits - 1) & backslashBits) != 0;
stringParser.parseString(buffer, idx);
}

private void visitNumber(byte[] buffer, int idx) {
Expand Down Expand Up @@ -278,7 +202,7 @@ private void emptyContainer(char start, char end) {

void reset() {
tape.reset();
stringBufferIdx = 0;
stringParser.reset();
}

JsonValue createJsonValue(byte[] buffer) {
Expand Down
Loading

0 comments on commit dd6d5b5

Please sign in to comment.