Parsing Unicode escape sequences (#21)

simdjson · Sep 12, 2023 · dd6d5b5 · dd6d5b5
1 parent 8c4c689
commit dd6d5b5
Show file tree

Hide file tree

Showing 12 changed files with 677 additions and 259 deletions.
diff --git a/README.md b/README.md
@@ -8,12 +8,6 @@ A Java version of [simdjson](https://github.com/simdjson/simdjson) - a JSON pars
 based on the paper [Parsing Gigabytes of JSON per Second](https://arxiv.org/abs/1902.08318) 
 by Geoff Langdale and Daniel Lemire.
 
-This implementation is still missing several features available in simdsjon. For example:
-
-* Support for Unicode characters
-* UTF-8 validation
-* Support for 512-bit vectors
-
 ## Code Sample
 
 ```java
@@ -73,8 +67,8 @@ This section presents a performance comparison of different JSON parsers availab
 the [twitter.json](src/jmh/resources/twitter.json) dataset, and its goal was to measure the throughput (ops/s) of parsing 
 and finding all unique users with a default profile.
 
-**Note that simdjson-java is still missing several features (mentioned in the introduction), so the following results
-may not reflect its real performance.**
+**Note that simdjson-java is still missing several features (see [GitHub Issues](https://github.com/simdjson/simdjson-java/issues)), 
+so the following results may not reflect its real performance.**
 
 Environment:
 * CPU: Intel(R) Core(TM) i5-4590 CPU @ 3.30GHz

diff --git a/build.gradle b/build.gradle
@@ -38,16 +38,18 @@ java {
 
 ext {
     junitVersion = '5.9.1'
+    jsoniterScalaVersion = '2.23.2'
 }
 
 dependencies {
     jmhImplementation group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.15.2'
     jmhImplementation group: 'com.alibaba.fastjson2', name: 'fastjson2', version: '2.0.35'
     jmhImplementation group: 'com.jsoniter', name: 'jsoniter', version: '0.9.23'
-    jmhImplementation group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-core_2.13', version: '2.23.2'
-    compileOnly group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-macros_2.13', version: '2.23.2'
+    jmhImplementation group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-core_2.13', version: jsoniterScalaVersion
+    compileOnly group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-macros_2.13', version: jsoniterScalaVersion
 
     testImplementation group: 'org.assertj', name: 'assertj-core', version: '3.24.2'
+    testImplementation group: 'org.apache.commons', name: 'commons-text', version: '1.10.0'
     testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-api', version: junitVersion
     testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-params', version: junitVersion
     testRuntimeOnly group: 'org.junit.jupiter', name: 'junit-jupiter-engine', version: junitVersion
@@ -74,6 +76,9 @@ test {
             '--add-modules', 'jdk.incubator.vector',
             '-Xmx2g'
     ]
+    testLogging {
+        events 'PASSED', 'SKIPPED', 'FAILED', 'STANDARD_OUT', 'STANDARD_ERROR'
+    }
 }
 
 tasks.withType(JmhBytecodeGeneratorTask).configureEach {

diff --git a/src/jmh/java/org/simdjson/ParseBenchmark.java b/src/jmh/java/org/simdjson/ParseBenchmark.java
@@ -21,7 +21,7 @@
 @OutputTimeUnit(TimeUnit.SECONDS)
 public class ParseBenchmark {
 
-    @Param({"/twitter.json" /*, "/gsoc-2018.json - unicode is not supported yet"*/, "/github_events.json"})
+    @Param({"/twitter.json", "/gsoc-2018.json", "/github_events.json"})
     String fileName;
 
     private final SimdJsonParser simdJsonParser = new SimdJsonParser();

diff --git a/src/main/java/org/simdjson/CharacterUtils.java b/src/main/java/org/simdjson/CharacterUtils.java
diff --git a/src/main/java/org/simdjson/JsonCharUtils.java b/src/main/java/org/simdjson/JsonCharUtils.java
diff --git a/src/main/java/org/simdjson/NumberParser.java b/src/main/java/org/simdjson/NumberParser.java
@@ -9,7 +9,7 @@
 import static java.lang.Long.remainderUnsigned;
 import static java.lang.Math.abs;
 import static java.lang.Math.unsignedMultiplyHigh;
-import static org.simdjson.JsonCharUtils.isStructuralOrWhitespace;
+import static org.simdjson.CharacterUtils.isStructuralOrWhitespace;
 import static org.simdjson.NumberParserTables.NUMBER_OF_ADDITIONAL_DIGITS_AFTER_LEFT_SHIFT;
 import static org.simdjson.NumberParserTables.POWERS_OF_FIVE;
 import static org.simdjson.NumberParserTables.POWER_OF_FIVE_DIGITS;

diff --git a/src/main/java/org/simdjson/StringParser.java b/src/main/java/org/simdjson/StringParser.java
@@ -0,0 +1,127 @@
+package org.simdjson;
+
+import jdk.incubator.vector.ByteVector;
+
+import static org.simdjson.CharacterUtils.escape;
+import static org.simdjson.CharacterUtils.hexToInt;
+import static org.simdjson.Tape.STRING;
+
+class StringParser {
+
+    private static final byte BACKSLASH = '\\';
+    private static final byte QUOTE = '"';
+    private static final int BYTES_PROCESSED = StructuralIndexer.SPECIES.vectorByteSize();
+    private static final int MIN_HIGH_SURROGATE = 0xD800;
+    private static final int MAX_HIGH_SURROGATE = 0xDBFF;
+    private static final int MIN_LOW_SURROGATE = 0xDC00;
+    private static final int MAX_LOW_SURROGATE = 0xDFFF;
+
+    private final Tape tape;
+    private final byte[] stringBuffer;
+
+    private int stringBufferIdx;
+
+    StringParser(Tape tape, byte[] stringBuffer) {
+        this.tape = tape;
+        this.stringBuffer = stringBuffer;
+    }
+
+    void parseString(byte[] buffer, int idx) {
+        tape.append(stringBufferIdx, STRING);
+        int src = idx + 1;
+        int dst = stringBufferIdx + Integer.BYTES;
+        while (true) {
+            ByteVector srcVec = ByteVector.fromArray(StructuralIndexer.SPECIES, buffer, src);
+            srcVec.intoArray(stringBuffer, dst);
+            long backslashBits = srcVec.eq(BACKSLASH).toLong();
+            long quoteBits = srcVec.eq(QUOTE).toLong();
+
+            if (hasQuoteFirst(backslashBits, quoteBits)) {
+                dst += Long.numberOfTrailingZeros(quoteBits);
+                break;
+            }
+            if (hasBackslash(backslashBits, quoteBits)) {
+                int backslashDist = Long.numberOfTrailingZeros(backslashBits);
+                byte escapeChar = buffer[src + backslashDist + 1];
+                if (escapeChar == 'u') {
+                    src += backslashDist;
+                    dst += backslashDist;
+                    int codePoint = hexToInt(buffer, src + 2);
+                    src += 6;
+                    if (codePoint >= MIN_HIGH_SURROGATE && codePoint <= MAX_HIGH_SURROGATE) {
+                        codePoint = parseLowSurrogate(buffer, src, codePoint);
+                        src += 6;
+                    } else if (codePoint >= MIN_LOW_SURROGATE && codePoint <= MAX_LOW_SURROGATE) {
+                        throw new JsonParsingException("Invalid code point. The range U+DC00–U+DFFF is reserved for low surrogate.");
+                    }
+                    dst += storeCodePointInStringBuffer(codePoint, dst);
+                } else {
+                    stringBuffer[dst + backslashDist] = escape(escapeChar);
+                    src += backslashDist + 2;
+                    dst += backslashDist + 1;
+                }
+            } else {
+                src += BYTES_PROCESSED;
+                dst += BYTES_PROCESSED;
+            }
+        }
+        int len = dst - stringBufferIdx - Integer.BYTES;
+        IntegerUtils.toBytes(len, stringBuffer, stringBufferIdx);
+        stringBufferIdx = dst;
+    }
+
+    private int parseLowSurrogate(byte[] buffer, int src, int codePoint) {
+        if ((buffer[src] << 8 | buffer[src + 1]) != ('\\' << 8 | 'u')) {
+            throw new JsonParsingException("Low surrogate should start with '\\u'");
+        } else {
+            int codePoint2 = hexToInt(buffer, src + 2);
+            int lowBit = codePoint2 - MIN_LOW_SURROGATE;
+            if (lowBit >> 10 == 0) {
+                return (((codePoint - MIN_HIGH_SURROGATE) << 10) | lowBit) + 0x10000;
+            } else {
+                throw new JsonParsingException("Invalid code point. Low surrogate should be in the range U+DC00–U+DFFF.");
+            }
+        }
+    }
+
+    private int storeCodePointInStringBuffer(int codePoint, int dst) {
+        if (codePoint < 0) {
+            throw new JsonParsingException("Invalid unicode escape sequence.");
+        }
+        if (codePoint <= 0x7F) {
+            stringBuffer[dst] = (byte) codePoint;
+            return 1;
+        }
+        if (codePoint <= 0x7FF) {
+            stringBuffer[dst] = (byte) ((codePoint >> 6) + 192);
+            stringBuffer[dst + 1] = (byte) ((codePoint & 63) + 128);
+            return 2;
+        }
+        if (codePoint <= 0xFFFF) {
+            stringBuffer[dst] = (byte) ((codePoint >> 12) + 224);
+            stringBuffer[dst + 1] = (byte) (((codePoint >> 6) & 63) + 128);
+            stringBuffer[dst + 2] = (byte) ((codePoint & 63) + 128);
+            return 3;
+        }
+        if (codePoint <= 0x10FFFF) {
+            stringBuffer[dst] = (byte) ((codePoint >> 18) + 240);
+            stringBuffer[dst + 1] = (byte) (((codePoint >> 12) & 63) + 128);
+            stringBuffer[dst + 2] = (byte) (((codePoint >> 6) & 63) + 128);
+            stringBuffer[dst + 3] = (byte) ((codePoint & 63) + 128);
+            return 4;
+        }
+        throw new IllegalStateException("Code point is greater than 0x110000.");
+    }
+
+    private boolean hasQuoteFirst(long backslashBits, long quoteBits) {
+        return ((backslashBits - 1) & quoteBits) != 0;
+    }
+
+    private boolean hasBackslash(long backslashBits, long quoteBits) {
+        return ((quoteBits - 1) & backslashBits) != 0;
+    }
+
+    void reset() {
+        stringBufferIdx = 0;
+    }
+}
diff --git a/src/main/java/org/simdjson/TapeBuilder.java b/src/main/java/org/simdjson/TapeBuilder.java
@@ -1,55 +1,27 @@
 package org.simdjson;
 
-import jdk.incubator.vector.ByteVector;
-
 import java.util.Arrays;
 
-import static org.simdjson.JsonCharUtils.isStructuralOrWhitespace;
+import static org.simdjson.CharacterUtils.isStructuralOrWhitespace;
 import static org.simdjson.Tape.END_ARRAY;
 import static org.simdjson.Tape.END_OBJECT;
 import static org.simdjson.Tape.FALSE_VALUE;
 import static org.simdjson.Tape.NULL_VALUE;
 import static org.simdjson.Tape.ROOT;
 import static org.simdjson.Tape.START_ARRAY;
 import static org.simdjson.Tape.START_OBJECT;
-import static org.simdjson.Tape.STRING;
 import static org.simdjson.Tape.TRUE_VALUE;
 
 class TapeBuilder {
 
     private static final byte SPACE = 0x20;
-    private static final byte BACKSLASH = '\\';
-    private static final byte QUOTE = '"';
-    private static final int BYTES_PROCESSED = StructuralIndexer.SPECIES.vectorByteSize();
-    private static final byte[] ESCAPE_MAP = new byte[]{
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
-            0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
-            0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
-
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    };
 
     private final Tape tape;
     private final byte[] stringBuffer;
     private final OpenContainer[] openContainers;
     private final int padding;
     private final NumberParser numberParser;
-
-    private int stringBufferIdx;
+    private final StringParser stringParser;
 
     TapeBuilder(int capacity, int depth, int padding) {
         this.tape = new Tape(capacity);
@@ -60,6 +32,7 @@ class TapeBuilder {
         }
         this.stringBuffer = new byte[capacity];
         this.numberParser = new NumberParser(tape);
+        this.stringParser = new StringParser(tape, stringBuffer);
     }
 
     void visitDocumentStart() {
@@ -193,56 +166,7 @@ void visitKey(byte[] buffer, int idx) {
     }
 
     private void visitString(byte[] buffer, int idx) {
-        tape.append(stringBufferIdx, STRING);
-        int src = idx + 1;
-        int dst = stringBufferIdx + Integer.BYTES;
-        while (true) {
-            ByteVector srcVec = ByteVector.fromArray(StructuralIndexer.SPECIES, buffer, src);
-            srcVec.intoArray(stringBuffer, dst);
-            long backslashBits = srcVec.eq(BACKSLASH).toLong();
-            long quoteBits = srcVec.eq(QUOTE).toLong();
-
-            if (hasQuoteFirst(backslashBits, quoteBits)) {
-                dst += Long.numberOfTrailingZeros(quoteBits);
-                break;
-            }
-            if (hasBackslash(backslashBits, quoteBits)) {
-                int backslashDist = Long.numberOfTrailingZeros(backslashBits);
-                byte escapeChar = buffer[src + backslashDist + 1];
-                if (escapeChar == 'u') {
-                    throw new UnsupportedOperationException("Support for unicode characters is not implemented yet.");
-                } else {
-                    stringBuffer[dst + backslashDist] = escape(escapeChar);
-                    src += backslashDist + 2;
-                    dst += backslashDist + 1;
-                }
-            } else {
-                src += BYTES_PROCESSED;
-                dst += BYTES_PROCESSED;
-            }
-        }
-        int len = dst - stringBufferIdx - Integer.BYTES;
-        IntegerUtils.toBytes(len, stringBuffer, stringBufferIdx);
-        stringBufferIdx = dst;
-    }
-
-    private byte escape(byte escapeChar) {
-        if (escapeChar < 0) {
-            throw new JsonParsingException("Escaped unexpected character: " + ((char) escapeChar));
-        }
-        byte escapeResult = ESCAPE_MAP[escapeChar];
-        if (escapeResult == 0) {
-            throw new JsonParsingException("Escaped unexpected character: " + ((char) escapeChar));
-        }
-        return escapeResult;
-    }
-
-    private boolean hasQuoteFirst(long backslashBits, long quoteBits) {
-        return ((backslashBits - 1) & quoteBits) != 0;
-    }
-
-    private boolean hasBackslash(long backslashBits, long quoteBits) {
-        return ((quoteBits - 1) & backslashBits) != 0;
+        stringParser.parseString(buffer, idx);
     }
 
     private void visitNumber(byte[] buffer, int idx) {
@@ -278,7 +202,7 @@ private void emptyContainer(char start, char end) {
 
     void reset() {
         tape.reset();
-        stringBufferIdx = 0;
+        stringParser.reset();
     }
 
     JsonValue createJsonValue(byte[] buffer) {