foxglove · jtbandes · Jan 22, 2024 · Jan 19, 2024 · Jan 22, 2024
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,7 +1,7 @@
 // -*- jsonc -*-
 {
   "editor.codeActionsOnSave": {
-    "source.fixAll.eslint": true
+    "source.fixAll.eslint": "explicit"
   },
   "editor.defaultFormatter": "esbenp.prettier-vscode",
   "editor.formatOnSave": true,

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@foxglove/rosmsg-serialization",
-  "version": "2.0.2",
+  "version": "2.0.3",
   "description": "ROS1 (Robot Operating System) message serialization, for reading and writing bags and network messages",
   "license": "MIT",
   "keywords": [

diff --git a/src/MessageWriter.test.ts b/src/MessageWriter.test.ts
@@ -9,6 +9,7 @@
 
 import { parse as parseMessageDefinition } from "@foxglove/rosmsg";
 
+import { MessageReader } from "./MessageReader";
 import { MessageWriter } from "./MessageWriter";
 
 const getStringBytes = (str: string): Uint8Array => {
@@ -421,4 +422,46 @@ describe("MessageWriter", () => {
       expect(writer.calculateByteSize(message)).toEqual(108);
     });
   });
+
+  it.each([
+    "",
+    "a",
+    "ab",
+    "abc",
+    "abcd",
+    "béta",
+    "\xE9",
+    "\u0000",
+    "\u007f",
+    "\u0080",
+    "\u07ff",
+    "\u0800",
+    "\ud800\udc00", // surrogate pair, equivalent to "𐀀" or "\u{10000}"
+    "\udbff\udfff", // surrogate pair, equivalent to "\u{10ffff}"
+    "\u7fff",
+    "\u8000",
+    "\u8001",
+    "\uffff",
+    "\u{10000}",
+    "\u{fffff}",
+    "\u{100000}",
+    "\u{10ffff}",
+  ])("handles non-ascii strings", (str) => {
+    const defs = parseMessageDefinition("string data");
+    const writer = new MessageWriter(defs);
+    const reader = new MessageReader(defs);
+    const msg = { data: str };
+    expect(reader.readMessage(writer.writeMessage(msg))).toEqual(msg);
+  });
+
+  it.each([
+    "\ud800", // lone high surrogate
+    "\udc00", // lone low surrogate
+  ])("replaces lone surrogates with replacement character", (str) => {
+    const defs = parseMessageDefinition("string data");
+    const writer = new MessageWriter(defs);
+    const reader = new MessageReader(defs);
+    const msg = { data: str };
+    expect(reader.readMessage(writer.writeMessage(msg))).toEqual({ data: "\ufffd" });
+  });
 });
diff --git a/src/MessageWriter.ts b/src/MessageWriter.ts
@@ -9,6 +9,8 @@
 
 import { MessageDefinition, MessageDefinitionField } from "@foxglove/message-definition";
 
+import { stringLengthUtf8 } from "./stringLengthUtf8";
+
 export interface Time {
   // whole seconds
   sec: number;
@@ -48,7 +50,7 @@ class StandardTypeOffsetCalculator {
     if (typeof value !== "string") {
       throw new Error(`Expected string but got ${typeof value}`);
     }
-    const length = 4 + value.length;
+    const length = 4 + stringLengthUtf8(value);
     return this._incrementAndReturn(length);
   }
 
@@ -129,8 +131,19 @@ class StandardTypeWriter {
       this.textEncoder = new TextEncoder();
     }
     const stringOffset = this.offsetCalculator.string(value);
-    this.view.setUint32(stringOffset, value.length, true);
-    this.textEncoder.encodeInto(value, this.data.subarray(stringOffset + 4));
+    const stringLength = this.offsetCalculator.offset - stringOffset - 4;
+    this.view.setUint32(stringOffset, stringLength, true);
+    const { read, written } = this.textEncoder.encodeInto(
+      value,
+      this.data.subarray(stringOffset + 4),
+    );
+    if (read !== value.length) {
+      throw new Error(
+        `Not enough space to encode string into subarray (wrote ${read!} of ${
+          value.length
+        } code units into ${written!} of ${this.data.subarray(stringOffset + 4).length} bytes)`,
+      );
+    }
   }
 
   // eslint-disable-next-line @foxglove/no-boolean-parameters

diff --git a/src/stringLengthUtf8.test.ts b/src/stringLengthUtf8.test.ts
@@ -0,0 +1,36 @@
+import { stringLengthUtf8 } from "./stringLengthUtf8";
+
+describe("stringLengthUtf8", () => {
+  it.each([
+    "",
+    "a",
+    "ab",
+    "abc",
+    "abcd",
+    "béta",
+    "\xE9",
+    "\u0000",
+    "\u007f",
+    "\u0080",
+    "\u07ff",
+    "\u0800",
+    "\ud800", // lone high surrogate
+    "\ud800x", // lone high surrogate
+    "x\ud800", // lone high surrogate
+    "\ud800\udc00", // surrogate pair, equivalent to "𐀀" or "\u{10000}"
+    "\udbff\udfff", // surrogate pair, equivalent to "\u{10ffff}"
+    "\udc00", // lone low surrogate
+    "\udc00x", // lone low surrogate
+    "x\udc00", // lone low surrogate
+    "\u7fff",
+    "\u8000",
+    "\u8001",
+    "\uffff",
+    "\u{10000}",
+    "\u{fffff}",
+    "\u{100000}",
+    "\u{10ffff}",
+  ])("agrees with TextEncoder", (str) => {
+    expect(stringLengthUtf8(str)).toEqual(new TextEncoder().encode(str).length);
+  });
+});
diff --git a/src/stringLengthUtf8.ts b/src/stringLengthUtf8.ts
@@ -0,0 +1,32 @@
+/**
+ * Returns the number of bytes that would be used when encoding the string as UTF-8, effectively the
+ * same as `new TextEncoder().encode(str).length` but faster.
+ * https://jsbench.me/nzlrkwmeiq/1
+ */
+export function stringLengthUtf8(str: string): number {
+  let byteLength = 0;
+  const numCodeUnits = str.length;
+  for (let i = 0; i < numCodeUnits; i++) {
+    const codeUnit = str.charCodeAt(i); // 0x0000-0xFFFF
+    if (codeUnit <= 0x7f) {
+      byteLength += 1; // 0b0xxxxxxx
+    } else if (codeUnit <= 0x7ff) {
+      byteLength += 2; // 0b110xxxxx 0b10xxxxxx
+    } else if (0xd800 <= codeUnit && codeUnit <= 0xdbff) {
+      // If the input string is valid UTF-16 then these surrogate characters come in pairs. They
+      // represent code points in the range 0x100000-0x10ffff and are represented with 4 bytes in
+      // UTF-8.
+      const nextCodeUnit = str.charCodeAt(i + 1);
+      if (0xdc00 <= nextCodeUnit && nextCodeUnit <= 0xdfff) {
+        byteLength += 4; // 0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx
+        i++;
+      } else {
+        byteLength += 3; // 0b1110xxxx 0b10xxxxxx 0b10xxxxxx
+      }
+    } else {
+      // <= 0xFFFF
+      byteLength += 3; // 0b1110xxxx 0b10xxxxxx 0b10xxxxxx
+    }
+  }
+  return byteLength;
+}