diff --git a/.claude/skills/typed-arrays/SKILL.md b/.claude/skills/typed-arrays/SKILL.md new file mode 100644 index 0000000..89ceb8f --- /dev/null +++ b/.claude/skills/typed-arrays/SKILL.md @@ -0,0 +1,115 @@ +--- +name: typed-arrays +description: | + Modern TypedArray and ArrayBuffer features including resizable buffers, + transfer operations, Float16Array, and Uint8Array base64/hex encoding. +compatibility: Node.js 20+ and all the modern browsers +--- + +# Modern Typed Arrays + +## ES2023: Change Array by Copy + +Immutable operations returning new arrays: + +```typescript +const arr = new Uint8Array([3, 1, 2]); + +arr.toReversed(); // Uint8Array [2, 1, 3] +arr.toSorted((a, b) => a - b); // Uint8Array [1, 2, 3] +arr.with(0, 99); // Uint8Array [99, 1, 2] +``` + +## ES2023: findLast / findLastIndex + +```typescript +const arr = new Uint8Array([1, 2, 3, 2, 1]); + +arr.findLast(x => x === 2); // 2 +arr.findLastIndex(x => x === 2); // 3 +``` + +## ES2024: Resizable ArrayBuffer + +```typescript +const buffer = new ArrayBuffer(16, { maxByteLength: 1024 }); + +buffer.resizable; // true +buffer.maxByteLength; // 1024 +buffer.resize(64); // grow +buffer.resize(8); // shrink +``` + +### Growable SharedArrayBuffer + +```typescript +const shared = new SharedArrayBuffer(16, { maxByteLength: 1024 }); +shared.growable; // true +shared.grow(64); // can only grow, not shrink +``` + +### TypedArray tracks resizable buffer + +```typescript +const buffer = new ArrayBuffer(16, { maxByteLength: 64 }); +const view = new Uint8Array(buffer); +view.length; // 16 +buffer.resize(32); +view.length; // 32 (auto-tracks) +``` + +## ES2024: ArrayBuffer Transfer + +```typescript +const buffer = new ArrayBuffer(16); +const arr = new Uint8Array(buffer); +arr[0] = 42; + +const newBuffer = buffer.transfer(); // zero-copy transfer +buffer.detached; // true +newBuffer.byteLength; // 16 + +// Transfer with resize +const grown = buffer.transfer(64); + +// Convert resizable to fixed +const fixed = resizable.transferToFixedLength(); +``` + +## ES2025: Float16Array + +```typescript +const f16 = new Float16Array(4); +const f16arr = Float16Array.of(1.5, 2.5, 3.5); + +Float16Array.BYTES_PER_ELEMENT; // 2 +// Range: ±65504 (max), ±6.1e-5 (min positive) +``` + +### DataView Float16 + +```typescript +const view = new DataView(buffer); +view.setFloat16(0, 3.14, true); // little-endian +view.getFloat16(0, true); // ≈3.140625 +``` + +## ES2026: Uint8Array Base64 + +Not yet in Node.js v24. + +### Base64 + +```typescript +const bytes = new Uint8Array([72, 101, 108, 108, 111]); + +bytes.toBase64(); // "SGVsbG8=" +bytes.toBase64({ alphabet: "base64url" }); // URL-safe +bytes.toBase64({ omitPadding: true }); // no trailing = + +Uint8Array.fromBase64("SGVsbG8="); +Uint8Array.fromBase64("SGVsbG8", { alphabet: "base64url" }); + +// Write to existing buffer +const { read, written } = target.setFromBase64("SGVsbG8="); +``` diff --git a/.claude/skills/wasm/SKILL.md b/.claude/skills/wasm/SKILL.md new file mode 100644 index 0000000..a21d086 --- /dev/null +++ b/.claude/skills/wasm/SKILL.md @@ -0,0 +1,101 @@ +--- +name: wasm +description: | + Modern WebAssembly (Wasm) development expertise covering modern Wasm features + and optimization techniques. Use this skill when working with WebAssembly + modules or optimizing Wasm performance. +compatibility: WebAssembly v3.0 and later +--- + +# WebAssembly Development Skill + +## WAT Syntax + +Use **folded (S-expression) syntax** for readability: + +```wat +;; Folded syntax (preferred) +(i32.add (local.get $x) (local.get $y)) + +;; Flat syntax (avoid) +local.get $x +local.get $y +i32.add +``` + +## WebAssembly Features + +### Memory64 (64-bit Address Space) +- Memories and tables use `i64` as address type +- Expands addressable space from 4GB to 16 exabytes +- Syntax: `(memory i64 1)` instead of `(memory 1)` + +### Multiple Memories +```wat +(module + (memory $main 1) + (memory $scratch 1)) +``` + +### Tail Call Optimization +- Efficient recursion via `return_call` and `return_call_indirect` +- Prevents stack overflow for tail-recursive functions +```wat +(func $factorial (param $n i64) (param $acc i64) (result i64) + (if (result i64) (i64.eqz (local.get $n)) + (then (local.get $acc)) + (else (return_call $factorial + (i64.sub (local.get $n) (i64.const 1)) + (i64.mul (local.get $n) (local.get $acc)))))) +``` + +### Exception Handling +- Native try/catch/throw semantics +- Interoperates with JavaScript exceptions +```wat +(tag $error (param i32)) +(func $may_throw + (throw $error (i32.const 42))) +``` + +### Relaxed SIMD +- Hardware-dependent SIMD optimizations beyond fixed-width 128-bit +- `i8x16.relaxed_swizzle`, `f32x4.relaxed_madd`, etc. + +### WasmGC +- Native garbage-collected types: `struct`, `array` +- Instructions: `array.new`, `array.get`, `array.set`, `struct.new`, `struct.get` +- Reference types: `(ref $type)`, `(ref null $type)` + +### externref +- Opaque reference to host (JS) objects +- Cannot be inspected or modified in Wasm, only passed around +- Used with js-string-builtins for efficient string handling + +### js-string-builtins +- Import `"wasm:js-string"` for direct JS string operations +- Functions: `length`, `charCodeAt`, `fromCharCodeArray`, `intoCharCodeArray` +- Avoids costly JS↔Wasm boundary crossings for string processing + +### SIMD Example +```wat +;; Process 16 bytes at a time +(v128.store (local.get $dst) + (i8x16.add + (v128.load (local.get $src1)) + (v128.load (local.get $src2)))) +``` + +## Toolchain (Binaryen) + +| Task | Command | +|------|---------| +| Assemble WAT to Wasm | `wasm-as module.wat -o module.wasm` | +| Disassemble Wasm to WAT | `wasm-dis module.wasm -o module.wat` | +| Optimize for size | `wasm-opt -Oz in.wasm -o out.wasm` | +| Optimize for speed | `wasm-opt -O3 in.wasm -o out.wasm` | + +## Resources + +- [WebAssembly Specification](https://webassembly.github.io/spec/) +- [Binaryen](https://github.com/WebAssembly/binaryen) diff --git a/benchmark/count-utf8.ts b/benchmark/count-utf8.ts new file mode 100644 index 0000000..babfbc2 --- /dev/null +++ b/benchmark/count-utf8.ts @@ -0,0 +1,57 @@ +/* eslint-disable no-console */ +import { utf8CountJs, WASM_AVAILABLE } from "../src/utils/utf8.ts"; +import { getWasmError, utf8CountWasm } from "../src/utils/utf8-wasm.ts"; + +// @ts-ignore +import Benchmark from "benchmark"; + +// description +console.log("utf8CountJs - pure JS implementation"); +console.log("utf8CountWasm - WebAssembly implementation"); + +// Show wasm status +console.log("=".repeat(60)); +console.log("WebAssembly Status:"); +console.log(` WASM_AVAILABLE: ${WASM_AVAILABLE}`); +if (WASM_AVAILABLE) { + console.log(" js-string-builtins: enabled"); +} else { + const error = getWasmError(); + console.log(` Error: ${error?.message || "unknown"}`); + if (error?.message?.includes("js-string") || error?.message?.includes("builtin")) { + console.log("\n js-string-builtins is enabled by default in Node.js 24+ (V8 13.6+)."); + console.log(" For older versions, run with:"); + console.log(" node --experimental-wasm-imported-strings node_modules/.bin/ts-node benchmark/count-utf8.ts"); + } +} +console.log("=".repeat(60)); + +for (const baseStr of ["A", "あ", "🌏"]) { + const dataSet = [10, 30, 50, 100, 200, 500, 1000].map((n) => { + return baseStr.repeat(n); + }); + + for (const str of dataSet) { + const byteLength = utf8CountJs(str); + + console.log(`\n## string "${baseStr}" (strLength=${str.length}, byteLength=${byteLength})\n`); + + const suite = new Benchmark.Suite(); + + suite.add("utf8CountJs", () => { + utf8CountJs(str); + }); + + if (WASM_AVAILABLE) { + suite.add("utf8CountWasm", () => { + utf8CountWasm(str); + }); + } + + suite.on("cycle", (event: any) => { + console.log(String(event.target)); + }); + + suite.run(); + } +} diff --git a/benchmark/decode-string.ts b/benchmark/decode-string.ts index a6ea146..f9612ea 100644 --- a/benchmark/decode-string.ts +++ b/benchmark/decode-string.ts @@ -1,9 +1,32 @@ /* eslint-disable no-console */ -import { utf8EncodeJs, utf8Count, utf8DecodeJs, utf8DecodeTD } from "../src/utils/utf8"; +import { utf8EncodeJs, utf8Count, utf8DecodeJs, utf8DecodeTD, WASM_AVAILABLE } from "../src/utils/utf8.ts"; +import { getWasmError, utf8DecodeWasm } from "../src/utils/utf8-wasm.ts"; // @ts-ignore import Benchmark from "benchmark"; +// description +console.log("utf8DecodeJs - pure JS implementation"); +console.log("utf8DecodeTD - TextDecoder implementation"); +console.log("utf8DecodeWasm - WebAssembly implementation"); + +// Show wasm status +console.log("=".repeat(60)); +console.log("WebAssembly Status:"); +console.log(` WASM_AVAILABLE: ${WASM_AVAILABLE}`); +if (WASM_AVAILABLE) { + console.log(" js-string-builtins: enabled"); +} else { + const error = getWasmError(); + console.log(` Error: ${error?.message || "unknown"}`); + if (error?.message?.includes("js-string") || error?.message?.includes("builtin")) { + console.log("\n js-string-builtins is enabled by default in Node.js 24+ (V8 13.6+)."); + console.log(" For older versions, run with:"); + console.log(" node --experimental-wasm-imported-strings node_modules/.bin/ts-node benchmark/decode-string.ts"); + } +} +console.log("=".repeat(60)); + for (const baseStr of ["A", "あ", "🌏"]) { const dataSet = [10, 100, 500, 1_000].map((n) => { return baseStr.repeat(n); @@ -24,11 +47,20 @@ for (const baseStr of ["A", "あ", "🌏"]) { } }); - suite.add("TextDecoder", () => { + suite.add("utf8DecodeTD", () => { if (utf8DecodeTD(bytes, 0, byteLength) !== str) { throw new Error("wrong result!"); } }); + + if (WASM_AVAILABLE) { + suite.add("utf8DecodeWasm", () => { + if (utf8DecodeWasm(bytes, 0, byteLength) !== str) { + throw new Error("wrong result!"); + } + }); + } + suite.on("cycle", (event: any) => { console.log(String(event.target)); }); diff --git a/benchmark/encode-string.ts b/benchmark/encode-string.ts index 3f6aac6..3a46529 100644 --- a/benchmark/encode-string.ts +++ b/benchmark/encode-string.ts @@ -1,9 +1,32 @@ /* eslint-disable no-console */ -import { utf8EncodeJs, utf8Count, utf8EncodeTE } from "../src/utils/utf8"; +import { utf8EncodeJs, utf8Count, utf8EncodeTE, WASM_AVAILABLE } from "../src/utils/utf8.ts"; +import { getWasmError, utf8EncodeWasm } from "../src/utils/utf8-wasm.ts"; // @ts-ignore import Benchmark from "benchmark"; +// description +console.log("utf8EncodeJs - pure JS implementation"); +console.log("utf8EncodeTE - TextEncoder implementation"); +console.log("utf8EncodeWasm - WebAssembly implementation"); + +// Show wasm status +console.log("=".repeat(60)); +console.log("WebAssembly Status:"); +console.log(` WASM_AVAILABLE: ${WASM_AVAILABLE}`); +if (WASM_AVAILABLE) { + console.log(" js-string-builtins: enabled"); +} else { + const error = getWasmError(); + console.log(` Error: ${error?.message || "unknown"}`); + if (error?.message?.includes("js-string") || error?.message?.includes("builtin")) { + console.log("\n js-string-builtins is enabled by default in Node.js 24+ (V8 13.6+)."); + console.log(" For older versions, run with:"); + console.log(" node --experimental-wasm-imported-strings node_modules/.bin/ts-node benchmark/encode-string.ts"); + } +} +console.log("=".repeat(60)); + for (const baseStr of ["A", "あ", "🌏"]) { const dataSet = [10, 30, 50, 100].map((n) => { return baseStr.repeat(n); @@ -21,9 +44,16 @@ for (const baseStr of ["A", "あ", "🌏"]) { utf8EncodeJs(str, buffer, 0); }); - suite.add("utf8DecodeTE", () => { + suite.add("utf8EncodeTE", () => { utf8EncodeTE(str, buffer, 0); }); + + if (WASM_AVAILABLE) { + suite.add("utf8EncodeWasm", () => { + utf8EncodeWasm(str, buffer, 0); + }); + } + suite.on("cycle", (event: any) => { console.log(String(event.target)); }); diff --git a/benchmark/key-decoder.ts b/benchmark/key-decoder.ts index 594bbab..55ab481 100644 --- a/benchmark/key-decoder.ts +++ b/benchmark/key-decoder.ts @@ -1,5 +1,5 @@ /* eslint-disable no-console */ -import { utf8EncodeJs, utf8Count, utf8DecodeJs } from "../src/utils/utf8"; +import { utf8EncodeJs, utf8Count, utf8DecodeJs } from "../src/utils/utf8.ts"; // @ts-ignore import Benchmark from "benchmark"; diff --git a/package-lock.json b/package-lock.json index 5a3f7d0..0a08567 100644 --- a/package-lock.json +++ b/package-lock.json @@ -17,9 +17,10 @@ "@types/node": "latest", "@typescript-eslint/eslint-plugin": "latest", "@typescript-eslint/parser": "latest", - "@typescript/native-preview": "^7.0.0-dev.20251225.1", + "@typescript/native-preview": "latest", "assert": "latest", "benchmark": "latest", + "binaryen": "latest", "buffer": "latest", "core-js": "latest", "eslint": "latest", @@ -1758,6 +1759,24 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/binaryen": { + "version": "125.0.0", + "resolved": "https://registry.npmjs.org/binaryen/-/binaryen-125.0.0.tgz", + "integrity": "sha512-X7CUM9ZnwL/Ow++JH5AJKiemc82J7JyeryuPvXQdXBLcL/rqrC5KMUB1mHiORSolietH9sotvaOZlr6HSwPAlw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "wasm-as": "bin/wasm-as", + "wasm-ctor-eval": "bin/wasm-ctor-eval", + "wasm-dis": "bin/wasm-dis", + "wasm-merge": "bin/wasm-merge", + "wasm-metadce": "bin/wasm-metadce", + "wasm-opt": "bin/wasm-opt", + "wasm-reduce": "bin/wasm-reduce", + "wasm-shell": "bin/wasm-shell", + "wasm2js": "bin/wasm2js" + } + }, "node_modules/body-parser": { "version": "1.20.4", "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.4.tgz", diff --git a/package.json b/package.json index 67e4dd4..d4c60be 100644 --- a/package.json +++ b/package.json @@ -25,10 +25,12 @@ "sideEffects": false, "scripts": { "build": "npm publish --dry-run", + "build:wasm": "node wasm/build.mts", "prepare": "npm run clean && webpack --bail && tsgo --build tsconfig.dist.cjs.json tsconfig.dist.esm.json && tsimp tools/fix-ext.mts --mjs dist.esm/*.js dist.esm/*/*.js dist.esm/*.d.ts dist.esm/*/*.d.ts && tsimp tools/fix-ext.mts --cjs dist.cjs/*.js dist.cjs/*/*.js dist.cjs/*.d.ts dist.cjs/*/*.d.ts", "prepublishOnly": "npm run test:dist", "clean": "rimraf build dist dist.*", "test": "mocha 'test/**/*.test.ts'", + "test:wasm": "MSGPACK_WASM=force node --experimental-wasm-imported-strings node_modules/.bin/mocha 'test/**/*.test.ts'", "test:dist": "npm run lint && npm run test && npm run test:deno", "test:cover": "npm run cover:clean && npx nyc --no-clean npm run 'test' && npm run cover:report", "test:node_with_strip_types": "node --experimental-strip-types test/deno_test.ts", @@ -73,9 +75,10 @@ "@types/node": "latest", "@typescript-eslint/eslint-plugin": "latest", "@typescript-eslint/parser": "latest", - "@typescript/native-preview": "^7.0.0-dev.20251225.1", + "@typescript/native-preview": "latest", "assert": "latest", "benchmark": "latest", + "binaryen": "latest", "buffer": "latest", "core-js": "latest", "eslint": "latest", diff --git a/src/utils/utf8-wasm-binary.ts b/src/utils/utf8-wasm-binary.ts new file mode 100644 index 0000000..7e22293 --- /dev/null +++ b/src/utils/utf8-wasm-binary.ts @@ -0,0 +1,27 @@ +// Auto-generated by wasm/build.mts - DO NOT EDIT MANUALLY +// Source: wasm/utf8.wat + +export const wasmBinary = ` +AGFzbQEAAAABNQhedwFgAW8Bf2ACb38Bf2ADb2QAfwF/YANkAH9/AWRvYAJ/ZAABf2ABfwFkAGADZA +B/fwFvAnsEDndhc206anMtc3RyaW5nBmxlbmd0aAABDndhc206anMtc3RyaW5nCmNoYXJDb2RlQXQA +Ag53YXNtOmpzLXN0cmluZxFpbnRvQ2hhckNvZGVBcnJheQADDndhc206anMtc3RyaW5nEWZyb21DaG +FyQ29kZUFycmF5AAQDBgUBAgUGBwUDAQABB1QGBm1lbW9yeQIACXV0ZjhDb3VudAAECnV0ZjhFbmNv +ZGUABRF1dGY4RGVjb2RlVG9BcnJheQAGCmFsbG9jQXJyYXkABw1hcnJheVRvU3RyaW5nAAgKsQcFlA +EBBH8gABAAIQQDQCADIARPRQRAIAAgAxABIgJBgAFJBH8gAUEBagUgAkGAEEkEfyABQQJqBSACQf+3 +A00gAkGAsANPcQR/IANBAWoiAiAESQR/IAAgAhABQYD4A3FBgLgDRgR/IAIhAyABQQRqBSABQQNqCw +UgAUEDagsFIAFBA2oLCwshASADQQFqIQMMAQsLIAELwgMCBn8BZAAgASECIAAgABAAIgX7BwAiCEEA +EAIaA0AgBCAFT0UEQCAIIAT7DQAiA0GAAUkEfyACIAM6AAAgAkEBagUgA0GAEEkEfyACIANBBnZBwA +FyOgAAIAJBAWogA0E/cUGAAXI6AAAgAkECagUgA0H/twNNIANBgLADT3EEfyAEQQFqIgYgBUkEfyAI +IAb7DQAiB0GA+ANxQYC4A0YEfyAGIQQgAiADQQp0IAdqQYC4/xprIgNBEnZB8AFyOgAAIAJBAWogA0 +EMdkE/cUGAAXI6AAAgAkECaiADQQZ2QT9xQYABcjoAACACQQNqIANBP3FBgAFyOgAAIAJBBGoFIAIg +A0EMdkHgAXI6AAAgAkEBaiADQQZ2QT9xQYABcjoAACACQQJqIANBP3FBgAFyOgAAIAJBA2oLBSACIA +NBDHZB4AFyOgAAIAJBAWogA0EGdkE/cUGAAXI6AAAgAkECaiADQT9xQYABcjoAACACQQNqCwUgAiAD +QQx2QeABcjoAACACQQFqIANBBnZBP3FBgAFyOgAAIAJBAmogA0E/cUGAAXI6AAAgAkEDagsLCyECIA +RBAWohBAwBCwsgAiABawvBAgEDfwNAIAAgA01FBEAgAy0AACIEQYABcQR/IARB4AFxQcABRgR/IAEg +AiADQQFqLQAAQT9xIARBH3FBBnRy+w4AIAJBAWohAiADQQJqBSAEQfABcUHgAUYEfyABIAIgA0ECai +0AAEE/cSAEQQ9xQQx0IANBAWotAABBP3FBBnRycvsOACACQQFqIQIgA0EDagUgBEH4AXFB8AFGBH8g +ASACIANBA2otAABBP3EgBEEHcUESdCADQQFqLQAAQT9xQQx0ciADQQJqLQAAQT9xQQZ0cnJBgIAEay +IEQQp2QYCwA3L7DgAgASACQQFqIgIgBEH/B3FBgLgDcvsOACACQQFqIQIgA0EEagUgASACIAT7DgAg +AkEBaiECIANBAWoLCwsFIAEgAiAE+w4AIAJBAWohAiADQQFqCyEDDAELCyACCwcAIAD7BwALCgAgAC +ABIAIQAws= +`; diff --git a/src/utils/utf8-wasm.ts b/src/utils/utf8-wasm.ts new file mode 100644 index 0000000..89a029c --- /dev/null +++ b/src/utils/utf8-wasm.ts @@ -0,0 +1,170 @@ +/** + * WebAssembly-based UTF-8 string processing using js-string-builtins with GC arrays. + * + * Environment variables: + * - MSGPACK_WASM=force: Force wasm mode, throw error if wasm fails to load + * - MSGPACK_WASM=never: Disable wasm, always use pure JS + * + * This implementation uses WASM GC arrays with intoCharCodeArray/fromCharCodeArray + * for efficient bulk string operations instead of character-by-character processing. + */ + +import { wasmBinary } from "./utf8-wasm-binary.ts"; + +function getWasmMode(): "force" | "never" | "auto" { + // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition + if (typeof process !== "undefined" && process.env) { + const mode = process.env["MSGPACK_WASM"]; + if (mode) { + switch (mode.toLowerCase()) { + case "force": + return "force"; + case "never": + return "never"; + default: + return "auto"; + } + } + } + return "auto"; +} + +const WASM_MODE = getWasmMode(); + +// GC array type (opaque reference) +type I16Array = object; + +interface WasmExports extends WebAssembly.Exports { + memory: WebAssembly.Memory; + utf8Count(str: string): number; + utf8Encode(str: string, offset: number): number; + utf8DecodeToArray(length: number, arr: I16Array): number; + allocArray(size: number): I16Array; + arrayToString(arr: I16Array, start: number, end: number): string; +} + +let wasmInstance: WasmExports | null = null; +let wasmInitError: Error | null = null; + +function base64ToBytes(base64: string): Uint8Array { + // @ts-expect-error - fromBase64 is not yet supported in TypeScript + if (Uint8Array.fromBase64) { + // @ts-expect-error - fromBase64 is not yet supported in TypeScript + return Uint8Array.fromBase64(base64); + } else if (typeof Buffer !== "undefined") { + // Node.js + return new Uint8Array(Buffer.from(base64, "base64")); + } else { + // Legacy fallback + const binary = atob(base64); + const bytes = new Uint8Array(binary.length); + for (let i = 0; i < binary.length; i++) { + bytes[i] = binary.charCodeAt(i); + } + return bytes; + } +} + +function tryInitializeWasmInstance(): void { + if (WASM_MODE === "never") { + wasmInitError = new Error("MSGPACK_WASM=never: wasm disabled"); + return; + } + + try { + if (typeof WebAssembly === "undefined") { + throw new Error("WebAssembly not supported"); + } + + const bytes = base64ToBytes(wasmBinary); + + // Requires js-string builtins support (Node.js 24+ / Chrome 130+ / Firefox 134+) + const module: WebAssembly.Module = new (WebAssembly.Module as any)(bytes, { builtins: ["js-string"] }); + const instance = new WebAssembly.Instance(module); + wasmInstance = instance.exports as WasmExports; + } catch (e) { + wasmInitError = e instanceof Error ? e : new Error(String(e)); + + if (WASM_MODE === "force") { + throw new Error(`MSGPACK_WASM=force but wasm failed to load: ${wasmInitError.message}`, { cause: wasmInitError }); + } + } +} + +tryInitializeWasmInstance(); + +/** + * Whether wasm is available and initialized. + */ +// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition +export const WASM_AVAILABLE = wasmInstance !== null; + +export function getWasmError(): Error | null { + return wasmInitError; +} + +export function getWasmExports(): WasmExports | null { + return wasmInstance; +} + +/** + * Count UTF-8 byte length of a string. + */ +export function utf8CountWasm(str: string): number { + return wasmInstance!.utf8Count(str); +} + +/** + * Encode string to UTF-8 bytes in the provided buffer. + * Returns the number of bytes written. + */ +export function utf8EncodeWasm(str: string, output: Uint8Array, outputOffset: number): number { + // Estimate max byte length without a full pass over the string. + // Each UTF-16 code unit can produce at most 3 UTF-8 bytes (BMP chars). + // Surrogate pairs (2 code units) produce 4 bytes, so 3 bytes/code unit is safe. + const maxByteLength = str.length * 3; + + // Ensure wasm memory is large enough + const requiredPages = Math.ceil(maxByteLength / 65536); + const currentPages = wasmInstance!.memory.buffer.byteLength / 65536; + + if (requiredPages > currentPages) { + wasmInstance!.memory.grow(requiredPages - currentPages); + } + + // Encode to wasm memory (uses intoCharCodeArray for bulk char extraction) + const bytesWritten = wasmInstance!.utf8Encode(str, 0); + + // Copy from wasm memory to output buffer + const wasmBytes = new Uint8Array(wasmInstance!.memory.buffer, 0, bytesWritten); + output.set(wasmBytes, outputOffset); + + return bytesWritten; +} + +/** + * Decode UTF-8 bytes to string. + * Uses GC arrays with fromCharCodeArray for efficient string creation. + */ +export function utf8DecodeWasm(bytes: Uint8Array, inputOffset: number, byteLength: number): string { + // Ensure wasm memory is large enough for UTF-8 input + const requiredPages = Math.ceil(byteLength / 65536); + const currentPages = wasmInstance!.memory.buffer.byteLength / 65536; + + if (requiredPages > currentPages) { + wasmInstance!.memory.grow(requiredPages - currentPages); + } + + // Copy UTF-8 bytes to wasm linear memory at offset 0 + const wasmBytes = new Uint8Array(wasmInstance!.memory.buffer, 0, byteLength); + wasmBytes.set(bytes.subarray(inputOffset, inputOffset + byteLength)); + + // Allocate GC array for UTF-16 output (max size = byteLength for ASCII) + const arr = wasmInstance!.allocArray(byteLength); + + // Decode UTF-8 to UTF-16 in GC array + const codeUnits = wasmInstance!.utf8DecodeToArray(byteLength, arr); + + // Create string directly from GC array using fromCharCodeArray + return wasmInstance!.arrayToString(arr, 0, codeUnits); +} diff --git a/src/utils/utf8.ts b/src/utils/utf8.ts index 1494f70..7fbd539 100644 --- a/src/utils/utf8.ts +++ b/src/utils/utf8.ts @@ -1,4 +1,8 @@ -export function utf8Count(str: string): number { +import { WASM_AVAILABLE, utf8CountWasm, utf8EncodeWasm, utf8DecodeWasm } from "./utf8-wasm.ts"; + +export { WASM_AVAILABLE }; + +export function utf8CountJs(str: string): number { const strLength = str.length; let byteLength = 0; @@ -38,6 +42,8 @@ export function utf8Count(str: string): number { return byteLength; } +export const utf8Count: (str: string) => number = WASM_AVAILABLE ? utf8CountWasm : utf8CountJs; + export function utf8EncodeJs(str: string, output: Uint8Array, outputOffset: number): void { const strLength = str.length; let offset = outputOffset; @@ -92,13 +98,32 @@ const sharedTextEncoder = new TextEncoder(); // This threshold should be determined by benchmarking, which might vary in engines and input data. // Run `npx ts-node benchmark/encode-string.ts` for details. +// For mixed content (ASCII + CJK + emoji), JS wins for strLength < 30-50. +// After that, WASM or TextEncoder is faster depending on content type. const TEXT_ENCODER_THRESHOLD = 50; export function utf8EncodeTE(str: string, output: Uint8Array, outputOffset: number): void { sharedTextEncoder.encodeInto(str, output.subarray(outputOffset)); } -export function utf8Encode(str: string, output: Uint8Array, outputOffset: number): void { +// Wasm threshold: use wasm for medium strings, TextEncoder for large strings. +// For pure ASCII, TextEncoder is ~1.7x faster at 100+ strLength. +// For CJK/emoji, WASM is ~1.4-1.6x faster than TextEncoder at all sizes. +// 1000 is a compromise for mixed content. +const WASM_ENCODE_MAX = 1000; + +function utf8EncodeWithWasm(str: string, output: Uint8Array, outputOffset: number): void { + const len = str.length; + if (len > WASM_ENCODE_MAX) { + utf8EncodeTE(str, output, outputOffset); + } else if (len > TEXT_ENCODER_THRESHOLD) { + utf8EncodeWasm(str, output, outputOffset); + } else { + utf8EncodeJs(str, output, outputOffset); + } +} + +function utf8EncodeNoWasm(str: string, output: Uint8Array, outputOffset: number): void { if (str.length > TEXT_ENCODER_THRESHOLD) { utf8EncodeTE(str, output, outputOffset); } else { @@ -106,6 +131,10 @@ export function utf8Encode(str: string, output: Uint8Array, outputOffset: number } } +export const utf8Encode: (str: string, output: Uint8Array, outputOffset: number) => void = WASM_AVAILABLE + ? utf8EncodeWithWasm + : utf8EncodeNoWasm; + const CHUNK_SIZE = 0x1_000; export function utf8DecodeJs(bytes: Uint8Array, inputOffset: number, byteLength: number): string { @@ -161,17 +190,39 @@ const sharedTextDecoder = new TextDecoder(); // This threshold should be determined by benchmarking, which might vary in engines and input data. // Run `npx ts-node benchmark/decode-string.ts` for details. -const TEXT_DECODER_THRESHOLD = 200; +// For mixed content (ASCII + CJK + emoji), JS wins for very short strings only. +// WASM becomes superior at ~30-50 bytes for non-ASCII content. +const TEXT_DECODER_THRESHOLD = 50; export function utf8DecodeTD(bytes: Uint8Array, inputOffset: number, byteLength: number): string { const stringBytes = bytes.subarray(inputOffset, inputOffset + byteLength); return sharedTextDecoder.decode(stringBytes); } -export function utf8Decode(bytes: Uint8Array, inputOffset: number, byteLength: number): string { +// Wasm decode threshold: use wasm for medium strings, TextDecoder for large strings. +// For pure ASCII, TextDecoder is ~5x faster at 1000+ bytes. +// For CJK/emoji, WASM is ~5-6x faster than TextDecoder at all sizes. +// 1000 is a compromise for mixed content. +const WASM_DECODE_MAX = 1000; + +function utf8DecodeWithWasm(bytes: Uint8Array, inputOffset: number, byteLength: number): string { + if (byteLength > WASM_DECODE_MAX) { + return utf8DecodeTD(bytes, inputOffset, byteLength); + } else if (byteLength > TEXT_DECODER_THRESHOLD) { + return utf8DecodeWasm(bytes, inputOffset, byteLength); + } else { + return utf8DecodeJs(bytes, inputOffset, byteLength); + } +} + +function utf8DecodeNoWasm(bytes: Uint8Array, inputOffset: number, byteLength: number): string { if (byteLength > TEXT_DECODER_THRESHOLD) { return utf8DecodeTD(bytes, inputOffset, byteLength); } else { return utf8DecodeJs(bytes, inputOffset, byteLength); } } + +export const utf8Decode: (bytes: Uint8Array, inputOffset: number, byteLength: number) => string = WASM_AVAILABLE + ? utf8DecodeWithWasm + : utf8DecodeNoWasm; diff --git a/test/utf8-wasm.test.ts b/test/utf8-wasm.test.ts new file mode 100644 index 0000000..f03e9d2 --- /dev/null +++ b/test/utf8-wasm.test.ts @@ -0,0 +1,364 @@ +import assert from "assert"; +import { WASM_AVAILABLE, getWasmError, getWasmExports, utf8CountWasm, utf8EncodeWasm, utf8DecodeWasm } from "../src/utils/utf8-wasm.ts"; +import { utf8Count, utf8CountJs, utf8Encode, utf8EncodeJs, utf8Decode, utf8DecodeJs } from "../src/utils/utf8.ts"; + +describe("utf8-wasm", () => { + describe("initialization", () => { + it("reports WASM_AVAILABLE status", () => { + // In Node.js without the flag, wasm should fail to load + // but we should get a clear error message + console.log("WASM_AVAILABLE:", WASM_AVAILABLE); + console.log("WASM error:", getWasmError()?.message); + + // Just verify the exports work + assert.strictEqual(typeof WASM_AVAILABLE, "boolean"); + }); + + it("getWasmExports returns null or valid exports", () => { + const exports = getWasmExports(); + if (WASM_AVAILABLE) { + assert.ok(exports !== null); + assert.ok(typeof exports!.utf8Count === "function"); + assert.ok(typeof exports!.utf8Encode === "function"); + assert.ok(typeof exports!.utf8DecodeToArray === "function"); + assert.ok(typeof exports!.allocArray === "function"); + assert.ok(typeof exports!.arrayToString === "function"); + assert.ok(exports!.memory instanceof WebAssembly.Memory); + } else { + assert.strictEqual(exports, null); + } + }); + }); + + describe("utf8Count", () => { + const testCases = [ + { input: "", expected: 0, description: "empty string" }, + { input: "hello", expected: 5, description: "ASCII" }, + { input: "こんにちは", expected: 15, description: "Japanese hiragana (3 bytes each)" }, + { input: "🎉", expected: 4, description: "emoji (4 bytes)" }, + { input: "hello🎉world", expected: 14, description: "mixed ASCII and emoji" }, + { input: "Ω", expected: 2, description: "Greek omega (2 bytes)" }, + { input: "€", expected: 3, description: "Euro sign (3 bytes)" }, + { input: "𝄞", expected: 4, description: "Musical G clef (4 bytes, surrogate pair)" }, + ]; + + for (const { input, expected, description } of testCases) { + it(`counts ${description}: "${input}" = ${expected} bytes`, () => { + const jsResult = utf8CountJs(input); + const result = utf8Count(input); + + assert.strictEqual(jsResult, expected, `JS implementation failed for "${input}"`); + assert.strictEqual(result, expected, `utf8Count failed for "${input}"`); + }); + } + }); + + describe("utf8Encode", () => { + const testCases = [ + { input: "hello", description: "ASCII" }, + { input: "こんにちは", description: "Japanese" }, + { input: "🎉🎊🎁", description: "emojis" }, + { input: "hello🎉world", description: "mixed" }, + { input: "Ω€𝄞", description: "multi-byte chars" }, + { input: "a".repeat(100), description: "100 ASCII chars" }, + { input: "日".repeat(100), description: "100 Japanese chars" }, + ]; + + for (const { input, description } of testCases) { + it(`encodes ${description}`, () => { + const byteLength = utf8Count(input); + const buffer1 = new Uint8Array(byteLength); + const buffer2 = new Uint8Array(byteLength); + + utf8EncodeJs(input, buffer1, 0); + utf8Encode(input, buffer2, 0); + + // Compare with TextEncoder as ground truth + const expected = new TextEncoder().encode(input); + assert.deepStrictEqual(buffer1, expected, `JS encode failed for "${description}"`); + assert.deepStrictEqual(buffer2, expected, `utf8Encode failed for "${description}"`); + }); + } + }); + + describe("utf8Decode", () => { + const testCases = [ + { input: "hello", description: "ASCII" }, + { input: "こんにちは", description: "Japanese" }, + { input: "🎉🎊🎁", description: "emojis" }, + { input: "hello🎉world", description: "mixed" }, + { input: "Ω€𝄞", description: "multi-byte chars" }, + { input: "a".repeat(100), description: "100 ASCII chars" }, + { input: "日".repeat(100), description: "100 Japanese chars" }, + ]; + + for (const { input, description } of testCases) { + it(`decodes ${description}`, () => { + const bytes = new TextEncoder().encode(input); + + const jsResult = utf8DecodeJs(bytes, 0, bytes.length); + const result = utf8Decode(bytes, 0, bytes.length); + + assert.strictEqual(jsResult, input, `JS decode failed for "${description}"`); + assert.strictEqual(result, input, `utf8Decode failed for "${description}"`); + }); + } + }); + + describe("round-trip", () => { + const testStrings = [ + "", + "hello", + "Hello, 世界! 🌍", + "The quick brown fox jumps over the lazy dog", + "日本語テスト", + "Emoji: 😀🎉🚀💻🔥", + "\u0000\u0001\u0002", // control characters + "Tab:\tNewline:\n", + "Mixed: ASCII, Ελληνικά, 日本語, العربية, 🎌", + ]; + + for (const str of testStrings) { + it(`round-trips: "${str.slice(0, 30)}${str.length > 30 ? "..." : ""}"`, () => { + const byteLength = utf8Count(str); + const buffer = new Uint8Array(byteLength); + utf8Encode(str, buffer, 0); + const decoded = utf8Decode(buffer, 0, byteLength); + + assert.strictEqual(decoded, str); + }); + } + }); + + // Edge case tests for invalid/malformed data + // These tests ensure JS and WASM implementations behave identically + describe("edge cases: lone surrogates", () => { + // Lone high surrogate (0xD800-0xDBFF without following low surrogate) + const loneHighSurrogate = "\uD800"; // U+D800 + const loneHighSurrogateAtEnd = "abc\uD800"; + const loneHighSurrogateFollowedByAscii = "\uD800X"; + const loneHighSurrogateFollowedByHighSurrogate = "\uD800\uD800"; + + // Lone low surrogate (0xDC00-0xDFFF without preceding high surrogate) + const loneLowSurrogate = "\uDC00"; + const loneLowSurrogateAtStart = "\uDC00abc"; + const loneLowSurrogateBetweenAscii = "a\uDC00b"; + + // Mixed valid and invalid surrogates + const validSurrogatePair = "\uD83D\uDE00"; // 😀 + const validThenLoneHigh = "\uD83D\uDE00\uD800"; + const loneLowThenValid = "\uDC00\uD83D\uDE00"; + + const surrogateTestCases = [ + { str: loneHighSurrogate, description: "lone high surrogate" }, + { str: loneHighSurrogateAtEnd, description: "lone high surrogate at end" }, + { str: loneHighSurrogateFollowedByAscii, description: "lone high surrogate followed by ASCII" }, + { str: loneHighSurrogateFollowedByHighSurrogate, description: "two lone high surrogates" }, + { str: loneLowSurrogate, description: "lone low surrogate" }, + { str: loneLowSurrogateAtStart, description: "lone low surrogate at start" }, + { str: loneLowSurrogateBetweenAscii, description: "lone low surrogate between ASCII" }, + { str: validSurrogatePair, description: "valid surrogate pair (emoji)" }, + { str: validThenLoneHigh, description: "valid pair then lone high" }, + { str: loneLowThenValid, description: "lone low then valid pair" }, + ]; + + describe("utf8Count", () => { + for (const { str, description } of surrogateTestCases) { + it(`counts ${description} consistently`, () => { + const jsResult = utf8CountJs(str); + + // JS implementation is the reference - lone surrogates should be 3 bytes each + assert.ok(jsResult > 0, `JS count should be positive for "${description}"`); + + if (WASM_AVAILABLE) { + const wasmResult = utf8CountWasm(str); + assert.strictEqual(wasmResult, jsResult, `WASM count should match JS for "${description}"`); + } + }); + } + + it("lone high surrogate counts as 3 bytes", () => { + // A lone high surrogate (0xD800-0xDBFF) should be encoded as 3 bytes + // because it's in the 0x800-0xFFFF range + assert.strictEqual(utf8CountJs("\uD800"), 3); + if (WASM_AVAILABLE) { + assert.strictEqual(utf8CountWasm("\uD800"), 3); + } + }); + + it("lone low surrogate counts as 3 bytes", () => { + assert.strictEqual(utf8CountJs("\uDC00"), 3); + if (WASM_AVAILABLE) { + assert.strictEqual(utf8CountWasm("\uDC00"), 3); + } + }); + + it("valid surrogate pair counts as 4 bytes", () => { + assert.strictEqual(utf8CountJs("\uD83D\uDE00"), 4); // 😀 + if (WASM_AVAILABLE) { + assert.strictEqual(utf8CountWasm("\uD83D\uDE00"), 4); + } + }); + }); + + describe("utf8Encode", () => { + for (const { str, description } of surrogateTestCases) { + it(`encodes ${description} consistently`, () => { + const byteLength = utf8CountJs(str); + const jsBuffer = new Uint8Array(byteLength); + utf8EncodeJs(str, jsBuffer, 0); + + if (WASM_AVAILABLE) { + const wasmBuffer = new Uint8Array(byteLength); + utf8EncodeWasm(str, wasmBuffer, 0); + assert.deepStrictEqual(wasmBuffer, jsBuffer, `WASM encode should match JS for "${description}"`); + } + }); + } + }); + + describe("round-trip with lone surrogates", () => { + for (const { str, description } of surrogateTestCases) { + it(`round-trips ${description}`, () => { + const byteLength = utf8CountJs(str); + const buffer = new Uint8Array(byteLength); + utf8EncodeJs(str, buffer, 0); + const decoded = utf8DecodeJs(buffer, 0, byteLength); + + assert.strictEqual(decoded, str, `JS round-trip failed for "${description}"`); + + if (WASM_AVAILABLE) { + const wasmBuffer = new Uint8Array(byteLength); + utf8EncodeWasm(str, wasmBuffer, 0); + const wasmDecoded = utf8DecodeWasm(wasmBuffer, 0, byteLength); + assert.strictEqual(wasmDecoded, str, `WASM round-trip failed for "${description}"`); + } + }); + } + }); + }); + + describe("edge cases: invalid UTF-8 bytes in decode", () => { + // Invalid UTF-8 sequences that don't match any valid pattern + const invalidByteSequences = [ + { + bytes: new Uint8Array([0x80]), // Continuation byte without leading byte + description: "lone continuation byte 0x80", + }, + { + bytes: new Uint8Array([0xBF]), // Continuation byte without leading byte + description: "lone continuation byte 0xBF", + }, + { + bytes: new Uint8Array([0xFE]), // Invalid byte (never valid in UTF-8) + description: "invalid byte 0xFE", + }, + { + bytes: new Uint8Array([0xFF]), // Invalid byte (never valid in UTF-8) + description: "invalid byte 0xFF", + }, + { + bytes: new Uint8Array([0xF8, 0x80, 0x80, 0x80, 0x80]), // 5-byte sequence (invalid) + description: "5-byte sequence (invalid)", + }, + { + bytes: new Uint8Array([0x41, 0x80, 0x42]), // ASCII, invalid, ASCII + description: "invalid byte between ASCII", + }, + { + bytes: new Uint8Array([0xC0, 0x80]), // Overlong encoding of NUL + description: "overlong encoding of NUL", + }, + { + bytes: new Uint8Array([0xE0, 0x80, 0x80]), // Overlong encoding + description: "overlong 3-byte encoding", + }, + ]; + + describe("utf8Decode preserves invalid bytes", () => { + for (const { bytes, description } of invalidByteSequences) { + it(`preserves ${description}`, () => { + const jsResult = utf8DecodeJs(bytes, 0, bytes.length); + + // The JS implementation should preserve invalid bytes as code units + // So the result length should be > 0 + assert.ok(jsResult.length > 0, `JS decode should produce output for "${description}"`); + + if (WASM_AVAILABLE) { + const wasmResult = utf8DecodeWasm(bytes, 0, bytes.length); + assert.strictEqual( + wasmResult, + jsResult, + `WASM decode should match JS for "${description}": got "${wasmResult}" vs "${jsResult}"` + ); + } + }); + } + }); + + describe("invalid bytes are not dropped", () => { + it("0x80 byte is preserved, not dropped", () => { + const bytes = new Uint8Array([0x80]); + const jsResult = utf8DecodeJs(bytes, 0, 1); + // Should be a single character with code point 0x80 + assert.strictEqual(jsResult.length, 1); + assert.strictEqual(jsResult.charCodeAt(0), 0x80); + + if (WASM_AVAILABLE) { + const wasmResult = utf8DecodeWasm(bytes, 0, 1); + assert.strictEqual(wasmResult.length, 1, "WASM should not drop the byte"); + assert.strictEqual(wasmResult.charCodeAt(0), 0x80); + } + }); + + it("0xFF byte is preserved, not dropped", () => { + const bytes = new Uint8Array([0xFF]); + const jsResult = utf8DecodeJs(bytes, 0, 1); + assert.strictEqual(jsResult.length, 1); + assert.strictEqual(jsResult.charCodeAt(0), 0xFF); + + if (WASM_AVAILABLE) { + const wasmResult = utf8DecodeWasm(bytes, 0, 1); + assert.strictEqual(wasmResult.length, 1, "WASM should not drop the byte"); + assert.strictEqual(wasmResult.charCodeAt(0), 0xFF); + } + }); + + it("invalid bytes between valid UTF-8 are preserved", () => { + // "A" + invalid + "B" + const bytes = new Uint8Array([0x41, 0x80, 0x42]); + const jsResult = utf8DecodeJs(bytes, 0, 3); + + // Should be 3 characters: 'A', char(0x80), 'B' + assert.strictEqual(jsResult.length, 3); + assert.strictEqual(jsResult.charCodeAt(0), 0x41); // 'A' + assert.strictEqual(jsResult.charCodeAt(1), 0x80); // invalid byte preserved + assert.strictEqual(jsResult.charCodeAt(2), 0x42); // 'B' + + if (WASM_AVAILABLE) { + const wasmResult = utf8DecodeWasm(bytes, 0, 3); + assert.strictEqual(wasmResult.length, 3, "WASM should produce 3 chars"); + assert.strictEqual(wasmResult, jsResult, "WASM should match JS"); + } + }); + + it("multiple invalid bytes are all preserved", () => { + const bytes = new Uint8Array([0x80, 0x81, 0x82, 0xFE, 0xFF]); + const jsResult = utf8DecodeJs(bytes, 0, 5); + + assert.strictEqual(jsResult.length, 5, "All 5 invalid bytes should produce 5 chars"); + assert.strictEqual(jsResult.charCodeAt(0), 0x80); + assert.strictEqual(jsResult.charCodeAt(1), 0x81); + assert.strictEqual(jsResult.charCodeAt(2), 0x82); + assert.strictEqual(jsResult.charCodeAt(3), 0xFE); + assert.strictEqual(jsResult.charCodeAt(4), 0xFF); + + if (WASM_AVAILABLE) { + const wasmResult = utf8DecodeWasm(bytes, 0, 5); + assert.strictEqual(wasmResult.length, 5, "WASM should produce 5 chars"); + assert.strictEqual(wasmResult, jsResult, "WASM should match JS"); + } + }); + }); + }); +}); diff --git a/tsconfig.json b/tsconfig.json index a4cacbc..db0c53b 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -1,29 +1,31 @@ { "compilerOptions": { /* Basic Options */ - "target": "ES2020", /* the baseline */ - "module": "CommonJS", /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', or 'ESNext'. */ - "lib": ["ES2024", "DOM"], /* Specify library files to be included in the compilation. */ + "target": "es2020", /* the baseline */ + "module": "esnext", /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', or 'ESNext'. */ + "lib": [ + "esnext", + "dom" + ], /* Specify library files to be included in the compilation. */ // "allowJs": true, /* Allow javascript files to be compiled. */ // "checkJs": true, /* Report errors in .js files. */ // "jsx": "preserve", /* Specify JSX code generation: 'preserve', 'react-native', or 'react'. */ // "declaration": true, /* Generates corresponding '.d.ts' file. */ // "declarationMap": true, /* Generates a sourcemap for each corresponding '.d.ts' file. */ - "sourceMap": true, /* Generates corresponding '.map' file. */ + "sourceMap": true, /* Generates corresponding '.map' file. */ // "outFile": "./", /* Concatenate and emit output to single file. */ - "outDir": "./build", /* Redirect output structure to the directory. */ + "outDir": "./build", /* Redirect output structure to the directory. */ // "rootDir": "./", /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */ // "composite": true, /* Enable project compilation */ - "incremental": true, /* Enable incremental compilation */ + "incremental": true, /* Enable incremental compilation */ // "tsBuildInfoFile": "./", /* Specify file to store incremental compilation information */ // "removeComments": true, /* Do not emit comments to output. */ // "noEmit": true, /* Do not emit outputs. */ - "importHelpers": false, /* Import emit helpers from 'tslib'. */ + "importHelpers": false, /* Import emit helpers from 'tslib'. */ // "downlevelIteration": true, /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */ // "isolatedModules": true, /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */ - /* Strict Type-Checking Options */ - "strict": true, /* Enable all strict type-checking options. */ + "strict": true, /* Enable all strict type-checking options. */ // "noImplicitAny": true, /* Raise error on expressions and declarations with an implied 'any' type. */ // "strictNullChecks": true, /* Enable strict null checks. */ // "strictFunctionTypes": true, /* Enable strict checking of function types. */ @@ -31,21 +33,19 @@ // "strictPropertyInitialization": true, /* Enable strict checking of property initialization in classes. */ // "noImplicitThis": true, /* Raise error on 'this' expressions with an implied 'any' type. */ // "alwaysStrict": true, /* Parse in strict mode and emit "use strict" for each source file. */ - /* Additional Checks */ // "noUnusedLocals": true, /* Report errors on unused locals. */ // "noUnusedParameters": true, /* Report errors on unused parameters. */ - "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */ - "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */ + "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */ + "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */ "noUncheckedIndexedAccess": true, "noPropertyAccessFromIndexSignature": true, "noImplicitOverride": true, "verbatimModuleSyntax": false, "allowImportingTsExtensions": true, "noEmit": true, - /* Module Resolution Options */ - "moduleResolution": "node", /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */ + "moduleResolution": "node", /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */ // "paths": { // "@msgpack/msgpack": ["./src"] // }, /* A series of entries which re-map imports to lookup locations relative to the 'baseUrl'. */ @@ -53,23 +53,26 @@ // "typeRoots": [], /* List of folders to include type definitions from. */ // "types": [], /* Type declaration files to be included in compilation. */ // "allowSyntheticDefaultImports": true, /* Allow default imports from modules with no default export. This does not affect code emit, just typechecking. */ - "esModuleInterop": true, /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */ + "esModuleInterop": true, /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */ // "preserveSymlinks": true, /* Do not resolve the real path of symlinks. */ "resolveJsonModule": true, "skipLibCheck": true, "forceConsistentCasingInFileNames": true - // "erasableSyntaxOnly": true - /* Source Map Options */ // "sourceRoot": "", /* Specify the location where debugger should locate TypeScript files instead of source locations. */ // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ // "inlineSourceMap": true, /* Emit a single file with source maps instead of having a separate file. */ // "inlineSources": true, /* Emit the source alongside the sourcemaps within a single file; requires '--inlineSourceMap' or '--sourceMap' to be set. */ - /* Experimental Options */ // "experimentalDecorators": true, /* Enables experimental support for ES7 decorators. */ // "emitDecoratorMetadata": true, /* Enables experimental support for emitting type metadata for decorators. */ }, - "exclude": ["example", "benchmark", "test/bun*", "test/deno*", "mod.ts"] + "exclude": [ + "example", + "benchmark", + "test/bun*", + "test/deno*", + "mod.ts" + ] } diff --git a/wasm/.gitignore b/wasm/.gitignore new file mode 100644 index 0000000..44ddaba --- /dev/null +++ b/wasm/.gitignore @@ -0,0 +1,2 @@ +# Generated wasm binary - rebuild with ./build.sh +*.wasm diff --git a/wasm/README.md b/wasm/README.md new file mode 100644 index 0000000..b3bc3e2 --- /dev/null +++ b/wasm/README.md @@ -0,0 +1,89 @@ +# WebAssembly UTF-8 String Processing + +## Background + +### Previous Attempt (2019-2020) + +- **PR #26**: Introduced AssemblyScript-based UTF-8 encode/decode +- **PR #95**: Removed it because "Wasm for UTF-8 encode/decode is not much faster than pureJS" + +The main issues were: +1. JS-to-Wasm call overhead negated encoding gains +2. String copying between JS and Wasm memory was expensive +3. Maintenance burden wasn't justified by performance gains + +### What Changed in 2025 + +**js-string-builtins** (WebAssembly 3.0) fundamentally changes the equation: + +- Direct import of JS string operations from `wasm:js-string` +- No glue code overhead - operations can be inlined by the engine +- Uses WASM GC arrays with `intoCharCodeArray`/`fromCharCodeArray` for bulk operations + +## Building + +Requires [Binaryen](https://github.com/WebAssembly/binaryen) (`brew install binaryen`): + +```bash +./build.sh +``` + +This compiles `utf8.wat` and generates `src/utils/utf8-wasm-binary.ts` with the base64-encoded binary. + +## Runtime Requirements + +| Environment | Support | +|-------------|---------| +| Node.js 24+ | Native (V8 13.6+) | +| Node.js 22-23 | `--experimental-wasm-imported-strings` flag | +| Chrome 131+ | Native | +| Firefox 134+ | Native | +| Safari | TBD | +| Older/unsupported | Falls back to pure JS | + +## Architecture + +Three-tier dispatch based on string/byte length: + +| Length | Method | Reason | +|--------|--------|--------| +| ≤ 50 | Pure JS | Lowest call overhead | +| 51-1000 | WASM | Optimal for medium strings | +| > 1000 | TextEncoder/TextDecoder | SIMD-optimized for bulk | + +## Optimization Attempts (2025) + +Several optimization approaches were tested for `utf8Count`: + +### 1. Bulk Array Copy (intoCharCodeArray) + +**Hypothesis**: Replace N `charCodeAt` calls with 1 bulk `intoCharCodeArray` + N array reads. + +**Result**: 17-29% slower. GC array allocation overhead outweighs boundary-crossing savings. + +### 2. codePointAt Instead of charCodeAt + +**Hypothesis**: Simplify surrogate pair handling with `codePointAt`. + +**Result**: Slightly slower. `codePointAt` does more internal work to decode surrogates. + +### 3. SIMD Processing + +**Hypothesis**: Copy to linear memory, then use SIMD to process 8 chars at once. + +**Result**: 23-49% slower. The O(n) copy from GC array to linear memory negates SIMD gains. + +``` +JS String → GC Array (1 call) → Linear Memory (N scalar ops) → SIMD + ↑ + This kills SIMD +``` + +### Conclusion + +The scalar `charCodeAt` loop is already near-optimal. The `js-string-builtins` implementation is highly optimized, making per-character calls very cheap. The 2-3x speedup over pure JS is about as good as it gets with current WASM capabilities. + +## References + +- [js-string-builtins proposal](https://github.com/WebAssembly/js-string-builtins) +- [MDN: WebAssembly JavaScript builtins](https://developer.mozilla.org/en-US/docs/WebAssembly/Guides/JavaScript_builtins) diff --git a/wasm/build.mts b/wasm/build.mts new file mode 100644 index 0000000..e932967 --- /dev/null +++ b/wasm/build.mts @@ -0,0 +1,61 @@ +// Build script for UTF-8 wasm module +// Invoked by `npm run build:wasm` + +/* eslint-disable no-console */ + +import fs from "node:fs"; +import path from "node:path"; +import { fileURLToPath } from "node:url"; +import binaryen from "binaryen"; +import binaryenMetadata from "binaryen/package.json" with { type: "json" }; + + +const dirname = path.dirname(fileURLToPath(import.meta.url)); + +const watPath = path.join(dirname, "utf8.wat"); +const wasmPath = path.join(dirname, "utf8.wasm"); +const tsOutputPath = path.join(dirname, "..", "src", "utils", "utf8-wasm-binary.ts"); + +console.log(`Compiling utf8.wat -> utf8.wasm with Binaryen v${binaryenMetadata.version}...`); + +// Read WAT source +const watSource = fs.readFileSync(watPath, "utf-8"); + +// Parse WAT to module +const mod = binaryen.parseText(watSource); + +// Enable required features +mod.setFeatures(binaryen.Features.ReferenceTypes | binaryen.Features.GC | binaryen.Features.Strings); + +// Optimize (equivalent to wasm-opt -O4) +mod.optimize(); + +// Emit binary +const wasmBinary = mod.emitBinary(); + +// Write wasm file +fs.writeFileSync(wasmPath, wasmBinary); + +console.log("Generating base64 TypeScript module..."); + +// Convert to base64 with line breaks (like base64 -b 78) +const base64 = Buffer.from(wasmBinary).toString("base64"); +const base64WithLineBreaks = base64.match(/.{1,78}/g)?.join("\n") ?? base64; + +// Generate TypeScript file +const tsContent = `// Auto-generated by wasm/build.mts - DO NOT EDIT MANUALLY +// Source: wasm/utf8.wat + +export const wasmBinary = \` +${base64WithLineBreaks} +\`; +`; + +fs.writeFileSync(tsOutputPath, tsContent); + +// Clean up +mod.dispose(); + +console.log("Done! Generated:"); +console.log(` - wasm/utf8.wasm (${wasmBinary.length} bytes)`); +console.log(` - src/utils/utf8-wasm-binary.ts (${Buffer.byteLength(tsContent)} bytes)`); diff --git a/wasm/utf8.wat b/wasm/utf8.wat new file mode 100644 index 0000000..c50d4d6 --- /dev/null +++ b/wasm/utf8.wat @@ -0,0 +1,280 @@ +;; UTF-8 string processing using js-string-builtins with GC arrays +;; https://github.com/WebAssembly/js-string-builtins +;; +;; Uses WASM GC arrays with intoCharCodeArray/fromCharCodeArray +;; for efficient bulk string operations. + +(module + ;; GC array type for UTF-16 code units + (type $i16_array (array (mut i16))) + + ;; Import js-string builtins + (import "wasm:js-string" "length" + (func $str_length (param externref) (result i32))) + (import "wasm:js-string" "charCodeAt" + (func $str_charCodeAt (param externref i32) (result i32))) + (import "wasm:js-string" "intoCharCodeArray" + (func $str_into_array (param externref (ref $i16_array) i32) (result i32))) + (import "wasm:js-string" "fromCharCodeArray" + (func $str_from_array (param (ref $i16_array) i32 i32) (result (ref extern)))) + + ;; Linear memory for UTF-8 bytes (64KB initial) + (memory (export "memory") 1) + + ;; Count UTF-8 byte length of a JS string + (func (export "utf8Count") (param $str externref) (result i32) + (local $len i32) + (local $i i32) + (local $byteLen i32) + (local $code i32) + (local $next i32) + + (local.set $len (call $str_length (local.get $str))) + + (block $break + (loop $continue + (br_if $break (i32.ge_u (local.get $i) (local.get $len))) + + (local.set $code (call $str_charCodeAt (local.get $str) (local.get $i))) + + (if (i32.lt_u (local.get $code) (i32.const 0x80)) + (then + ;; 1-byte: 0x00-0x7F + (local.set $byteLen (i32.add (local.get $byteLen) (i32.const 1)))) + (else + (if (i32.lt_u (local.get $code) (i32.const 0x800)) + (then + ;; 2-byte: 0x80-0x7FF + (local.set $byteLen (i32.add (local.get $byteLen) (i32.const 2)))) + (else + (if (i32.and + (i32.ge_u (local.get $code) (i32.const 0xD800)) + (i32.le_u (local.get $code) (i32.const 0xDBFF))) + (then + ;; High surrogate - check if next is valid low surrogate + (if (i32.lt_u (i32.add (local.get $i) (i32.const 1)) (local.get $len)) + (then + (local.set $next (call $str_charCodeAt (local.get $str) (i32.add (local.get $i) (i32.const 1)))) + (if (i32.eq (i32.and (local.get $next) (i32.const 0xFC00)) (i32.const 0xDC00)) + (then + ;; Valid surrogate pair: 4 bytes, skip low surrogate + (local.set $byteLen (i32.add (local.get $byteLen) (i32.const 4))) + (local.set $i (i32.add (local.get $i) (i32.const 1)))) + (else + ;; Lone high surrogate: 3 bytes + (local.set $byteLen (i32.add (local.get $byteLen) (i32.const 3)))))) + (else + ;; Lone high surrogate at end: 3 bytes + (local.set $byteLen (i32.add (local.get $byteLen) (i32.const 3)))))) + (else + ;; 3-byte: 0x800-0xFFFF (includes lone low surrogates) + (local.set $byteLen (i32.add (local.get $byteLen) (i32.const 3))))))))) + + (local.set $i (i32.add (local.get $i) (i32.const 1))) + (br $continue))) + + (local.get $byteLen)) + + ;; Encode JS string to UTF-8 bytes at offset in linear memory + ;; Returns number of bytes written + (func (export "utf8Encode") (param $str externref) (param $offset i32) (result i32) + (local $len i32) + (local $arr (ref $i16_array)) + (local $i i32) + (local $pos i32) + (local $code i32) + (local $code2 i32) + + (local.set $len (call $str_length (local.get $str))) + (local.set $pos (local.get $offset)) + + ;; Bulk copy all char codes into GC array + (local.set $arr (array.new $i16_array (i32.const 0) (local.get $len))) + (drop (call $str_into_array (local.get $str) (local.get $arr) (i32.const 0))) + + (block $break + (loop $continue + (br_if $break (i32.ge_u (local.get $i) (local.get $len))) + + (local.set $code (array.get_u $i16_array (local.get $arr) (local.get $i))) + + (if (i32.lt_u (local.get $code) (i32.const 0x80)) + (then + ;; 1-byte: ASCII + (i32.store8 (local.get $pos) (local.get $code)) + (local.set $pos (i32.add (local.get $pos) (i32.const 1)))) + (else + (if (i32.lt_u (local.get $code) (i32.const 0x800)) + (then + ;; 2-byte: 110xxxxx 10xxxxxx + (i32.store8 (local.get $pos) + (i32.or (i32.const 0xC0) (i32.shr_u (local.get $code) (i32.const 6)))) + (i32.store8 (i32.add (local.get $pos) (i32.const 1)) + (i32.or (i32.const 0x80) (i32.and (local.get $code) (i32.const 0x3F)))) + (local.set $pos (i32.add (local.get $pos) (i32.const 2)))) + (else + (if (i32.and + (i32.ge_u (local.get $code) (i32.const 0xD800)) + (i32.le_u (local.get $code) (i32.const 0xDBFF))) + (then + ;; High surrogate - check if next is valid low surrogate + ;; Use nested if to ensure bounds check before array access (short-circuit) + (if (i32.lt_u (i32.add (local.get $i) (i32.const 1)) (local.get $len)) + (then + (local.set $code2 (array.get_u $i16_array (local.get $arr) (i32.add (local.get $i) (i32.const 1)))) + (if (i32.eq (i32.and (local.get $code2) (i32.const 0xFC00)) (i32.const 0xDC00)) + (then + ;; Valid surrogate pair: 4-byte encoding + (local.set $i (i32.add (local.get $i) (i32.const 1))) + ;; Decode: ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000 + (local.set $code + (i32.add + (i32.const 0x10000) + (i32.add + (i32.shl + (i32.sub (local.get $code) (i32.const 0xD800)) + (i32.const 10)) + (i32.sub (local.get $code2) (i32.const 0xDC00))))) + ;; 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + (i32.store8 (local.get $pos) + (i32.or (i32.const 0xF0) (i32.shr_u (local.get $code) (i32.const 18)))) + (i32.store8 (i32.add (local.get $pos) (i32.const 1)) + (i32.or (i32.const 0x80) + (i32.and (i32.shr_u (local.get $code) (i32.const 12)) (i32.const 0x3F)))) + (i32.store8 (i32.add (local.get $pos) (i32.const 2)) + (i32.or (i32.const 0x80) + (i32.and (i32.shr_u (local.get $code) (i32.const 6)) (i32.const 0x3F)))) + (i32.store8 (i32.add (local.get $pos) (i32.const 3)) + (i32.or (i32.const 0x80) (i32.and (local.get $code) (i32.const 0x3F)))) + (local.set $pos (i32.add (local.get $pos) (i32.const 4)))) + (else + ;; Next char exists but not a low surrogate: 3-byte encoding + (i32.store8 (local.get $pos) + (i32.or (i32.const 0xE0) (i32.shr_u (local.get $code) (i32.const 12)))) + (i32.store8 (i32.add (local.get $pos) (i32.const 1)) + (i32.or (i32.const 0x80) + (i32.and (i32.shr_u (local.get $code) (i32.const 6)) (i32.const 0x3F)))) + (i32.store8 (i32.add (local.get $pos) (i32.const 2)) + (i32.or (i32.const 0x80) (i32.and (local.get $code) (i32.const 0x3F)))) + (local.set $pos (i32.add (local.get $pos) (i32.const 3)))))) + (else + ;; Lone high surrogate at end: 3-byte encoding + (i32.store8 (local.get $pos) + (i32.or (i32.const 0xE0) (i32.shr_u (local.get $code) (i32.const 12)))) + (i32.store8 (i32.add (local.get $pos) (i32.const 1)) + (i32.or (i32.const 0x80) + (i32.and (i32.shr_u (local.get $code) (i32.const 6)) (i32.const 0x3F)))) + (i32.store8 (i32.add (local.get $pos) (i32.const 2)) + (i32.or (i32.const 0x80) (i32.and (local.get $code) (i32.const 0x3F)))) + (local.set $pos (i32.add (local.get $pos) (i32.const 3)))))) + (else + ;; 3-byte: 1110xxxx 10xxxxxx 10xxxxxx (includes lone low surrogates) + (i32.store8 (local.get $pos) + (i32.or (i32.const 0xE0) (i32.shr_u (local.get $code) (i32.const 12)))) + (i32.store8 (i32.add (local.get $pos) (i32.const 1)) + (i32.or (i32.const 0x80) + (i32.and (i32.shr_u (local.get $code) (i32.const 6)) (i32.const 0x3F)))) + (i32.store8 (i32.add (local.get $pos) (i32.const 2)) + (i32.or (i32.const 0x80) (i32.and (local.get $code) (i32.const 0x3F)))) + (local.set $pos (i32.add (local.get $pos) (i32.const 3))))))))) + + (local.set $i (i32.add (local.get $i) (i32.const 1))) + (br $continue))) + + (i32.sub (local.get $pos) (local.get $offset))) + + ;; Decode UTF-8 bytes from linear memory to GC array + ;; Returns number of code units written + (func (export "utf8DecodeToArray") (param $length i32) (param $arr (ref $i16_array)) (result i32) + (local $pos i32) + (local $end i32) + (local $outIdx i32) + (local $b1 i32) + (local $b2 i32) + (local $b3 i32) + (local $b4 i32) + (local $cp i32) + + (local.set $end (local.get $length)) + + (block $break + (loop $continue + (br_if $break (i32.ge_u (local.get $pos) (local.get $end))) + + (local.set $b1 (i32.load8_u (local.get $pos))) + + (if (i32.eqz (i32.and (local.get $b1) (i32.const 0x80))) + (then + ;; 1-byte: 0xxxxxxx + (array.set $i16_array (local.get $arr) (local.get $outIdx) (local.get $b1)) + (local.set $outIdx (i32.add (local.get $outIdx) (i32.const 1))) + (local.set $pos (i32.add (local.get $pos) (i32.const 1)))) + (else + (if (i32.eq (i32.and (local.get $b1) (i32.const 0xE0)) (i32.const 0xC0)) + (then + ;; 2-byte: 110xxxxx 10xxxxxx + (local.set $b2 (i32.load8_u (i32.add (local.get $pos) (i32.const 1)))) + (array.set $i16_array (local.get $arr) (local.get $outIdx) + (i32.or + (i32.shl (i32.and (local.get $b1) (i32.const 0x1F)) (i32.const 6)) + (i32.and (local.get $b2) (i32.const 0x3F)))) + (local.set $outIdx (i32.add (local.get $outIdx) (i32.const 1))) + (local.set $pos (i32.add (local.get $pos) (i32.const 2)))) + (else + (if (i32.eq (i32.and (local.get $b1) (i32.const 0xF0)) (i32.const 0xE0)) + (then + ;; 3-byte: 1110xxxx 10xxxxxx 10xxxxxx + (local.set $b2 (i32.load8_u (i32.add (local.get $pos) (i32.const 1)))) + (local.set $b3 (i32.load8_u (i32.add (local.get $pos) (i32.const 2)))) + (array.set $i16_array (local.get $arr) (local.get $outIdx) + (i32.or + (i32.or + (i32.shl (i32.and (local.get $b1) (i32.const 0x0F)) (i32.const 12)) + (i32.shl (i32.and (local.get $b2) (i32.const 0x3F)) (i32.const 6))) + (i32.and (local.get $b3) (i32.const 0x3F)))) + (local.set $outIdx (i32.add (local.get $outIdx) (i32.const 1))) + (local.set $pos (i32.add (local.get $pos) (i32.const 3)))) + (else + (if (i32.eq (i32.and (local.get $b1) (i32.const 0xF8)) (i32.const 0xF0)) + (then + ;; 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + (local.set $b2 (i32.load8_u (i32.add (local.get $pos) (i32.const 1)))) + (local.set $b3 (i32.load8_u (i32.add (local.get $pos) (i32.const 2)))) + (local.set $b4 (i32.load8_u (i32.add (local.get $pos) (i32.const 3)))) + (local.set $cp + (i32.sub + (i32.or + (i32.or + (i32.or + (i32.shl (i32.and (local.get $b1) (i32.const 0x07)) (i32.const 18)) + (i32.shl (i32.and (local.get $b2) (i32.const 0x3F)) (i32.const 12))) + (i32.shl (i32.and (local.get $b3) (i32.const 0x3F)) (i32.const 6))) + (i32.and (local.get $b4) (i32.const 0x3F))) + (i32.const 0x10000))) + ;; High surrogate + (array.set $i16_array (local.get $arr) (local.get $outIdx) + (i32.or (i32.const 0xD800) (i32.shr_u (local.get $cp) (i32.const 10)))) + (local.set $outIdx (i32.add (local.get $outIdx) (i32.const 1))) + ;; Low surrogate + (array.set $i16_array (local.get $arr) (local.get $outIdx) + (i32.or (i32.const 0xDC00) (i32.and (local.get $cp) (i32.const 0x3FF)))) + (local.set $outIdx (i32.add (local.get $outIdx) (i32.const 1))) + (local.set $pos (i32.add (local.get $pos) (i32.const 4)))) + (else + ;; Invalid byte: preserve as code unit (same as JS) + (array.set $i16_array (local.get $arr) (local.get $outIdx) (local.get $b1)) + (local.set $outIdx (i32.add (local.get $outIdx) (i32.const 1))) + (local.set $pos (i32.add (local.get $pos) (i32.const 1))))))))))) + + (br $continue))) + + (local.get $outIdx)) + + ;; Allocate GC array for UTF-16 code units + (func (export "allocArray") (param $size i32) (result (ref $i16_array)) + (array.new $i16_array (i32.const 0) (local.get $size))) + + ;; Create string from GC array + (func (export "arrayToString") (param $arr (ref $i16_array)) (param $start i32) (param $end i32) (result externref) + (call $str_from_array (local.get $arr) (local.get $start) (local.get $end))) +)