From f542646ec68b48da78ba3d368c72680446dc854f Mon Sep 17 00:00:00 2001 From: Mert Can Altin Date: Fri, 23 Jan 2026 22:40:25 +0300 Subject: [PATCH 1/3] src: improve textEncoder encode performance with simdutf --- src/encoding_binding.cc | 96 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 85 insertions(+), 11 deletions(-) diff --git a/src/encoding_binding.cc b/src/encoding_binding.cc index 7683d205aa6a3e..3a2893460bce94 100644 --- a/src/encoding_binding.cc +++ b/src/encoding_binding.cc @@ -306,31 +306,105 @@ void BindingData::EncodeInto(const FunctionCallbackInfo& args) { // Encode a single string to a UTF-8 Uint8Array (not Buffer). // Used in TextEncoder.prototype.encode. +// +// Optimized using simdutf, adapted from Cloudflare workerd: +// https://github.com/cloudflare/workerd/pull/5448 void BindingData::EncodeUtf8String(const FunctionCallbackInfo& args) { Isolate* isolate = args.GetIsolate(); CHECK_GE(args.Length(), 1); CHECK(args[0]->IsString()); - Local str = args[0].As(); - size_t length = str->Utf8LengthV2(isolate); + Local source = args[0].As(); - Local ab; - { + // For small strings, use the V8 path + static constexpr int kSmallStringThreshold = 32; + if (source->Length() <= kSmallStringThreshold) { + size_t length = source->Utf8LengthV2(isolate); std::unique_ptr bs = ArrayBuffer::NewBackingStore( isolate, length, BackingStoreInitializationMode::kUninitialized); + CHECK(bs); + source->WriteUtf8V2(isolate, + static_cast(bs->Data()), + bs->MaxByteLength(), + String::WriteFlags::kReplaceInvalidUtf8); + Local ab = ArrayBuffer::New(isolate, std::move(bs)); + args.GetReturnValue().Set(Uint8Array::New(ab, 0, length)); + return; + } + + size_t length = source->Length(); + size_t utf8_length = 0; + bool is_one_byte = source->IsOneByte(); + + if (is_one_byte) { + // One-byte string (Latin1) - copy to buffer first, then process + MaybeStackBuffer latin1_buffer(length); + source->WriteOneByteV2(isolate, 0, length, latin1_buffer.out()); + + auto data = reinterpret_cast(latin1_buffer.out()); + + // Check if it's pure ASCII - if so, we can just copy + simdutf::result result = simdutf::validate_ascii_with_errors(data, length); + if (result.error == simdutf::SUCCESS) { + // Pure ASCII - direct copy + std::unique_ptr bs = ArrayBuffer::NewBackingStore( + isolate, length, BackingStoreInitializationMode::kUninitialized); + CHECK(bs); + memcpy(bs->Data(), data, length); + Local ab = ArrayBuffer::New(isolate, std::move(bs)); + args.GetReturnValue().Set(Uint8Array::New(ab, 0, length)); + return; + } + // Latin1 with non-ASCII characters - need conversion + utf8_length = simdutf::utf8_length_from_latin1(data, length); + std::unique_ptr bs = ArrayBuffer::NewBackingStore( + isolate, utf8_length, BackingStoreInitializationMode::kUninitialized); CHECK(bs); + [[maybe_unused]] size_t written = simdutf::convert_latin1_to_utf8( + data, length, static_cast(bs->Data())); + DCHECK_EQ(written, utf8_length); + Local ab = ArrayBuffer::New(isolate, std::move(bs)); + args.GetReturnValue().Set(Uint8Array::New(ab, 0, utf8_length)); + return; + } + + // Two-byte string (UTF-16) - copy to buffer first + MaybeStackBuffer utf16_buffer(length); + source->WriteV2(isolate, 0, length, utf16_buffer.out()); - // We are certain that `data` is sufficiently large - str->WriteUtf8V2(isolate, - static_cast(bs->Data()), - bs->MaxByteLength(), - String::WriteFlags::kReplaceInvalidUtf8); + auto data = reinterpret_cast(utf16_buffer.out()); - ab = ArrayBuffer::New(isolate, std::move(bs)); + // Check for unpaired surrogates + simdutf::result validation_result = + simdutf::validate_utf16_with_errors(data, length); + + if (validation_result.error == simdutf::SUCCESS) { + // Valid UTF-16 - use the fast path + utf8_length = simdutf::utf8_length_from_utf16(data, length); + std::unique_ptr bs = ArrayBuffer::NewBackingStore( + isolate, utf8_length, BackingStoreInitializationMode::kUninitialized); + CHECK(bs); + [[maybe_unused]] size_t written = simdutf::convert_utf16_to_utf8( + data, length, static_cast(bs->Data())); + DCHECK_EQ(written, utf8_length); + Local ab = ArrayBuffer::New(isolate, std::move(bs)); + args.GetReturnValue().Set(Uint8Array::New(ab, 0, utf8_length)); + return; } - args.GetReturnValue().Set(Uint8Array::New(ab, 0, length)); + // Invalid UTF-16 with unpaired surrogates - convert to well-formed in place + simdutf::to_well_formed_utf16(data, length, data); + + utf8_length = simdutf::utf8_length_from_utf16(data, length); + std::unique_ptr bs = ArrayBuffer::NewBackingStore( + isolate, utf8_length, BackingStoreInitializationMode::kUninitialized); + CHECK(bs); + [[maybe_unused]] size_t written = simdutf::convert_utf16_to_utf8( + data, length, static_cast(bs->Data())); + DCHECK_EQ(written, utf8_length); + Local ab = ArrayBuffer::New(isolate, std::move(bs)); + args.GetReturnValue().Set(Uint8Array::New(ab, 0, utf8_length)); } // Convert the input into an encoded string From 8a514d666c52e684b2e2a0924a9e42fe968fd09f Mon Sep 17 00:00:00 2001 From: Mert Can Altin Date: Fri, 23 Jan 2026 22:44:41 +0300 Subject: [PATCH 2/3] remove comment line --- src/encoding_binding.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/encoding_binding.cc b/src/encoding_binding.cc index 3a2893460bce94..a0dc65b76726a4 100644 --- a/src/encoding_binding.cc +++ b/src/encoding_binding.cc @@ -307,8 +307,7 @@ void BindingData::EncodeInto(const FunctionCallbackInfo& args) { // Encode a single string to a UTF-8 Uint8Array (not Buffer). // Used in TextEncoder.prototype.encode. // -// Optimized using simdutf, adapted from Cloudflare workerd: -// https://github.com/cloudflare/workerd/pull/5448 + void BindingData::EncodeUtf8String(const FunctionCallbackInfo& args) { Isolate* isolate = args.GetIsolate(); CHECK_GE(args.Length(), 1); From 0c17bf16ee65ad339ecc7a3b74a67e82574b59c0 Mon Sep 17 00:00:00 2001 From: Mert Can Altin Date: Fri, 23 Jan 2026 22:45:32 +0300 Subject: [PATCH 3/3] remove comment line --- src/encoding_binding.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/encoding_binding.cc b/src/encoding_binding.cc index a0dc65b76726a4..047e8162f5e9ae 100644 --- a/src/encoding_binding.cc +++ b/src/encoding_binding.cc @@ -306,8 +306,6 @@ void BindingData::EncodeInto(const FunctionCallbackInfo& args) { // Encode a single string to a UTF-8 Uint8Array (not Buffer). // Used in TextEncoder.prototype.encode. -// - void BindingData::EncodeUtf8String(const FunctionCallbackInfo& args) { Isolate* isolate = args.GetIsolate(); CHECK_GE(args.Length(), 1);