fix(core): handle multibyte UTF-8 characters in socket message consumption (#34151)

Chanki-Min · web-flow · commit e35dcd205094 · 2026-02-01T22:09:53.000-05:00
## Current Behavior

When socket data chunks split a multibyte UTF-8 character (e.g., CJK
characters like Korean, Chinese, Japanese) at an arbitrary byte
boundary, `Buffer.toString()` decodes incomplete byte sequences as
replacement characters (�), causing message corruption.

This can occur when:
- File paths contain non-ASCII characters
- Project names include multibyte characters
- Any JSON message contains international text

## Expected Behavior

Multibyte UTF-8 characters should be properly decoded even when split
across multiple socket data chunks. The fix uses Node.js `StringDecoder`
which buffers incomplete multibyte sequences until the remaining bytes
arrive.

## Related Issue(s)

Fixes socket message corruption for paths/names containing multibyte
characters.
diff --git a/packages/nx/src/utils/consume-messages-from-socket.spec.ts b/packages/nx/src/utils/consume-messages-from-socket.spec.ts
@@ -46,4 +46,22 @@ describe('consumeMessagesFromSocket', () => {
 
     expect(messages).toEqual([{ one: 1 }, { two: 2 }, { three: 3 }]);
   });
+
+  it('should handle multibyte UTF-8 characters split across chunks', () => {
+    const messages = [] as any[];
+    const r = consumeMessagesFromSocket((message) =>
+      messages.push(JSON.parse(message))
+    );
+
+    // "한글테스트" path included in JSON
+    const json = JSON.stringify({ path: '/test/한글테스트.tsx' });
+    const buffer = Buffer.from(json + MESSAGE_END_SEQ, 'utf8');
+
+    // Split in the middle of a multibyte character
+    const mid = Math.floor(buffer.length / 2);
+    r(buffer.subarray(0, mid));
+    r(buffer.subarray(mid));
+
+    expect(messages).toEqual([{ path: '/test/한글테스트.tsx' }]);
+  });
 });
diff --git a/packages/nx/src/utils/consume-messages-from-socket.ts b/packages/nx/src/utils/consume-messages-from-socket.ts
@@ -1,11 +1,14 @@
+import { StringDecoder } from 'string_decoder';
+
 const VERY_END_CODE = 4;
 export const MESSAGE_END_SEQ =
   'NX_MSG_END' + String.fromCharCode(VERY_END_CODE);
 
 export function consumeMessagesFromSocket(callback: (message: string) => void) {
   let message = '';
+  const decoder = new StringDecoder('utf8');
   return (data) => {
-    const chunk = data.toString();
+    const chunk = decoder.write(data);
     message += chunk;
 
     // Check if accumulated message ends with MESSAGE_END_SEQ (not just the chunk)