From d40b459fcf87ebdfdfc82c86d9bad2fe8c60e47d Mon Sep 17 00:00:00 2001
From: Mateusz Sterczewski <mateusz@cycode.com>
Date: Wed, 21 Jan 2026 14:46:12 +0100
Subject: [PATCH] CM-57848-Fix UTF encoding for Windows characters

---
 .../files_collector/models/in_memory_zip.py   |  6 +++-
 .../cli/files_collector/test_in_memory_zip.py | 29 +++++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)
 create mode 100644 tests/cli/files_collector/test_in_memory_zip.py

diff --git a/cycode/cli/files_collector/models/in_memory_zip.py b/cycode/cli/files_collector/models/in_memory_zip.py
index 93ac4ac7..8bb9bf9e 100644
--- a/cycode/cli/files_collector/models/in_memory_zip.py
+++ b/cycode/cli/files_collector/models/in_memory_zip.py
@@ -26,7 +26,11 @@ def append(self, filename: str, unique_id: Optional[str], content: str) -> None:
         if unique_id:
             filename = concat_unique_id(filename, unique_id)
 
-        self.zip.writestr(filename, content)
+        # Encode content to bytes with error handling to handle surrogate characters
+        # that cannot be encoded to UTF-8. Use 'replace' to replace invalid characters
+        # with the Unicode replacement character (U+FFFD).
+        content_bytes = content.encode('utf-8', errors='replace')
+        self.zip.writestr(filename, content_bytes)
 
     def close(self) -> None:
         self.zip.close()
diff --git a/tests/cli/files_collector/test_in_memory_zip.py b/tests/cli/files_collector/test_in_memory_zip.py
new file mode 100644
index 00000000..d1790c7c
--- /dev/null
+++ b/tests/cli/files_collector/test_in_memory_zip.py
@@ -0,0 +1,29 @@
+"""Tests for InMemoryZip class, specifically for handling surrogate characters and encoding issues."""
+
+import zipfile
+from io import BytesIO
+
+from cycode.cli.files_collector.models.in_memory_zip import InMemoryZip
+
+
+def test_append_with_surrogate_characters() -> None:
+    """Test that surrogate characters are handled gracefully without raising encoding errors."""
+    # Surrogate characters (U+D800 to U+DFFF) cannot be encoded to UTF-8 directly
+    zip_file = InMemoryZip()
+    content = 'Normal text \udc96 more text'
+
+    # Should not raise UnicodeEncodeError
+    zip_file.append('test.txt', None, content)
+    zip_file.close()
+
+    # Verify the ZIP was created successfully
+    zip_data = zip_file.read()
+    assert len(zip_data) > 0
+
+    # Verify we can read it back and the surrogate was replaced
+    with zipfile.ZipFile(BytesIO(zip_data), 'r') as zf:
+        extracted = zf.read('test.txt').decode('utf-8')
+        assert 'Normal text' in extracted
+        assert 'more text' in extracted
+        # The surrogate should have been replaced with the replacement character
+        assert '\udc96' not in extracted