From d40b459fcf87ebdfdfc82c86d9bad2fe8c60e47d Mon Sep 17 00:00:00 2001 From: Mateusz Sterczewski Date: Wed, 21 Jan 2026 14:46:12 +0100 Subject: [PATCH] CM-57848-Fix UTF encoding for Windows characters --- .../files_collector/models/in_memory_zip.py | 6 +++- .../cli/files_collector/test_in_memory_zip.py | 29 +++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 tests/cli/files_collector/test_in_memory_zip.py diff --git a/cycode/cli/files_collector/models/in_memory_zip.py b/cycode/cli/files_collector/models/in_memory_zip.py index 93ac4ac7..8bb9bf9e 100644 --- a/cycode/cli/files_collector/models/in_memory_zip.py +++ b/cycode/cli/files_collector/models/in_memory_zip.py @@ -26,7 +26,11 @@ def append(self, filename: str, unique_id: Optional[str], content: str) -> None: if unique_id: filename = concat_unique_id(filename, unique_id) - self.zip.writestr(filename, content) + # Encode content to bytes with error handling to handle surrogate characters + # that cannot be encoded to UTF-8. Use 'replace' to replace invalid characters + # with the Unicode replacement character (U+FFFD). + content_bytes = content.encode('utf-8', errors='replace') + self.zip.writestr(filename, content_bytes) def close(self) -> None: self.zip.close() diff --git a/tests/cli/files_collector/test_in_memory_zip.py b/tests/cli/files_collector/test_in_memory_zip.py new file mode 100644 index 00000000..d1790c7c --- /dev/null +++ b/tests/cli/files_collector/test_in_memory_zip.py @@ -0,0 +1,29 @@ +"""Tests for InMemoryZip class, specifically for handling surrogate characters and encoding issues.""" + +import zipfile +from io import BytesIO + +from cycode.cli.files_collector.models.in_memory_zip import InMemoryZip + + +def test_append_with_surrogate_characters() -> None: + """Test that surrogate characters are handled gracefully without raising encoding errors.""" + # Surrogate characters (U+D800 to U+DFFF) cannot be encoded to UTF-8 directly + zip_file = InMemoryZip() + content = 'Normal text \udc96 more text' + + # Should not raise UnicodeEncodeError + zip_file.append('test.txt', None, content) + zip_file.close() + + # Verify the ZIP was created successfully + zip_data = zip_file.read() + assert len(zip_data) > 0 + + # Verify we can read it back and the surrogate was replaced + with zipfile.ZipFile(BytesIO(zip_data), 'r') as zf: + extracted = zf.read('test.txt').decode('utf-8') + assert 'Normal text' in extracted + assert 'more text' in extracted + # The surrogate should have been replaced with the replacement character + assert '\udc96' not in extracted