diff --git a/CHANGELOG.md b/CHANGELOG.md index 28f6c20..c13731e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## Version 6.4.0 (unreleased) + +- Enhanced `monospaced_width()` to support emoji ZWJ sequences, flags, variation selectors, + skin tones, and many more languages and terminal sequences, like OSC 8 hyperlinks. It no longer + returns -1 for control characters; instead they are parsed (eg. BACKSPACE, `\b`) for their + horizontal effects, or otherwise ignored, (eg. BEL, `\a`). +- `display_ljust/rjust/center()` delegate to directly to `wcwidth.ljust()/..` for the same. +- Updated `display_center()` to match standard python `str.center()` "parity-odd" spacing. + ## Version 6.3.1 (October 25, 2024) - Fixed `license` metadata field in pyproject.toml. diff --git a/ftfy/formatting.py b/ftfy/formatting.py index 4295558..4a43853 100644 --- a/ftfy/formatting.py +++ b/ftfy/formatting.py @@ -6,11 +6,7 @@ the 'wcwidth' library. """ -from unicodedata import normalize - -from wcwidth import wcswidth, wcwidth - -from ftfy.fixes import remove_terminal_escapes +import wcwidth def character_width(char: str) -> int: @@ -31,7 +27,7 @@ def character_width(char: str) -> int: >>> character_width('\n') -1 """ - return int(wcwidth(char)) + return wcwidth.wcwidth(char) def monospaced_width(text: str) -> int: @@ -43,16 +39,12 @@ def monospaced_width(text: str) -> int: This can be useful for formatting text that may contain non-spacing characters, or CJK characters that take up two character cells. - Returns -1 if the string contains a non-printable or control character. - >>> monospaced_width('ちゃぶ台返し') 12 >>> len('ちゃぶ台返し') 6 >>> monospaced_width('owl\N{SOFT HYPHEN}flavored') - 11 - >>> monospaced_width('example\x80') - -1 + 12 A more complex example: The Korean word 'ibnida' can be written with 3 pre-composed characters or 7 jamo. Either way, it *looks* the same and @@ -67,13 +59,16 @@ def monospaced_width(text: str) -> int: 4 characters, when shown as intended. >>> monospaced_width('\x1b[34mblue\x1b[m') 4 + + Emoji ZWJ sequences are treated as single grapheme clusters with width 2. + >>> monospaced_width('👨‍👩‍👧') + 2 + + Control characters are parsed and treated as zero-width. + >>> monospaced_width('example\x80') + 7 """ - # NFC-normalize the text first, so that we don't need special cases for - # Hangul jamo. - # - # Remove terminal escapes before calculating width, because if they are - # displayed as intended, they will have zero width. - return int(wcswidth(remove_terminal_escapes(normalize("NFC", text)))) + return wcwidth.width(text, control_codes="parse") def display_ljust(text: str, width: int, fillchar: str = " ") -> str: @@ -102,13 +97,7 @@ def display_ljust(text: str, width: int, fillchar: str = " ") -> str: msg = "The padding character must have display width 1" raise ValueError(msg) - text_width = monospaced_width(text) - if text_width == -1: - # There's a control character here, so just don't add padding - return text - - padding = max(0, width - text_width) - return text + fillchar * padding + return wcwidth.ljust(text, width, fillchar=fillchar) def display_rjust(text: str, width: int, fillchar: str = " ") -> str: @@ -133,12 +122,7 @@ def display_rjust(text: str, width: int, fillchar: str = " ") -> str: msg = "The padding character must have display width 1" raise ValueError(msg) - text_width = monospaced_width(text) - if text_width == -1: - return text - - padding = max(0, width - text_width) - return fillchar * padding + text + return wcwidth.rjust(text, width, fillchar=fillchar) def display_center(text: str, width: int, fillchar: str = " ") -> str: @@ -159,11 +143,4 @@ def display_center(text: str, width: int, fillchar: str = " ") -> str: msg = "The padding character must have display width 1" raise ValueError(msg) - text_width = monospaced_width(text) - if text_width == -1: - return text - - padding = max(0, width - text_width) - left_padding = padding // 2 - right_padding = padding - left_padding - return fillchar * left_padding + text + fillchar * right_padding + return wcwidth.center(text, width, fillchar=fillchar) diff --git a/pyproject.toml b/pyproject.toml index 130dec2..ad2663b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "Fixes mojibake and other problems with Unicode, after the fact" authors = [{ name = "Robyn Speer", email = "rspeer@arborelia.net" }] license = { text = "Apache-2.0" } readme = "README.md" -dependencies = ["wcwidth"] +dependencies = ["wcwidth>=0.4"] requires-python = ">=3.9" [project.scripts] diff --git a/tests/test_formatting.py b/tests/test_formatting.py new file mode 100644 index 0000000..a340046 --- /dev/null +++ b/tests/test_formatting.py @@ -0,0 +1,80 @@ +import pytest + +from ftfy.formatting import ( + character_width, + display_center, + display_ljust, + display_rjust, + monospaced_width, +) + + +class TestMonospacedWidth: + def test_ascii_and_cjk(self): + assert monospaced_width("hello") == 5 + assert monospaced_width("中文") == 4 + assert monospaced_width("ちゃぶ台返し") == 12 + assert monospaced_width("Hello 中文 👍") == 13 + + def test_grapheme_clusters(self): + assert monospaced_width("cafe\u0301") == 4 + assert monospaced_width("\u200d") == 0 + assert monospaced_width("👨‍👩‍👧") == 2 + assert monospaced_width("👩🏻‍💻") == 2 + assert monospaced_width("🇨🇦") == 2 + assert monospaced_width("❤️") == 2 + + def test_ansi_escape_sequences(self): + assert monospaced_width("\x1b[31mred\x1b[0m") == 3 + assert monospaced_width("\x1b[34mblue\x1b[m") == 4 + assert monospaced_width("\x1b[31;1mBold Red\x1b[0m") == 8 + + def test_osc8_hyperlinks(self): + assert monospaced_width("\x1b]8;;https://example.com\x07Click here\x1b]8;;\x07") == 10 + assert monospaced_width( + "\x1b]8;;https://example.com\x07\x1b[34mBlue Link\x1b[0m\x1b]8;;\x07" + ) == 9 + + def test_control_characters(self): + assert monospaced_width("example\x80") == 7 + assert monospaced_width("aaa\b\b\bxxx") == 3 + assert monospaced_width("hello\b\bXX") == 5 + + +class TestCharacterWidth: + def test_character_widths(self): + assert character_width("A") == 1 + assert character_width("車") == 2 + assert character_width("\N{ZERO WIDTH JOINER}") == 0 + assert character_width("\n") == -1 + + +class TestDisplayJustify: + def test_ljust(self): + assert display_ljust("hello", 10) == "hello " + assert display_ljust("中", 4) == "中 " + assert display_ljust("👍", 4) == "👍 " + assert display_ljust("hello", 3) == "hello" + assert display_ljust("hi", 5, ".") == "hi..." + + def test_rjust(self): + assert display_rjust("hello", 10) == " hello" + assert display_rjust("中", 4) == " 中" + assert display_rjust("👍", 4) == " 👍" + + def test_center(self): + assert display_center("hi", 6) == " hi " + assert display_center("中", 6) == " 中 " + assert display_center("hi", 5) == " hi " + + def test_invalid_fillchar(self): + with pytest.raises(ValueError, match="display width 1"): + display_ljust("hi", 10, "中") + with pytest.raises(ValueError, match="display width 1"): + display_ljust("hi", 10, "\u200d") + with pytest.raises(ValueError, match="display width 1"): + display_rjust("hi", 10, "中") + with pytest.raises(ValueError, match="display width 1"): + display_center("hi", 10, "中") + +