From 669f98db504af8fcb710c5da91e44d32cff1696f Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Fri, 10 Apr 2026 12:23:25 -0400 Subject: [PATCH 01/13] fix: use UTF-16 offsets for Text operations (fixes #308) Set OffsetKind::Utf16 on yrs Doc so the wire format uses UTF-16 code unit offsets, matching JS yjs. Without this, pycrdt uses UTF-8 byte offsets, causing findIndexSS "Unexpected case" crashes when JS yjs clients apply incremental updates containing multi-byte characters. In the Python wrapper, convert character (code point) indices to UTF-16 code unit indices before passing to yrs. This ensures Text.insert(), __setitem__, __delitem__, and format() all work correctly with emoji and other non-BMP characters. Fixes: #308 Related: jupyter-ai-contrib/jupyter-server-documents#197 --- python/pycrdt/_text.py | 77 ++++++++++++++++++++++++++++++++++-------- src/doc.rs | 8 ++++- 2 files changed, 70 insertions(+), 15 deletions(-) diff --git a/python/pycrdt/_text.py b/python/pycrdt/_text.py index 49356c0..418bab3 100644 --- a/python/pycrdt/_text.py +++ b/python/pycrdt/_text.py @@ -11,6 +11,36 @@ from ._doc import Doc +def _char_to_utf16(text: str, char_index: int) -> int: + """Convert a Python character (code point) index to a UTF-16 code unit index. + + Characters outside the Basic Multilingual Plane (e.g. emoji) occupy 2 + UTF-16 code units but only 1 Python character. The underlying yrs library + uses UTF-16 offsets, so all indices passed to it must be converted. + + For pure-ASCII / BMP text this is a no-op (returns ``char_index`` + unchanged). + """ + if char_index == 0: + return 0 + prefix = text[:char_index] + # Count characters that need a surrogate pair (code point > 0xFFFF) + extra = sum(1 for ch in prefix if ord(ch) > 0xFFFF) + return char_index + extra + + +def _utf16_to_char(text: str, utf16_index: int) -> int: + """Convert a UTF-16 code unit index back to a Python character index.""" + char_idx = 0 + utf16_idx = 0 + for ch in text: + if utf16_idx >= utf16_index: + break + utf16_idx += 2 if ord(ch) > 0xFFFF else 1 + char_idx += 1 + return char_idx + + class Text(Sequence): """ A shared data type used for collaborative text editing, similar to a Python `str`. @@ -89,10 +119,10 @@ def __len__(self) -> int: ``` Returns: - The length of the text. + The length of the text (in Python characters, not UTF-16 code units). """ - with self.doc.transaction() as txn: - return self.integrated.len(txn._txn) + # Return Python character count, not yrs UTF-16 code unit count + return len(str(self)) def __str__(self) -> str: """ @@ -169,13 +199,19 @@ def __delitem__(self, key: int | slice) -> None: """ with self.doc.transaction() as txn: self._forbid_read_transaction(txn) + current = str(self) if isinstance(key, int): - self.integrated.remove_range(txn._txn, key, 1) + utf16_idx = _char_to_utf16(current, key) + char_at = current[key] + utf16_len = 2 if ord(char_at) > 0xFFFF else 1 + self.integrated.remove_range(txn._txn, utf16_idx, utf16_len) elif isinstance(key, slice): start, stop = self._check_slice(key) length = stop - start if length > 0: - self.integrated.remove_range(txn._txn, start, length) + utf16_start = _char_to_utf16(current, start) + utf16_stop = _char_to_utf16(current, stop) + self.integrated.remove_range(txn._txn, utf16_start, utf16_stop - utf16_start) else: raise RuntimeError(f"Index not supported: {key}") @@ -214,20 +250,26 @@ def __setitem__(self, key: int | slice, value: str) -> None: """ with self.doc.transaction() as txn: self._forbid_read_transaction(txn) + current = str(self) if isinstance(key, int): value_len = len(value) if value_len != 1: raise RuntimeError( f"Single item assigned value must have a length of 1, not {value_len}" ) - del self[key] - self.integrated.insert(txn._txn, key, value) + utf16_idx = _char_to_utf16(current, key) + char_at = current[key] + utf16_len = 2 if ord(char_at) > 0xFFFF else 1 + self.integrated.remove_range(txn._txn, utf16_idx, utf16_len) + self.integrated.insert(txn._txn, utf16_idx, value) elif isinstance(key, slice): start, stop = self._check_slice(key) - length = stop - start + utf16_start = _char_to_utf16(current, start) + utf16_stop = _char_to_utf16(current, stop) + length = utf16_stop - utf16_start if length > 0: - self.integrated.remove_range(txn._txn, start, length) - self.integrated.insert(txn._txn, start, value) + self.integrated.remove_range(txn._txn, utf16_start, length) + self.integrated.insert(txn._txn, utf16_start, value) else: raise RuntimeError(f"Index not supported: {key}") @@ -251,8 +293,10 @@ def insert(self, index: int, value: str, attrs: dict[str, Any] | None = None) -> """ with self.doc.transaction() as txn: self._forbid_read_transaction(txn) + current = str(self) + utf16_index = _char_to_utf16(current, index) self.integrated.insert( - txn._txn, index, value, iter(attrs.items()) if attrs is not None else None + txn._txn, utf16_index, value, iter(attrs.items()) if attrs is not None else None ) def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = None) -> None: @@ -266,8 +310,10 @@ def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = No """ with self.doc.transaction() as txn: self._forbid_read_transaction(txn) + current = str(self) + utf16_index = _char_to_utf16(current, index) self.integrated.insert_embed( - txn._txn, index, value, iter(attrs.items()) if attrs is not None else None + txn._txn, utf16_index, value, iter(attrs.items()) if attrs is not None else None ) def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None: @@ -282,9 +328,12 @@ def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None: with self.doc.transaction() as txn: self._forbid_read_transaction(txn) start, stop = self._check_slice(slice(start, stop)) - length = stop - start + current = str(self) + utf16_start = _char_to_utf16(current, start) + utf16_stop = _char_to_utf16(current, stop) + length = utf16_stop - utf16_start if length > 0: - self.integrated.format(txn._txn, start, length, iter(attrs.items())) + self.integrated.format(txn._txn, utf16_start, length, iter(attrs.items())) def diff(self) -> list[tuple[Any, dict[str, Any] | None]]: """ diff --git a/src/doc.rs b/src/doc.rs index 61109cc..27e875d 100644 --- a/src/doc.rs +++ b/src/doc.rs @@ -3,7 +3,7 @@ use pyo3::IntoPyObjectExt; use pyo3::exceptions::{PyRuntimeError, PyValueError}; use pyo3::types::{PyBool, PyBytes, PyDict, PyInt, PyList}; use yrs::{ - Doc as _Doc, Options, ReadTxn, StateVector, SubdocsEvent as _SubdocsEvent, Transact, TransactionCleanupEvent, TransactionMut, Update, WriteTxn + Doc as _Doc, OffsetKind, Options, ReadTxn, StateVector, SubdocsEvent as _SubdocsEvent, Transact, TransactionCleanupEvent, TransactionMut, Update, WriteTxn }; use yrs::updates::encoder::{Encode, Encoder}; use yrs::updates::decoder::Decode; @@ -32,6 +32,7 @@ impl Doc { let mut options = yrs::Options::default(); options.client_id = original.doc.client_id(); options.skip_gc = original.doc.skip_gc(); + options.offset_kind = OffsetKind::Utf16; if let Some(collection_id) = original.doc.collection_id() { options.collection_id = Some(collection_id); } @@ -84,6 +85,11 @@ impl Doc { .map_err(|_| PyValueError::new_err("skip_gc must be a valid bool"))?; options.skip_gc = _skip_gc; } + // Use UTF-16 offsets for compatibility with JS yjs clients. + // Without this, pycrdt uses UTF-8 byte offsets which causes + // findIndexSS crashes when JS yjs applies incremental updates + // containing multi-byte characters. + options.offset_kind = OffsetKind::Utf16; let doc = _Doc::with_options(options); Ok(Doc { doc }) } From f3a2e7e596fd2d7bf2b35ea4fdbf97eb7b5eb484 Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Fri, 10 Apr 2026 14:44:42 -0400 Subject: [PATCH 02/13] test: add 12 Unicode/emoji tests for Text operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cover insert, delete, setitem, slice, len, and cross-doc sync with: - emoji (surrogate pairs: πŸ“Š πŸŽ‰) - CJK (BMP: δ»·ζ Ό δΈ–η•Œ 特征ε·₯程) - Cyrillic (ΠΌΠΈΡ€) - supplementary plane (π’œ π €€) - mixed scripts in one text These all fail on stock pycrdt 0.12.50 and pass with the OffsetKind::Utf16 fix. --- tests/test_text.py | 153 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) diff --git a/tests/test_text.py b/tests/test_text.py index ba913a0..ab4e507 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -228,6 +228,159 @@ def test_sticky_index(serialize: str): assert text1[new_idx] == "*" +def test_unicode_emoji_insert(): + """Text.insert() after emoji characters should use character positions, not byte offsets.""" + doc = Doc() + doc["text"] = text = Text() + + text += "AπŸ“ŠB" + assert str(text) == "AπŸ“ŠB" + assert len(text) == 3 + + # Insert at position 2 = between πŸ“Š and B + text.insert(2, "X") + assert str(text) == "AπŸ“ŠXB", f"Got {str(text)!r}, emoji insert position is wrong" + + +def test_unicode_emoji_sequential_inserts(): + """Sequential inserts after emoji should maintain correct positions.""" + doc = Doc() + doc["text"] = text = Text() + + text += "# Analysis πŸ“Š\n" + text.insert(len(text), "model = fit()\n") + text.insert(len(text), "# 特征ε·₯程\n") + text.insert(len(text), 'print("done")\n') + + expected = '# Analysis πŸ“Š\nmodel = fit()\n# 特征ε·₯程\nprint("done")\n' + assert str(text) == expected, f"Got {str(text)!r}" + + +def test_unicode_emoji_len(): + """len() should return Python character count, not byte count.""" + doc = Doc() + doc["text"] = text = Text() + + text += "AπŸ“ŠB" + assert len(text) == 3 # 3 chars, not 6 bytes or 4 UTF-16 code units + + text += "πŸŽ‰" + assert len(text) == 4 + + +def test_unicode_emoji_delete(): + """Deleting a character after an emoji should work correctly.""" + doc = Doc() + doc["text"] = text = Text("AπŸ“ŠBC") + + del text[2] # delete B (after emoji) + assert str(text) == "AπŸ“ŠC", f"Got {str(text)!r}" + + +def test_unicode_emoji_delete_emoji(): + """Deleting an emoji character itself should work correctly.""" + doc = Doc() + doc["text"] = text = Text("AπŸ“ŠB") + + del text[1] # delete πŸ“Š + assert str(text) == "AB", f"Got {str(text)!r}" + + +def test_unicode_emoji_slice_delete(): + """Slice deletion across emoji boundaries should work correctly.""" + doc = Doc() + doc["text"] = text = Text("AπŸ“ŠBπŸŽ‰C") + + del text[1:4] # delete πŸ“ŠBπŸŽ‰ + assert str(text) == "AC", f"Got {str(text)!r}" + + +def test_unicode_emoji_setitem(): + """Replacing a character after an emoji should work correctly.""" + doc = Doc() + doc["text"] = text = Text("AπŸ“ŠBC") + + text[2] = "X" # replace B (after emoji) + assert str(text) == "AπŸ“ŠXC", f"Got {str(text)!r}" + + +def test_unicode_emoji_slice_setitem(): + """Slice replacement spanning emoji should work correctly.""" + doc = Doc() + doc["text"] = text = Text("AπŸ“ŠBπŸŽ‰C") + + text[1:4] = "XYZ" # replace πŸ“ŠBπŸŽ‰ with XYZ + assert str(text) == "AXYZC", f"Got {str(text)!r}" + + +def test_unicode_cjk(): + """CJK characters (BMP, 1 UTF-16 code unit each) should work correctly.""" + doc = Doc() + doc["text"] = text = Text() + + text += "δ»·ζ Ό" + text.insert(2, "X") + assert str(text) == "δ»·ζ ΌX", f"Got {str(text)!r}" + assert len(text) == 3 + + +def test_unicode_mixed_scripts(): + """Mixed ASCII, CJK, Cyrillic, and emoji in one text.""" + doc = Doc() + doc["text"] = text = Text() + + text += "Hello" + text.insert(5, " δΈ–η•Œ") + text.insert(8, " πŸ“Š") + text.insert(11, " ΠΌΠΈΡ€") + text.insert(15, "!") + + expected = "Hello δΈ–η•Œ πŸ“Š ΠΌΠΈΡ€!" + assert str(text) == expected, f"Got {str(text)!r}" + assert len(text) == 15 + + +def test_unicode_supplementary_plane(): + """Characters outside BMP (require UTF-16 surrogate pairs).""" + doc = Doc() + doc["text"] = text = Text() + + # π’œ (U+1D49C) = Mathematical Script Capital A + # π €€ (U+20000) = CJK Unified Ideograph Extension B + text += "Aπ’œBπ €€C" + assert len(text) == 5 + + text.insert(2, "X") # between π’œ and B + assert str(text) == "Aπ’œXBπ €€C", f"Got {str(text)!r}" + + text.insert(5, "Y") # between π €€ and C + assert str(text) == "Aπ’œXBπ €€YC", f"Got {str(text)!r}" + + +def test_unicode_cross_doc_sync(): + """Updates with Unicode content should sync correctly between two pycrdt docs.""" + doc1 = Doc() + doc1["text"] = text1 = Text() + + # Capture updates from doc1 + updates = [] + doc1.observe(lambda event: updates.append(event.update)) + + text1 += "# Analysis πŸ“Š\n" + text1.insert(len(text1), "model = fit()\n") + text1.insert(len(text1), "# 特征ε·₯程\n") + + # Apply to doc2 + doc2 = Doc() + doc2["text"] = Text() + for update in updates: + doc2.apply_update(update) + + assert str(doc2["text"]) == str(text1), ( + f"Docs diverged: doc1={str(text1)!r} doc2={str(doc2['text'])!r}" + ) + + def test_sticky_index_transaction(): doc = Doc() text = doc.get("text", type=Text) From 8264d95de14fb00435d338b76111dc81ab12b97a Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Fri, 10 Apr 2026 14:48:36 -0400 Subject: [PATCH 03/13] test: add granular diff tests from jupyter_ydoc#370 11 parametrized test cases adapted from jupyter-server/jupyter_ydoc#370 covering emoji swaps, flags, ZWJ family sequences, combining marks, keycap sequences, RTL/LTR text, Japanese, and math operators. These exercise Text insert/delete/replace via SequenceMatcher-based diffing (the same pattern jupyter_ydoc.YUnicode.set() uses). --- tests/test_text.py | 116 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) diff --git a/tests/test_text.py b/tests/test_text.py index ab4e507..dd77df2 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -381,6 +381,122 @@ def test_unicode_cross_doc_sync(): ) +# Test cases adapted from jupyter-server/jupyter_ydoc#370 (prior art for +# the workaround at the jupyter_ydoc layer). These exercise pycrdt's Text +# operations directly with the same Unicode edge cases. Each test sets +# initial content, then applies a granular edit (using SequenceMatcher on +# byte offsets, matching how jupyter_ydoc.YUnicode.set() works), and verifies +# the result is correct. +from difflib import SequenceMatcher + + +def _apply_diff(text, old_value, new_value): + """Apply a granular diff from old_value to new_value using character-level + SequenceMatcher. With the UTF-16 offset fix, pycrdt Text indices are + character-based, so we diff on characters (not bytes).""" + matcher = SequenceMatcher(a=old_value, b=new_value) + + offset = 0 + for tag, i1, i2, j1, j2 in matcher.get_opcodes(): + if tag == "replace": + text[i1 + offset : i2 + offset] = new_value[j1:j2] + offset += (j2 - j1) - (i2 - i1) + elif tag == "delete": + del text[i1 + offset : i2 + offset] + offset -= i2 - i1 + elif tag == "insert": + text.insert(i1 + offset, new_value[j1:j2]) + offset += j2 - j1 + + +@pytest.mark.parametrize( + "initial, updated", + [ + # emojis swapped + ( + "I like security 🎨 but I really love painting πŸ”’", + "I like security πŸ”’ but I really love painting 🎨", + ), + # text changes, emojis stay in place + ( + "Here is a rocket: ⭐ and a star: πŸš€", + "Here is a star: ⭐ and a rocket: πŸš€", + ), + # change of text and emojis + ( + "Here are some happy faces: πŸ˜€πŸ˜πŸ˜‚", + "Here are some sad faces: 😞😒😭", + ), + # change of characters with combining marks + ( + "Combining characters: Γ‘ Γ© Γ­ Γ³ ΓΊ", + "Combining characters: ΓΊ Γ³ Γ­ Γ© Γ‘", + ), + # flags (regional indicator sequences) + ( + "Flags: πŸ‡ΊπŸ‡ΈπŸ‡¬πŸ‡§πŸ‡¨πŸ‡¦", + "Flags: πŸ‡¨πŸ‡¦πŸ‡¬πŸ‡§πŸ‡ΊπŸ‡Έ", + ), + # Zero-width joiner sequences (family emoji) + ( + "A family πŸ‘¨\u200dπŸ‘©\u200dπŸ‘§\u200dπŸ‘¦ (with two children)", + "A family πŸ‘¨\u200dπŸ‘©\u200dπŸ‘§ (with one child)", + ), + # Mixed RTL/LTR text + ( + "Hello Χ©ΧœΧ•Χ world", + "Hello Χ’Χ•ΧœΧ world", + ), + # Keycap sequences + ( + "Numbers: 1️⃣2️⃣3️⃣", + "Numbers: 3️⃣2️⃣1️⃣", + ), + # Emoji at boundaries + ( + "πŸ‘‹ middle text πŸŽ‰", + "πŸŽ‰ middle text πŸ‘‹", + ), + # Japanese characters + ( + "γ“γ‚“γ«γ‘γ―δΈ–η•Œ", + "γ“γ‚“γ«γ‘γ―εœ°ηƒ", + ), + # Julia math operators + ( + "x ∈ [1, 2, 3] && y β‰₯ 0", + "x βˆ‰ [1, 2, 3] || y ≀ 0", + ), + ], + ids=[ + "emoji_swap", + "text_change_emoji_stay", + "emoji_change", + "combining_marks", + "flags", + "zwj_family", + "rtl_ltr", + "keycap", + "emoji_boundaries", + "japanese", + "math_operators", + ], +) +def test_unicode_granular_diff(initial, updated): + """Granular text edits with multi-byte Unicode should produce correct results. + + Test cases adapted from jupyter-server/jupyter_ydoc#370. + """ + doc = Doc() + doc["text"] = text = Text() + + text += initial + assert str(text) == initial + + _apply_diff(text, initial, updated) + assert str(text) == updated, f"Got {str(text)!r}, expected {updated!r}" + + def test_sticky_index_transaction(): doc = Doc() text = doc.get("text", type=Text) From 7cf5af44834ba1ae4f108ea99ed668b90530b723 Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Fri, 10 Apr 2026 14:52:11 -0400 Subject: [PATCH 04/13] fix: move SequenceMatcher import to top of file (ruff E402) --- tests/test_text.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_text.py b/tests/test_text.py index dd77df2..5efdc1f 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -1,3 +1,5 @@ +from difflib import SequenceMatcher + import pytest from anyio import TASK_STATUS_IGNORED, Event, create_task_group from anyio.abc import TaskStatus @@ -387,7 +389,6 @@ def test_unicode_cross_doc_sync(): # initial content, then applies a granular edit (using SequenceMatcher on # byte offsets, matching how jupyter_ydoc.YUnicode.set() works), and verifies # the result is correct. -from difflib import SequenceMatcher def _apply_diff(text, old_value, new_value): From b1ed6ae56be03f2555f4fc99527143b9a49bf526 Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Mon, 13 Apr 2026 20:06:41 -0400 Subject: [PATCH 05/13] test: add tests for _utf16_to_char helper Addresses review feedback from @davidbrochart. Tests cover ASCII (identity), BMP characters, supplementary plane (emoji), multiple emoji, and roundtrip with _char_to_utf16. --- tests/test_text.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/tests/test_text.py b/tests/test_text.py index 5efdc1f..d67f52e 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -4,6 +4,7 @@ from anyio import TASK_STATUS_IGNORED, Event, create_task_group from anyio.abc import TaskStatus from pycrdt import Array, Assoc, Doc, Map, StickyIndex, Text +from pycrdt._text import _char_to_utf16, _utf16_to_char pytestmark = pytest.mark.anyio @@ -498,6 +499,62 @@ def test_unicode_granular_diff(initial, updated): assert str(text) == updated, f"Got {str(text)!r}, expected {updated!r}" +def test_utf16_to_char_ascii(): + """_utf16_to_char is identity for pure ASCII text.""" + text = "Hello, World!" + for i in range(len(text) + 1): + assert _utf16_to_char(text, i) == i + + +def test_utf16_to_char_bmp(): + """BMP characters (CJK, Cyrillic) are 1 UTF-16 code unit each.""" + text = "δ»·ζ Όεˆ†ζž" # 4 BMP CJK chars = 4 UTF-16 code units + assert _utf16_to_char(text, 0) == 0 + assert _utf16_to_char(text, 1) == 1 + assert _utf16_to_char(text, 2) == 2 + assert _utf16_to_char(text, 4) == 4 + + +def test_utf16_to_char_supplementary(): + """Supplementary plane chars (emoji) take 2 UTF-16 code units.""" + text = "AπŸ“ŠB" # UTF-16: A(1) πŸ“Š(2) B(1) = 4 code units, 3 chars + assert _utf16_to_char(text, 0) == 0 # before A + assert _utf16_to_char(text, 1) == 1 # before πŸ“Š + assert _utf16_to_char(text, 3) == 2 # before B (1 + 2 = 3) + assert _utf16_to_char(text, 4) == 3 # end + + +def test_utf16_to_char_multiple_emoji(): + """Multiple supplementary plane characters.""" + text = "AπŸ“ŠBπŸŽ‰C" # UTF-16: A(1) πŸ“Š(2) B(1) πŸŽ‰(2) C(1) = 7 units, 5 chars + assert _utf16_to_char(text, 0) == 0 # before A + assert _utf16_to_char(text, 1) == 1 # before πŸ“Š + assert _utf16_to_char(text, 3) == 2 # before B + assert _utf16_to_char(text, 4) == 3 # before πŸŽ‰ + assert _utf16_to_char(text, 6) == 4 # before C + assert _utf16_to_char(text, 7) == 5 # end + + +def test_utf16_to_char_roundtrip(): + """_char_to_utf16 and _utf16_to_char are inverses.""" + texts = [ + "Hello", + "AπŸ“ŠB", + "δ»·ζ Όεˆ†ζž", + "# Analysis πŸ“Š\n", + "Aπ’œBπ €€C", + "Hello δΈ–η•Œ πŸ“Š ΠΌΠΈΡ€!", + "πŸŽ‰πŸ“ŠπŸ”’", + ] + for text in texts: + for char_idx in range(len(text) + 1): + utf16_idx = _char_to_utf16(text, char_idx) + assert _utf16_to_char(text, utf16_idx) == char_idx, ( + f"Roundtrip failed for {text!r} at char_idx={char_idx}: " + f"utf16={utf16_idx}, back={_utf16_to_char(text, utf16_idx)}" + ) + + def test_sticky_index_transaction(): doc = Doc() text = doc.get("text", type=Text) From f48610b1a49c2eb312aeb4f4e4e38c76d9489488 Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Fri, 17 Apr 2026 00:05:54 -0400 Subject: [PATCH 06/13] fix: convert UTF-16 offset in Text.__iadd__ and drop unused _utf16_to_char MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Text.__iadd__ passed len(self) (Python character count) to the yrs insert, but yrs expects a UTF-16 code unit index β€” so `t += "X"` after an emoji landed inside the surrogate pair. Convert the index through _char_to_utf16, matching every other mutating method. Also removes _utf16_to_char and its tests, which had no callers. Co-Authored-By: Claude Opus 4.7 (1M context) --- python/pycrdt/_text.py | 16 ++-------- tests/test_text.py | 69 ++++++++---------------------------------- 2 files changed, 15 insertions(+), 70 deletions(-) diff --git a/python/pycrdt/_text.py b/python/pycrdt/_text.py index 418bab3..069da86 100644 --- a/python/pycrdt/_text.py +++ b/python/pycrdt/_text.py @@ -29,18 +29,6 @@ def _char_to_utf16(text: str, char_index: int) -> int: return char_index + extra -def _utf16_to_char(text: str, utf16_index: int) -> int: - """Convert a UTF-16 code unit index back to a Python character index.""" - char_idx = 0 - utf16_idx = 0 - for ch in text: - if utf16_idx >= utf16_index: - break - utf16_idx += 2 if ord(ch) > 0xFFFF else 1 - char_idx += 1 - return char_idx - - class Text(Sequence): """ A shared data type used for collaborative text editing, similar to a Python `str`. @@ -158,7 +146,9 @@ def __iadd__(self, value: str) -> Text: """ with self.doc.transaction() as txn: self._forbid_read_transaction(txn) - self.integrated.insert(txn._txn, len(self), value) + current = str(self) + utf16_index = _char_to_utf16(current, len(current)) + self.integrated.insert(txn._txn, utf16_index, value) return self def _check_slice(self, key: slice) -> tuple[int, int]: diff --git a/tests/test_text.py b/tests/test_text.py index d67f52e..fc06f0a 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -4,7 +4,7 @@ from anyio import TASK_STATUS_IGNORED, Event, create_task_group from anyio.abc import TaskStatus from pycrdt import Array, Assoc, Doc, Map, StickyIndex, Text -from pycrdt._text import _char_to_utf16, _utf16_to_char +from pycrdt._text import _char_to_utf16 pytestmark = pytest.mark.anyio @@ -259,6 +259,17 @@ def test_unicode_emoji_sequential_inserts(): assert str(text) == expected, f"Got {str(text)!r}" +def test_unicode_emoji_iadd(): + """`+=` after emoji should append at the end (regression for UTF-16 offset bug).""" + doc = Doc() + doc["text"] = text = Text() + + text += "AπŸ“ŠB" + text += "X" + + assert str(text) == "AπŸ“ŠBX" + + def test_unicode_emoji_len(): """len() should return Python character count, not byte count.""" doc = Doc() @@ -499,62 +510,6 @@ def test_unicode_granular_diff(initial, updated): assert str(text) == updated, f"Got {str(text)!r}, expected {updated!r}" -def test_utf16_to_char_ascii(): - """_utf16_to_char is identity for pure ASCII text.""" - text = "Hello, World!" - for i in range(len(text) + 1): - assert _utf16_to_char(text, i) == i - - -def test_utf16_to_char_bmp(): - """BMP characters (CJK, Cyrillic) are 1 UTF-16 code unit each.""" - text = "δ»·ζ Όεˆ†ζž" # 4 BMP CJK chars = 4 UTF-16 code units - assert _utf16_to_char(text, 0) == 0 - assert _utf16_to_char(text, 1) == 1 - assert _utf16_to_char(text, 2) == 2 - assert _utf16_to_char(text, 4) == 4 - - -def test_utf16_to_char_supplementary(): - """Supplementary plane chars (emoji) take 2 UTF-16 code units.""" - text = "AπŸ“ŠB" # UTF-16: A(1) πŸ“Š(2) B(1) = 4 code units, 3 chars - assert _utf16_to_char(text, 0) == 0 # before A - assert _utf16_to_char(text, 1) == 1 # before πŸ“Š - assert _utf16_to_char(text, 3) == 2 # before B (1 + 2 = 3) - assert _utf16_to_char(text, 4) == 3 # end - - -def test_utf16_to_char_multiple_emoji(): - """Multiple supplementary plane characters.""" - text = "AπŸ“ŠBπŸŽ‰C" # UTF-16: A(1) πŸ“Š(2) B(1) πŸŽ‰(2) C(1) = 7 units, 5 chars - assert _utf16_to_char(text, 0) == 0 # before A - assert _utf16_to_char(text, 1) == 1 # before πŸ“Š - assert _utf16_to_char(text, 3) == 2 # before B - assert _utf16_to_char(text, 4) == 3 # before πŸŽ‰ - assert _utf16_to_char(text, 6) == 4 # before C - assert _utf16_to_char(text, 7) == 5 # end - - -def test_utf16_to_char_roundtrip(): - """_char_to_utf16 and _utf16_to_char are inverses.""" - texts = [ - "Hello", - "AπŸ“ŠB", - "δ»·ζ Όεˆ†ζž", - "# Analysis πŸ“Š\n", - "Aπ’œBπ €€C", - "Hello δΈ–η•Œ πŸ“Š ΠΌΠΈΡ€!", - "πŸŽ‰πŸ“ŠπŸ”’", - ] - for text in texts: - for char_idx in range(len(text) + 1): - utf16_idx = _char_to_utf16(text, char_idx) - assert _utf16_to_char(text, utf16_idx) == char_idx, ( - f"Roundtrip failed for {text!r} at char_idx={char_idx}: " - f"utf16={utf16_idx}, back={_utf16_to_char(text, utf16_idx)}" - ) - - def test_sticky_index_transaction(): doc = Doc() text = doc.get("text", type=Text) From 01a1756c7fbb81ed89060756b1af4a8422919de7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 04:06:11 +0000 Subject: [PATCH 07/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_text.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_text.py b/tests/test_text.py index fc06f0a..f84b933 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -4,7 +4,6 @@ from anyio import TASK_STATUS_IGNORED, Event, create_task_group from anyio.abc import TaskStatus from pycrdt import Array, Assoc, Doc, Map, StickyIndex, Text -from pycrdt._text import _char_to_utf16 pytestmark = pytest.mark.anyio From 41c8cc58e691edc1b54c209f40e676b1982b4878 Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Tue, 28 Apr 2026 10:23:49 -0400 Subject: [PATCH 08/13] feat: parameterize Doc offset_kind (default UTF-8) Adds an `offset_kind` parameter to `Doc(...)` so callers can choose between UTF-8 byte offsets (the yrs default) and UTF-16 code unit offsets (required for cross-runtime interop with JS yjs). Default is "utf8", matching yrs. The Text wrapper previously assumed UTF-16 unconditionally and passed Python char indices through a UTF-16 conversion. This commit replaces that with a dispatcher that picks UTF-8 or UTF-16 conversion based on the doc's offset_kind, so Text behaves correctly in either mode while the public API still takes Python character indices. src/doc.rs gains a third positional arg on `Doc::new` accepting "utf8" / "utf-8" / "utf16" / "utf-16" / None (None preserves the yrs default). Invalid values raise PyValueError. _from_snapshot_impl now reads the offset kind from the source doc instead of hardcoding it. A new `#[getter] offset_kind` returns the canonical "utf8" or "utf16" string. _base.py forwards the new kwarg, and raises ValueError if both `doc=` (an existing _Doc) and `offset_kind=` are supplied with disagreeing values. _doc.py adds the kwarg to Doc.__init__, documents it, and exposes a read-only `Doc.offset_kind` property. Addresses davidbrochart's review on #379 asking for parameterization with a UTF-8 default. Co-Authored-By: Claude Opus 4.7 (1M context) --- python/pycrdt/_base.py | 8 ++++- python/pycrdt/_doc.py | 17 +++++++++ python/pycrdt/_text.py | 82 ++++++++++++++++++++++++++---------------- src/doc.rs | 33 +++++++++++++---- 4 files changed, 101 insertions(+), 39 deletions(-) diff --git a/python/pycrdt/_base.py b/python/pycrdt/_base.py index 1443d7d..4aebdfa 100644 --- a/python/pycrdt/_base.py +++ b/python/pycrdt/_base.py @@ -61,6 +61,7 @@ def __init__( *, client_id: int | None = None, skip_gc: bool | None = None, + offset_kind: str | None = None, doc: _Doc | None = None, Model=None, allow_multithreading: bool = False, @@ -68,7 +69,12 @@ def __init__( ) -> None: super().__init__(**data) if doc is None: - doc = _Doc(client_id, skip_gc) + doc = _Doc(client_id, skip_gc, offset_kind) + elif offset_kind is not None and offset_kind != doc.offset_kind: + raise ValueError( + f"offset_kind={offset_kind!r} does not match doc.offset_kind=" + f"{doc.offset_kind!r}" + ) self._doc = doc self._txn = None self._exceptions = [] diff --git a/python/pycrdt/_doc.py b/python/pycrdt/_doc.py index 07b140c..34f37c6 100644 --- a/python/pycrdt/_doc.py +++ b/python/pycrdt/_doc.py @@ -48,6 +48,7 @@ def __init__( *, client_id: int | None = None, skip_gc: bool | None = None, + offset_kind: str | None = None, doc: _Doc | None = None, Model=None, allow_multithreading: bool = False, @@ -58,11 +59,18 @@ def __init__( client_id: An optional client ID for the document. skip_gc: Whether to skip garbage collection on deleted collections on transaction commit. + offset_kind: How yrs counts text positions internally. ``"utf8"`` + (the yrs default) uses byte offsets; ``"utf16"`` uses UTF-16 + code unit offsets and is required for cross-runtime + compatibility with JS yjs. ``None`` (default) selects the yrs + default of ``"utf8"``. Regardless of this setting, the public + ``Text`` API always takes Python character indices. allow_multithreading: Whether to allow the document to be used in different threads. """ super().__init__( client_id=client_id, skip_gc=skip_gc, + offset_kind=offset_kind, doc=doc, Model=Model, allow_multithreading=allow_multithreading, @@ -86,6 +94,15 @@ def client_id(self) -> int: """The document client ID.""" return self._doc.client_id() + @property + def offset_kind(self) -> str: + """The text offset kind used internally by yrs. + + Returns ``"utf8"`` or ``"utf16"``. See [Doc.__init__][pycrdt.Doc.__init__] + for the meaning. + """ + return self._doc.offset_kind + def transaction(self, origin: Any = None) -> Transaction: """ Creates a new transaction or gets the current one, if any. diff --git a/python/pycrdt/_text.py b/python/pycrdt/_text.py index 069da86..0b9977d 100644 --- a/python/pycrdt/_text.py +++ b/python/pycrdt/_text.py @@ -15,8 +15,7 @@ def _char_to_utf16(text: str, char_index: int) -> int: """Convert a Python character (code point) index to a UTF-16 code unit index. Characters outside the Basic Multilingual Plane (e.g. emoji) occupy 2 - UTF-16 code units but only 1 Python character. The underlying yrs library - uses UTF-16 offsets, so all indices passed to it must be converted. + UTF-16 code units but only 1 Python character. For pure-ASCII / BMP text this is a no-op (returns ``char_index`` unchanged). @@ -29,6 +28,25 @@ def _char_to_utf16(text: str, char_index: int) -> int: return char_index + extra +def _char_to_utf8(text: str, char_index: int) -> int: + """Convert a Python character (code point) index to a UTF-8 byte index.""" + if char_index == 0: + return 0 + return len(text[:char_index].encode("utf-8")) + + +def _char_to_offset(text: str, char_index: int, offset_kind: str) -> int: + if offset_kind == "utf16": + return _char_to_utf16(text, char_index) + return _char_to_utf8(text, char_index) + + +def _single_char_unit_len(char: str, offset_kind: str) -> int: + if offset_kind == "utf16": + return 2 if ord(char) > 0xFFFF else 1 + return len(char.encode("utf-8")) + + class Text(Sequence): """ A shared data type used for collaborative text editing, similar to a Python `str`. @@ -147,8 +165,8 @@ def __iadd__(self, value: str) -> Text: with self.doc.transaction() as txn: self._forbid_read_transaction(txn) current = str(self) - utf16_index = _char_to_utf16(current, len(current)) - self.integrated.insert(txn._txn, utf16_index, value) + offset = _char_to_offset(current, len(current), self.doc.offset_kind) + self.integrated.insert(txn._txn, offset, value) return self def _check_slice(self, key: slice) -> tuple[int, int]: @@ -190,18 +208,19 @@ def __delitem__(self, key: int | slice) -> None: with self.doc.transaction() as txn: self._forbid_read_transaction(txn) current = str(self) + ok = self.doc.offset_kind if isinstance(key, int): - utf16_idx = _char_to_utf16(current, key) - char_at = current[key] - utf16_len = 2 if ord(char_at) > 0xFFFF else 1 - self.integrated.remove_range(txn._txn, utf16_idx, utf16_len) + offset = _char_to_offset(current, key, ok) + unit_len = _single_char_unit_len(current[key], ok) + self.integrated.remove_range(txn._txn, offset, unit_len) elif isinstance(key, slice): start, stop = self._check_slice(key) - length = stop - start - if length > 0: - utf16_start = _char_to_utf16(current, start) - utf16_stop = _char_to_utf16(current, stop) - self.integrated.remove_range(txn._txn, utf16_start, utf16_stop - utf16_start) + if stop - start > 0: + offset_start = _char_to_offset(current, start, ok) + offset_stop = _char_to_offset(current, stop, ok) + self.integrated.remove_range( + txn._txn, offset_start, offset_stop - offset_start + ) else: raise RuntimeError(f"Index not supported: {key}") @@ -241,25 +260,25 @@ def __setitem__(self, key: int | slice, value: str) -> None: with self.doc.transaction() as txn: self._forbid_read_transaction(txn) current = str(self) + ok = self.doc.offset_kind if isinstance(key, int): value_len = len(value) if value_len != 1: raise RuntimeError( f"Single item assigned value must have a length of 1, not {value_len}" ) - utf16_idx = _char_to_utf16(current, key) - char_at = current[key] - utf16_len = 2 if ord(char_at) > 0xFFFF else 1 - self.integrated.remove_range(txn._txn, utf16_idx, utf16_len) - self.integrated.insert(txn._txn, utf16_idx, value) + offset = _char_to_offset(current, key, ok) + unit_len = _single_char_unit_len(current[key], ok) + self.integrated.remove_range(txn._txn, offset, unit_len) + self.integrated.insert(txn._txn, offset, value) elif isinstance(key, slice): start, stop = self._check_slice(key) - utf16_start = _char_to_utf16(current, start) - utf16_stop = _char_to_utf16(current, stop) - length = utf16_stop - utf16_start + offset_start = _char_to_offset(current, start, ok) + offset_stop = _char_to_offset(current, stop, ok) + length = offset_stop - offset_start if length > 0: - self.integrated.remove_range(txn._txn, utf16_start, length) - self.integrated.insert(txn._txn, utf16_start, value) + self.integrated.remove_range(txn._txn, offset_start, length) + self.integrated.insert(txn._txn, offset_start, value) else: raise RuntimeError(f"Index not supported: {key}") @@ -284,9 +303,9 @@ def insert(self, index: int, value: str, attrs: dict[str, Any] | None = None) -> with self.doc.transaction() as txn: self._forbid_read_transaction(txn) current = str(self) - utf16_index = _char_to_utf16(current, index) + offset = _char_to_offset(current, index, self.doc.offset_kind) self.integrated.insert( - txn._txn, utf16_index, value, iter(attrs.items()) if attrs is not None else None + txn._txn, offset, value, iter(attrs.items()) if attrs is not None else None ) def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = None) -> None: @@ -301,9 +320,9 @@ def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = No with self.doc.transaction() as txn: self._forbid_read_transaction(txn) current = str(self) - utf16_index = _char_to_utf16(current, index) + offset = _char_to_offset(current, index, self.doc.offset_kind) self.integrated.insert_embed( - txn._txn, utf16_index, value, iter(attrs.items()) if attrs is not None else None + txn._txn, offset, value, iter(attrs.items()) if attrs is not None else None ) def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None: @@ -319,11 +338,12 @@ def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None: self._forbid_read_transaction(txn) start, stop = self._check_slice(slice(start, stop)) current = str(self) - utf16_start = _char_to_utf16(current, start) - utf16_stop = _char_to_utf16(current, stop) - length = utf16_stop - utf16_start + ok = self.doc.offset_kind + offset_start = _char_to_offset(current, start, ok) + offset_stop = _char_to_offset(current, stop, ok) + length = offset_stop - offset_start if length > 0: - self.integrated.format(txn._txn, utf16_start, length, iter(attrs.items())) + self.integrated.format(txn._txn, offset_start, length, iter(attrs.items())) def diff(self) -> list[tuple[Any, dict[str, Any] | None]]: """ diff --git a/src/doc.rs b/src/doc.rs index 27e875d..455c61f 100644 --- a/src/doc.rs +++ b/src/doc.rs @@ -32,7 +32,7 @@ impl Doc { let mut options = yrs::Options::default(); options.client_id = original.doc.client_id(); options.skip_gc = original.doc.skip_gc(); - options.offset_kind = OffsetKind::Utf16; + options.offset_kind = original.doc.offset_kind(); if let Some(collection_id) = original.doc.collection_id() { options.collection_id = Some(collection_id); } @@ -69,7 +69,11 @@ impl Doc { #[pymethods] impl Doc { #[new] - fn new(client_id: &Bound<'_, PyAny>, skip_gc: &Bound<'_, PyAny>) -> PyResult { + fn new( + client_id: &Bound<'_, PyAny>, + skip_gc: &Bound<'_, PyAny>, + offset_kind: &Bound<'_, PyAny>, + ) -> PyResult { let mut options = Options::default(); if !client_id.is_none() { let _client_id: u64 = client_id.cast::() @@ -85,15 +89,30 @@ impl Doc { .map_err(|_| PyValueError::new_err("skip_gc must be a valid bool"))?; options.skip_gc = _skip_gc; } - // Use UTF-16 offsets for compatibility with JS yjs clients. - // Without this, pycrdt uses UTF-8 byte offsets which causes - // findIndexSS crashes when JS yjs applies incremental updates - // containing multi-byte characters. - options.offset_kind = OffsetKind::Utf16; + if !offset_kind.is_none() { + let _offset_kind: String = offset_kind + .extract() + .map_err(|_| PyValueError::new_err("offset_kind must be a string"))?; + options.offset_kind = match _offset_kind.as_str() { + "utf8" | "utf-8" => OffsetKind::Bytes, + "utf16" | "utf-16" => OffsetKind::Utf16, + _ => return Err(PyValueError::new_err( + "offset_kind must be 'utf8' or 'utf16'", + )), + }; + } let doc = _Doc::with_options(options); Ok(Doc { doc }) } + #[getter] + fn offset_kind(&self) -> &'static str { + match self.doc.offset_kind() { + OffsetKind::Bytes => "utf8", + OffsetKind::Utf16 => "utf16", + } + } + #[staticmethod] #[pyo3(name = "from_snapshot")] pub fn from_snapshot(py: Python<'_>, snapshot: PyRef<'_, crate::snapshot::Snapshot>, doc: PyRef<'_, Doc>) -> PyResult> { From 855af5c8c5fa64e7d04829bc54e36e57e270c925 Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Tue, 28 Apr 2026 10:25:20 -0400 Subject: [PATCH 09/13] refactor: make utf16/utf8 index helpers public MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses davidbrochart's three inline reviews on #379: - _text.py:14 β€” rename `_char_to_utf16` β†’ `get_utf16_index` (public, per the suggested rename). Add a counterpart `get_utf8_index` since with offset_kind parameterization the conversion is needed in both directions (jbdyn + davidbrochart agreed in the thread). The dispatcher and single-char-length helpers stay private β€” trivial conditionals best read at the call site. - _text.py:23 β€” both public helpers now use Google-style docstrings (Args / Returns sections), matching the rest of the module. - _text.py:112 β€” drop the inline comment that duplicated the docstring; tighten the docstring to match. Re-export the two helpers from `pycrdt.__init__` so consumers can reach them without importing from the underscore module. Drop a now-dead `_char_to_utf16` import in tests/test_text.py. Co-Authored-By: Claude Opus 4.7 (1M context) --- python/pycrdt/__init__.py | 2 ++ python/pycrdt/_text.py | 32 ++++++++++++++++++++++---------- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/python/pycrdt/__init__.py b/python/pycrdt/__init__.py index d849660..77446aa 100644 --- a/python/pycrdt/__init__.py +++ b/python/pycrdt/__init__.py @@ -35,6 +35,8 @@ from ._sync import write_var_uint as write_var_uint from ._text import Text as Text from ._text import TextEvent as TextEvent +from ._text import get_utf8_index as get_utf8_index +from ._text import get_utf16_index as get_utf16_index from ._transaction import NewTransaction as NewTransaction from ._transaction import ReadTransaction as ReadTransaction from ._transaction import Transaction as Transaction diff --git a/python/pycrdt/_text.py b/python/pycrdt/_text.py index 0b9977d..ccb3d3b 100644 --- a/python/pycrdt/_text.py +++ b/python/pycrdt/_text.py @@ -11,14 +11,19 @@ from ._doc import Doc -def _char_to_utf16(text: str, char_index: int) -> int: +def get_utf16_index(text: str, char_index: int) -> int: """Convert a Python character (code point) index to a UTF-16 code unit index. Characters outside the Basic Multilingual Plane (e.g. emoji) occupy 2 - UTF-16 code units but only 1 Python character. + UTF-16 code units but only 1 Python character. For pure-ASCII / BMP + text this is a no-op. - For pure-ASCII / BMP text this is a no-op (returns ``char_index`` - unchanged). + Args: + text: The string against which ``char_index`` is interpreted. + char_index: A Python (code point) index into ``text``. + + Returns: + The corresponding UTF-16 code unit offset. """ if char_index == 0: return 0 @@ -28,8 +33,16 @@ def _char_to_utf16(text: str, char_index: int) -> int: return char_index + extra -def _char_to_utf8(text: str, char_index: int) -> int: - """Convert a Python character (code point) index to a UTF-8 byte index.""" +def get_utf8_index(text: str, char_index: int) -> int: + """Convert a Python character (code point) index to a UTF-8 byte index. + + Args: + text: The string against which ``char_index`` is interpreted. + char_index: A Python (code point) index into ``text``. + + Returns: + The corresponding UTF-8 byte offset. + """ if char_index == 0: return 0 return len(text[:char_index].encode("utf-8")) @@ -37,8 +50,8 @@ def _char_to_utf8(text: str, char_index: int) -> int: def _char_to_offset(text: str, char_index: int, offset_kind: str) -> int: if offset_kind == "utf16": - return _char_to_utf16(text, char_index) - return _char_to_utf8(text, char_index) + return get_utf16_index(text, char_index) + return get_utf8_index(text, char_index) def _single_char_unit_len(char: str, offset_kind: str) -> int: @@ -125,9 +138,8 @@ def __len__(self) -> int: ``` Returns: - The length of the text (in Python characters, not UTF-16 code units). + The length of the text in Python characters. """ - # Return Python character count, not yrs UTF-16 code unit count return len(str(self)) def __str__(self) -> str: From e41d3450e76c0814b79630fcaecbf48e22718f1b Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Tue, 28 Apr 2026 10:26:11 -0400 Subject: [PATCH 10/13] test: parametrize Unicode tests over offset_kind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an `offset_kind` pytest fixture parametrizing all Unicode tests over both ``"utf8"`` and ``"utf16"`` modes. The same test bodies now exercise the dispatcher in `Text` for both yrs offset configurations, without changing the assertions β€” public API contract is always Python char indices. `test_unicode_cross_doc_sync` constructs both peers with the same offset_kind (mismatched offset kinds across peers is unsupported by yrs/yjs). New explicit tests cover: - default-is-utf8 - explicit "utf8" / "utf-8" / "utf16" / "utf-16" - ValueError on unknown values - snapshot round-trip preserves offset_kind for both modes - new public `get_utf16_index` / `get_utf8_index` helpers (identity / BMP CJK / non-BMP emoji) Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_text.py | 152 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 115 insertions(+), 37 deletions(-) diff --git a/tests/test_text.py b/tests/test_text.py index f84b933..6e5575e 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -3,7 +3,16 @@ import pytest from anyio import TASK_STATUS_IGNORED, Event, create_task_group from anyio.abc import TaskStatus -from pycrdt import Array, Assoc, Doc, Map, StickyIndex, Text +from pycrdt import ( + Array, + Assoc, + Doc, + Map, + StickyIndex, + Text, + get_utf8_index, + get_utf16_index, +) pytestmark = pytest.mark.anyio @@ -230,9 +239,14 @@ def test_sticky_index(serialize: str): assert text1[new_idx] == "*" -def test_unicode_emoji_insert(): - """Text.insert() after emoji characters should use character positions, not byte offsets.""" - doc = Doc() +@pytest.fixture(params=["utf8", "utf16"]) +def offset_kind(request): + return request.param + + +def test_unicode_emoji_insert(offset_kind): + """Text.insert() after emoji characters should use character positions.""" + doc = Doc(offset_kind=offset_kind) doc["text"] = text = Text() text += "AπŸ“ŠB" @@ -244,9 +258,9 @@ def test_unicode_emoji_insert(): assert str(text) == "AπŸ“ŠXB", f"Got {str(text)!r}, emoji insert position is wrong" -def test_unicode_emoji_sequential_inserts(): +def test_unicode_emoji_sequential_inserts(offset_kind): """Sequential inserts after emoji should maintain correct positions.""" - doc = Doc() + doc = Doc(offset_kind=offset_kind) doc["text"] = text = Text() text += "# Analysis πŸ“Š\n" @@ -258,9 +272,11 @@ def test_unicode_emoji_sequential_inserts(): assert str(text) == expected, f"Got {str(text)!r}" -def test_unicode_emoji_iadd(): - """`+=` after emoji should append at the end (regression for UTF-16 offset bug).""" - doc = Doc() +def test_unicode_emoji_iadd(offset_kind): + """`+=` after emoji should append at the end (regression for the original + Text.__iadd__ bug where len(self) was passed to yrs as if it were already + in offset units).""" + doc = Doc(offset_kind=offset_kind) doc["text"] = text = Text() text += "AπŸ“ŠB" @@ -269,66 +285,66 @@ def test_unicode_emoji_iadd(): assert str(text) == "AπŸ“ŠBX" -def test_unicode_emoji_len(): - """len() should return Python character count, not byte count.""" - doc = Doc() +def test_unicode_emoji_len(offset_kind): + """len() should return Python character count, regardless of offset_kind.""" + doc = Doc(offset_kind=offset_kind) doc["text"] = text = Text() text += "AπŸ“ŠB" - assert len(text) == 3 # 3 chars, not 6 bytes or 4 UTF-16 code units + assert len(text) == 3 text += "πŸŽ‰" assert len(text) == 4 -def test_unicode_emoji_delete(): +def test_unicode_emoji_delete(offset_kind): """Deleting a character after an emoji should work correctly.""" - doc = Doc() + doc = Doc(offset_kind=offset_kind) doc["text"] = text = Text("AπŸ“ŠBC") del text[2] # delete B (after emoji) assert str(text) == "AπŸ“ŠC", f"Got {str(text)!r}" -def test_unicode_emoji_delete_emoji(): +def test_unicode_emoji_delete_emoji(offset_kind): """Deleting an emoji character itself should work correctly.""" - doc = Doc() + doc = Doc(offset_kind=offset_kind) doc["text"] = text = Text("AπŸ“ŠB") del text[1] # delete πŸ“Š assert str(text) == "AB", f"Got {str(text)!r}" -def test_unicode_emoji_slice_delete(): +def test_unicode_emoji_slice_delete(offset_kind): """Slice deletion across emoji boundaries should work correctly.""" - doc = Doc() + doc = Doc(offset_kind=offset_kind) doc["text"] = text = Text("AπŸ“ŠBπŸŽ‰C") del text[1:4] # delete πŸ“ŠBπŸŽ‰ assert str(text) == "AC", f"Got {str(text)!r}" -def test_unicode_emoji_setitem(): +def test_unicode_emoji_setitem(offset_kind): """Replacing a character after an emoji should work correctly.""" - doc = Doc() + doc = Doc(offset_kind=offset_kind) doc["text"] = text = Text("AπŸ“ŠBC") text[2] = "X" # replace B (after emoji) assert str(text) == "AπŸ“ŠXC", f"Got {str(text)!r}" -def test_unicode_emoji_slice_setitem(): +def test_unicode_emoji_slice_setitem(offset_kind): """Slice replacement spanning emoji should work correctly.""" - doc = Doc() + doc = Doc(offset_kind=offset_kind) doc["text"] = text = Text("AπŸ“ŠBπŸŽ‰C") text[1:4] = "XYZ" # replace πŸ“ŠBπŸŽ‰ with XYZ assert str(text) == "AXYZC", f"Got {str(text)!r}" -def test_unicode_cjk(): - """CJK characters (BMP, 1 UTF-16 code unit each) should work correctly.""" - doc = Doc() +def test_unicode_cjk(offset_kind): + """CJK characters (BMP, 1 UTF-16 code unit but 3 UTF-8 bytes each).""" + doc = Doc(offset_kind=offset_kind) doc["text"] = text = Text() text += "δ»·ζ Ό" @@ -337,9 +353,9 @@ def test_unicode_cjk(): assert len(text) == 3 -def test_unicode_mixed_scripts(): +def test_unicode_mixed_scripts(offset_kind): """Mixed ASCII, CJK, Cyrillic, and emoji in one text.""" - doc = Doc() + doc = Doc(offset_kind=offset_kind) doc["text"] = text = Text() text += "Hello" @@ -353,9 +369,9 @@ def test_unicode_mixed_scripts(): assert len(text) == 15 -def test_unicode_supplementary_plane(): - """Characters outside BMP (require UTF-16 surrogate pairs).""" - doc = Doc() +def test_unicode_supplementary_plane(offset_kind): + """Characters outside BMP.""" + doc = Doc(offset_kind=offset_kind) doc["text"] = text = Text() # π’œ (U+1D49C) = Mathematical Script Capital A @@ -370,9 +386,14 @@ def test_unicode_supplementary_plane(): assert str(text) == "Aπ’œXBπ €€YC", f"Got {str(text)!r}" -def test_unicode_cross_doc_sync(): - """Updates with Unicode content should sync correctly between two pycrdt docs.""" - doc1 = Doc() +def test_unicode_cross_doc_sync(offset_kind): + """Updates with Unicode content should sync correctly between two pycrdt docs. + + Both docs must use the same offset_kind β€” peers with mismatched offset + kinds is a known incompatibility (yrs and yjs both require all peers in + a swarm to agree). + """ + doc1 = Doc(offset_kind=offset_kind) doc1["text"] = text1 = Text() # Capture updates from doc1 @@ -384,7 +405,7 @@ def test_unicode_cross_doc_sync(): text1.insert(len(text1), "# 特征ε·₯程\n") # Apply to doc2 - doc2 = Doc() + doc2 = Doc(offset_kind=offset_kind) doc2["text"] = Text() for update in updates: doc2.apply_update(update) @@ -494,12 +515,12 @@ def _apply_diff(text, old_value, new_value): "math_operators", ], ) -def test_unicode_granular_diff(initial, updated): +def test_unicode_granular_diff(initial, updated, offset_kind): """Granular text edits with multi-byte Unicode should produce correct results. Test cases adapted from jupyter-server/jupyter_ydoc#370. """ - doc = Doc() + doc = Doc(offset_kind=offset_kind) doc["text"] = text = Text() text += initial @@ -509,6 +530,63 @@ def test_unicode_granular_diff(initial, updated): assert str(text) == updated, f"Got {str(text)!r}, expected {updated!r}" +def test_get_utf16_index(): + # ASCII: identity + assert get_utf16_index("hello", 0) == 0 + assert get_utf16_index("hello", 5) == 5 + # BMP CJK: 1 code point = 1 UTF-16 code unit + assert get_utf16_index("δ»·ζ Ό", 2) == 2 + # Non-BMP emoji: 1 code point = 2 UTF-16 code units (surrogate pair) + assert get_utf16_index("AπŸ“ŠB", 1) == 1 + assert get_utf16_index("AπŸ“ŠB", 2) == 3 + assert get_utf16_index("AπŸ“ŠB", 3) == 4 + + +def test_get_utf8_index(): + # ASCII: identity + assert get_utf8_index("hello", 0) == 0 + assert get_utf8_index("hello", 5) == 5 + # BMP CJK: 1 code point = 3 UTF-8 bytes + assert get_utf8_index("δ»·ζ Ό", 1) == 3 + assert get_utf8_index("δ»·ζ Ό", 2) == 6 + # Non-BMP emoji: 1 code point = 4 UTF-8 bytes + assert get_utf8_index("AπŸ“ŠB", 1) == 1 + assert get_utf8_index("AπŸ“ŠB", 2) == 5 + assert get_utf8_index("AπŸ“ŠB", 3) == 6 + + +def test_offset_kind_default_is_utf8(): + assert Doc().offset_kind == "utf8" + + +def test_offset_kind_explicit(): + assert Doc(offset_kind="utf8").offset_kind == "utf8" + assert Doc(offset_kind="utf16").offset_kind == "utf16" + # hyphenated forms accepted + assert Doc(offset_kind="utf-8").offset_kind == "utf8" + assert Doc(offset_kind="utf-16").offset_kind == "utf16" + + +def test_offset_kind_invalid_raises(): + with pytest.raises(ValueError): + Doc(offset_kind="utf32") + + +def test_offset_kind_snapshot_round_trip(offset_kind): + """from_snapshot must preserve the source doc's offset_kind.""" + from pycrdt import Snapshot + + doc = Doc(offset_kind=offset_kind, skip_gc=True) + doc["text"] = Text("AπŸ“ŠB") + snap = Snapshot.from_doc(doc) + restored = Doc.from_snapshot(snap, doc) + assert restored.offset_kind == offset_kind, ( + f"snapshot lost offset_kind: expected {offset_kind}, " + f"got {restored.offset_kind}" + ) + assert str(restored["text"]) == "AπŸ“ŠB" + + def test_sticky_index_transaction(): doc = Doc() text = doc.get("text", type=Text) From b6b938c169e892a071a73b72004c26602ec79810 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 28 Apr 2026 14:28:18 +0000 Subject: [PATCH 11/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- python/pycrdt/_base.py | 3 +-- python/pycrdt/_text.py | 4 +--- tests/test_text.py | 3 +-- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/python/pycrdt/_base.py b/python/pycrdt/_base.py index 4aebdfa..53f614f 100644 --- a/python/pycrdt/_base.py +++ b/python/pycrdt/_base.py @@ -72,8 +72,7 @@ def __init__( doc = _Doc(client_id, skip_gc, offset_kind) elif offset_kind is not None and offset_kind != doc.offset_kind: raise ValueError( - f"offset_kind={offset_kind!r} does not match doc.offset_kind=" - f"{doc.offset_kind!r}" + f"offset_kind={offset_kind!r} does not match doc.offset_kind={doc.offset_kind!r}" ) self._doc = doc self._txn = None diff --git a/python/pycrdt/_text.py b/python/pycrdt/_text.py index ccb3d3b..d7275e7 100644 --- a/python/pycrdt/_text.py +++ b/python/pycrdt/_text.py @@ -230,9 +230,7 @@ def __delitem__(self, key: int | slice) -> None: if stop - start > 0: offset_start = _char_to_offset(current, start, ok) offset_stop = _char_to_offset(current, stop, ok) - self.integrated.remove_range( - txn._txn, offset_start, offset_stop - offset_start - ) + self.integrated.remove_range(txn._txn, offset_start, offset_stop - offset_start) else: raise RuntimeError(f"Index not supported: {key}") diff --git a/tests/test_text.py b/tests/test_text.py index 6e5575e..b4ca78c 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -581,8 +581,7 @@ def test_offset_kind_snapshot_round_trip(offset_kind): snap = Snapshot.from_doc(doc) restored = Doc.from_snapshot(snap, doc) assert restored.offset_kind == offset_kind, ( - f"snapshot lost offset_kind: expected {offset_kind}, " - f"got {restored.offset_kind}" + f"snapshot lost offset_kind: expected {offset_kind}, got {restored.offset_kind}" ) assert str(restored["text"]) == "AπŸ“ŠB" From 3e9933a9f0c9f7438155ceefef93dd5a996bd8c8 Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Tue, 28 Apr 2026 10:35:12 -0400 Subject: [PATCH 12/13] fix: update _pycrdt.pyi stub for offset_kind The Rust-extension stub didn't reflect the new constructor signature or the offset_kind getter, so `mypy python` failed in CI on _base.py and _doc.py with "Too many arguments" / "no attribute offset_kind". Add the third positional `offset_kind: str | None` to `Doc.__init__` and declare the read-only `offset_kind` property. Co-Authored-By: Claude Opus 4.7 (1M context) --- python/pycrdt/_pycrdt.pyi | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/python/pycrdt/_pycrdt.pyi b/python/pycrdt/_pycrdt.pyi index 1498689..73f3ed1 100644 --- a/python/pycrdt/_pycrdt.pyi +++ b/python/pycrdt/_pycrdt.pyi @@ -17,7 +17,12 @@ class Snapshot: class Doc: """Shared document.""" - def __init__(self, client_id: int | None, skip_gc: bool | None) -> None: + def __init__( + self, + client_id: int | None, + skip_gc: bool | None, + offset_kind: str | None, + ) -> None: """Create a new document with an optional global client ID. If no client ID is passed, a random one will be generated.""" @@ -28,6 +33,10 @@ class Doc: def client_id(self) -> int: """Returns the document unique client identifier.""" + @property + def offset_kind(self) -> str: + """Returns the offset kind ('utf8' or 'utf16').""" + def guid(self) -> int: """Returns the document globally unique identifier.""" From 67b2dead51482a32a62c2e8f60e1e0db47e6c2dd Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Tue, 28 Apr 2026 11:21:19 -0400 Subject: [PATCH 13/13] test: cover the offset_kind/doc mismatch ValueError Fixes the ubuntu python-3.14 coverage regression on #379. The new mismatch check at python/pycrdt/_base.py:74 (raise ValueError when both doc= and offset_kind= are passed and they disagree) wasn't exercised by any pytest test, dropping coverage to 99% and tripping the suite's fail-under=100 gate. Add a one-shot regression test that constructs an existing Doc with offset_kind="utf8", then attempts to wrap it as a new Doc with offset_kind="utf16" and asserts ValueError. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_text.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_text.py b/tests/test_text.py index b4ca78c..983e0ac 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -572,6 +572,13 @@ def test_offset_kind_invalid_raises(): Doc(offset_kind="utf32") +def test_offset_kind_doc_mismatch_raises(): + """Doc(doc=existing, offset_kind=other) must reject conflicting values.""" + utf8_doc = Doc(offset_kind="utf8") + with pytest.raises(ValueError, match="does not match"): + Doc(doc=utf8_doc._doc, offset_kind="utf16") + + def test_offset_kind_snapshot_round_trip(offset_kind): """from_snapshot must preserve the source doc's offset_kind.""" from pycrdt import Snapshot