diff --git a/python/pycrdt/__init__.py b/python/pycrdt/__init__.py index d849660..77446aa 100644 --- a/python/pycrdt/__init__.py +++ b/python/pycrdt/__init__.py @@ -35,6 +35,8 @@ from ._sync import write_var_uint as write_var_uint from ._text import Text as Text from ._text import TextEvent as TextEvent +from ._text import get_utf8_index as get_utf8_index +from ._text import get_utf16_index as get_utf16_index from ._transaction import NewTransaction as NewTransaction from ._transaction import ReadTransaction as ReadTransaction from ._transaction import Transaction as Transaction diff --git a/python/pycrdt/_base.py b/python/pycrdt/_base.py index 1443d7d..53f614f 100644 --- a/python/pycrdt/_base.py +++ b/python/pycrdt/_base.py @@ -61,6 +61,7 @@ def __init__( *, client_id: int | None = None, skip_gc: bool | None = None, + offset_kind: str | None = None, doc: _Doc | None = None, Model=None, allow_multithreading: bool = False, @@ -68,7 +69,11 @@ def __init__( ) -> None: super().__init__(**data) if doc is None: - doc = _Doc(client_id, skip_gc) + doc = _Doc(client_id, skip_gc, offset_kind) + elif offset_kind is not None and offset_kind != doc.offset_kind: + raise ValueError( + f"offset_kind={offset_kind!r} does not match doc.offset_kind={doc.offset_kind!r}" + ) self._doc = doc self._txn = None self._exceptions = [] diff --git a/python/pycrdt/_doc.py b/python/pycrdt/_doc.py index 07b140c..34f37c6 100644 --- a/python/pycrdt/_doc.py +++ b/python/pycrdt/_doc.py @@ -48,6 +48,7 @@ def __init__( *, client_id: int | None = None, skip_gc: bool | None = None, + offset_kind: str | None = None, doc: _Doc | None = None, Model=None, allow_multithreading: bool = False, @@ -58,11 +59,18 @@ def __init__( client_id: An optional client ID for the document. skip_gc: Whether to skip garbage collection on deleted collections on transaction commit. + offset_kind: How yrs counts text positions internally. ``"utf8"`` + (the yrs default) uses byte offsets; ``"utf16"`` uses UTF-16 + code unit offsets and is required for cross-runtime + compatibility with JS yjs. ``None`` (default) selects the yrs + default of ``"utf8"``. Regardless of this setting, the public + ``Text`` API always takes Python character indices. allow_multithreading: Whether to allow the document to be used in different threads. """ super().__init__( client_id=client_id, skip_gc=skip_gc, + offset_kind=offset_kind, doc=doc, Model=Model, allow_multithreading=allow_multithreading, @@ -86,6 +94,15 @@ def client_id(self) -> int: """The document client ID.""" return self._doc.client_id() + @property + def offset_kind(self) -> str: + """The text offset kind used internally by yrs. + + Returns ``"utf8"`` or ``"utf16"``. See [Doc.__init__][pycrdt.Doc.__init__] + for the meaning. + """ + return self._doc.offset_kind + def transaction(self, origin: Any = None) -> Transaction: """ Creates a new transaction or gets the current one, if any. diff --git a/python/pycrdt/_pycrdt.pyi b/python/pycrdt/_pycrdt.pyi index 1498689..73f3ed1 100644 --- a/python/pycrdt/_pycrdt.pyi +++ b/python/pycrdt/_pycrdt.pyi @@ -17,7 +17,12 @@ class Snapshot: class Doc: """Shared document.""" - def __init__(self, client_id: int | None, skip_gc: bool | None) -> None: + def __init__( + self, + client_id: int | None, + skip_gc: bool | None, + offset_kind: str | None, + ) -> None: """Create a new document with an optional global client ID. If no client ID is passed, a random one will be generated.""" @@ -28,6 +33,10 @@ class Doc: def client_id(self) -> int: """Returns the document unique client identifier.""" + @property + def offset_kind(self) -> str: + """Returns the offset kind ('utf8' or 'utf16').""" + def guid(self) -> int: """Returns the document globally unique identifier.""" diff --git a/python/pycrdt/_text.py b/python/pycrdt/_text.py index 49356c0..d7275e7 100644 --- a/python/pycrdt/_text.py +++ b/python/pycrdt/_text.py @@ -11,6 +11,55 @@ from ._doc import Doc +def get_utf16_index(text: str, char_index: int) -> int: + """Convert a Python character (code point) index to a UTF-16 code unit index. + + Characters outside the Basic Multilingual Plane (e.g. emoji) occupy 2 + UTF-16 code units but only 1 Python character. For pure-ASCII / BMP + text this is a no-op. + + Args: + text: The string against which ``char_index`` is interpreted. + char_index: A Python (code point) index into ``text``. + + Returns: + The corresponding UTF-16 code unit offset. + """ + if char_index == 0: + return 0 + prefix = text[:char_index] + # Count characters that need a surrogate pair (code point > 0xFFFF) + extra = sum(1 for ch in prefix if ord(ch) > 0xFFFF) + return char_index + extra + + +def get_utf8_index(text: str, char_index: int) -> int: + """Convert a Python character (code point) index to a UTF-8 byte index. + + Args: + text: The string against which ``char_index`` is interpreted. + char_index: A Python (code point) index into ``text``. + + Returns: + The corresponding UTF-8 byte offset. + """ + if char_index == 0: + return 0 + return len(text[:char_index].encode("utf-8")) + + +def _char_to_offset(text: str, char_index: int, offset_kind: str) -> int: + if offset_kind == "utf16": + return get_utf16_index(text, char_index) + return get_utf8_index(text, char_index) + + +def _single_char_unit_len(char: str, offset_kind: str) -> int: + if offset_kind == "utf16": + return 2 if ord(char) > 0xFFFF else 1 + return len(char.encode("utf-8")) + + class Text(Sequence): """ A shared data type used for collaborative text editing, similar to a Python `str`. @@ -89,10 +138,9 @@ def __len__(self) -> int: ``` Returns: - The length of the text. + The length of the text in Python characters. """ - with self.doc.transaction() as txn: - return self.integrated.len(txn._txn) + return len(str(self)) def __str__(self) -> str: """ @@ -128,7 +176,9 @@ def __iadd__(self, value: str) -> Text: """ with self.doc.transaction() as txn: self._forbid_read_transaction(txn) - self.integrated.insert(txn._txn, len(self), value) + current = str(self) + offset = _char_to_offset(current, len(current), self.doc.offset_kind) + self.integrated.insert(txn._txn, offset, value) return self def _check_slice(self, key: slice) -> tuple[int, int]: @@ -169,13 +219,18 @@ def __delitem__(self, key: int | slice) -> None: """ with self.doc.transaction() as txn: self._forbid_read_transaction(txn) + current = str(self) + ok = self.doc.offset_kind if isinstance(key, int): - self.integrated.remove_range(txn._txn, key, 1) + offset = _char_to_offset(current, key, ok) + unit_len = _single_char_unit_len(current[key], ok) + self.integrated.remove_range(txn._txn, offset, unit_len) elif isinstance(key, slice): start, stop = self._check_slice(key) - length = stop - start - if length > 0: - self.integrated.remove_range(txn._txn, start, length) + if stop - start > 0: + offset_start = _char_to_offset(current, start, ok) + offset_stop = _char_to_offset(current, stop, ok) + self.integrated.remove_range(txn._txn, offset_start, offset_stop - offset_start) else: raise RuntimeError(f"Index not supported: {key}") @@ -214,20 +269,26 @@ def __setitem__(self, key: int | slice, value: str) -> None: """ with self.doc.transaction() as txn: self._forbid_read_transaction(txn) + current = str(self) + ok = self.doc.offset_kind if isinstance(key, int): value_len = len(value) if value_len != 1: raise RuntimeError( f"Single item assigned value must have a length of 1, not {value_len}" ) - del self[key] - self.integrated.insert(txn._txn, key, value) + offset = _char_to_offset(current, key, ok) + unit_len = _single_char_unit_len(current[key], ok) + self.integrated.remove_range(txn._txn, offset, unit_len) + self.integrated.insert(txn._txn, offset, value) elif isinstance(key, slice): start, stop = self._check_slice(key) - length = stop - start + offset_start = _char_to_offset(current, start, ok) + offset_stop = _char_to_offset(current, stop, ok) + length = offset_stop - offset_start if length > 0: - self.integrated.remove_range(txn._txn, start, length) - self.integrated.insert(txn._txn, start, value) + self.integrated.remove_range(txn._txn, offset_start, length) + self.integrated.insert(txn._txn, offset_start, value) else: raise RuntimeError(f"Index not supported: {key}") @@ -251,8 +312,10 @@ def insert(self, index: int, value: str, attrs: dict[str, Any] | None = None) -> """ with self.doc.transaction() as txn: self._forbid_read_transaction(txn) + current = str(self) + offset = _char_to_offset(current, index, self.doc.offset_kind) self.integrated.insert( - txn._txn, index, value, iter(attrs.items()) if attrs is not None else None + txn._txn, offset, value, iter(attrs.items()) if attrs is not None else None ) def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = None) -> None: @@ -266,8 +329,10 @@ def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = No """ with self.doc.transaction() as txn: self._forbid_read_transaction(txn) + current = str(self) + offset = _char_to_offset(current, index, self.doc.offset_kind) self.integrated.insert_embed( - txn._txn, index, value, iter(attrs.items()) if attrs is not None else None + txn._txn, offset, value, iter(attrs.items()) if attrs is not None else None ) def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None: @@ -282,9 +347,13 @@ def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None: with self.doc.transaction() as txn: self._forbid_read_transaction(txn) start, stop = self._check_slice(slice(start, stop)) - length = stop - start + current = str(self) + ok = self.doc.offset_kind + offset_start = _char_to_offset(current, start, ok) + offset_stop = _char_to_offset(current, stop, ok) + length = offset_stop - offset_start if length > 0: - self.integrated.format(txn._txn, start, length, iter(attrs.items())) + self.integrated.format(txn._txn, offset_start, length, iter(attrs.items())) def diff(self) -> list[tuple[Any, dict[str, Any] | None]]: """ diff --git a/src/doc.rs b/src/doc.rs index 61109cc..455c61f 100644 --- a/src/doc.rs +++ b/src/doc.rs @@ -3,7 +3,7 @@ use pyo3::IntoPyObjectExt; use pyo3::exceptions::{PyRuntimeError, PyValueError}; use pyo3::types::{PyBool, PyBytes, PyDict, PyInt, PyList}; use yrs::{ - Doc as _Doc, Options, ReadTxn, StateVector, SubdocsEvent as _SubdocsEvent, Transact, TransactionCleanupEvent, TransactionMut, Update, WriteTxn + Doc as _Doc, OffsetKind, Options, ReadTxn, StateVector, SubdocsEvent as _SubdocsEvent, Transact, TransactionCleanupEvent, TransactionMut, Update, WriteTxn }; use yrs::updates::encoder::{Encode, Encoder}; use yrs::updates::decoder::Decode; @@ -32,6 +32,7 @@ impl Doc { let mut options = yrs::Options::default(); options.client_id = original.doc.client_id(); options.skip_gc = original.doc.skip_gc(); + options.offset_kind = original.doc.offset_kind(); if let Some(collection_id) = original.doc.collection_id() { options.collection_id = Some(collection_id); } @@ -68,7 +69,11 @@ impl Doc { #[pymethods] impl Doc { #[new] - fn new(client_id: &Bound<'_, PyAny>, skip_gc: &Bound<'_, PyAny>) -> PyResult { + fn new( + client_id: &Bound<'_, PyAny>, + skip_gc: &Bound<'_, PyAny>, + offset_kind: &Bound<'_, PyAny>, + ) -> PyResult { let mut options = Options::default(); if !client_id.is_none() { let _client_id: u64 = client_id.cast::() @@ -84,10 +89,30 @@ impl Doc { .map_err(|_| PyValueError::new_err("skip_gc must be a valid bool"))?; options.skip_gc = _skip_gc; } + if !offset_kind.is_none() { + let _offset_kind: String = offset_kind + .extract() + .map_err(|_| PyValueError::new_err("offset_kind must be a string"))?; + options.offset_kind = match _offset_kind.as_str() { + "utf8" | "utf-8" => OffsetKind::Bytes, + "utf16" | "utf-16" => OffsetKind::Utf16, + _ => return Err(PyValueError::new_err( + "offset_kind must be 'utf8' or 'utf16'", + )), + }; + } let doc = _Doc::with_options(options); Ok(Doc { doc }) } + #[getter] + fn offset_kind(&self) -> &'static str { + match self.doc.offset_kind() { + OffsetKind::Bytes => "utf8", + OffsetKind::Utf16 => "utf16", + } + } + #[staticmethod] #[pyo3(name = "from_snapshot")] pub fn from_snapshot(py: Python<'_>, snapshot: PyRef<'_, crate::snapshot::Snapshot>, doc: PyRef<'_, Doc>) -> PyResult> { diff --git a/tests/test_text.py b/tests/test_text.py index ba913a0..983e0ac 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -1,7 +1,18 @@ +from difflib import SequenceMatcher + import pytest from anyio import TASK_STATUS_IGNORED, Event, create_task_group from anyio.abc import TaskStatus -from pycrdt import Array, Assoc, Doc, Map, StickyIndex, Text +from pycrdt import ( + Array, + Assoc, + Doc, + Map, + StickyIndex, + Text, + get_utf8_index, + get_utf16_index, +) pytestmark = pytest.mark.anyio @@ -228,6 +239,360 @@ def test_sticky_index(serialize: str): assert text1[new_idx] == "*" +@pytest.fixture(params=["utf8", "utf16"]) +def offset_kind(request): + return request.param + + +def test_unicode_emoji_insert(offset_kind): + """Text.insert() after emoji characters should use character positions.""" + doc = Doc(offset_kind=offset_kind) + doc["text"] = text = Text() + + text += "AπŸ“ŠB" + assert str(text) == "AπŸ“ŠB" + assert len(text) == 3 + + # Insert at position 2 = between πŸ“Š and B + text.insert(2, "X") + assert str(text) == "AπŸ“ŠXB", f"Got {str(text)!r}, emoji insert position is wrong" + + +def test_unicode_emoji_sequential_inserts(offset_kind): + """Sequential inserts after emoji should maintain correct positions.""" + doc = Doc(offset_kind=offset_kind) + doc["text"] = text = Text() + + text += "# Analysis πŸ“Š\n" + text.insert(len(text), "model = fit()\n") + text.insert(len(text), "# 特征ε·₯程\n") + text.insert(len(text), 'print("done")\n') + + expected = '# Analysis πŸ“Š\nmodel = fit()\n# 特征ε·₯程\nprint("done")\n' + assert str(text) == expected, f"Got {str(text)!r}" + + +def test_unicode_emoji_iadd(offset_kind): + """`+=` after emoji should append at the end (regression for the original + Text.__iadd__ bug where len(self) was passed to yrs as if it were already + in offset units).""" + doc = Doc(offset_kind=offset_kind) + doc["text"] = text = Text() + + text += "AπŸ“ŠB" + text += "X" + + assert str(text) == "AπŸ“ŠBX" + + +def test_unicode_emoji_len(offset_kind): + """len() should return Python character count, regardless of offset_kind.""" + doc = Doc(offset_kind=offset_kind) + doc["text"] = text = Text() + + text += "AπŸ“ŠB" + assert len(text) == 3 + + text += "πŸŽ‰" + assert len(text) == 4 + + +def test_unicode_emoji_delete(offset_kind): + """Deleting a character after an emoji should work correctly.""" + doc = Doc(offset_kind=offset_kind) + doc["text"] = text = Text("AπŸ“ŠBC") + + del text[2] # delete B (after emoji) + assert str(text) == "AπŸ“ŠC", f"Got {str(text)!r}" + + +def test_unicode_emoji_delete_emoji(offset_kind): + """Deleting an emoji character itself should work correctly.""" + doc = Doc(offset_kind=offset_kind) + doc["text"] = text = Text("AπŸ“ŠB") + + del text[1] # delete πŸ“Š + assert str(text) == "AB", f"Got {str(text)!r}" + + +def test_unicode_emoji_slice_delete(offset_kind): + """Slice deletion across emoji boundaries should work correctly.""" + doc = Doc(offset_kind=offset_kind) + doc["text"] = text = Text("AπŸ“ŠBπŸŽ‰C") + + del text[1:4] # delete πŸ“ŠBπŸŽ‰ + assert str(text) == "AC", f"Got {str(text)!r}" + + +def test_unicode_emoji_setitem(offset_kind): + """Replacing a character after an emoji should work correctly.""" + doc = Doc(offset_kind=offset_kind) + doc["text"] = text = Text("AπŸ“ŠBC") + + text[2] = "X" # replace B (after emoji) + assert str(text) == "AπŸ“ŠXC", f"Got {str(text)!r}" + + +def test_unicode_emoji_slice_setitem(offset_kind): + """Slice replacement spanning emoji should work correctly.""" + doc = Doc(offset_kind=offset_kind) + doc["text"] = text = Text("AπŸ“ŠBπŸŽ‰C") + + text[1:4] = "XYZ" # replace πŸ“ŠBπŸŽ‰ with XYZ + assert str(text) == "AXYZC", f"Got {str(text)!r}" + + +def test_unicode_cjk(offset_kind): + """CJK characters (BMP, 1 UTF-16 code unit but 3 UTF-8 bytes each).""" + doc = Doc(offset_kind=offset_kind) + doc["text"] = text = Text() + + text += "δ»·ζ Ό" + text.insert(2, "X") + assert str(text) == "δ»·ζ ΌX", f"Got {str(text)!r}" + assert len(text) == 3 + + +def test_unicode_mixed_scripts(offset_kind): + """Mixed ASCII, CJK, Cyrillic, and emoji in one text.""" + doc = Doc(offset_kind=offset_kind) + doc["text"] = text = Text() + + text += "Hello" + text.insert(5, " δΈ–η•Œ") + text.insert(8, " πŸ“Š") + text.insert(11, " ΠΌΠΈΡ€") + text.insert(15, "!") + + expected = "Hello δΈ–η•Œ πŸ“Š ΠΌΠΈΡ€!" + assert str(text) == expected, f"Got {str(text)!r}" + assert len(text) == 15 + + +def test_unicode_supplementary_plane(offset_kind): + """Characters outside BMP.""" + doc = Doc(offset_kind=offset_kind) + doc["text"] = text = Text() + + # π’œ (U+1D49C) = Mathematical Script Capital A + # π €€ (U+20000) = CJK Unified Ideograph Extension B + text += "Aπ’œBπ €€C" + assert len(text) == 5 + + text.insert(2, "X") # between π’œ and B + assert str(text) == "Aπ’œXBπ €€C", f"Got {str(text)!r}" + + text.insert(5, "Y") # between π €€ and C + assert str(text) == "Aπ’œXBπ €€YC", f"Got {str(text)!r}" + + +def test_unicode_cross_doc_sync(offset_kind): + """Updates with Unicode content should sync correctly between two pycrdt docs. + + Both docs must use the same offset_kind β€” peers with mismatched offset + kinds is a known incompatibility (yrs and yjs both require all peers in + a swarm to agree). + """ + doc1 = Doc(offset_kind=offset_kind) + doc1["text"] = text1 = Text() + + # Capture updates from doc1 + updates = [] + doc1.observe(lambda event: updates.append(event.update)) + + text1 += "# Analysis πŸ“Š\n" + text1.insert(len(text1), "model = fit()\n") + text1.insert(len(text1), "# 特征ε·₯程\n") + + # Apply to doc2 + doc2 = Doc(offset_kind=offset_kind) + doc2["text"] = Text() + for update in updates: + doc2.apply_update(update) + + assert str(doc2["text"]) == str(text1), ( + f"Docs diverged: doc1={str(text1)!r} doc2={str(doc2['text'])!r}" + ) + + +# Test cases adapted from jupyter-server/jupyter_ydoc#370 (prior art for +# the workaround at the jupyter_ydoc layer). These exercise pycrdt's Text +# operations directly with the same Unicode edge cases. Each test sets +# initial content, then applies a granular edit (using SequenceMatcher on +# byte offsets, matching how jupyter_ydoc.YUnicode.set() works), and verifies +# the result is correct. + + +def _apply_diff(text, old_value, new_value): + """Apply a granular diff from old_value to new_value using character-level + SequenceMatcher. With the UTF-16 offset fix, pycrdt Text indices are + character-based, so we diff on characters (not bytes).""" + matcher = SequenceMatcher(a=old_value, b=new_value) + + offset = 0 + for tag, i1, i2, j1, j2 in matcher.get_opcodes(): + if tag == "replace": + text[i1 + offset : i2 + offset] = new_value[j1:j2] + offset += (j2 - j1) - (i2 - i1) + elif tag == "delete": + del text[i1 + offset : i2 + offset] + offset -= i2 - i1 + elif tag == "insert": + text.insert(i1 + offset, new_value[j1:j2]) + offset += j2 - j1 + + +@pytest.mark.parametrize( + "initial, updated", + [ + # emojis swapped + ( + "I like security 🎨 but I really love painting πŸ”’", + "I like security πŸ”’ but I really love painting 🎨", + ), + # text changes, emojis stay in place + ( + "Here is a rocket: ⭐ and a star: πŸš€", + "Here is a star: ⭐ and a rocket: πŸš€", + ), + # change of text and emojis + ( + "Here are some happy faces: πŸ˜€πŸ˜πŸ˜‚", + "Here are some sad faces: 😞😒😭", + ), + # change of characters with combining marks + ( + "Combining characters: Γ‘ Γ© Γ­ Γ³ ΓΊ", + "Combining characters: ΓΊ Γ³ Γ­ Γ© Γ‘", + ), + # flags (regional indicator sequences) + ( + "Flags: πŸ‡ΊπŸ‡ΈπŸ‡¬πŸ‡§πŸ‡¨πŸ‡¦", + "Flags: πŸ‡¨πŸ‡¦πŸ‡¬πŸ‡§πŸ‡ΊπŸ‡Έ", + ), + # Zero-width joiner sequences (family emoji) + ( + "A family πŸ‘¨\u200dπŸ‘©\u200dπŸ‘§\u200dπŸ‘¦ (with two children)", + "A family πŸ‘¨\u200dπŸ‘©\u200dπŸ‘§ (with one child)", + ), + # Mixed RTL/LTR text + ( + "Hello Χ©ΧœΧ•Χ world", + "Hello Χ’Χ•ΧœΧ world", + ), + # Keycap sequences + ( + "Numbers: 1️⃣2️⃣3️⃣", + "Numbers: 3️⃣2️⃣1️⃣", + ), + # Emoji at boundaries + ( + "πŸ‘‹ middle text πŸŽ‰", + "πŸŽ‰ middle text πŸ‘‹", + ), + # Japanese characters + ( + "γ“γ‚“γ«γ‘γ―δΈ–η•Œ", + "γ“γ‚“γ«γ‘γ―εœ°ηƒ", + ), + # Julia math operators + ( + "x ∈ [1, 2, 3] && y β‰₯ 0", + "x βˆ‰ [1, 2, 3] || y ≀ 0", + ), + ], + ids=[ + "emoji_swap", + "text_change_emoji_stay", + "emoji_change", + "combining_marks", + "flags", + "zwj_family", + "rtl_ltr", + "keycap", + "emoji_boundaries", + "japanese", + "math_operators", + ], +) +def test_unicode_granular_diff(initial, updated, offset_kind): + """Granular text edits with multi-byte Unicode should produce correct results. + + Test cases adapted from jupyter-server/jupyter_ydoc#370. + """ + doc = Doc(offset_kind=offset_kind) + doc["text"] = text = Text() + + text += initial + assert str(text) == initial + + _apply_diff(text, initial, updated) + assert str(text) == updated, f"Got {str(text)!r}, expected {updated!r}" + + +def test_get_utf16_index(): + # ASCII: identity + assert get_utf16_index("hello", 0) == 0 + assert get_utf16_index("hello", 5) == 5 + # BMP CJK: 1 code point = 1 UTF-16 code unit + assert get_utf16_index("δ»·ζ Ό", 2) == 2 + # Non-BMP emoji: 1 code point = 2 UTF-16 code units (surrogate pair) + assert get_utf16_index("AπŸ“ŠB", 1) == 1 + assert get_utf16_index("AπŸ“ŠB", 2) == 3 + assert get_utf16_index("AπŸ“ŠB", 3) == 4 + + +def test_get_utf8_index(): + # ASCII: identity + assert get_utf8_index("hello", 0) == 0 + assert get_utf8_index("hello", 5) == 5 + # BMP CJK: 1 code point = 3 UTF-8 bytes + assert get_utf8_index("δ»·ζ Ό", 1) == 3 + assert get_utf8_index("δ»·ζ Ό", 2) == 6 + # Non-BMP emoji: 1 code point = 4 UTF-8 bytes + assert get_utf8_index("AπŸ“ŠB", 1) == 1 + assert get_utf8_index("AπŸ“ŠB", 2) == 5 + assert get_utf8_index("AπŸ“ŠB", 3) == 6 + + +def test_offset_kind_default_is_utf8(): + assert Doc().offset_kind == "utf8" + + +def test_offset_kind_explicit(): + assert Doc(offset_kind="utf8").offset_kind == "utf8" + assert Doc(offset_kind="utf16").offset_kind == "utf16" + # hyphenated forms accepted + assert Doc(offset_kind="utf-8").offset_kind == "utf8" + assert Doc(offset_kind="utf-16").offset_kind == "utf16" + + +def test_offset_kind_invalid_raises(): + with pytest.raises(ValueError): + Doc(offset_kind="utf32") + + +def test_offset_kind_doc_mismatch_raises(): + """Doc(doc=existing, offset_kind=other) must reject conflicting values.""" + utf8_doc = Doc(offset_kind="utf8") + with pytest.raises(ValueError, match="does not match"): + Doc(doc=utf8_doc._doc, offset_kind="utf16") + + +def test_offset_kind_snapshot_round_trip(offset_kind): + """from_snapshot must preserve the source doc's offset_kind.""" + from pycrdt import Snapshot + + doc = Doc(offset_kind=offset_kind, skip_gc=True) + doc["text"] = Text("AπŸ“ŠB") + snap = Snapshot.from_doc(doc) + restored = Doc.from_snapshot(snap, doc) + assert restored.offset_kind == offset_kind, ( + f"snapshot lost offset_kind: expected {offset_kind}, got {restored.offset_kind}" + ) + assert str(restored["text"]) == "AπŸ“ŠB" + + def test_sticky_index_transaction(): doc = Doc() text = doc.get("text", type=Text)