From 669f98db504af8fcb710c5da91e44d32cff1696f Mon Sep 17 00:00:00 2001
From: Xavier Lange <xrlange@gmail.com>
Date: Fri, 10 Apr 2026 12:23:25 -0400
Subject: [PATCH 01/13] fix: use UTF-16 offsets for Text operations (fixes
 #308)

Set OffsetKind::Utf16 on yrs Doc so the wire format uses UTF-16 code
unit offsets, matching JS yjs. Without this, pycrdt uses UTF-8 byte
offsets, causing findIndexSS "Unexpected case" crashes when JS yjs
clients apply incremental updates containing multi-byte characters.

In the Python wrapper, convert character (code point) indices to
UTF-16 code unit indices before passing to yrs. This ensures
Text.insert(), __setitem__, __delitem__, and format() all work
correctly with emoji and other non-BMP characters.

Fixes: #308
Related: jupyter-ai-contrib/jupyter-server-documents#197
---
 python/pycrdt/_text.py | 77 ++++++++++++++++++++++++++++++++++--------
 src/doc.rs             |  8 ++++-
 2 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/python/pycrdt/_text.py b/python/pycrdt/_text.py
index 49356c0..418bab3 100644
--- a/python/pycrdt/_text.py
+++ b/python/pycrdt/_text.py
@@ -11,6 +11,36 @@
     from ._doc import Doc
 
 
+def _char_to_utf16(text: str, char_index: int) -> int:
+    """Convert a Python character (code point) index to a UTF-16 code unit index.
+
+    Characters outside the Basic Multilingual Plane (e.g. emoji) occupy 2
+    UTF-16 code units but only 1 Python character.  The underlying yrs library
+    uses UTF-16 offsets, so all indices passed to it must be converted.
+
+    For pure-ASCII / BMP text this is a no-op (returns ``char_index``
+    unchanged).
+    """
+    if char_index == 0:
+        return 0
+    prefix = text[:char_index]
+    # Count characters that need a surrogate pair (code point > 0xFFFF)
+    extra = sum(1 for ch in prefix if ord(ch) > 0xFFFF)
+    return char_index + extra
+
+
+def _utf16_to_char(text: str, utf16_index: int) -> int:
+    """Convert a UTF-16 code unit index back to a Python character index."""
+    char_idx = 0
+    utf16_idx = 0
+    for ch in text:
+        if utf16_idx >= utf16_index:
+            break
+        utf16_idx += 2 if ord(ch) > 0xFFFF else 1
+        char_idx += 1
+    return char_idx
+
+
 class Text(Sequence):
     """
     A shared data type used for collaborative text editing, similar to a Python `str`.
@@ -89,10 +119,10 @@ def __len__(self) -> int:
         ```
 
         Returns:
-            The length of the text.
+            The length of the text (in Python characters, not UTF-16 code units).
         """
-        with self.doc.transaction() as txn:
-            return self.integrated.len(txn._txn)
+        # Return Python character count, not yrs UTF-16 code unit count
+        return len(str(self))
 
     def __str__(self) -> str:
         """
@@ -169,13 +199,19 @@ def __delitem__(self, key: int | slice) -> None:
         """
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
+            current = str(self)
             if isinstance(key, int):
-                self.integrated.remove_range(txn._txn, key, 1)
+                utf16_idx = _char_to_utf16(current, key)
+                char_at = current[key]
+                utf16_len = 2 if ord(char_at) > 0xFFFF else 1
+                self.integrated.remove_range(txn._txn, utf16_idx, utf16_len)
             elif isinstance(key, slice):
                 start, stop = self._check_slice(key)
                 length = stop - start
                 if length > 0:
-                    self.integrated.remove_range(txn._txn, start, length)
+                    utf16_start = _char_to_utf16(current, start)
+                    utf16_stop = _char_to_utf16(current, stop)
+                    self.integrated.remove_range(txn._txn, utf16_start, utf16_stop - utf16_start)
             else:
                 raise RuntimeError(f"Index not supported: {key}")
 
@@ -214,20 +250,26 @@ def __setitem__(self, key: int | slice, value: str) -> None:
         """
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
+            current = str(self)
             if isinstance(key, int):
                 value_len = len(value)
                 if value_len != 1:
                     raise RuntimeError(
                         f"Single item assigned value must have a length of 1, not {value_len}"
                     )
-                del self[key]
-                self.integrated.insert(txn._txn, key, value)
+                utf16_idx = _char_to_utf16(current, key)
+                char_at = current[key]
+                utf16_len = 2 if ord(char_at) > 0xFFFF else 1
+                self.integrated.remove_range(txn._txn, utf16_idx, utf16_len)
+                self.integrated.insert(txn._txn, utf16_idx, value)
             elif isinstance(key, slice):
                 start, stop = self._check_slice(key)
-                length = stop - start
+                utf16_start = _char_to_utf16(current, start)
+                utf16_stop = _char_to_utf16(current, stop)
+                length = utf16_stop - utf16_start
                 if length > 0:
-                    self.integrated.remove_range(txn._txn, start, length)
-                self.integrated.insert(txn._txn, start, value)
+                    self.integrated.remove_range(txn._txn, utf16_start, length)
+                self.integrated.insert(txn._txn, utf16_start, value)
             else:
                 raise RuntimeError(f"Index not supported: {key}")
 
@@ -251,8 +293,10 @@ def insert(self, index: int, value: str, attrs: dict[str, Any] | None = None) ->
         """
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
+            current = str(self)
+            utf16_index = _char_to_utf16(current, index)
             self.integrated.insert(
-                txn._txn, index, value, iter(attrs.items()) if attrs is not None else None
+                txn._txn, utf16_index, value, iter(attrs.items()) if attrs is not None else None
             )
 
     def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = None) -> None:
@@ -266,8 +310,10 @@ def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = No
         """
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
+            current = str(self)
+            utf16_index = _char_to_utf16(current, index)
             self.integrated.insert_embed(
-                txn._txn, index, value, iter(attrs.items()) if attrs is not None else None
+                txn._txn, utf16_index, value, iter(attrs.items()) if attrs is not None else None
             )
 
     def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None:
@@ -282,9 +328,12 @@ def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None:
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
             start, stop = self._check_slice(slice(start, stop))
-            length = stop - start
+            current = str(self)
+            utf16_start = _char_to_utf16(current, start)
+            utf16_stop = _char_to_utf16(current, stop)
+            length = utf16_stop - utf16_start
             if length > 0:
-                self.integrated.format(txn._txn, start, length, iter(attrs.items()))
+                self.integrated.format(txn._txn, utf16_start, length, iter(attrs.items()))
 
     def diff(self) -> list[tuple[Any, dict[str, Any] | None]]:
         """
diff --git a/src/doc.rs b/src/doc.rs
index 61109cc..27e875d 100644
--- a/src/doc.rs
+++ b/src/doc.rs
@@ -3,7 +3,7 @@ use pyo3::IntoPyObjectExt;
 use pyo3::exceptions::{PyRuntimeError, PyValueError};
 use pyo3::types::{PyBool, PyBytes, PyDict, PyInt, PyList};
 use yrs::{
-    Doc as _Doc, Options, ReadTxn, StateVector, SubdocsEvent as _SubdocsEvent, Transact, TransactionCleanupEvent, TransactionMut, Update, WriteTxn
+    Doc as _Doc, OffsetKind, Options, ReadTxn, StateVector, SubdocsEvent as _SubdocsEvent, Transact, TransactionCleanupEvent, TransactionMut, Update, WriteTxn
 };
 use yrs::updates::encoder::{Encode, Encoder};
 use yrs::updates::decoder::Decode;
@@ -32,6 +32,7 @@ impl Doc {
         let mut options = yrs::Options::default();
         options.client_id = original.doc.client_id();
         options.skip_gc = original.doc.skip_gc();
+        options.offset_kind = OffsetKind::Utf16;
         if let Some(collection_id) = original.doc.collection_id() {
             options.collection_id = Some(collection_id);
         }
@@ -84,6 +85,11 @@ impl Doc {
                 .map_err(|_| PyValueError::new_err("skip_gc must be a valid bool"))?;
             options.skip_gc = _skip_gc;
         }
+        // Use UTF-16 offsets for compatibility with JS yjs clients.
+        // Without this, pycrdt uses UTF-8 byte offsets which causes
+        // findIndexSS crashes when JS yjs applies incremental updates
+        // containing multi-byte characters.
+        options.offset_kind = OffsetKind::Utf16;
         let doc = _Doc::with_options(options);
         Ok(Doc { doc })
     }

From f3a2e7e596fd2d7bf2b35ea4fdbf97eb7b5eb484 Mon Sep 17 00:00:00 2001
From: Xavier Lange <xrlange@gmail.com>
Date: Fri, 10 Apr 2026 14:44:42 -0400
Subject: [PATCH 02/13] test: add 12 Unicode/emoji tests for Text operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cover insert, delete, setitem, slice, len, and cross-doc sync with:
- emoji (surrogate pairs: 📊 🎉)
- CJK (BMP: 价格 世界 特征工程)
- Cyrillic (мир)
- supplementary plane (𝒜 𠀀)
- mixed scripts in one text

These all fail on stock pycrdt 0.12.50 and pass with the OffsetKind::Utf16 fix.
---
 tests/test_text.py | 153 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 153 insertions(+)

diff --git a/tests/test_text.py b/tests/test_text.py
index ba913a0..ab4e507 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -228,6 +228,159 @@ def test_sticky_index(serialize: str):
     assert text1[new_idx] == "*"
 
 
+def test_unicode_emoji_insert():
+    """Text.insert() after emoji characters should use character positions, not byte offsets."""
+    doc = Doc()
+    doc["text"] = text = Text()
+
+    text += "A📊B"
+    assert str(text) == "A📊B"
+    assert len(text) == 3
+
+    # Insert at position 2 = between 📊 and B
+    text.insert(2, "X")
+    assert str(text) == "A📊XB", f"Got {str(text)!r}, emoji insert position is wrong"
+
+
+def test_unicode_emoji_sequential_inserts():
+    """Sequential inserts after emoji should maintain correct positions."""
+    doc = Doc()
+    doc["text"] = text = Text()
+
+    text += "# Analysis 📊\n"
+    text.insert(len(text), "model = fit()\n")
+    text.insert(len(text), "# 特征工程\n")
+    text.insert(len(text), 'print("done")\n')
+
+    expected = '# Analysis 📊\nmodel = fit()\n# 特征工程\nprint("done")\n'
+    assert str(text) == expected, f"Got {str(text)!r}"
+
+
+def test_unicode_emoji_len():
+    """len() should return Python character count, not byte count."""
+    doc = Doc()
+    doc["text"] = text = Text()
+
+    text += "A📊B"
+    assert len(text) == 3  # 3 chars, not 6 bytes or 4 UTF-16 code units
+
+    text += "🎉"
+    assert len(text) == 4
+
+
+def test_unicode_emoji_delete():
+    """Deleting a character after an emoji should work correctly."""
+    doc = Doc()
+    doc["text"] = text = Text("A📊BC")
+
+    del text[2]  # delete B (after emoji)
+    assert str(text) == "A📊C", f"Got {str(text)!r}"
+
+
+def test_unicode_emoji_delete_emoji():
+    """Deleting an emoji character itself should work correctly."""
+    doc = Doc()
+    doc["text"] = text = Text("A📊B")
+
+    del text[1]  # delete 📊
+    assert str(text) == "AB", f"Got {str(text)!r}"
+
+
+def test_unicode_emoji_slice_delete():
+    """Slice deletion across emoji boundaries should work correctly."""
+    doc = Doc()
+    doc["text"] = text = Text("A📊B🎉C")
+
+    del text[1:4]  # delete 📊B🎉
+    assert str(text) == "AC", f"Got {str(text)!r}"
+
+
+def test_unicode_emoji_setitem():
+    """Replacing a character after an emoji should work correctly."""
+    doc = Doc()
+    doc["text"] = text = Text("A📊BC")
+
+    text[2] = "X"  # replace B (after emoji)
+    assert str(text) == "A📊XC", f"Got {str(text)!r}"
+
+
+def test_unicode_emoji_slice_setitem():
+    """Slice replacement spanning emoji should work correctly."""
+    doc = Doc()
+    doc["text"] = text = Text("A📊B🎉C")
+
+    text[1:4] = "XYZ"  # replace 📊B🎉 with XYZ
+    assert str(text) == "AXYZC", f"Got {str(text)!r}"
+
+
+def test_unicode_cjk():
+    """CJK characters (BMP, 1 UTF-16 code unit each) should work correctly."""
+    doc = Doc()
+    doc["text"] = text = Text()
+
+    text += "价格"
+    text.insert(2, "X")
+    assert str(text) == "价格X", f"Got {str(text)!r}"
+    assert len(text) == 3
+
+
+def test_unicode_mixed_scripts():
+    """Mixed ASCII, CJK, Cyrillic, and emoji in one text."""
+    doc = Doc()
+    doc["text"] = text = Text()
+
+    text += "Hello"
+    text.insert(5, " 世界")
+    text.insert(8, " 📊")
+    text.insert(11, " мир")
+    text.insert(15, "!")
+
+    expected = "Hello 世界 📊 мир!"
+    assert str(text) == expected, f"Got {str(text)!r}"
+    assert len(text) == 15
+
+
+def test_unicode_supplementary_plane():
+    """Characters outside BMP (require UTF-16 surrogate pairs)."""
+    doc = Doc()
+    doc["text"] = text = Text()
+
+    # 𝒜 (U+1D49C) = Mathematical Script Capital A
+    # 𠀀 (U+20000) = CJK Unified Ideograph Extension B
+    text += "A𝒜B𠀀C"
+    assert len(text) == 5
+
+    text.insert(2, "X")  # between 𝒜 and B
+    assert str(text) == "A𝒜XB𠀀C", f"Got {str(text)!r}"
+
+    text.insert(5, "Y")  # between 𠀀 and C
+    assert str(text) == "A𝒜XB𠀀YC", f"Got {str(text)!r}"
+
+
+def test_unicode_cross_doc_sync():
+    """Updates with Unicode content should sync correctly between two pycrdt docs."""
+    doc1 = Doc()
+    doc1["text"] = text1 = Text()
+
+    # Capture updates from doc1
+    updates = []
+    doc1.observe(lambda event: updates.append(event.update))
+
+    text1 += "# Analysis 📊\n"
+    text1.insert(len(text1), "model = fit()\n")
+    text1.insert(len(text1), "# 特征工程\n")
+
+    # Apply to doc2
+    doc2 = Doc()
+    doc2["text"] = Text()
+    for update in updates:
+        doc2.apply_update(update)
+
+    assert str(doc2["text"]) == str(text1), (
+        f"Docs diverged: doc1={str(text1)!r} doc2={str(doc2['text'])!r}"
+    )
+
+
 def test_sticky_index_transaction():
     doc = Doc()
     text = doc.get("text", type=Text)

From 8264d95de14fb00435d338b76111dc81ab12b97a Mon Sep 17 00:00:00 2001
From: Xavier Lange <xrlange@gmail.com>
Date: Fri, 10 Apr 2026 14:48:36 -0400
Subject: [PATCH 03/13] test: add granular diff tests from jupyter_ydoc#370

11 parametrized test cases adapted from jupyter-server/jupyter_ydoc#370
covering emoji swaps, flags, ZWJ family sequences, combining marks,
keycap sequences, RTL/LTR text, Japanese, and math operators.

These exercise Text insert/delete/replace via SequenceMatcher-based
diffing (the same pattern jupyter_ydoc.YUnicode.set() uses).
---
 tests/test_text.py | 116 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)

diff --git a/tests/test_text.py b/tests/test_text.py
index ab4e507..dd77df2 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -381,6 +381,122 @@ def test_unicode_cross_doc_sync():
     )
 
 
+# Test cases adapted from jupyter-server/jupyter_ydoc#370 (prior art for
+# the workaround at the jupyter_ydoc layer). These exercise pycrdt's Text
+# operations directly with the same Unicode edge cases. Each test sets
+# initial content, then applies a granular edit (using SequenceMatcher on
+# byte offsets, matching how jupyter_ydoc.YUnicode.set() works), and verifies
+# the result is correct.
+from difflib import SequenceMatcher
+
+
+def _apply_diff(text, old_value, new_value):
+    """Apply a granular diff from old_value to new_value using character-level
+    SequenceMatcher. With the UTF-16 offset fix, pycrdt Text indices are
+    character-based, so we diff on characters (not bytes)."""
+    matcher = SequenceMatcher(a=old_value, b=new_value)
+
+    offset = 0
+    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
+        if tag == "replace":
+            text[i1 + offset : i2 + offset] = new_value[j1:j2]
+            offset += (j2 - j1) - (i2 - i1)
+        elif tag == "delete":
+            del text[i1 + offset : i2 + offset]
+            offset -= i2 - i1
+        elif tag == "insert":
+            text.insert(i1 + offset, new_value[j1:j2])
+            offset += j2 - j1
+
+
+@pytest.mark.parametrize(
+    "initial, updated",
+    [
+        # emojis swapped
+        (
+            "I like security 🎨 but I really love painting 🔒",
+            "I like security 🔒 but I really love painting 🎨",
+        ),
+        # text changes, emojis stay in place
+        (
+            "Here is a rocket: ⭐ and a star: 🚀",
+            "Here is a star: ⭐ and a rocket: 🚀",
+        ),
+        # change of text and emojis
+        (
+            "Here are some happy faces: 😀😁😂",
+            "Here are some sad faces: 😞😢😭",
+        ),
+        # change of characters with combining marks
+        (
+            "Combining characters: á é í ó ú",
+            "Combining characters: ú ó í é á",
+        ),
+        # flags (regional indicator sequences)
+        (
+            "Flags: 🇺🇸🇬🇧🇨🇦",
+            "Flags: 🇨🇦🇬🇧🇺🇸",
+        ),
+        # Zero-width joiner sequences (family emoji)
+        (
+            "A family 👨\u200d👩\u200d👧\u200d👦 (with two children)",
+            "A family 👨\u200d👩\u200d👧 (with one child)",
+        ),
+        # Mixed RTL/LTR text
+        (
+            "Hello שלום world",
+            "Hello עולם world",
+        ),
+        # Keycap sequences
+        (
+            "Numbers: 1️⃣2️⃣3️⃣",
+            "Numbers: 3️⃣2️⃣1️⃣",
+        ),
+        # Emoji at boundaries
+        (
+            "👋 middle text 🎉",
+            "🎉 middle text 👋",
+        ),
+        # Japanese characters
+        (
+            "こんにちは世界",
+            "こんにちは地球",
+        ),
+        # Julia math operators
+        (
+            "x ∈ [1, 2, 3] && y ≥ 0",
+            "x ∉ [1, 2, 3] || y ≤ 0",
+        ),
+    ],
+    ids=[
+        "emoji_swap",
+        "text_change_emoji_stay",
+        "emoji_change",
+        "combining_marks",
+        "flags",
+        "zwj_family",
+        "rtl_ltr",
+        "keycap",
+        "emoji_boundaries",
+        "japanese",
+        "math_operators",
+    ],
+)
+def test_unicode_granular_diff(initial, updated):
+    """Granular text edits with multi-byte Unicode should produce correct results.
+
+    Test cases adapted from jupyter-server/jupyter_ydoc#370.
+    """
+    doc = Doc()
+    doc["text"] = text = Text()
+
+    text += initial
+    assert str(text) == initial
+
+    _apply_diff(text, initial, updated)
+    assert str(text) == updated, f"Got {str(text)!r}, expected {updated!r}"
+
+
 def test_sticky_index_transaction():
     doc = Doc()
     text = doc.get("text", type=Text)

From 7cf5af44834ba1ae4f108ea99ed668b90530b723 Mon Sep 17 00:00:00 2001
From: Xavier Lange <xrlange@gmail.com>
Date: Fri, 10 Apr 2026 14:52:11 -0400
Subject: [PATCH 04/13] fix: move SequenceMatcher import to top of file (ruff
 E402)

---
 tests/test_text.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_text.py b/tests/test_text.py
index dd77df2..5efdc1f 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -1,3 +1,5 @@
+from difflib import SequenceMatcher
+
 import pytest
 from anyio import TASK_STATUS_IGNORED, Event, create_task_group
 from anyio.abc import TaskStatus
@@ -387,7 +389,6 @@ def test_unicode_cross_doc_sync():
 # initial content, then applies a granular edit (using SequenceMatcher on
 # byte offsets, matching how jupyter_ydoc.YUnicode.set() works), and verifies
 # the result is correct.
-from difflib import SequenceMatcher
 
 
 def _apply_diff(text, old_value, new_value):

From b1ed6ae56be03f2555f4fc99527143b9a49bf526 Mon Sep 17 00:00:00 2001
From: Xavier Lange <xrlange@gmail.com>
Date: Mon, 13 Apr 2026 20:06:41 -0400
Subject: [PATCH 05/13] test: add tests for _utf16_to_char helper

Addresses review feedback from @davidbrochart. Tests cover ASCII
(identity), BMP characters, supplementary plane (emoji), multiple
emoji, and roundtrip with _char_to_utf16.
---
 tests/test_text.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/tests/test_text.py b/tests/test_text.py
index 5efdc1f..d67f52e 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -4,6 +4,7 @@
 from anyio import TASK_STATUS_IGNORED, Event, create_task_group
 from anyio.abc import TaskStatus
 from pycrdt import Array, Assoc, Doc, Map, StickyIndex, Text
+from pycrdt._text import _char_to_utf16, _utf16_to_char
 
 pytestmark = pytest.mark.anyio
 
@@ -498,6 +499,62 @@ def test_unicode_granular_diff(initial, updated):
     assert str(text) == updated, f"Got {str(text)!r}, expected {updated!r}"
 
 
+def test_utf16_to_char_ascii():
+    """_utf16_to_char is identity for pure ASCII text."""
+    text = "Hello, World!"
+    for i in range(len(text) + 1):
+        assert _utf16_to_char(text, i) == i
+
+
+def test_utf16_to_char_bmp():
+    """BMP characters (CJK, Cyrillic) are 1 UTF-16 code unit each."""
+    text = "价格分析"  # 4 BMP CJK chars = 4 UTF-16 code units
+    assert _utf16_to_char(text, 0) == 0
+    assert _utf16_to_char(text, 1) == 1
+    assert _utf16_to_char(text, 2) == 2
+    assert _utf16_to_char(text, 4) == 4
+
+
+def test_utf16_to_char_supplementary():
+    """Supplementary plane chars (emoji) take 2 UTF-16 code units."""
+    text = "A📊B"  # UTF-16: A(1) 📊(2) B(1) = 4 code units, 3 chars
+    assert _utf16_to_char(text, 0) == 0  # before A
+    assert _utf16_to_char(text, 1) == 1  # before 📊
+    assert _utf16_to_char(text, 3) == 2  # before B (1 + 2 = 3)
+    assert _utf16_to_char(text, 4) == 3  # end
+
+
+def test_utf16_to_char_multiple_emoji():
+    """Multiple supplementary plane characters."""
+    text = "A📊B🎉C"  # UTF-16: A(1) 📊(2) B(1) 🎉(2) C(1) = 7 units, 5 chars
+    assert _utf16_to_char(text, 0) == 0  # before A
+    assert _utf16_to_char(text, 1) == 1  # before 📊
+    assert _utf16_to_char(text, 3) == 2  # before B
+    assert _utf16_to_char(text, 4) == 3  # before 🎉
+    assert _utf16_to_char(text, 6) == 4  # before C
+    assert _utf16_to_char(text, 7) == 5  # end
+
+
+def test_utf16_to_char_roundtrip():
+    """_char_to_utf16 and _utf16_to_char are inverses."""
+    texts = [
+        "Hello",
+        "A📊B",
+        "价格分析",
+        "# Analysis 📊\n",
+        "A𝒜B𠀀C",
+        "Hello 世界 📊 мир!",
+        "🎉📊🔒",
+    ]
+    for text in texts:
+        for char_idx in range(len(text) + 1):
+            utf16_idx = _char_to_utf16(text, char_idx)
+            assert _utf16_to_char(text, utf16_idx) == char_idx, (
+                f"Roundtrip failed for {text!r} at char_idx={char_idx}: "
+                f"utf16={utf16_idx}, back={_utf16_to_char(text, utf16_idx)}"
+            )
+
+
 def test_sticky_index_transaction():
     doc = Doc()
     text = doc.get("text", type=Text)

From f48610b1a49c2eb312aeb4f4e4e38c76d9489488 Mon Sep 17 00:00:00 2001
From: Xavier Lange <xrlange@gmail.com>
Date: Fri, 17 Apr 2026 00:05:54 -0400
Subject: [PATCH 06/13] fix: convert UTF-16 offset in Text.__iadd__ and drop
 unused _utf16_to_char
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Text.__iadd__ passed len(self) (Python character count) to the yrs
insert, but yrs expects a UTF-16 code unit index — so `t += "X"` after
an emoji landed inside the surrogate pair. Convert the index through
_char_to_utf16, matching every other mutating method.

Also removes _utf16_to_char and its tests, which had no callers.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/pycrdt/_text.py | 16 ++--------
 tests/test_text.py     | 69 ++++++++----------------------------------
 2 files changed, 15 insertions(+), 70 deletions(-)

diff --git a/python/pycrdt/_text.py b/python/pycrdt/_text.py
index 418bab3..069da86 100644
--- a/python/pycrdt/_text.py
+++ b/python/pycrdt/_text.py
@@ -29,18 +29,6 @@ def _char_to_utf16(text: str, char_index: int) -> int:
     return char_index + extra
 
 
-def _utf16_to_char(text: str, utf16_index: int) -> int:
-    """Convert a UTF-16 code unit index back to a Python character index."""
-    char_idx = 0
-    utf16_idx = 0
-    for ch in text:
-        if utf16_idx >= utf16_index:
-            break
-        utf16_idx += 2 if ord(ch) > 0xFFFF else 1
-        char_idx += 1
-    return char_idx
-
-
 class Text(Sequence):
     """
     A shared data type used for collaborative text editing, similar to a Python `str`.
@@ -158,7 +146,9 @@ def __iadd__(self, value: str) -> Text:
         """
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
-            self.integrated.insert(txn._txn, len(self), value)
+            current = str(self)
+            utf16_index = _char_to_utf16(current, len(current))
+            self.integrated.insert(txn._txn, utf16_index, value)
             return self
 
     def _check_slice(self, key: slice) -> tuple[int, int]:
diff --git a/tests/test_text.py b/tests/test_text.py
index d67f52e..fc06f0a 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -4,7 +4,7 @@
 from anyio import TASK_STATUS_IGNORED, Event, create_task_group
 from anyio.abc import TaskStatus
 from pycrdt import Array, Assoc, Doc, Map, StickyIndex, Text
-from pycrdt._text import _char_to_utf16, _utf16_to_char
+from pycrdt._text import _char_to_utf16
 
 pytestmark = pytest.mark.anyio
 
@@ -259,6 +259,17 @@ def test_unicode_emoji_sequential_inserts():
     assert str(text) == expected, f"Got {str(text)!r}"
 
 
+def test_unicode_emoji_iadd():
+    """`+=` after emoji should append at the end (regression for UTF-16 offset bug)."""
+    doc = Doc()
+    doc["text"] = text = Text()
+
+    text += "A📊B"
+    text += "X"
+
+    assert str(text) == "A📊BX"
+
+
 def test_unicode_emoji_len():
     """len() should return Python character count, not byte count."""
     doc = Doc()
@@ -499,62 +510,6 @@ def test_unicode_granular_diff(initial, updated):
     assert str(text) == updated, f"Got {str(text)!r}, expected {updated!r}"
 
 
-def test_utf16_to_char_ascii():
-    """_utf16_to_char is identity for pure ASCII text."""
-    text = "Hello, World!"
-    for i in range(len(text) + 1):
-        assert _utf16_to_char(text, i) == i
-
-
-def test_utf16_to_char_bmp():
-    """BMP characters (CJK, Cyrillic) are 1 UTF-16 code unit each."""
-    text = "价格分析"  # 4 BMP CJK chars = 4 UTF-16 code units
-    assert _utf16_to_char(text, 0) == 0
-    assert _utf16_to_char(text, 1) == 1
-    assert _utf16_to_char(text, 2) == 2
-    assert _utf16_to_char(text, 4) == 4
-
-
-def test_utf16_to_char_supplementary():
-    """Supplementary plane chars (emoji) take 2 UTF-16 code units."""
-    text = "A📊B"  # UTF-16: A(1) 📊(2) B(1) = 4 code units, 3 chars
-    assert _utf16_to_char(text, 0) == 0  # before A
-    assert _utf16_to_char(text, 1) == 1  # before 📊
-    assert _utf16_to_char(text, 3) == 2  # before B (1 + 2 = 3)
-    assert _utf16_to_char(text, 4) == 3  # end
-
-
-def test_utf16_to_char_multiple_emoji():
-    """Multiple supplementary plane characters."""
-    text = "A📊B🎉C"  # UTF-16: A(1) 📊(2) B(1) 🎉(2) C(1) = 7 units, 5 chars
-    assert _utf16_to_char(text, 0) == 0  # before A
-    assert _utf16_to_char(text, 1) == 1  # before 📊
-    assert _utf16_to_char(text, 3) == 2  # before B
-    assert _utf16_to_char(text, 4) == 3  # before 🎉
-    assert _utf16_to_char(text, 6) == 4  # before C
-    assert _utf16_to_char(text, 7) == 5  # end
-
-
-def test_utf16_to_char_roundtrip():
-    """_char_to_utf16 and _utf16_to_char are inverses."""
-    texts = [
-        "Hello",
-        "A📊B",
-        "价格分析",
-        "# Analysis 📊\n",
-        "A𝒜B𠀀C",
-        "Hello 世界 📊 мир!",
-        "🎉📊🔒",
-    ]
-    for text in texts:
-        for char_idx in range(len(text) + 1):
-            utf16_idx = _char_to_utf16(text, char_idx)
-            assert _utf16_to_char(text, utf16_idx) == char_idx, (
-                f"Roundtrip failed for {text!r} at char_idx={char_idx}: "
-                f"utf16={utf16_idx}, back={_utf16_to_char(text, utf16_idx)}"
-            )
-
-
 def test_sticky_index_transaction():
     doc = Doc()
     text = doc.get("text", type=Text)

From 01a1756c7fbb81ed89060756b1af4a8422919de7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 17 Apr 2026 04:06:11 +0000
Subject: [PATCH 07/13] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/test_text.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_text.py b/tests/test_text.py
index fc06f0a..f84b933 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -4,7 +4,6 @@
 from anyio import TASK_STATUS_IGNORED, Event, create_task_group
 from anyio.abc import TaskStatus
 from pycrdt import Array, Assoc, Doc, Map, StickyIndex, Text
-from pycrdt._text import _char_to_utf16
 
 pytestmark = pytest.mark.anyio
 

From 41c8cc58e691edc1b54c209f40e676b1982b4878 Mon Sep 17 00:00:00 2001
From: Xavier Lange <xrlange@gmail.com>
Date: Tue, 28 Apr 2026 10:23:49 -0400
Subject: [PATCH 08/13] feat: parameterize Doc offset_kind (default UTF-8)

Adds an `offset_kind` parameter to `Doc(...)` so callers can choose
between UTF-8 byte offsets (the yrs default) and UTF-16 code unit
offsets (required for cross-runtime interop with JS yjs). Default is
"utf8", matching yrs.

The Text wrapper previously assumed UTF-16 unconditionally and passed
Python char indices through a UTF-16 conversion. This commit replaces
that with a dispatcher that picks UTF-8 or UTF-16 conversion based on
the doc's offset_kind, so Text behaves correctly in either mode while
the public API still takes Python character indices.

src/doc.rs gains a third positional arg on `Doc::new` accepting "utf8"
/ "utf-8" / "utf16" / "utf-16" / None (None preserves the yrs default).
Invalid values raise PyValueError. _from_snapshot_impl now reads the
offset kind from the source doc instead of hardcoding it. A new
`#[getter] offset_kind` returns the canonical "utf8" or "utf16" string.

_base.py forwards the new kwarg, and raises ValueError if both `doc=`
(an existing _Doc) and `offset_kind=` are supplied with disagreeing
values. _doc.py adds the kwarg to Doc.__init__, documents it, and
exposes a read-only `Doc.offset_kind` property.

Addresses davidbrochart's review on #379 asking for parameterization
with a UTF-8 default.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/pycrdt/_base.py |  8 ++++-
 python/pycrdt/_doc.py  | 17 +++++++++
 python/pycrdt/_text.py | 82 ++++++++++++++++++++++++++----------------
 src/doc.rs             | 33 +++++++++++++----
 4 files changed, 101 insertions(+), 39 deletions(-)

diff --git a/python/pycrdt/_base.py b/python/pycrdt/_base.py
index 1443d7d..4aebdfa 100644
--- a/python/pycrdt/_base.py
+++ b/python/pycrdt/_base.py
@@ -61,6 +61,7 @@ def __init__(
         *,
         client_id: int | None = None,
         skip_gc: bool | None = None,
+        offset_kind: str | None = None,
         doc: _Doc | None = None,
         Model=None,
         allow_multithreading: bool = False,
@@ -68,7 +69,12 @@ def __init__(
     ) -> None:
         super().__init__(**data)
         if doc is None:
-            doc = _Doc(client_id, skip_gc)
+            doc = _Doc(client_id, skip_gc, offset_kind)
+        elif offset_kind is not None and offset_kind != doc.offset_kind:
+            raise ValueError(
+                f"offset_kind={offset_kind!r} does not match doc.offset_kind="
+                f"{doc.offset_kind!r}"
+            )
         self._doc = doc
         self._txn = None
         self._exceptions = []
diff --git a/python/pycrdt/_doc.py b/python/pycrdt/_doc.py
index 07b140c..34f37c6 100644
--- a/python/pycrdt/_doc.py
+++ b/python/pycrdt/_doc.py
@@ -48,6 +48,7 @@ def __init__(
         *,
         client_id: int | None = None,
         skip_gc: bool | None = None,
+        offset_kind: str | None = None,
         doc: _Doc | None = None,
         Model=None,
         allow_multithreading: bool = False,
@@ -58,11 +59,18 @@ def __init__(
             client_id: An optional client ID for the document.
             skip_gc: Whether to skip garbage collection on deleted collections
                 on transaction commit.
+            offset_kind: How yrs counts text positions internally. ``"utf8"``
+                (the yrs default) uses byte offsets; ``"utf16"`` uses UTF-16
+                code unit offsets and is required for cross-runtime
+                compatibility with JS yjs. ``None`` (default) selects the yrs
+                default of ``"utf8"``. Regardless of this setting, the public
+                ``Text`` API always takes Python character indices.
             allow_multithreading: Whether to allow the document to be used in different threads.
         """
         super().__init__(
             client_id=client_id,
             skip_gc=skip_gc,
+            offset_kind=offset_kind,
             doc=doc,
             Model=Model,
             allow_multithreading=allow_multithreading,
@@ -86,6 +94,15 @@ def client_id(self) -> int:
         """The document client ID."""
         return self._doc.client_id()
 
+    @property
+    def offset_kind(self) -> str:
+        """The text offset kind used internally by yrs.
+
+        Returns ``"utf8"`` or ``"utf16"``. See [Doc.__init__][pycrdt.Doc.__init__]
+        for the meaning.
+        """
+        return self._doc.offset_kind
+
     def transaction(self, origin: Any = None) -> Transaction:
         """
         Creates a new transaction or gets the current one, if any.
diff --git a/python/pycrdt/_text.py b/python/pycrdt/_text.py
index 069da86..0b9977d 100644
--- a/python/pycrdt/_text.py
+++ b/python/pycrdt/_text.py
@@ -15,8 +15,7 @@ def _char_to_utf16(text: str, char_index: int) -> int:
     """Convert a Python character (code point) index to a UTF-16 code unit index.
 
     Characters outside the Basic Multilingual Plane (e.g. emoji) occupy 2
-    UTF-16 code units but only 1 Python character.  The underlying yrs library
-    uses UTF-16 offsets, so all indices passed to it must be converted.
+    UTF-16 code units but only 1 Python character.
 
     For pure-ASCII / BMP text this is a no-op (returns ``char_index``
     unchanged).
@@ -29,6 +28,25 @@ def _char_to_utf16(text: str, char_index: int) -> int:
     return char_index + extra
 
 
+def _char_to_utf8(text: str, char_index: int) -> int:
+    """Convert a Python character (code point) index to a UTF-8 byte index."""
+    if char_index == 0:
+        return 0
+    return len(text[:char_index].encode("utf-8"))
+
+
+def _char_to_offset(text: str, char_index: int, offset_kind: str) -> int:
+    if offset_kind == "utf16":
+        return _char_to_utf16(text, char_index)
+    return _char_to_utf8(text, char_index)
+
+
+def _single_char_unit_len(char: str, offset_kind: str) -> int:
+    if offset_kind == "utf16":
+        return 2 if ord(char) > 0xFFFF else 1
+    return len(char.encode("utf-8"))
+
+
 class Text(Sequence):
     """
     A shared data type used for collaborative text editing, similar to a Python `str`.
@@ -147,8 +165,8 @@ def __iadd__(self, value: str) -> Text:
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
             current = str(self)
-            utf16_index = _char_to_utf16(current, len(current))
-            self.integrated.insert(txn._txn, utf16_index, value)
+            offset = _char_to_offset(current, len(current), self.doc.offset_kind)
+            self.integrated.insert(txn._txn, offset, value)
             return self
 
     def _check_slice(self, key: slice) -> tuple[int, int]:
@@ -190,18 +208,19 @@ def __delitem__(self, key: int | slice) -> None:
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
             current = str(self)
+            ok = self.doc.offset_kind
             if isinstance(key, int):
-                utf16_idx = _char_to_utf16(current, key)
-                char_at = current[key]
-                utf16_len = 2 if ord(char_at) > 0xFFFF else 1
-                self.integrated.remove_range(txn._txn, utf16_idx, utf16_len)
+                offset = _char_to_offset(current, key, ok)
+                unit_len = _single_char_unit_len(current[key], ok)
+                self.integrated.remove_range(txn._txn, offset, unit_len)
             elif isinstance(key, slice):
                 start, stop = self._check_slice(key)
-                length = stop - start
-                if length > 0:
-                    utf16_start = _char_to_utf16(current, start)
-                    utf16_stop = _char_to_utf16(current, stop)
-                    self.integrated.remove_range(txn._txn, utf16_start, utf16_stop - utf16_start)
+                if stop - start > 0:
+                    offset_start = _char_to_offset(current, start, ok)
+                    offset_stop = _char_to_offset(current, stop, ok)
+                    self.integrated.remove_range(
+                        txn._txn, offset_start, offset_stop - offset_start
+                    )
             else:
                 raise RuntimeError(f"Index not supported: {key}")
 
@@ -241,25 +260,25 @@ def __setitem__(self, key: int | slice, value: str) -> None:
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
             current = str(self)
+            ok = self.doc.offset_kind
             if isinstance(key, int):
                 value_len = len(value)
                 if value_len != 1:
                     raise RuntimeError(
                         f"Single item assigned value must have a length of 1, not {value_len}"
                     )
-                utf16_idx = _char_to_utf16(current, key)
-                char_at = current[key]
-                utf16_len = 2 if ord(char_at) > 0xFFFF else 1
-                self.integrated.remove_range(txn._txn, utf16_idx, utf16_len)
-                self.integrated.insert(txn._txn, utf16_idx, value)
+                offset = _char_to_offset(current, key, ok)
+                unit_len = _single_char_unit_len(current[key], ok)
+                self.integrated.remove_range(txn._txn, offset, unit_len)
+                self.integrated.insert(txn._txn, offset, value)
             elif isinstance(key, slice):
                 start, stop = self._check_slice(key)
-                utf16_start = _char_to_utf16(current, start)
-                utf16_stop = _char_to_utf16(current, stop)
-                length = utf16_stop - utf16_start
+                offset_start = _char_to_offset(current, start, ok)
+                offset_stop = _char_to_offset(current, stop, ok)
+                length = offset_stop - offset_start
                 if length > 0:
-                    self.integrated.remove_range(txn._txn, utf16_start, length)
-                self.integrated.insert(txn._txn, utf16_start, value)
+                    self.integrated.remove_range(txn._txn, offset_start, length)
+                self.integrated.insert(txn._txn, offset_start, value)
             else:
                 raise RuntimeError(f"Index not supported: {key}")
 
@@ -284,9 +303,9 @@ def insert(self, index: int, value: str, attrs: dict[str, Any] | None = None) ->
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
             current = str(self)
-            utf16_index = _char_to_utf16(current, index)
+            offset = _char_to_offset(current, index, self.doc.offset_kind)
             self.integrated.insert(
-                txn._txn, utf16_index, value, iter(attrs.items()) if attrs is not None else None
+                txn._txn, offset, value, iter(attrs.items()) if attrs is not None else None
             )
 
     def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = None) -> None:
@@ -301,9 +320,9 @@ def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = No
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
             current = str(self)
-            utf16_index = _char_to_utf16(current, index)
+            offset = _char_to_offset(current, index, self.doc.offset_kind)
             self.integrated.insert_embed(
-                txn._txn, utf16_index, value, iter(attrs.items()) if attrs is not None else None
+                txn._txn, offset, value, iter(attrs.items()) if attrs is not None else None
             )
 
     def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None:
@@ -319,11 +338,12 @@ def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None:
             self._forbid_read_transaction(txn)
             start, stop = self._check_slice(slice(start, stop))
             current = str(self)
-            utf16_start = _char_to_utf16(current, start)
-            utf16_stop = _char_to_utf16(current, stop)
-            length = utf16_stop - utf16_start
+            ok = self.doc.offset_kind
+            offset_start = _char_to_offset(current, start, ok)
+            offset_stop = _char_to_offset(current, stop, ok)
+            length = offset_stop - offset_start
             if length > 0:
-                self.integrated.format(txn._txn, utf16_start, length, iter(attrs.items()))
+                self.integrated.format(txn._txn, offset_start, length, iter(attrs.items()))
 
     def diff(self) -> list[tuple[Any, dict[str, Any] | None]]:
         """
diff --git a/src/doc.rs b/src/doc.rs
index 27e875d..455c61f 100644
--- a/src/doc.rs
+++ b/src/doc.rs
@@ -32,7 +32,7 @@ impl Doc {
         let mut options = yrs::Options::default();
         options.client_id = original.doc.client_id();
         options.skip_gc = original.doc.skip_gc();
-        options.offset_kind = OffsetKind::Utf16;
+        options.offset_kind = original.doc.offset_kind();
         if let Some(collection_id) = original.doc.collection_id() {
             options.collection_id = Some(collection_id);
         }
@@ -69,7 +69,11 @@ impl Doc {
 #[pymethods]
 impl Doc {
     #[new]
-    fn new(client_id: &Bound<'_, PyAny>, skip_gc: &Bound<'_, PyAny>) -> PyResult<Self> {
+    fn new(
+        client_id: &Bound<'_, PyAny>,
+        skip_gc: &Bound<'_, PyAny>,
+        offset_kind: &Bound<'_, PyAny>,
+    ) -> PyResult<Self> {
         let mut options = Options::default();
         if !client_id.is_none() {
             let _client_id: u64 = client_id.cast::<PyInt>()
@@ -85,15 +89,30 @@ impl Doc {
                 .map_err(|_| PyValueError::new_err("skip_gc must be a valid bool"))?;
             options.skip_gc = _skip_gc;
         }
-        // Use UTF-16 offsets for compatibility with JS yjs clients.
-        // Without this, pycrdt uses UTF-8 byte offsets which causes
-        // findIndexSS crashes when JS yjs applies incremental updates
-        // containing multi-byte characters.
-        options.offset_kind = OffsetKind::Utf16;
+        if !offset_kind.is_none() {
+            let _offset_kind: String = offset_kind
+                .extract()
+                .map_err(|_| PyValueError::new_err("offset_kind must be a string"))?;
+            options.offset_kind = match _offset_kind.as_str() {
+                "utf8" | "utf-8" => OffsetKind::Bytes,
+                "utf16" | "utf-16" => OffsetKind::Utf16,
+                _ => return Err(PyValueError::new_err(
+                    "offset_kind must be 'utf8' or 'utf16'",
+                )),
+            };
+        }
         let doc = _Doc::with_options(options);
         Ok(Doc { doc })
     }
 
+    #[getter]
+    fn offset_kind(&self) -> &'static str {
+        match self.doc.offset_kind() {
+            OffsetKind::Bytes => "utf8",
+            OffsetKind::Utf16 => "utf16",
+        }
+    }
+
     #[staticmethod]
     #[pyo3(name = "from_snapshot")]
     pub fn from_snapshot(py: Python<'_>, snapshot: PyRef<'_, crate::snapshot::Snapshot>, doc: PyRef<'_, Doc>) -> PyResult<Py<Doc>> {

From 855af5c8c5fa64e7d04829bc54e36e57e270c925 Mon Sep 17 00:00:00 2001
From: Xavier Lange <xrlange@gmail.com>
Date: Tue, 28 Apr 2026 10:25:20 -0400
Subject: [PATCH 09/13] refactor: make utf16/utf8 index helpers public
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses davidbrochart's three inline reviews on #379:

- _text.py:14 — rename `_char_to_utf16` → `get_utf16_index` (public,
  per the suggested rename). Add a counterpart `get_utf8_index` since
  with offset_kind parameterization the conversion is needed in both
  directions (jbdyn + davidbrochart agreed in the thread). The
  dispatcher and single-char-length helpers stay private — trivial
  conditionals best read at the call site.
- _text.py:23 — both public helpers now use Google-style docstrings
  (Args / Returns sections), matching the rest of the module.
- _text.py:112 — drop the inline comment that duplicated the
  docstring; tighten the docstring to match.

Re-export the two helpers from `pycrdt.__init__` so consumers can
reach them without importing from the underscore module. Drop a
now-dead `_char_to_utf16` import in tests/test_text.py.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/pycrdt/__init__.py |  2 ++
 python/pycrdt/_text.py    | 32 ++++++++++++++++++++++----------
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/python/pycrdt/__init__.py b/python/pycrdt/__init__.py
index d849660..77446aa 100644
--- a/python/pycrdt/__init__.py
+++ b/python/pycrdt/__init__.py
@@ -35,6 +35,8 @@
 from ._sync import write_var_uint as write_var_uint
 from ._text import Text as Text
 from ._text import TextEvent as TextEvent
+from ._text import get_utf8_index as get_utf8_index
+from ._text import get_utf16_index as get_utf16_index
 from ._transaction import NewTransaction as NewTransaction
 from ._transaction import ReadTransaction as ReadTransaction
 from ._transaction import Transaction as Transaction
diff --git a/python/pycrdt/_text.py b/python/pycrdt/_text.py
index 0b9977d..ccb3d3b 100644
--- a/python/pycrdt/_text.py
+++ b/python/pycrdt/_text.py
@@ -11,14 +11,19 @@
     from ._doc import Doc
 
 
-def _char_to_utf16(text: str, char_index: int) -> int:
+def get_utf16_index(text: str, char_index: int) -> int:
     """Convert a Python character (code point) index to a UTF-16 code unit index.
 
     Characters outside the Basic Multilingual Plane (e.g. emoji) occupy 2
-    UTF-16 code units but only 1 Python character.
+    UTF-16 code units but only 1 Python character. For pure-ASCII / BMP
+    text this is a no-op.
 
-    For pure-ASCII / BMP text this is a no-op (returns ``char_index``
-    unchanged).
+    Args:
+        text: The string against which ``char_index`` is interpreted.
+        char_index: A Python (code point) index into ``text``.
+
+    Returns:
+        The corresponding UTF-16 code unit offset.
     """
     if char_index == 0:
         return 0
@@ -28,8 +33,16 @@ def _char_to_utf16(text: str, char_index: int) -> int:
     return char_index + extra
 
 
-def _char_to_utf8(text: str, char_index: int) -> int:
-    """Convert a Python character (code point) index to a UTF-8 byte index."""
+def get_utf8_index(text: str, char_index: int) -> int:
+    """Convert a Python character (code point) index to a UTF-8 byte index.
+
+    Args:
+        text: The string against which ``char_index`` is interpreted.
+        char_index: A Python (code point) index into ``text``.
+
+    Returns:
+        The corresponding UTF-8 byte offset.
+    """
     if char_index == 0:
         return 0
     return len(text[:char_index].encode("utf-8"))
@@ -37,8 +50,8 @@ def _char_to_utf8(text: str, char_index: int) -> int:
 
 def _char_to_offset(text: str, char_index: int, offset_kind: str) -> int:
     if offset_kind == "utf16":
-        return _char_to_utf16(text, char_index)
-    return _char_to_utf8(text, char_index)
+        return get_utf16_index(text, char_index)
+    return get_utf8_index(text, char_index)
 
 
 def _single_char_unit_len(char: str, offset_kind: str) -> int:
@@ -125,9 +138,8 @@ def __len__(self) -> int:
         ```
 
         Returns:
-            The length of the text (in Python characters, not UTF-16 code units).
+            The length of the text in Python characters.
         """
-        # Return Python character count, not yrs UTF-16 code unit count
         return len(str(self))
 
     def __str__(self) -> str:

From e41d3450e76c0814b79630fcaecbf48e22718f1b Mon Sep 17 00:00:00 2001
From: Xavier Lange <xrlange@gmail.com>
Date: Tue, 28 Apr 2026 10:26:11 -0400
Subject: [PATCH 10/13] test: parametrize Unicode tests over offset_kind
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds an `offset_kind` pytest fixture parametrizing all Unicode tests
over both ``"utf8"`` and ``"utf16"`` modes. The same test bodies now
exercise the dispatcher in `Text` for both yrs offset configurations,
without changing the assertions — public API contract is always Python
char indices.

`test_unicode_cross_doc_sync` constructs both peers with the same
offset_kind (mismatched offset kinds across peers is unsupported by
yrs/yjs). New explicit tests cover:
- default-is-utf8
- explicit "utf8" / "utf-8" / "utf16" / "utf-16"
- ValueError on unknown values
- snapshot round-trip preserves offset_kind for both modes
- new public `get_utf16_index` / `get_utf8_index` helpers
  (identity / BMP CJK / non-BMP emoji)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/test_text.py | 152 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 115 insertions(+), 37 deletions(-)

diff --git a/tests/test_text.py b/tests/test_text.py
index f84b933..6e5575e 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -3,7 +3,16 @@
 import pytest
 from anyio import TASK_STATUS_IGNORED, Event, create_task_group
 from anyio.abc import TaskStatus
-from pycrdt import Array, Assoc, Doc, Map, StickyIndex, Text
+from pycrdt import (
+    Array,
+    Assoc,
+    Doc,
+    Map,
+    StickyIndex,
+    Text,
+    get_utf8_index,
+    get_utf16_index,
+)
 
 pytestmark = pytest.mark.anyio
 
@@ -230,9 +239,14 @@ def test_sticky_index(serialize: str):
     assert text1[new_idx] == "*"
 
 
-def test_unicode_emoji_insert():
-    """Text.insert() after emoji characters should use character positions, not byte offsets."""
-    doc = Doc()
+@pytest.fixture(params=["utf8", "utf16"])
+def offset_kind(request):
+    return request.param
+
+
+def test_unicode_emoji_insert(offset_kind):
+    """Text.insert() after emoji characters should use character positions."""
+    doc = Doc(offset_kind=offset_kind)
     doc["text"] = text = Text()
 
     text += "A📊B"
@@ -244,9 +258,9 @@ def test_unicode_emoji_insert():
     assert str(text) == "A📊XB", f"Got {str(text)!r}, emoji insert position is wrong"
 
 
-def test_unicode_emoji_sequential_inserts():
+def test_unicode_emoji_sequential_inserts(offset_kind):
     """Sequential inserts after emoji should maintain correct positions."""
-    doc = Doc()
+    doc = Doc(offset_kind=offset_kind)
     doc["text"] = text = Text()
 
     text += "# Analysis 📊\n"
@@ -258,9 +272,11 @@ def test_unicode_emoji_sequential_inserts():
     assert str(text) == expected, f"Got {str(text)!r}"
 
 
-def test_unicode_emoji_iadd():
-    """`+=` after emoji should append at the end (regression for UTF-16 offset bug)."""
-    doc = Doc()
+def test_unicode_emoji_iadd(offset_kind):
+    """`+=` after emoji should append at the end (regression for the original
+    Text.__iadd__ bug where len(self) was passed to yrs as if it were already
+    in offset units)."""
+    doc = Doc(offset_kind=offset_kind)
     doc["text"] = text = Text()
 
     text += "A📊B"
@@ -269,66 +285,66 @@ def test_unicode_emoji_iadd():
     assert str(text) == "A📊BX"
 
 
-def test_unicode_emoji_len():
-    """len() should return Python character count, not byte count."""
-    doc = Doc()
+def test_unicode_emoji_len(offset_kind):
+    """len() should return Python character count, regardless of offset_kind."""
+    doc = Doc(offset_kind=offset_kind)
     doc["text"] = text = Text()
 
     text += "A📊B"
-    assert len(text) == 3  # 3 chars, not 6 bytes or 4 UTF-16 code units
+    assert len(text) == 3
 
     text += "🎉"
     assert len(text) == 4
 
 
-def test_unicode_emoji_delete():
+def test_unicode_emoji_delete(offset_kind):
     """Deleting a character after an emoji should work correctly."""
-    doc = Doc()
+    doc = Doc(offset_kind=offset_kind)
     doc["text"] = text = Text("A📊BC")
 
     del text[2]  # delete B (after emoji)
     assert str(text) == "A📊C", f"Got {str(text)!r}"
 
 
-def test_unicode_emoji_delete_emoji():
+def test_unicode_emoji_delete_emoji(offset_kind):
     """Deleting an emoji character itself should work correctly."""
-    doc = Doc()
+    doc = Doc(offset_kind=offset_kind)
     doc["text"] = text = Text("A📊B")
 
     del text[1]  # delete 📊
     assert str(text) == "AB", f"Got {str(text)!r}"
 
 
-def test_unicode_emoji_slice_delete():
+def test_unicode_emoji_slice_delete(offset_kind):
     """Slice deletion across emoji boundaries should work correctly."""
-    doc = Doc()
+    doc = Doc(offset_kind=offset_kind)
     doc["text"] = text = Text("A📊B🎉C")
 
     del text[1:4]  # delete 📊B🎉
     assert str(text) == "AC", f"Got {str(text)!r}"
 
 
-def test_unicode_emoji_setitem():
+def test_unicode_emoji_setitem(offset_kind):
     """Replacing a character after an emoji should work correctly."""
-    doc = Doc()
+    doc = Doc(offset_kind=offset_kind)
     doc["text"] = text = Text("A📊BC")
 
     text[2] = "X"  # replace B (after emoji)
     assert str(text) == "A📊XC", f"Got {str(text)!r}"
 
 
-def test_unicode_emoji_slice_setitem():
+def test_unicode_emoji_slice_setitem(offset_kind):
     """Slice replacement spanning emoji should work correctly."""
-    doc = Doc()
+    doc = Doc(offset_kind=offset_kind)
     doc["text"] = text = Text("A📊B🎉C")
 
     text[1:4] = "XYZ"  # replace 📊B🎉 with XYZ
     assert str(text) == "AXYZC", f"Got {str(text)!r}"
 
 
-def test_unicode_cjk():
-    """CJK characters (BMP, 1 UTF-16 code unit each) should work correctly."""
-    doc = Doc()
+def test_unicode_cjk(offset_kind):
+    """CJK characters (BMP, 1 UTF-16 code unit but 3 UTF-8 bytes each)."""
+    doc = Doc(offset_kind=offset_kind)
     doc["text"] = text = Text()
 
     text += "价格"
@@ -337,9 +353,9 @@ def test_unicode_cjk():
     assert len(text) == 3
 
 
-def test_unicode_mixed_scripts():
+def test_unicode_mixed_scripts(offset_kind):
     """Mixed ASCII, CJK, Cyrillic, and emoji in one text."""
-    doc = Doc()
+    doc = Doc(offset_kind=offset_kind)
     doc["text"] = text = Text()
 
     text += "Hello"
@@ -353,9 +369,9 @@ def test_unicode_mixed_scripts():
     assert len(text) == 15
 
 
-def test_unicode_supplementary_plane():
-    """Characters outside BMP (require UTF-16 surrogate pairs)."""
-    doc = Doc()
+def test_unicode_supplementary_plane(offset_kind):
+    """Characters outside BMP."""
+    doc = Doc(offset_kind=offset_kind)
     doc["text"] = text = Text()
 
     # 𝒜 (U+1D49C) = Mathematical Script Capital A
@@ -370,9 +386,14 @@ def test_unicode_supplementary_plane():
     assert str(text) == "A𝒜XB𠀀YC", f"Got {str(text)!r}"
 
 
-def test_unicode_cross_doc_sync():
-    """Updates with Unicode content should sync correctly between two pycrdt docs."""
-    doc1 = Doc()
+def test_unicode_cross_doc_sync(offset_kind):
+    """Updates with Unicode content should sync correctly between two pycrdt docs.
+
+    Both docs must use the same offset_kind — peers with mismatched offset
+    kinds is a known incompatibility (yrs and yjs both require all peers in
+    a swarm to agree).
+    """
+    doc1 = Doc(offset_kind=offset_kind)
     doc1["text"] = text1 = Text()
 
     # Capture updates from doc1
@@ -384,7 +405,7 @@ def test_unicode_cross_doc_sync():
     text1.insert(len(text1), "# 特征工程\n")
 
     # Apply to doc2
-    doc2 = Doc()
+    doc2 = Doc(offset_kind=offset_kind)
     doc2["text"] = Text()
     for update in updates:
         doc2.apply_update(update)
@@ -494,12 +515,12 @@ def _apply_diff(text, old_value, new_value):
         "math_operators",
     ],
 )
-def test_unicode_granular_diff(initial, updated):
+def test_unicode_granular_diff(initial, updated, offset_kind):
     """Granular text edits with multi-byte Unicode should produce correct results.
 
     Test cases adapted from jupyter-server/jupyter_ydoc#370.
     """
-    doc = Doc()
+    doc = Doc(offset_kind=offset_kind)
     doc["text"] = text = Text()
 
     text += initial
@@ -509,6 +530,63 @@ def test_unicode_granular_diff(initial, updated):
     assert str(text) == updated, f"Got {str(text)!r}, expected {updated!r}"
 
 
+def test_get_utf16_index():
+    # ASCII: identity
+    assert get_utf16_index("hello", 0) == 0
+    assert get_utf16_index("hello", 5) == 5
+    # BMP CJK: 1 code point = 1 UTF-16 code unit
+    assert get_utf16_index("价格", 2) == 2
+    # Non-BMP emoji: 1 code point = 2 UTF-16 code units (surrogate pair)
+    assert get_utf16_index("A📊B", 1) == 1
+    assert get_utf16_index("A📊B", 2) == 3
+    assert get_utf16_index("A📊B", 3) == 4
+
+
+def test_get_utf8_index():
+    # ASCII: identity
+    assert get_utf8_index("hello", 0) == 0
+    assert get_utf8_index("hello", 5) == 5
+    # BMP CJK: 1 code point = 3 UTF-8 bytes
+    assert get_utf8_index("价格", 1) == 3
+    assert get_utf8_index("价格", 2) == 6
+    # Non-BMP emoji: 1 code point = 4 UTF-8 bytes
+    assert get_utf8_index("A📊B", 1) == 1
+    assert get_utf8_index("A📊B", 2) == 5
+    assert get_utf8_index("A📊B", 3) == 6
+
+
+def test_offset_kind_default_is_utf8():
+    assert Doc().offset_kind == "utf8"
+
+
+def test_offset_kind_explicit():
+    assert Doc(offset_kind="utf8").offset_kind == "utf8"
+    assert Doc(offset_kind="utf16").offset_kind == "utf16"
+    # hyphenated forms accepted
+    assert Doc(offset_kind="utf-8").offset_kind == "utf8"
+    assert Doc(offset_kind="utf-16").offset_kind == "utf16"
+
+
+def test_offset_kind_invalid_raises():
+    with pytest.raises(ValueError):
+        Doc(offset_kind="utf32")
+
+
+def test_offset_kind_snapshot_round_trip(offset_kind):
+    """from_snapshot must preserve the source doc's offset_kind."""
+    from pycrdt import Snapshot
+
+    doc = Doc(offset_kind=offset_kind, skip_gc=True)
+    doc["text"] = Text("A📊B")
+    snap = Snapshot.from_doc(doc)
+    restored = Doc.from_snapshot(snap, doc)
+    assert restored.offset_kind == offset_kind, (
+        f"snapshot lost offset_kind: expected {offset_kind}, "
+        f"got {restored.offset_kind}"
+    )
+    assert str(restored["text"]) == "A📊B"
+
+
 def test_sticky_index_transaction():
     doc = Doc()
     text = doc.get("text", type=Text)

From b6b938c169e892a071a73b72004c26602ec79810 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 28 Apr 2026 14:28:18 +0000
Subject: [PATCH 11/13] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 python/pycrdt/_base.py | 3 +--
 python/pycrdt/_text.py | 4 +---
 tests/test_text.py     | 3 +--
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/python/pycrdt/_base.py b/python/pycrdt/_base.py
index 4aebdfa..53f614f 100644
--- a/python/pycrdt/_base.py
+++ b/python/pycrdt/_base.py
@@ -72,8 +72,7 @@ def __init__(
             doc = _Doc(client_id, skip_gc, offset_kind)
         elif offset_kind is not None and offset_kind != doc.offset_kind:
             raise ValueError(
-                f"offset_kind={offset_kind!r} does not match doc.offset_kind="
-                f"{doc.offset_kind!r}"
+                f"offset_kind={offset_kind!r} does not match doc.offset_kind={doc.offset_kind!r}"
             )
         self._doc = doc
         self._txn = None
diff --git a/python/pycrdt/_text.py b/python/pycrdt/_text.py
index ccb3d3b..d7275e7 100644
--- a/python/pycrdt/_text.py
+++ b/python/pycrdt/_text.py
@@ -230,9 +230,7 @@ def __delitem__(self, key: int | slice) -> None:
                 if stop - start > 0:
                     offset_start = _char_to_offset(current, start, ok)
                     offset_stop = _char_to_offset(current, stop, ok)
-                    self.integrated.remove_range(
-                        txn._txn, offset_start, offset_stop - offset_start
-                    )
+                    self.integrated.remove_range(txn._txn, offset_start, offset_stop - offset_start)
             else:
                 raise RuntimeError(f"Index not supported: {key}")
 
diff --git a/tests/test_text.py b/tests/test_text.py
index 6e5575e..b4ca78c 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -581,8 +581,7 @@ def test_offset_kind_snapshot_round_trip(offset_kind):
     snap = Snapshot.from_doc(doc)
     restored = Doc.from_snapshot(snap, doc)
     assert restored.offset_kind == offset_kind, (
-        f"snapshot lost offset_kind: expected {offset_kind}, "
-        f"got {restored.offset_kind}"
+        f"snapshot lost offset_kind: expected {offset_kind}, got {restored.offset_kind}"
     )
     assert str(restored["text"]) == "A📊B"
 

From 3e9933a9f0c9f7438155ceefef93dd5a996bd8c8 Mon Sep 17 00:00:00 2001
From: Xavier Lange <xrlange@gmail.com>
Date: Tue, 28 Apr 2026 10:35:12 -0400
Subject: [PATCH 12/13] fix: update _pycrdt.pyi stub for offset_kind

The Rust-extension stub didn't reflect the new constructor signature
or the offset_kind getter, so `mypy python` failed in CI on _base.py
and _doc.py with "Too many arguments" / "no attribute offset_kind".

Add the third positional `offset_kind: str | None` to `Doc.__init__`
and declare the read-only `offset_kind` property.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/pycrdt/_pycrdt.pyi | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/python/pycrdt/_pycrdt.pyi b/python/pycrdt/_pycrdt.pyi
index 1498689..73f3ed1 100644
--- a/python/pycrdt/_pycrdt.pyi
+++ b/python/pycrdt/_pycrdt.pyi
@@ -17,7 +17,12 @@ class Snapshot:
 class Doc:
     """Shared document."""
 
-    def __init__(self, client_id: int | None, skip_gc: bool | None) -> None:
+    def __init__(
+        self,
+        client_id: int | None,
+        skip_gc: bool | None,
+        offset_kind: str | None,
+    ) -> None:
         """Create a new document with an optional global client ID.
         If no client ID is passed, a random one will be generated."""
 
@@ -28,6 +33,10 @@ class Doc:
     def client_id(self) -> int:
         """Returns the document unique client identifier."""
 
+    @property
+    def offset_kind(self) -> str:
+        """Returns the offset kind ('utf8' or 'utf16')."""
+
     def guid(self) -> int:
         """Returns the document globally unique identifier."""
 

From 67b2dead51482a32a62c2e8f60e1e0db47e6c2dd Mon Sep 17 00:00:00 2001
From: Xavier Lange <xrlange@gmail.com>
Date: Tue, 28 Apr 2026 11:21:19 -0400
Subject: [PATCH 13/13] test: cover the offset_kind/doc mismatch ValueError

Fixes the ubuntu python-3.14 coverage regression on #379. The new
mismatch check at python/pycrdt/_base.py:74 (raise ValueError when
both doc= and offset_kind= are passed and they disagree) wasn't
exercised by any pytest test, dropping coverage to 99% and tripping
the suite's fail-under=100 gate.

Add a one-shot regression test that constructs an existing Doc with
offset_kind="utf8", then attempts to wrap it as a new Doc with
offset_kind="utf16" and asserts ValueError.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/test_text.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/test_text.py b/tests/test_text.py
index b4ca78c..983e0ac 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -572,6 +572,13 @@ def test_offset_kind_invalid_raises():
         Doc(offset_kind="utf32")
 
 
+def test_offset_kind_doc_mismatch_raises():
+    """Doc(doc=existing, offset_kind=other) must reject conflicting values."""
+    utf8_doc = Doc(offset_kind="utf8")
+    with pytest.raises(ValueError, match="does not match"):
+        Doc(doc=utf8_doc._doc, offset_kind="utf16")
+
+
 def test_offset_kind_snapshot_round_trip(offset_kind):
     """from_snapshot must preserve the source doc's offset_kind."""
     from pycrdt import Snapshot