y-crdt · xrl · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026
diff --git a/python/pycrdt/__init__.py b/python/pycrdt/__init__.py
@@ -35,6 +35,8 @@
 from ._sync import write_var_uint as write_var_uint
 from ._text import Text as Text
 from ._text import TextEvent as TextEvent
+from ._text import get_utf8_index as get_utf8_index
+from ._text import get_utf16_index as get_utf16_index
 from ._transaction import NewTransaction as NewTransaction
 from ._transaction import ReadTransaction as ReadTransaction
 from ._transaction import Transaction as Transaction

diff --git a/python/pycrdt/_base.py b/python/pycrdt/_base.py
@@ -61,14 +61,19 @@ def __init__(
         *,
         client_id: int | None = None,
         skip_gc: bool | None = None,
+        offset_kind: str | None = None,
         doc: _Doc | None = None,
         Model=None,
         allow_multithreading: bool = False,
         **data,
     ) -> None:
         super().__init__(**data)
         if doc is None:
-            doc = _Doc(client_id, skip_gc)
+            doc = _Doc(client_id, skip_gc, offset_kind)
+        elif offset_kind is not None and offset_kind != doc.offset_kind:
+            raise ValueError(
+                f"offset_kind={offset_kind!r} does not match doc.offset_kind={doc.offset_kind!r}"
+            )
         self._doc = doc
         self._txn = None
         self._exceptions = []

diff --git a/python/pycrdt/_doc.py b/python/pycrdt/_doc.py
@@ -48,6 +48,7 @@ def __init__(
         *,
         client_id: int | None = None,
         skip_gc: bool | None = None,
+        offset_kind: str | None = None,
         doc: _Doc | None = None,
         Model=None,
         allow_multithreading: bool = False,
@@ -58,11 +59,18 @@ def __init__(
             client_id: An optional client ID for the document.
             skip_gc: Whether to skip garbage collection on deleted collections
                 on transaction commit.
+            offset_kind: How yrs counts text positions internally. ``"utf8"``
+                (the yrs default) uses byte offsets; ``"utf16"`` uses UTF-16
+                code unit offsets and is required for cross-runtime
+                compatibility with JS yjs. ``None`` (default) selects the yrs
+                default of ``"utf8"``. Regardless of this setting, the public
+                ``Text`` API always takes Python character indices.
             allow_multithreading: Whether to allow the document to be used in different threads.
         """
         super().__init__(
             client_id=client_id,
             skip_gc=skip_gc,
+            offset_kind=offset_kind,
             doc=doc,
             Model=Model,
             allow_multithreading=allow_multithreading,
@@ -86,6 +94,15 @@ def client_id(self) -> int:
         """The document client ID."""
         return self._doc.client_id()
 
+    @property
+    def offset_kind(self) -> str:
+        """The text offset kind used internally by yrs.
+
+        Returns ``"utf8"`` or ``"utf16"``. See [Doc.__init__][pycrdt.Doc.__init__]
+        for the meaning.
+        """
+        return self._doc.offset_kind
+
     def transaction(self, origin: Any = None) -> Transaction:
         """
         Creates a new transaction or gets the current one, if any.

diff --git a/python/pycrdt/_pycrdt.pyi b/python/pycrdt/_pycrdt.pyi
@@ -17,7 +17,12 @@ class Snapshot:
 class Doc:
     """Shared document."""
 
-    def __init__(self, client_id: int | None, skip_gc: bool | None) -> None:
+    def __init__(
+        self,
+        client_id: int | None,
+        skip_gc: bool | None,
+        offset_kind: str | None,
+    ) -> None:
         """Create a new document with an optional global client ID.
         If no client ID is passed, a random one will be generated."""
 
@@ -28,6 +33,10 @@ class Doc:
     def client_id(self) -> int:
         """Returns the document unique client identifier."""
 
+    @property
+    def offset_kind(self) -> str:
+        """Returns the offset kind ('utf8' or 'utf16')."""
+
     def guid(self) -> int:
         """Returns the document globally unique identifier."""
 

diff --git a/python/pycrdt/_text.py b/python/pycrdt/_text.py
@@ -11,6 +11,55 @@
     from ._doc import Doc
 
 
+def get_utf16_index(text: str, char_index: int) -> int:
+    """Convert a Python character (code point) index to a UTF-16 code unit index.
+
+    Characters outside the Basic Multilingual Plane (e.g. emoji) occupy 2
+    UTF-16 code units but only 1 Python character. For pure-ASCII / BMP
+    text this is a no-op.
+
+    Args:
+        text: The string against which ``char_index`` is interpreted.
+        char_index: A Python (code point) index into ``text``.
+
+    Returns:
+        The corresponding UTF-16 code unit offset.
+    """
+    if char_index == 0:
+        return 0
+    prefix = text[:char_index]
+    # Count characters that need a surrogate pair (code point > 0xFFFF)
+    extra = sum(1 for ch in prefix if ord(ch) > 0xFFFF)
+    return char_index + extra
-    return char_index + extra
+    return len(prefix) + extra
-    return char_index + extra
+    return len(prefix) + extra
+
+
+def get_utf8_index(text: str, char_index: int) -> int:
+    """Convert a Python character (code point) index to a UTF-8 byte index.
+
+    Args:
+        text: The string against which ``char_index`` is interpreted.
+        char_index: A Python (code point) index into ``text``.
+
+    Returns:
+        The corresponding UTF-8 byte offset.
+    """
+    if char_index == 0:
+        return 0
+    return len(text[:char_index].encode("utf-8"))
+
+
+def _char_to_offset(text: str, char_index: int, offset_kind: str) -> int:
+    if offset_kind == "utf16":
+        return get_utf16_index(text, char_index)
+    return get_utf8_index(text, char_index)
+
+
+def _single_char_unit_len(char: str, offset_kind: str) -> int:
+    if offset_kind == "utf16":
+        return 2 if ord(char) > 0xFFFF else 1
+    return len(char.encode("utf-8"))
+
+
 class Text(Sequence):
     """
     A shared data type used for collaborative text editing, similar to a Python `str`.
@@ -89,10 +138,9 @@ def __len__(self) -> int:
         ```
 
         Returns:
-            The length of the text.
+            The length of the text in Python characters.
         """
-        with self.doc.transaction() as txn:
-            return self.integrated.len(txn._txn)
+        return len(str(self))
 
     def __str__(self) -> str:
         """
@@ -128,7 +176,9 @@ def __iadd__(self, value: str) -> Text:
         """
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
-            self.integrated.insert(txn._txn, len(self), value)
+            current = str(self)
+            offset = _char_to_offset(current, len(current), self.doc.offset_kind)
-            current = str(self)
-            offset = _char_to_offset(current, len(current), self.doc.offset_kind)
+            offset = self.integrated.len(txn._txn)
-            current = str(self)
-            offset = _char_to_offset(current, len(current), self.doc.offset_kind)
+            offset = self.integrated.len(txn._txn)
+            self.integrated.insert(txn._txn, offset, value)
             return self
 
     def _check_slice(self, key: slice) -> tuple[int, int]:
@@ -169,13 +219,18 @@ def __delitem__(self, key: int | slice) -> None:
         """
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
+            current = str(self)
+            ok = self.doc.offset_kind
             if isinstance(key, int):
-                self.integrated.remove_range(txn._txn, key, 1)
+                offset = _char_to_offset(current, key, ok)
+                unit_len = _single_char_unit_len(current[key], ok)
+                self.integrated.remove_range(txn._txn, offset, unit_len)
             elif isinstance(key, slice):
                 start, stop = self._check_slice(key)
-                length = stop - start
-                if length > 0:
-                    self.integrated.remove_range(txn._txn, start, length)
+                if stop - start > 0:
+                    offset_start = _char_to_offset(current, start, ok)
+                    offset_stop = _char_to_offset(current, stop, ok)
+                    self.integrated.remove_range(txn._txn, offset_start, offset_stop - offset_start)
             else:
                 raise RuntimeError(f"Index not supported: {key}")
 
@@ -214,20 +269,26 @@ def __setitem__(self, key: int | slice, value: str) -> None:
         """
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
+            current = str(self)
+            ok = self.doc.offset_kind
             if isinstance(key, int):
                 value_len = len(value)
                 if value_len != 1:
                     raise RuntimeError(
                         f"Single item assigned value must have a length of 1, not {value_len}"
                     )
-                del self[key]
-                self.integrated.insert(txn._txn, key, value)
+                offset = _char_to_offset(current, key, ok)
+                unit_len = _single_char_unit_len(current[key], ok)
+                self.integrated.remove_range(txn._txn, offset, unit_len)
+                self.integrated.insert(txn._txn, offset, value)
             elif isinstance(key, slice):
                 start, stop = self._check_slice(key)
-                length = stop - start
+                offset_start = _char_to_offset(current, start, ok)
+                offset_stop = _char_to_offset(current, stop, ok)
+                length = offset_stop - offset_start
                 if length > 0:
-                    self.integrated.remove_range(txn._txn, start, length)
-                self.integrated.insert(txn._txn, start, value)
+                    self.integrated.remove_range(txn._txn, offset_start, length)
+                self.integrated.insert(txn._txn, offset_start, value)
             else:
                 raise RuntimeError(f"Index not supported: {key}")
 
@@ -251,8 +312,10 @@ def insert(self, index: int, value: str, attrs: dict[str, Any] | None = None) ->
         """
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
+            current = str(self)
+            offset = _char_to_offset(current, index, self.doc.offset_kind)
             self.integrated.insert(
-                txn._txn, index, value, iter(attrs.items()) if attrs is not None else None
+                txn._txn, offset, value, iter(attrs.items()) if attrs is not None else None
             )
 
     def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = None) -> None:
@@ -266,8 +329,10 @@ def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = No
         """
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
+            current = str(self)
+            offset = _char_to_offset(current, index, self.doc.offset_kind)
             self.integrated.insert_embed(
-                txn._txn, index, value, iter(attrs.items()) if attrs is not None else None
+                txn._txn, offset, value, iter(attrs.items()) if attrs is not None else None
             )
 
     def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None:
@@ -282,9 +347,13 @@ def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None:
         with self.doc.transaction() as txn:
             self._forbid_read_transaction(txn)
             start, stop = self._check_slice(slice(start, stop))
-            length = stop - start
+            current = str(self)
+            ok = self.doc.offset_kind
+            offset_start = _char_to_offset(current, start, ok)
+            offset_stop = _char_to_offset(current, stop, ok)
+            length = offset_stop - offset_start
             if length > 0:
-                self.integrated.format(txn._txn, start, length, iter(attrs.items()))
+                self.integrated.format(txn._txn, offset_start, length, iter(attrs.items()))
 
     def diff(self) -> list[tuple[Any, dict[str, Any] | None]]:
         """

diff --git a/src/doc.rs b/src/doc.rs
@@ -3,7 +3,7 @@ use pyo3::IntoPyObjectExt;
 use pyo3::exceptions::{PyRuntimeError, PyValueError};
 use pyo3::types::{PyBool, PyBytes, PyDict, PyInt, PyList};
 use yrs::{
-    Doc as _Doc, Options, ReadTxn, StateVector, SubdocsEvent as _SubdocsEvent, Transact, TransactionCleanupEvent, TransactionMut, Update, WriteTxn
+    Doc as _Doc, OffsetKind, Options, ReadTxn, StateVector, SubdocsEvent as _SubdocsEvent, Transact, TransactionCleanupEvent, TransactionMut, Update, WriteTxn
 };
 use yrs::updates::encoder::{Encode, Encoder};
 use yrs::updates::decoder::Decode;
@@ -32,6 +32,7 @@ impl Doc {
         let mut options = yrs::Options::default();
         options.client_id = original.doc.client_id();
         options.skip_gc = original.doc.skip_gc();
+        options.offset_kind = original.doc.offset_kind();
         if let Some(collection_id) = original.doc.collection_id() {
             options.collection_id = Some(collection_id);
         }
@@ -68,7 +69,11 @@ impl Doc {
 #[pymethods]
 impl Doc {
     #[new]
-    fn new(client_id: &Bound<'_, PyAny>, skip_gc: &Bound<'_, PyAny>) -> PyResult<Self> {
+    fn new(
+        client_id: &Bound<'_, PyAny>,
+        skip_gc: &Bound<'_, PyAny>,
+        offset_kind: &Bound<'_, PyAny>,
+    ) -> PyResult<Self> {
         let mut options = Options::default();
         if !client_id.is_none() {
             let _client_id: u64 = client_id.cast::<PyInt>()
@@ -84,10 +89,30 @@ impl Doc {
                 .map_err(|_| PyValueError::new_err("skip_gc must be a valid bool"))?;
             options.skip_gc = _skip_gc;
         }
+        if !offset_kind.is_none() {
+            let _offset_kind: String = offset_kind
+                .extract()
+                .map_err(|_| PyValueError::new_err("offset_kind must be a string"))?;
+            options.offset_kind = match _offset_kind.as_str() {
+                "utf8" | "utf-8" => OffsetKind::Bytes,
+                "utf16" | "utf-16" => OffsetKind::Utf16,
+                _ => return Err(PyValueError::new_err(
+                    "offset_kind must be 'utf8' or 'utf16'",
+                )),
+            };
+        }
         let doc = _Doc::with_options(options);
         Ok(Doc { doc })
     }
 
+    #[getter]
+    fn offset_kind(&self) -> &'static str {
+        match self.doc.offset_kind() {
+            OffsetKind::Bytes => "utf8",
+            OffsetKind::Utf16 => "utf16",
+        }
+    }
+
     #[staticmethod]
     #[pyo3(name = "from_snapshot")]
     pub fn from_snapshot(py: Python<'_>, snapshot: PyRef<'_, crate::snapshot::Snapshot>, doc: PyRef<'_, Doc>) -> PyResult<Py<Doc>> {