Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions python/pycrdt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
from ._sync import write_var_uint as write_var_uint
from ._text import Text as Text
from ._text import TextEvent as TextEvent
from ._text import get_utf8_index as get_utf8_index
from ._text import get_utf16_index as get_utf16_index
from ._transaction import NewTransaction as NewTransaction
from ._transaction import ReadTransaction as ReadTransaction
from ._transaction import Transaction as Transaction
Expand Down
7 changes: 6 additions & 1 deletion python/pycrdt/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,19 @@ def __init__(
*,
client_id: int | None = None,
skip_gc: bool | None = None,
offset_kind: str | None = None,
doc: _Doc | None = None,
Model=None,
allow_multithreading: bool = False,
**data,
) -> None:
super().__init__(**data)
if doc is None:
doc = _Doc(client_id, skip_gc)
doc = _Doc(client_id, skip_gc, offset_kind)
elif offset_kind is not None and offset_kind != doc.offset_kind:
raise ValueError(
f"offset_kind={offset_kind!r} does not match doc.offset_kind={doc.offset_kind!r}"
)
Comment on lines +72 to +76
Copy link

Copilot AI Apr 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The offset_kind mismatch check compares the raw user string to doc.offset_kind, but the constructor accepts hyphenated aliases (e.g. utf-16). Passing an existing _Doc plus offset_kind='utf-16' will currently raise even though it's semantically the same as 'utf16'. Normalize offset_kind to the canonical form (e.g. strip hyphens / lowercase) before comparing, or accept both spellings in the comparison.

Copilot uses AI. Check for mistakes.
self._doc = doc
self._txn = None
self._exceptions = []
Expand Down
17 changes: 17 additions & 0 deletions python/pycrdt/_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(
*,
client_id: int | None = None,
skip_gc: bool | None = None,
offset_kind: str | None = None,
doc: _Doc | None = None,
Model=None,
allow_multithreading: bool = False,
Expand All @@ -58,11 +59,18 @@ def __init__(
client_id: An optional client ID for the document.
skip_gc: Whether to skip garbage collection on deleted collections
on transaction commit.
offset_kind: How yrs counts text positions internally. ``"utf8"``
(the yrs default) uses byte offsets; ``"utf16"`` uses UTF-16
code unit offsets and is required for cross-runtime
compatibility with JS yjs. ``None`` (default) selects the yrs
default of ``"utf8"``. Regardless of this setting, the public
``Text`` API always takes Python character indices.
allow_multithreading: Whether to allow the document to be used in different threads.
"""
super().__init__(
client_id=client_id,
skip_gc=skip_gc,
offset_kind=offset_kind,
doc=doc,
Model=Model,
allow_multithreading=allow_multithreading,
Expand All @@ -86,6 +94,15 @@ def client_id(self) -> int:
"""The document client ID."""
return self._doc.client_id()

@property
def offset_kind(self) -> str:
"""The text offset kind used internally by yrs.

Returns ``"utf8"`` or ``"utf16"``. See [Doc.__init__][pycrdt.Doc.__init__]
for the meaning.
"""
return self._doc.offset_kind

def transaction(self, origin: Any = None) -> Transaction:
"""
Creates a new transaction or gets the current one, if any.
Expand Down
11 changes: 10 additions & 1 deletion python/pycrdt/_pycrdt.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,12 @@ class Snapshot:
class Doc:
"""Shared document."""

def __init__(self, client_id: int | None, skip_gc: bool | None) -> None:
def __init__(
self,
client_id: int | None,
skip_gc: bool | None,
offset_kind: str | None,
) -> None:
"""Create a new document with an optional global client ID.
If no client ID is passed, a random one will be generated."""

Expand All @@ -28,6 +33,10 @@ class Doc:
def client_id(self) -> int:
"""Returns the document unique client identifier."""

@property
def offset_kind(self) -> str:
"""Returns the offset kind ('utf8' or 'utf16')."""

def guid(self) -> int:
"""Returns the document globally unique identifier."""

Expand Down
103 changes: 86 additions & 17 deletions python/pycrdt/_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,55 @@
from ._doc import Doc


def get_utf16_index(text: str, char_index: int) -> int:
"""Convert a Python character (code point) index to a UTF-16 code unit index.

Characters outside the Basic Multilingual Plane (e.g. emoji) occupy 2
UTF-16 code units but only 1 Python character. For pure-ASCII / BMP
text this is a no-op.

Args:
text: The string against which ``char_index`` is interpreted.
char_index: A Python (code point) index into ``text``.

Returns:
The corresponding UTF-16 code unit offset.
"""
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use Google style docstrings (Args, Returns).

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in 855af5c β€” both get_utf16_index and get_utf8_index now use Google-style Args / Returns docstrings.

if char_index == 0:
return 0
prefix = text[:char_index]
# Count characters that need a surrogate pair (code point > 0xFFFF)
extra = sum(1 for ch in prefix if ord(ch) > 0xFFFF)
return char_index + extra
Copy link

Copilot AI Apr 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get_utf16_index() returns char_index + extra, which becomes incorrect when char_index is out of range or negative because Python slicing clamps but char_index is not clamped (e.g. get_utf16_index('AπŸ“ŠB', 10) returns 11 instead of the end offset 4). This can produce offsets beyond the actual text length and break insert/delete/format calls in UTF-16 mode. Compute the result from the sliced prefix length (or clamp char_index to [0, len(text)]) so the returned UTF-16 offset always stays within bounds.

Suggested change
return char_index + extra
return len(prefix) + extra

Copilot uses AI. Check for mistakes.


def get_utf8_index(text: str, char_index: int) -> int:
"""Convert a Python character (code point) index to a UTF-8 byte index.

Args:
text: The string against which ``char_index`` is interpreted.
char_index: A Python (code point) index into ``text``.

Returns:
The corresponding UTF-8 byte offset.
"""
if char_index == 0:
return 0
return len(text[:char_index].encode("utf-8"))


def _char_to_offset(text: str, char_index: int, offset_kind: str) -> int:
if offset_kind == "utf16":
return get_utf16_index(text, char_index)
return get_utf8_index(text, char_index)


def _single_char_unit_len(char: str, offset_kind: str) -> int:
if offset_kind == "utf16":
return 2 if ord(char) > 0xFFFF else 1
return len(char.encode("utf-8"))


class Text(Sequence):
"""
A shared data type used for collaborative text editing, similar to a Python `str`.
Expand Down Expand Up @@ -89,10 +138,9 @@ def __len__(self) -> int:
```

Returns:
The length of the text.
The length of the text in Python characters.
"""
with self.doc.transaction() as txn:
return self.integrated.len(txn._txn)
return len(str(self))

def __str__(self) -> str:
"""
Expand Down Expand Up @@ -128,7 +176,9 @@ def __iadd__(self, value: str) -> Text:
"""
with self.doc.transaction() as txn:
self._forbid_read_transaction(txn)
self.integrated.insert(txn._txn, len(self), value)
current = str(self)
offset = _char_to_offset(current, len(current), self.doc.offset_kind)
Comment on lines +179 to +180
Copy link

Copilot AI Apr 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

__iadd__ now calls str(self) just to compute the append offset, which makes text += ... O(n) in the current text size and allocates the whole string. For appends you can avoid this by using the underlying integrated length in the doc’s offset units (e.g. self.integrated.len(txn._txn)) as the insert index, since that already matches the offset kind.

Suggested change
current = str(self)
offset = _char_to_offset(current, len(current), self.doc.offset_kind)
offset = self.integrated.len(txn._txn)

Copilot uses AI. Check for mistakes.
self.integrated.insert(txn._txn, offset, value)
return self

def _check_slice(self, key: slice) -> tuple[int, int]:
Expand Down Expand Up @@ -169,13 +219,18 @@ def __delitem__(self, key: int | slice) -> None:
"""
with self.doc.transaction() as txn:
self._forbid_read_transaction(txn)
current = str(self)
ok = self.doc.offset_kind
if isinstance(key, int):
self.integrated.remove_range(txn._txn, key, 1)
offset = _char_to_offset(current, key, ok)
unit_len = _single_char_unit_len(current[key], ok)
self.integrated.remove_range(txn._txn, offset, unit_len)
elif isinstance(key, slice):
start, stop = self._check_slice(key)
length = stop - start
if length > 0:
self.integrated.remove_range(txn._txn, start, length)
if stop - start > 0:
offset_start = _char_to_offset(current, start, ok)
offset_stop = _char_to_offset(current, stop, ok)
self.integrated.remove_range(txn._txn, offset_start, offset_stop - offset_start)
else:
raise RuntimeError(f"Index not supported: {key}")

Expand Down Expand Up @@ -214,20 +269,26 @@ def __setitem__(self, key: int | slice, value: str) -> None:
"""
with self.doc.transaction() as txn:
self._forbid_read_transaction(txn)
current = str(self)
ok = self.doc.offset_kind
if isinstance(key, int):
value_len = len(value)
if value_len != 1:
raise RuntimeError(
f"Single item assigned value must have a length of 1, not {value_len}"
)
del self[key]
self.integrated.insert(txn._txn, key, value)
offset = _char_to_offset(current, key, ok)
unit_len = _single_char_unit_len(current[key], ok)
self.integrated.remove_range(txn._txn, offset, unit_len)
self.integrated.insert(txn._txn, offset, value)
elif isinstance(key, slice):
start, stop = self._check_slice(key)
length = stop - start
offset_start = _char_to_offset(current, start, ok)
offset_stop = _char_to_offset(current, stop, ok)
length = offset_stop - offset_start
if length > 0:
self.integrated.remove_range(txn._txn, start, length)
self.integrated.insert(txn._txn, start, value)
self.integrated.remove_range(txn._txn, offset_start, length)
self.integrated.insert(txn._txn, offset_start, value)
else:
raise RuntimeError(f"Index not supported: {key}")

Expand All @@ -251,8 +312,10 @@ def insert(self, index: int, value: str, attrs: dict[str, Any] | None = None) ->
"""
with self.doc.transaction() as txn:
self._forbid_read_transaction(txn)
current = str(self)
offset = _char_to_offset(current, index, self.doc.offset_kind)
self.integrated.insert(
txn._txn, index, value, iter(attrs.items()) if attrs is not None else None
txn._txn, offset, value, iter(attrs.items()) if attrs is not None else None
)

def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = None) -> None:
Expand All @@ -266,8 +329,10 @@ def insert_embed(self, index: int, value: Any, attrs: dict[str, Any] | None = No
"""
with self.doc.transaction() as txn:
self._forbid_read_transaction(txn)
current = str(self)
offset = _char_to_offset(current, index, self.doc.offset_kind)
self.integrated.insert_embed(
txn._txn, index, value, iter(attrs.items()) if attrs is not None else None
txn._txn, offset, value, iter(attrs.items()) if attrs is not None else None
)

def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None:
Expand All @@ -282,9 +347,13 @@ def format(self, start: int, stop: int, attrs: dict[str, Any]) -> None:
with self.doc.transaction() as txn:
self._forbid_read_transaction(txn)
start, stop = self._check_slice(slice(start, stop))
length = stop - start
current = str(self)
ok = self.doc.offset_kind
offset_start = _char_to_offset(current, start, ok)
offset_stop = _char_to_offset(current, stop, ok)
length = offset_stop - offset_start
if length > 0:
self.integrated.format(txn._txn, start, length, iter(attrs.items()))
self.integrated.format(txn._txn, offset_start, length, iter(attrs.items()))

def diff(self) -> list[tuple[Any, dict[str, Any] | None]]:
"""
Expand Down
29 changes: 27 additions & 2 deletions src/doc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use pyo3::IntoPyObjectExt;
use pyo3::exceptions::{PyRuntimeError, PyValueError};
use pyo3::types::{PyBool, PyBytes, PyDict, PyInt, PyList};
use yrs::{
Doc as _Doc, Options, ReadTxn, StateVector, SubdocsEvent as _SubdocsEvent, Transact, TransactionCleanupEvent, TransactionMut, Update, WriteTxn
Doc as _Doc, OffsetKind, Options, ReadTxn, StateVector, SubdocsEvent as _SubdocsEvent, Transact, TransactionCleanupEvent, TransactionMut, Update, WriteTxn
};
use yrs::updates::encoder::{Encode, Encoder};
use yrs::updates::decoder::Decode;
Expand Down Expand Up @@ -32,6 +32,7 @@ impl Doc {
let mut options = yrs::Options::default();
options.client_id = original.doc.client_id();
options.skip_gc = original.doc.skip_gc();
options.offset_kind = original.doc.offset_kind();
if let Some(collection_id) = original.doc.collection_id() {
options.collection_id = Some(collection_id);
}
Expand Down Expand Up @@ -68,7 +69,11 @@ impl Doc {
#[pymethods]
impl Doc {
#[new]
fn new(client_id: &Bound<'_, PyAny>, skip_gc: &Bound<'_, PyAny>) -> PyResult<Self> {
fn new(
client_id: &Bound<'_, PyAny>,
skip_gc: &Bound<'_, PyAny>,
offset_kind: &Bound<'_, PyAny>,
) -> PyResult<Self> {
let mut options = Options::default();
if !client_id.is_none() {
let _client_id: u64 = client_id.cast::<PyInt>()
Expand All @@ -84,10 +89,30 @@ impl Doc {
.map_err(|_| PyValueError::new_err("skip_gc must be a valid bool"))?;
options.skip_gc = _skip_gc;
}
if !offset_kind.is_none() {
let _offset_kind: String = offset_kind
.extract()
.map_err(|_| PyValueError::new_err("offset_kind must be a string"))?;
options.offset_kind = match _offset_kind.as_str() {
"utf8" | "utf-8" => OffsetKind::Bytes,
"utf16" | "utf-16" => OffsetKind::Utf16,
_ => return Err(PyValueError::new_err(
"offset_kind must be 'utf8' or 'utf16'",
)),
};
}
let doc = _Doc::with_options(options);
Comment on lines 77 to 104
Copy link

Copilot AI Apr 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This constructor only sets options.offset_kind when offset_kind is explicitly provided, which means Doc() will still use the yrs default (OffsetKind::Bytes). That conflicts with the PR description/title claiming the Doc is configured to use UTF-16 offsets by default. Either set options.offset_kind = OffsetKind::Utf16 unconditionally (and adjust tests/docs accordingly) or update the PR description to reflect that UTF-16 is opt-in via offset_kind.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

update the PR description to reflect that UTF-16 is opt-in via offset_kind

I suppose we agreed on this one.

Ok(Doc { doc })
}

#[getter]
fn offset_kind(&self) -> &'static str {
match self.doc.offset_kind() {
OffsetKind::Bytes => "utf8",
OffsetKind::Utf16 => "utf16",
}
}

#[staticmethod]
#[pyo3(name = "from_snapshot")]
pub fn from_snapshot(py: Python<'_>, snapshot: PyRef<'_, crate::snapshot::Snapshot>, doc: PyRef<'_, Doc>) -> PyResult<Py<Doc>> {
Expand Down
Loading
Loading