-
Notifications
You must be signed in to change notification settings - Fork 33
fix: use UTF-16 offsets for Text operations #379
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
669f98d
f3a2e7e
8264d95
7cf5af4
b1ed6ae
f48610b
01a1756
41c8cc5
855af5c
e41d345
b6b938c
3e9933a
67b2dea
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -11,6 +11,55 @@ | |||||||
| from ._doc import Doc | ||||||||
|
|
||||||||
|
|
||||||||
| def get_utf16_index(text: str, char_index: int) -> int: | ||||||||
| """Convert a Python character (code point) index to a UTF-16 code unit index. | ||||||||
|
|
||||||||
| Characters outside the Basic Multilingual Plane (e.g. emoji) occupy 2 | ||||||||
| UTF-16 code units but only 1 Python character. For pure-ASCII / BMP | ||||||||
| text this is a no-op. | ||||||||
|
|
||||||||
| Args: | ||||||||
| text: The string against which ``char_index`` is interpreted. | ||||||||
| char_index: A Python (code point) index into ``text``. | ||||||||
|
|
||||||||
| Returns: | ||||||||
| The corresponding UTF-16 code unit offset. | ||||||||
| """ | ||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use Google style docstrings (
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done in 855af5c β both |
||||||||
| if char_index == 0: | ||||||||
| return 0 | ||||||||
| prefix = text[:char_index] | ||||||||
| # Count characters that need a surrogate pair (code point > 0xFFFF) | ||||||||
| extra = sum(1 for ch in prefix if ord(ch) > 0xFFFF) | ||||||||
| return char_index + extra | ||||||||
|
||||||||
| return char_index + extra | |
| return len(prefix) + extra |
Copilot
AI
Apr 30, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
__iadd__ now calls str(self) just to compute the append offset, which makes text += ... O(n) in the current text size and allocates the whole string. For appends you can avoid this by using the underlying integrated length in the docβs offset units (e.g. self.integrated.len(txn._txn)) as the insert index, since that already matches the offset kind.
| current = str(self) | |
| offset = _char_to_offset(current, len(current), self.doc.offset_kind) | |
| offset = self.integrated.len(txn._txn) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,7 +3,7 @@ use pyo3::IntoPyObjectExt; | |
| use pyo3::exceptions::{PyRuntimeError, PyValueError}; | ||
| use pyo3::types::{PyBool, PyBytes, PyDict, PyInt, PyList}; | ||
| use yrs::{ | ||
| Doc as _Doc, Options, ReadTxn, StateVector, SubdocsEvent as _SubdocsEvent, Transact, TransactionCleanupEvent, TransactionMut, Update, WriteTxn | ||
| Doc as _Doc, OffsetKind, Options, ReadTxn, StateVector, SubdocsEvent as _SubdocsEvent, Transact, TransactionCleanupEvent, TransactionMut, Update, WriteTxn | ||
| }; | ||
| use yrs::updates::encoder::{Encode, Encoder}; | ||
| use yrs::updates::decoder::Decode; | ||
|
|
@@ -32,6 +32,7 @@ impl Doc { | |
| let mut options = yrs::Options::default(); | ||
| options.client_id = original.doc.client_id(); | ||
| options.skip_gc = original.doc.skip_gc(); | ||
| options.offset_kind = original.doc.offset_kind(); | ||
| if let Some(collection_id) = original.doc.collection_id() { | ||
| options.collection_id = Some(collection_id); | ||
| } | ||
|
|
@@ -68,7 +69,11 @@ impl Doc { | |
| #[pymethods] | ||
| impl Doc { | ||
| #[new] | ||
| fn new(client_id: &Bound<'_, PyAny>, skip_gc: &Bound<'_, PyAny>) -> PyResult<Self> { | ||
| fn new( | ||
| client_id: &Bound<'_, PyAny>, | ||
| skip_gc: &Bound<'_, PyAny>, | ||
| offset_kind: &Bound<'_, PyAny>, | ||
| ) -> PyResult<Self> { | ||
| let mut options = Options::default(); | ||
| if !client_id.is_none() { | ||
| let _client_id: u64 = client_id.cast::<PyInt>() | ||
|
|
@@ -84,10 +89,30 @@ impl Doc { | |
| .map_err(|_| PyValueError::new_err("skip_gc must be a valid bool"))?; | ||
| options.skip_gc = _skip_gc; | ||
| } | ||
| if !offset_kind.is_none() { | ||
| let _offset_kind: String = offset_kind | ||
| .extract() | ||
| .map_err(|_| PyValueError::new_err("offset_kind must be a string"))?; | ||
| options.offset_kind = match _offset_kind.as_str() { | ||
| "utf8" | "utf-8" => OffsetKind::Bytes, | ||
| "utf16" | "utf-16" => OffsetKind::Utf16, | ||
| _ => return Err(PyValueError::new_err( | ||
| "offset_kind must be 'utf8' or 'utf16'", | ||
| )), | ||
| }; | ||
| } | ||
| let doc = _Doc::with_options(options); | ||
|
Comment on lines
77
to
104
|
||
| Ok(Doc { doc }) | ||
| } | ||
|
|
||
| #[getter] | ||
| fn offset_kind(&self) -> &'static str { | ||
| match self.doc.offset_kind() { | ||
| OffsetKind::Bytes => "utf8", | ||
| OffsetKind::Utf16 => "utf16", | ||
| } | ||
| } | ||
|
|
||
| #[staticmethod] | ||
| #[pyo3(name = "from_snapshot")] | ||
| pub fn from_snapshot(py: Python<'_>, snapshot: PyRef<'_, crate::snapshot::Snapshot>, doc: PyRef<'_, Doc>) -> PyResult<Py<Doc>> { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
offset_kindmismatch check compares the raw user string todoc.offset_kind, but the constructor accepts hyphenated aliases (e.g.utf-16). Passing an existing_Docplusoffset_kind='utf-16'will currently raise even though it's semantically the same as'utf16'. Normalizeoffset_kindto the canonical form (e.g. strip hyphens / lowercase) before comparing, or accept both spellings in the comparison.