Merge pull request #1323 from PyThaiNLP/copilot/fix-memory-usage-newmm-tokenization

bact · web-flow · commit ef81afcbcd54 · 2026-03-10T13:15:28.000+07:00
fix: reduce Trie and newmm peak memory; add tcc_pos_array()
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -94,6 +94,7 @@ The minimum requirement is now Python 3.9.
   downloaded (#1317)
 - `newmm` tokenization: Exponential-time explosion when text has
   many ambiguous breaking points (#1319)
+- `Trie`: Reduce memory usage and faster TCC boundary lookups (#1323)
 
 ### Security
 
diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
@@ -27,7 +27,7 @@
     from pythainlp.util import Trie
 
 from pythainlp.tokenize import word_dict_trie
-from pythainlp.tokenize.tcc_p import tcc_pos
+from pythainlp.tokenize.tcc_p import tcc_pos_array
 
 # match non-Thai tokens
 # `|` is used as like "early return",
@@ -86,7 +86,7 @@ def _onecut(text: str, custom_dict: Trie) -> Generator[str, None, None]:
 
     graph_size = 0  # keep track of graph size, if too big, force cutoff
 
-    valid_poss = tcc_pos(text)  # breaking positions that are TCC-valid
+    valid_poss = tcc_pos_array(text)  # bytearray of valid TCC break positions
 
     len_text = len(text)
     pos_list = [0]  # priority queue of possible breaking positions
@@ -95,7 +95,7 @@ def _onecut(text: str, custom_dict: Trie) -> Generator[str, None, None]:
         begin_pos = heappop(pos_list)
         for word in custom_dict.prefixes(text, begin_pos):
             end_pos_candidate = begin_pos + len(word)
-            if end_pos_candidate in valid_poss:
+            if valid_poss[end_pos_candidate]:
                 graph[begin_pos].append(end_pos_candidate)
                 graph_size = graph_size + 1
 
@@ -121,12 +121,12 @@ def _onecut(text: str, custom_dict: Trie) -> Generator[str, None, None]:
                 end_pos = m.end()
             else:  # Thai token, find minimum skip
                 for pos in range(begin_pos + 1, len_text):
-                    if pos in valid_poss:
+                    if valid_poss[pos]:
                         words = [
                             word
                             for word in custom_dict.prefixes(text, pos)
                             if (
-                                (pos + len(word) in valid_poss)
+                                valid_poss[pos + len(word)]
                                 and not _PAT_THAI_TWOCHARS.match(word)
                             )
                         ]
diff --git a/pythainlp/tokenize/tcc_p.py b/pythainlp/tokenize/tcc_p.py
@@ -88,7 +88,7 @@ def tcc_pos(text: str) -> set[int]:
     """TCC positions
 
     :param str text: text to be tokenized into character clusters
-    :return: list of the ending position of subwords
+    :return: set of the ending positions of character clusters
     :rtype: set[int]
     """
     if not text or not isinstance(text, str):
@@ -103,6 +103,30 @@ def tcc_pos(text: str) -> set[int]:
     return p_set
 
 
+def tcc_pos_array(text: str) -> bytearray:
+    """TCC positions as a bytearray.
+
+    Returns a bytearray of length ``len(text) + 1`` where index ``i``
+    is ``1`` if position ``i`` is a valid Thai Character Cluster boundary,
+    and ``0`` otherwise.  Array-index lookup is faster and uses less
+    memory than set membership for large texts.
+
+    :param str text: text to be tokenized into character clusters
+    :return: bytearray of valid TCC boundary flags, indexed by position
+    :rtype: bytearray
+    """
+    if not text or not isinstance(text, str):
+        return bytearray(1)
+
+    arr = bytearray(len(text) + 1)
+    p = 0
+    for w in tcc(text):
+        p += len(w)
+        arr[p] = 1
+
+    return arr
+
+
 def segment(text: str) -> list[str]:
     """Subword segmentation
 
diff --git a/pythainlp/util/trie.py b/pythainlp/util/trie.py
@@ -9,7 +9,7 @@
 from __future__ import annotations
 
 from collections.abc import Iterable, Iterator
-from typing import Union
+from typing import Optional, Union
 
 
 class Trie(Iterable[str]):
@@ -46,64 +46,74 @@ class Trie(Iterable[str]):
         # output: 5
     """
 
-    words: set[str]
     root: Node
+    _word_count: int
 
     class Node:
         __slots__: tuple[str, str] = ("end", "children")
 
         def __init__(self) -> None:
             self.end: bool = False
-            self.children: dict[str, Trie.Node] = {}
+            # Children dict is created on demand to reduce memory for leaf nodes.
+            self.children: Optional[dict[str, Trie.Node]] = None
 
     def __init__(self, words: Iterable[str]) -> None:
-        self.words: set[str] = set(words)
+        self._word_count: int = 0
         self.root: Trie.Node = Trie.Node()
-
         for word in words:
             self.add(word)
 
     def add(self, word: str) -> None:
         """Add a word to the trie.
         Spaces in front of and following the word will be removed.
 
-        :param str text: a word
+        :param str word: a word
         """
         word = word.strip()
-        self.words.add(word)
         cur = self.root
         for ch in word:
+            if cur.children is None:
+                cur.children = {}
             child = cur.children.get(ch)
-            if not child:
+            if child is None:
                 child = Trie.Node()
                 cur.children[ch] = child
             cur = child
-        cur.end = True
+        if not cur.end:
+            cur.end = True
+            self._word_count += 1
 
     def remove(self, word: str) -> None:
         """Remove a word from the trie.
         If the word is not found, do nothing.
 
-        :param str text: a word
+        :param str word: a word
         """
-        # remove from set first
-        if word not in self.words:
-            return
-        self.words.remove(word)
-        # then remove from nodes
-        parent = self.root
-        data = []  # track path to leaf
+        # Navigate to the word's end node, recording the path.
+        node = self.root
+        path: list[tuple[Trie.Node, Trie.Node, str]] = []
         for ch in word:
-            child = parent.children[ch]
-            data.append((parent, child, ch))
-            parent = child
-        # remove the last one
-        child.end = False
-        # prune up the tree
-        for parent, child, ch in reversed(data):
+            if node.children is None:
+                return  # word not in trie
+            child = node.children.get(ch)
+            if child is None:
+                return  # word not in trie
+            path.append((node, child, ch))
+            node = child
+        if not node.end:
+            return  # path exists but not a complete word
+        node.end = False
+        self._word_count -= 1
+        # Prune nodes that are now unused (not an end and no children).
+        # parent.children is always non-None here because the path was
+        # built by traversing through existing children dicts.
+        for parent, child, ch in reversed(path):
             if child.end or child.children:
                 break
-            del parent.children[ch]  # remove from parent dict
+            if parent.children is not None:  # always true; narrows type
+                del parent.children[ch]
+                if not parent.children:
+                    parent.children = None  # free empty dict
 
     def prefixes(self, text: str, start: int = 0) -> list[str]:
         """List all possible words from first sequence of characters in a word.
@@ -118,8 +128,10 @@ def prefixes(self, text: str, start: int = 0) -> list[str]:
         i = start
         n = len(text)
         while i < n:
+            if cur.children is None:
+                break
             node = cur.children.get(text[i])
-            if not node:
+            if node is None:
                 break
             if node.end:
                 res.append(text[start : i + 1])
@@ -128,13 +140,33 @@ def prefixes(self, text: str, start: int = 0) -> list[str]:
         return res
 
     def __contains__(self, key: str) -> bool:
-        return key in self.words
+        cur = self.root
+        for ch in key:
+            if cur.children is None:
+                return False
+            node = cur.children.get(ch)
+            if node is None:
+                return False
+            cur = node
+        return cur.end
 
     def __iter__(self) -> Iterator[str]:
-        yield from self.words
+        # DFS through the trie to yield all stored words.
+        # A shared mutable prefix list is appended/popped to avoid
+        # O(k²) list copies that a stack-based approach would incur.
+        def _dfs(node: Trie.Node, prefix: list[str]) -> Iterator[str]:
+            if node.end:
+                yield "".join(prefix)
+            if node.children:
+                for ch, child in node.children.items():
+                    prefix.append(ch)
+                    yield from _dfs(child, prefix)
+                    prefix.pop()
+
+        yield from _dfs(self.root, [])
 
     def __len__(self) -> int:
-        return len(self.words)
+        return self._word_count
 
 
 def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie:
diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py
@@ -684,6 +684,14 @@ def test_tcc_p(self):
         # )
         self.assertEqual(list(tcc_p.tcc("")), [])
         self.assertEqual(tcc_p.tcc_pos(""), set())
+        # tcc_pos_array: edge cases
+        self.assertIsInstance(tcc_p.tcc_pos_array(""), bytearray)
+        self.assertIsInstance(tcc_p.tcc_pos_array(None), bytearray)
+        self.assertIsInstance(tcc_p.tcc_pos_array(42), bytearray)
+        # valid text: array length must equal len(text)+1 and mark boundaries
+        arr = tcc_p.tcc_pos_array("ประเทศ")
+        self.assertEqual(len(arr), len("ประเทศ") + 1)
+        self.assertEqual(arr[0], 0)  # position 0 is never a boundary
 
     def test_display_cell_tokenize(self):
         self.assertEqual(display_cell_tokenize(""), [])
diff --git a/tests/core/test_util.py b/tests/core/test_util.py
@@ -516,6 +516,20 @@ def test_trie(self):
         trie.remove("ทด")
         self.assertEqual(len(trie), 2)
 
+        # _word_count must not double-count re-added words
+        trie2 = Trie(["ก", "ข", "ก"])
+        self.assertEqual(len(trie2), 2)
+        trie2.add("ก")  # already present – count must stay the same
+        self.assertEqual(len(trie2), 2)
+        trie2.add("ค")
+        self.assertEqual(len(trie2), 3)
+        trie2.remove("ข")
+        self.assertEqual(len(trie2), 2)
+        trie2.remove("ข")  # removing non-existent word must not change count
+        self.assertEqual(len(trie2), 2)
+        # All remaining words must be reachable via __iter__
+        self.assertEqual(sorted(trie2), ["ก", "ค"])
+
         trie = Trie([])
         self.assertEqual(len(trie), 0)
         trie.remove("หมด")