OAI-Labs
diff --git a/‎CHANGELOG.md‎
Lines changed: 9 additions & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 66 additions & 49 deletions b/‎pyproject.toml‎
Lines changed: 66 additions & 49 deletions
@@ -1,4 +1,12 @@
-## [0.3.0] [*](https://github.com/OAI-Labs/xfmr-zem/pull/6) - 2026-02-06
+## [0.3.1] - 2026-02-23
+
+### Changed
+- **Minimal Core Install**: Moved all heavy optional libraries (`nemo-curator`, `dask-cuda`, `sentence-transformers`, `faiss-cpu`, `transformers`, `unstructured`, `python-magic`) out of core `[dependencies]` into explicit optional extras. `pip install xfmr-zem` now only installs the lightweight engine core.
+- **Reorganized Extras**: Grouped optional extras by domain: `[nemo]`, `[datajuicer]`, `[deduplication]`, `[document]`, `[ocr]`, `[corrector]`, `[voice]`, `[evaluator]`, `[ui]`, `[dev]`.
+- **New `[nlp]` bundle**: Convenience extra that installs the full NLP stack (nemo + datajuicer + deduplication) without GPU/OCR/Voice dependencies.
+- **Cleaned `[ocr]` extra**: Removed unrelated packages (`dotenv`, `ipython`, `landingai-ade`, `matplotlib`, `seaborn`).
+
+
 
 ### Added
 - **Two-Stage Deduplication**: Implemented a high-precision pipeline combining MinHash LSH with NER-based filtering and semantic similarity detection.
 
@@ -10,28 +10,19 @@ authors = [
 ]
 keywords = ["xfmr-zem", "data-pipeline", "zenml", "nemo-curator", "data-juicer", "mlops"]
 
+# ── Core: chỉ những gì cần thiết để chạy Zem Engine + CLI ──
 dependencies = [
     "pandas>=2.0.0",
     "numpy>=1.24.0",
     "pyyaml>=6.0",
     "click>=8.0.0",
     "rich>=13.0.0",
     "loguru>=0.7.0",
+    "pydantic>=2.0.0",
+    "pyarrow>=15.0.0",
     "fastmcp>=0.1.0",
     "mcp>=0.1.0",
     "zenml[local,server]>=0.75.0",
-    "pyarrow>=15.0.0",
-    "nemo-curator>=1.0.0",
-    "dask-cuda>=24.0.0",
-    "ftfy>=6.3.1",
-    "pydantic>=2.0.0",
-    "datasketch>=1.6.0",
-    "underthesea>=9.2.9",
-    "sentence-transformers>=5.2.2",
-    "faiss-cpu>=1.13.2",
-    "transformers>=4.55.2",
-    "unstructured[all-docs]>=0.16.0",
-    "python-magic>=0.4.27",
 ]
 
 [project.urls]
@@ -41,25 +32,26 @@ Issues = "https://github.com/OAI-Labs/xfmr-zem/issues"
 Changelog = "https://github.com/OAI-Labs/xfmr-zem/blob/main/CHANGELOG.md"
 
 [project.optional-dependencies]
-zenml = [
-    "zenml>=0.75.0",
-]
+# ── NLP / Text Processing ─────────────────────────────────────────────────────
 nemo = [
-    "nemo-curator>=0.6.0",
+    "nemo-curator>=0.5.0",
+    "dask-cuda>=24.0.0",
+    "ftfy>=6.3.1",
 ]
 datajuicer = [
     "py-data-juicer>=1.0.0",
 ]
 deduplication = [
     "datasketch>=1.6.0",
     "underthesea>=6.8.0",
+    "sentence-transformers>=2.2.0",
+    "faiss-cpu>=1.7.0",
 ]
-all = [
-    "zenml>=0.75.0",
-    "nemo-curator>=0.6.0",
-    "py-data-juicer>=1.0.0",
-    "datasketch>=1.6.0",
-    "underthesea>=6.8.0",
+
+# ── Document / Multimodal ─────────────────────────────────────────────────────
+document = [
+    "unstructured[all-docs]>=0.16.0",
+    "python-magic>=0.4.27",
 ]
 audio = [
     "lhotse>=1.24.0",
@@ -70,19 +62,12 @@ vn_asr = [
     "transformers>=4.40.0",
 ]
 ocr = [
-    "dotenv>=0.9.9",
-    "ipython>=8.38.0",
-    "landingai-ade>=1.5.0",
-    "matplotlib>=3.10.8",
-    "numpy<2.0.0",
-    "pillow>=12.1.0",
-    "pymupdf>=1.26.7",
-    "pyyaml>=6.0.3",
-    "rich>=14.3.2",
-    "seaborn>=0.13.2",
+    "pymupdf>=1.23.0",
+    "pdfplumber>=0.11.0",
     "pytesseract>=0.3.10",
     "paddleocr>=2.7.0",
     "paddlepaddle>=2.6.0",
+    "landingai-ade>=1.5.0",
     "transformers>=4.40.0",
     "torch>=2.5.1",
     "torchvision>=0.20.1",
@@ -92,11 +77,20 @@ ocr = [
     "shapely",
     "pyclipper",
     "einops",
-    "pdfplumber>=0.11.0",
-    "pymupdf>=1.23.0",
     "ruamel.yaml>=0.17.0",
     "cachetools>=5.0.0",
+    "numpy<2.0.0",
+]
+corrector = [
+    "transformers>=4.55.2",
+    "torch>=2.5.0",
+    "torchvision>=0.20.1",
+    "pillow>=12.1.0",
+    "pymupdf>=1.26.7",
+    "numpy<2.0.0",
 ]
+
+# ── Audio / Voice ──────────────────────────────────────────────────────────────
 voice = [
     "pyannote-audio>=3.1.0",
     "librosa>=0.10.0",
@@ -121,11 +115,20 @@ voice = [
     "accelerate>=0.25.0",
     "thop>=0.1.1",
 ]
+
+# ── LLM / Evaluation ──────────────────────────────────────────────────────────
+evaluator = [
+    "opik>=1.10.9",
+]
+
+# ── Web UI ─────────────────────────────────────────────────────────────────────
 ui = [
     "fastapi",
     "uvicorn",
     "python-multipart",
 ]
+
+# ── Dev / Test ─────────────────────────────────────────────────────────────────
 dev = [
     "pytest>=7.0.0",
     "pytest-cov>=4.0.0",
@@ -134,23 +137,37 @@ dev = [
     "mypy>=1.0.0",
 ]
 
-corrector = [
-    "dotenv>=0.9.9",
-    "ipython>=8.38.0",
-    "landingai-ade>=1.5.0",
-    "matplotlib>=3.10.8",
-    "numpy<2.0.0",
-    "pillow>=12.1.0",
-    "pymupdf>=1.26.7",
-    "pyyaml>=6.0.3",
-    "rich>=14.3.2",
-    "seaborn>=0.13.2",
-    "torch>=2.5.0",
-    "torchvision>=0.20.1",
-    "transformers>=4.55.2",
+# ── Bundles ───────────────────────────────────────────────────────────────────
+# Cài đầy đủ NLP stack (không bao gồm GPU/OCR/Voice)
+nlp = [
+    "nemo-curator>=0.5.0",
+    "dask-cuda>=24.0.0",
+    "ftfy>=6.3.1",
+    "py-data-juicer>=1.0.0",
+    "datasketch>=1.6.0",
+    "underthesea>=6.8.0",
+    "sentence-transformers>=2.2.0",
+    "faiss-cpu>=1.7.0",
 ]
-evaluator = [
+# Cài tất cả (trừ GPU-only tools)
+all = [
+    "nemo-curator>=0.5.0",
+    "dask-cuda>=24.0.0",
+    "ftfy>=6.3.1",
+    "py-data-juicer>=1.0.0",
+    "datasketch>=1.6.0",
+    "underthesea>=6.8.0",
+    "sentence-transformers>=2.2.0",
+    "faiss-cpu>=1.7.0",
+    "unstructured[all-docs]>=0.16.0",
+    "python-magic>=0.4.27",
+    "openai-whisper",
+    "librosa",
+    "soundfile",
     "opik>=1.10.9",
+    "fastapi",
+    "uvicorn",
+    "python-multipart",
 ]