Skip to content

Commit ca226a5

Browse files
authored
feat: Bổ sung tài liệu nghiên cứu chuyên sâu mô tả kiến trúc, cài đặt và cách sử dụng Zem SDK. (#12)
1 parent eb5016a commit ca226a5

4 files changed

Lines changed: 961 additions & 366 deletions

File tree

CHANGELOG.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,12 @@
1-
## [0.3.0] [*](https://github.com/OAI-Labs/xfmr-zem/pull/6) - 2026-02-06
1+
## [0.3.1] - 2026-02-23
2+
3+
### Changed
4+
- **Minimal Core Install**: Moved all heavy optional libraries (`nemo-curator`, `dask-cuda`, `sentence-transformers`, `faiss-cpu`, `transformers`, `unstructured`, `python-magic`) out of core `[dependencies]` into explicit optional extras. `pip install xfmr-zem` now only installs the lightweight engine core.
5+
- **Reorganized Extras**: Grouped optional extras by domain: `[nemo]`, `[datajuicer]`, `[deduplication]`, `[document]`, `[ocr]`, `[corrector]`, `[voice]`, `[evaluator]`, `[ui]`, `[dev]`.
6+
- **New `[nlp]` bundle**: Convenience extra that installs the full NLP stack (nemo + datajuicer + deduplication) without GPU/OCR/Voice dependencies.
7+
- **Cleaned `[ocr]` extra**: Removed unrelated packages (`dotenv`, `ipython`, `landingai-ade`, `matplotlib`, `seaborn`).
8+
9+
210

311
### Added
412
- **Two-Stage Deduplication**: Implemented a high-precision pipeline combining MinHash LSH with NER-based filtering and semantic similarity detection.

pyproject.toml

Lines changed: 66 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -10,28 +10,19 @@ authors = [
1010
]
1111
keywords = ["xfmr-zem", "data-pipeline", "zenml", "nemo-curator", "data-juicer", "mlops"]
1212

13+
# ── Core: chỉ những gì cần thiết để chạy Zem Engine + CLI ──
1314
dependencies = [
1415
"pandas>=2.0.0",
1516
"numpy>=1.24.0",
1617
"pyyaml>=6.0",
1718
"click>=8.0.0",
1819
"rich>=13.0.0",
1920
"loguru>=0.7.0",
21+
"pydantic>=2.0.0",
22+
"pyarrow>=15.0.0",
2023
"fastmcp>=0.1.0",
2124
"mcp>=0.1.0",
2225
"zenml[local,server]>=0.75.0",
23-
"pyarrow>=15.0.0",
24-
"nemo-curator>=1.0.0",
25-
"dask-cuda>=24.0.0",
26-
"ftfy>=6.3.1",
27-
"pydantic>=2.0.0",
28-
"datasketch>=1.6.0",
29-
"underthesea>=9.2.9",
30-
"sentence-transformers>=5.2.2",
31-
"faiss-cpu>=1.13.2",
32-
"transformers>=4.55.2",
33-
"unstructured[all-docs]>=0.16.0",
34-
"python-magic>=0.4.27",
3526
]
3627

3728
[project.urls]
@@ -41,25 +32,26 @@ Issues = "https://github.com/OAI-Labs/xfmr-zem/issues"
4132
Changelog = "https://github.com/OAI-Labs/xfmr-zem/blob/main/CHANGELOG.md"
4233

4334
[project.optional-dependencies]
44-
zenml = [
45-
"zenml>=0.75.0",
46-
]
35+
# ── NLP / Text Processing ─────────────────────────────────────────────────────
4736
nemo = [
48-
"nemo-curator>=0.6.0",
37+
"nemo-curator>=0.5.0",
38+
"dask-cuda>=24.0.0",
39+
"ftfy>=6.3.1",
4940
]
5041
datajuicer = [
5142
"py-data-juicer>=1.0.0",
5243
]
5344
deduplication = [
5445
"datasketch>=1.6.0",
5546
"underthesea>=6.8.0",
47+
"sentence-transformers>=2.2.0",
48+
"faiss-cpu>=1.7.0",
5649
]
57-
all = [
58-
"zenml>=0.75.0",
59-
"nemo-curator>=0.6.0",
60-
"py-data-juicer>=1.0.0",
61-
"datasketch>=1.6.0",
62-
"underthesea>=6.8.0",
50+
51+
# ── Document / Multimodal ─────────────────────────────────────────────────────
52+
document = [
53+
"unstructured[all-docs]>=0.16.0",
54+
"python-magic>=0.4.27",
6355
]
6456
audio = [
6557
"lhotse>=1.24.0",
@@ -70,19 +62,12 @@ vn_asr = [
7062
"transformers>=4.40.0",
7163
]
7264
ocr = [
73-
"dotenv>=0.9.9",
74-
"ipython>=8.38.0",
75-
"landingai-ade>=1.5.0",
76-
"matplotlib>=3.10.8",
77-
"numpy<2.0.0",
78-
"pillow>=12.1.0",
79-
"pymupdf>=1.26.7",
80-
"pyyaml>=6.0.3",
81-
"rich>=14.3.2",
82-
"seaborn>=0.13.2",
65+
"pymupdf>=1.23.0",
66+
"pdfplumber>=0.11.0",
8367
"pytesseract>=0.3.10",
8468
"paddleocr>=2.7.0",
8569
"paddlepaddle>=2.6.0",
70+
"landingai-ade>=1.5.0",
8671
"transformers>=4.40.0",
8772
"torch>=2.5.1",
8873
"torchvision>=0.20.1",
@@ -92,11 +77,20 @@ ocr = [
9277
"shapely",
9378
"pyclipper",
9479
"einops",
95-
"pdfplumber>=0.11.0",
96-
"pymupdf>=1.23.0",
9780
"ruamel.yaml>=0.17.0",
9881
"cachetools>=5.0.0",
82+
"numpy<2.0.0",
83+
]
84+
corrector = [
85+
"transformers>=4.55.2",
86+
"torch>=2.5.0",
87+
"torchvision>=0.20.1",
88+
"pillow>=12.1.0",
89+
"pymupdf>=1.26.7",
90+
"numpy<2.0.0",
9991
]
92+
93+
# ── Audio / Voice ──────────────────────────────────────────────────────────────
10094
voice = [
10195
"pyannote-audio>=3.1.0",
10296
"librosa>=0.10.0",
@@ -121,11 +115,20 @@ voice = [
121115
"accelerate>=0.25.0",
122116
"thop>=0.1.1",
123117
]
118+
119+
# ── LLM / Evaluation ──────────────────────────────────────────────────────────
120+
evaluator = [
121+
"opik>=1.10.9",
122+
]
123+
124+
# ── Web UI ─────────────────────────────────────────────────────────────────────
124125
ui = [
125126
"fastapi",
126127
"uvicorn",
127128
"python-multipart",
128129
]
130+
131+
# ── Dev / Test ─────────────────────────────────────────────────────────────────
129132
dev = [
130133
"pytest>=7.0.0",
131134
"pytest-cov>=4.0.0",
@@ -134,23 +137,37 @@ dev = [
134137
"mypy>=1.0.0",
135138
]
136139

137-
corrector = [
138-
"dotenv>=0.9.9",
139-
"ipython>=8.38.0",
140-
"landingai-ade>=1.5.0",
141-
"matplotlib>=3.10.8",
142-
"numpy<2.0.0",
143-
"pillow>=12.1.0",
144-
"pymupdf>=1.26.7",
145-
"pyyaml>=6.0.3",
146-
"rich>=14.3.2",
147-
"seaborn>=0.13.2",
148-
"torch>=2.5.0",
149-
"torchvision>=0.20.1",
150-
"transformers>=4.55.2",
140+
# ── Bundles ───────────────────────────────────────────────────────────────────
141+
# Cài đầy đủ NLP stack (không bao gồm GPU/OCR/Voice)
142+
nlp = [
143+
"nemo-curator>=0.5.0",
144+
"dask-cuda>=24.0.0",
145+
"ftfy>=6.3.1",
146+
"py-data-juicer>=1.0.0",
147+
"datasketch>=1.6.0",
148+
"underthesea>=6.8.0",
149+
"sentence-transformers>=2.2.0",
150+
"faiss-cpu>=1.7.0",
151151
]
152-
evaluator = [
152+
# Cài tất cả (trừ GPU-only tools)
153+
all = [
154+
"nemo-curator>=0.5.0",
155+
"dask-cuda>=24.0.0",
156+
"ftfy>=6.3.1",
157+
"py-data-juicer>=1.0.0",
158+
"datasketch>=1.6.0",
159+
"underthesea>=6.8.0",
160+
"sentence-transformers>=2.2.0",
161+
"faiss-cpu>=1.7.0",
162+
"unstructured[all-docs]>=0.16.0",
163+
"python-magic>=0.4.27",
164+
"openai-whisper",
165+
"librosa",
166+
"soundfile",
153167
"opik>=1.10.9",
168+
"fastapi",
169+
"uvicorn",
170+
"python-multipart",
154171
]
155172

156173

0 commit comments

Comments
 (0)