@@ -10,28 +10,19 @@ authors = [
1010]
1111keywords = [" xfmr-zem" , " data-pipeline" , " zenml" , " nemo-curator" , " data-juicer" , " mlops" ]
1212
13+ # ── Core: chỉ những gì cần thiết để chạy Zem Engine + CLI ──
1314dependencies = [
1415 " pandas>=2.0.0" ,
1516 " numpy>=1.24.0" ,
1617 " pyyaml>=6.0" ,
1718 " click>=8.0.0" ,
1819 " rich>=13.0.0" ,
1920 " loguru>=0.7.0" ,
21+ " pydantic>=2.0.0" ,
22+ " pyarrow>=15.0.0" ,
2023 " fastmcp>=0.1.0" ,
2124 " mcp>=0.1.0" ,
2225 " zenml[local,server]>=0.75.0" ,
23- " pyarrow>=15.0.0" ,
24- " nemo-curator>=1.0.0" ,
25- " dask-cuda>=24.0.0" ,
26- " ftfy>=6.3.1" ,
27- " pydantic>=2.0.0" ,
28- " datasketch>=1.6.0" ,
29- " underthesea>=9.2.9" ,
30- " sentence-transformers>=5.2.2" ,
31- " faiss-cpu>=1.13.2" ,
32- " transformers>=4.55.2" ,
33- " unstructured[all-docs]>=0.16.0" ,
34- " python-magic>=0.4.27" ,
3526]
3627
3728[project .urls ]
@@ -41,25 +32,26 @@ Issues = "https://github.com/OAI-Labs/xfmr-zem/issues"
4132Changelog = " https://github.com/OAI-Labs/xfmr-zem/blob/main/CHANGELOG.md"
4233
4334[project .optional-dependencies ]
44- zenml = [
45- " zenml>=0.75.0" ,
46- ]
35+ # ── NLP / Text Processing ─────────────────────────────────────────────────────
4736nemo = [
48- " nemo-curator>=0.6.0" ,
37+ " nemo-curator>=0.5.0" ,
38+ " dask-cuda>=24.0.0" ,
39+ " ftfy>=6.3.1" ,
4940]
5041datajuicer = [
5142 " py-data-juicer>=1.0.0" ,
5243]
5344deduplication = [
5445 " datasketch>=1.6.0" ,
5546 " underthesea>=6.8.0" ,
47+ " sentence-transformers>=2.2.0" ,
48+ " faiss-cpu>=1.7.0" ,
5649]
57- all = [
58- " zenml>=0.75.0" ,
59- " nemo-curator>=0.6.0" ,
60- " py-data-juicer>=1.0.0" ,
61- " datasketch>=1.6.0" ,
62- " underthesea>=6.8.0" ,
50+
51+ # ── Document / Multimodal ─────────────────────────────────────────────────────
52+ document = [
53+ " unstructured[all-docs]>=0.16.0" ,
54+ " python-magic>=0.4.27" ,
6355]
6456audio = [
6557 " lhotse>=1.24.0" ,
@@ -70,19 +62,12 @@ vn_asr = [
7062 " transformers>=4.40.0" ,
7163]
7264ocr = [
73- " dotenv>=0.9.9" ,
74- " ipython>=8.38.0" ,
75- " landingai-ade>=1.5.0" ,
76- " matplotlib>=3.10.8" ,
77- " numpy<2.0.0" ,
78- " pillow>=12.1.0" ,
79- " pymupdf>=1.26.7" ,
80- " pyyaml>=6.0.3" ,
81- " rich>=14.3.2" ,
82- " seaborn>=0.13.2" ,
65+ " pymupdf>=1.23.0" ,
66+ " pdfplumber>=0.11.0" ,
8367 " pytesseract>=0.3.10" ,
8468 " paddleocr>=2.7.0" ,
8569 " paddlepaddle>=2.6.0" ,
70+ " landingai-ade>=1.5.0" ,
8671 " transformers>=4.40.0" ,
8772 " torch>=2.5.1" ,
8873 " torchvision>=0.20.1" ,
@@ -92,11 +77,20 @@ ocr = [
9277 " shapely" ,
9378 " pyclipper" ,
9479 " einops" ,
95- " pdfplumber>=0.11.0" ,
96- " pymupdf>=1.23.0" ,
9780 " ruamel.yaml>=0.17.0" ,
9881 " cachetools>=5.0.0" ,
82+ " numpy<2.0.0" ,
83+ ]
84+ corrector = [
85+ " transformers>=4.55.2" ,
86+ " torch>=2.5.0" ,
87+ " torchvision>=0.20.1" ,
88+ " pillow>=12.1.0" ,
89+ " pymupdf>=1.26.7" ,
90+ " numpy<2.0.0" ,
9991]
92+
93+ # ── Audio / Voice ──────────────────────────────────────────────────────────────
10094voice = [
10195 " pyannote-audio>=3.1.0" ,
10296 " librosa>=0.10.0" ,
@@ -121,11 +115,20 @@ voice = [
121115 " accelerate>=0.25.0" ,
122116 " thop>=0.1.1" ,
123117]
118+
119+ # ── LLM / Evaluation ──────────────────────────────────────────────────────────
120+ evaluator = [
121+ " opik>=1.10.9" ,
122+ ]
123+
124+ # ── Web UI ─────────────────────────────────────────────────────────────────────
124125ui = [
125126 " fastapi" ,
126127 " uvicorn" ,
127128 " python-multipart" ,
128129]
130+
131+ # ── Dev / Test ─────────────────────────────────────────────────────────────────
129132dev = [
130133 " pytest>=7.0.0" ,
131134 " pytest-cov>=4.0.0" ,
@@ -134,23 +137,37 @@ dev = [
134137 " mypy>=1.0.0" ,
135138]
136139
137- corrector = [
138- " dotenv>=0.9.9" ,
139- " ipython>=8.38.0" ,
140- " landingai-ade>=1.5.0" ,
141- " matplotlib>=3.10.8" ,
142- " numpy<2.0.0" ,
143- " pillow>=12.1.0" ,
144- " pymupdf>=1.26.7" ,
145- " pyyaml>=6.0.3" ,
146- " rich>=14.3.2" ,
147- " seaborn>=0.13.2" ,
148- " torch>=2.5.0" ,
149- " torchvision>=0.20.1" ,
150- " transformers>=4.55.2" ,
140+ # ── Bundles ───────────────────────────────────────────────────────────────────
141+ # Cài đầy đủ NLP stack (không bao gồm GPU/OCR/Voice)
142+ nlp = [
143+ " nemo-curator>=0.5.0" ,
144+ " dask-cuda>=24.0.0" ,
145+ " ftfy>=6.3.1" ,
146+ " py-data-juicer>=1.0.0" ,
147+ " datasketch>=1.6.0" ,
148+ " underthesea>=6.8.0" ,
149+ " sentence-transformers>=2.2.0" ,
150+ " faiss-cpu>=1.7.0" ,
151151]
152- evaluator = [
152+ # Cài tất cả (trừ GPU-only tools)
153+ all = [
154+ " nemo-curator>=0.5.0" ,
155+ " dask-cuda>=24.0.0" ,
156+ " ftfy>=6.3.1" ,
157+ " py-data-juicer>=1.0.0" ,
158+ " datasketch>=1.6.0" ,
159+ " underthesea>=6.8.0" ,
160+ " sentence-transformers>=2.2.0" ,
161+ " faiss-cpu>=1.7.0" ,
162+ " unstructured[all-docs]>=0.16.0" ,
163+ " python-magic>=0.4.27" ,
164+ " openai-whisper" ,
165+ " librosa" ,
166+ " soundfile" ,
153167 " opik>=1.10.9" ,
168+ " fastapi" ,
169+ " uvicorn" ,
170+ " python-multipart" ,
154171]
155172
156173
0 commit comments