Skip to content

Commit f20ca97

Browse files
authored
Merge branch 'main' into chenjiel/add_qwen_moe_test
2 parents 4d522da + 3ad4f4f commit f20ca97

271 files changed

Lines changed: 20036 additions & 5228 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
#!/usr/bin/env bash
2+
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
# Re-vendor upstream Claude skills from NVIDIA-NeMo/Evaluator at a pinned SHA.
18+
#
19+
# Scope: only skills we vendor verbatim (launching-evals, accessing-mlflow).
20+
# The `evaluation` skill is a *modified* fork of upstream nel-assistant and is
21+
# NOT managed by this script — update it manually when pulling upstream changes.
22+
#
23+
# Usage:
24+
# .claude/scripts/sync-upstream-skills.sh # re-vendor at the pinned SHA
25+
# UPSTREAM_SHA=<sha> .claude/scripts/sync-upstream-skills.sh # bump to a new SHA
26+
#
27+
# Requires: gh, base64, awk. Run from the repo root.
28+
#
29+
# The script overwrites .claude/skills/<skill>/ with upstream contents and
30+
# re-applies our provenance lines into each SKILL.md frontmatter. If you have
31+
# local changes to a vendored skill, they will be lost — that is expected,
32+
# since vendored-verbatim skills should not be modified locally.
33+
34+
set -euo pipefail
35+
36+
# Pinned upstream commit. Bump this (or pass UPSTREAM_SHA=...) when syncing.
37+
DEFAULT_SHA="8fa16b237d11e213ea665d5bad6b44d393762317"
38+
SHA="${UPSTREAM_SHA:-$DEFAULT_SHA}"
39+
SHORT_SHA="${SHA:0:7}"
40+
41+
UPSTREAM_REPO="NVIDIA-NeMo/Evaluator"
42+
UPSTREAM_BASE="packages/nemo-evaluator-launcher/.claude/skills"
43+
DEST_BASE=".claude/skills"
44+
45+
if [[ ! -d "$DEST_BASE" ]]; then
46+
echo "error: run from the repo root (expected $DEST_BASE/ to exist)" >&2
47+
exit 1
48+
fi
49+
50+
echo "Syncing upstream skills from $UPSTREAM_REPO @ $SHORT_SHA"
51+
52+
fetch_tree() {
53+
local skill="$1"
54+
local path="$2"
55+
gh api "repos/$UPSTREAM_REPO/contents/$UPSTREAM_BASE/$skill/$path?ref=$SHA" \
56+
-q '.[] | "\(.type)\t\(.name)"'
57+
}
58+
59+
fetch_file() {
60+
local src="$1"
61+
local dst="$2"
62+
mkdir -p "$(dirname "$dst")"
63+
gh api "repos/$UPSTREAM_REPO/contents/$src?ref=$SHA" -q '.content' | base64 -d > "$dst"
64+
}
65+
66+
fetch_skill_recursive() {
67+
local skill="$1"
68+
local subpath="${2:-}"
69+
local remote="$UPSTREAM_BASE/$skill"
70+
[[ -n "$subpath" ]] && remote="$remote/$subpath"
71+
72+
local entries
73+
entries=$(gh api "repos/$UPSTREAM_REPO/contents/$remote?ref=$SHA" -q '.[] | "\(.type)\t\(.name)"')
74+
75+
while IFS=$'\t' read -r type name; do
76+
local rel_path
77+
if [[ -n "$subpath" ]]; then
78+
rel_path="$subpath/$name"
79+
else
80+
rel_path="$name"
81+
fi
82+
83+
if [[ "$type" == "file" ]]; then
84+
local dst="$DEST_BASE/$skill/$rel_path"
85+
echo " fetch: $dst"
86+
fetch_file "$UPSTREAM_BASE/$skill/$rel_path" "$dst"
87+
elif [[ "$type" == "dir" ]]; then
88+
fetch_skill_recursive "$skill" "$rel_path"
89+
fi
90+
done <<< "$entries"
91+
}
92+
93+
# Inject our provenance lines into a SKILL.md frontmatter, right after the
94+
# `description:` line. Idempotent — removes any existing provenance block first.
95+
inject_provenance() {
96+
local skill="$1"
97+
local extra_note="${2:-}"
98+
local path="$DEST_BASE/$skill/SKILL.md"
99+
100+
awk -v sha="$SHA" -v short="$SHORT_SHA" -v skill="$skill" -v extra="$extra_note" '
101+
BEGIN { in_fm = 0; injected = 0; fm_end_seen = 0 }
102+
# Skip any pre-existing provenance or license lines we own
103+
/^license: Apache-2\.0$/ && in_fm && !fm_end_seen { next }
104+
/^# Vendored verbatim/ && in_fm && !fm_end_seen { next }
105+
/^# https:\/\/github\.com\/NVIDIA-NeMo\/Evaluator\/tree\// && in_fm && !fm_end_seen { next }
106+
/^# To re-sync:/ && in_fm && !fm_end_seen { next }
107+
/^# Note: this skill depends on the mlflow-mcp/ && in_fm && !fm_end_seen { next }
108+
/^# configured in the user/ && in_fm && !fm_end_seen { next }
109+
{
110+
print
111+
if ($0 == "---") {
112+
if (in_fm == 0) { in_fm = 1 }
113+
else { in_fm = 0; fm_end_seen = 1 }
114+
}
115+
if (in_fm && !injected && $0 ~ /^description: /) {
116+
print "license: Apache-2.0"
117+
print "# Vendored verbatim from NVIDIA NeMo Evaluator (commit " short ")"
118+
print "# https://github.com/NVIDIA-NeMo/Evaluator/tree/" sha "/packages/nemo-evaluator-launcher/.claude/skills/" skill
119+
print "# To re-sync: .claude/scripts/sync-upstream-skills.sh"
120+
if (extra != "") {
121+
n = split(extra, lines, "\\|")
122+
for (i = 1; i <= n; i++) print "# " lines[i]
123+
}
124+
injected = 1
125+
}
126+
}
127+
' "$path" > "$path.tmp"
128+
mv "$path.tmp" "$path"
129+
}
130+
131+
for skill in launching-evals accessing-mlflow; do
132+
echo ""
133+
echo "== $skill =="
134+
rm -rf "${DEST_BASE:?}/$skill"
135+
fetch_skill_recursive "$skill"
136+
137+
case "$skill" in
138+
accessing-mlflow)
139+
inject_provenance "$skill" \
140+
"Note: this skill depends on the mlflow-mcp MCP server (https://github.com/kkruglik/mlflow-mcp)|configured in the user's Claude Code setup."
141+
;;
142+
*)
143+
inject_provenance "$skill"
144+
;;
145+
esac
146+
done
147+
148+
echo ""
149+
echo "Done. Review with: git diff $DEST_BASE/launching-evals $DEST_BASE/accessing-mlflow"
150+
echo "If the SHA changed, update DEFAULT_SHA at the top of this script before committing."
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
---
2+
name: accessing-mlflow
3+
description: Query and browse evaluation results stored in MLflow. Use when the user wants to look up runs by invocation ID, compare metrics across models, fetch artifacts (configs, logs, results), or set up the MLflow MCP server. ALWAYS triggers on mentions of MLflow, experiment results, run comparison, invocation IDs in the context of results, or MLflow MCP setup.
4+
license: Apache-2.0
5+
# Vendored verbatim from NVIDIA NeMo Evaluator (commit 8fa16b2)
6+
# https://github.com/NVIDIA-NeMo/Evaluator/tree/8fa16b237d11e213ea665d5bad6b44d393762317/packages/nemo-evaluator-launcher/.claude/skills/accessing-mlflow
7+
# To re-sync: .claude/scripts/sync-upstream-skills.sh
8+
# Note: this skill depends on the mlflow-mcp MCP server (https://github.com/kkruglik/mlflow-mcp)
9+
# configured in the user's Claude Code setup.
10+
---
11+
12+
# Accessing MLflow
13+
14+
## MCP Server
15+
16+
[mlflow-mcp](https://github.com/kkruglik/mlflow-mcp) gives agents direct access to MLflow — query runs, compare metrics, browse artifacts, all through natural language.
17+
18+
## ID Convention
19+
20+
When the user provides a hex ID (e.g. `71f3f3199ea5e1f0`) without specifying what it is, assume it is an **invocation_id** (not an MLflow run_id). An invocation_id identifies a launcher invocation and is stored as both a tag and a param on MLflow runs. One invocation can produce multiple MLflow runs (one per task). You may need to search across multiple experiments if you don't know which experiment the run belongs to.
21+
22+
## Querying Runs
23+
24+
```python
25+
# Find runs by invocation_id
26+
MLflow:search_runs_by_tags(experiment_id, {"invocation_id": "<invocation_id>"})
27+
28+
# Query for example model/task runs
29+
MLflow:query_runs(experiment_id, "tags.model LIKE '%<model>%'")
30+
MLflow:query_runs(experiment_id, "tags.task_name LIKE '%<task_name>%'")
31+
32+
# Get a config from run's artifacts
33+
MLflow:get_artifact_content(run_id, "config.yml")
34+
35+
# Get nested stats from run's artifacts
36+
MLflow:get_artifact_content(run_id, "artifacts/eval_factory_metrics.json")
37+
```
38+
39+
NOTE: You WILL NOT find PENDING, RUNNING, KILLED, or FAILED runs in MLflow! Only SUCCESSFUL runs are exported to MLflow.
40+
41+
## Workflow Tips
42+
43+
When comparing metrics across runs, fetch the data via MCP, then run the computation in Python for exact results rather than doing math in-context:
44+
45+
```bash
46+
uv run --with pandas python3 << 'EOF'
47+
import pandas as pd
48+
# ... compute deltas, averages, etc.
49+
EOF
50+
```
51+
52+
## Artifacts Structure
53+
54+
```
55+
<harness>.<task>/
56+
├── artifacts/
57+
│ ├── config.yml # Fully resolved config used during the evaluation
58+
│ ├── launcher_unresolved_config.yaml # Unresolved config passed to the launcher
59+
│ ├── results.yml # All results in YAML format
60+
│ ├── eval_factory_metrics.json # Runtime stats (latency, tokens count, memory)
61+
│ ├── report.html # Request-Response Pairs samples in HTML format (if enabled)
62+
│ └── report.json # Request-Response Pairs samples in JSON format (if enabled)
63+
└── logs/
64+
├── client-*.log # Evaluation client
65+
├── server-*-N.log # Deployment per node
66+
├── slurm-*.log # Slurm job
67+
└── proxy-*.log # Request proxy
68+
```
69+
70+
## Troubleshooting
71+
72+
If the MLflow MCP server fails to load or its tools are unavailable:
73+
74+
1. **`uvx` not found** — install [uv](https://docs.astral.sh/uv/getting-started/installation/):
75+
```bash
76+
curl -LsSf https://astral.sh/uv/install.sh | sh
77+
```
78+
2. **MCP server not configured** — add the config and restart the agent:
79+
80+
**For Claude Code** — add to `.claude/settings.json` (project or user level), under `"mcpServers"`:
81+
```json
82+
"MLflow": {
83+
"command": "uvx",
84+
"args": ["mlflow-mcp"],
85+
"env": {
86+
"MLFLOW_TRACKING_URI": "https://<your-mlflow-server>/"
87+
}
88+
}
89+
```
90+
91+
**For Cursor** — edit `~/.cursor/mcp.json` (Settings > Tools & MCP > New MCP Server):
92+
```json
93+
{
94+
"mcpServers": {
95+
"MLflow": {
96+
"command": "uvx",
97+
"args": ["mlflow-mcp"],
98+
"env": {
99+
"MLFLOW_TRACKING_URI": "https://<your-mlflow-server>/"
100+
}
101+
}
102+
}
103+
}
104+
```
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# Credentials Setup
2+
3+
Tokens and registry credentials that ModelOpt workflows need across local and cluster environments. Not SLURM-specific — referenced from PTQ, deployment, evaluation, and slurm-setup skills.
4+
5+
## Check what's already set first
6+
7+
Before configuring anything, check what the user already has — many of these are likely in place from prior `hf auth login`, `docker login`, or previous SLURM work. Skip any section below for which credentials are already present.
8+
9+
```bash
10+
# HF token: env var or persisted from `hf auth login`
11+
[ -n "$HF_TOKEN" ] && echo "✓ HF_TOKEN set in env"
12+
[ -s ~/.cache/huggingface/token ] && echo "✓ HF token at ~/.cache/huggingface/token (from 'hf auth login')"
13+
14+
# Docker / NGC registry credentials
15+
grep -qE '"(nvcr\.io|https://index\.docker\.io)"' ~/.docker/config.json 2>/dev/null && echo "✓ Docker login present"
16+
17+
# Enroot / pyxis credentials (on cluster login node, for SLURM users)
18+
grep -qE '^machine nvcr\.io ' ~/.config/enroot/.credentials 2>/dev/null && echo "✓ Enroot NGC entry present"
19+
```
20+
21+
For remote clusters, run the same checks via SSH (`ssh <cluster-login> '<check>'`) — credentials live on the cluster, not your workstation.
22+
23+
## HuggingFace token (`HF_TOKEN`)
24+
25+
Required for gated models (e.g., Llama, Mistral, some Nemotron variants) and gated datasets (e.g., GPQA, HLE).
26+
27+
Generate at <https://huggingface.co/settings/tokens>. Two persistence options (you can use either or both):
28+
29+
1. **`hf auth login`** (recommended for interactive use) — stores the token at `~/.cache/huggingface/token`. The HF Python client picks it up automatically; `transformers`, `datasets`, and the `hf` CLI all read this file without needing `HF_TOKEN` in the env.
30+
31+
```bash
32+
pip install -U huggingface_hub
33+
hf auth login # paste the token interactively
34+
```
35+
36+
2. **Environment variable** (good for scripts, CI, and remote sessions):
37+
38+
```bash
39+
export HF_TOKEN=hf_...
40+
```
41+
42+
Persist in `~/.bashrc` or a project-local `.env` file. `HF_TOKEN` takes precedence when both are present.
43+
44+
## NGC API key (for `nvcr.io`)
45+
46+
Required for pulling NGC images (`nvcr.io/nvidia/pytorch:...`, `nvcr.io/nvidia/vllm:...`) via Docker, `srun --container-image`, or enroot.
47+
48+
Generate at <https://ngc.nvidia.com/setup/api-key>.
49+
50+
### Docker
51+
52+
```bash
53+
docker login nvcr.io -u '$oauthtoken' -p <NGC_API_KEY>
54+
```
55+
56+
### Enroot (SLURM / pyxis)
57+
58+
Add an entry to `~/.config/enroot/.credentials` on the cluster. The file may already hold credentials for other registries — **append rather than overwrite**:
59+
60+
```bash
61+
mkdir -p ~/.config/enroot
62+
CREDS=~/.config/enroot/.credentials
63+
touch "$CREDS"
64+
grep -q '^machine nvcr.io ' "$CREDS" || \
65+
echo 'machine nvcr.io login $oauthtoken password <NGC_API_KEY>' >> "$CREDS"
66+
chmod 600 "$CREDS"
67+
```
68+
69+
> **Note**: `$oauthtoken` is a **literal string** required by NGC, not a shell variable. Do not replace it and do not let your shell expand it — the single quotes above keep it literal.
70+
71+
Without this, `srun --container-image=nvcr.io/...` fails with `401 Unauthorized` when the compute node tries to pull.
72+
73+
## Docker Hub login
74+
75+
Only needed if you hit rate limits pulling public images:
76+
77+
```bash
78+
docker login
79+
```
80+
81+
## Summary
82+
83+
| Credential | Used for | Set via |
84+
|---|---|---|
85+
| `HF_TOKEN` | Gated HF models / datasets | Env var (`export HF_TOKEN=...`) or `.env` |
86+
| NGC API key | `nvcr.io` image pulls | `docker login` or `~/.config/enroot/.credentials` |
87+
| Docker Hub | Rate-limited public image pulls | `docker login` |

.claude/skills/common/remote-execution.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,16 @@ clusters:
2828
default_cluster: my-cluster
2929
```
3030
31+
### Staging checkpoints from your workstation
32+
33+
Workstation filesystems (`/home/scratch.*`, local NFS) are **not** mounted on the cluster. If a checkpoint was produced on your workstation, copy it to the cluster's own storage before submitting any job that references it — NEL and SLURM do NOT sync checkpoints automatically.
34+
35+
```bash
36+
rsync -av /path/to/local/checkpoint <cluster-login>:<cluster-workspace>/checkpoints/
37+
```
38+
39+
Use the `workspace` path from your cluster config as the destination. Compute nodes on a given cluster share the same storage as its login node, so once staged, the path works everywhere on that cluster.
40+
3141
See `.claude/clusters.yaml.example` for a fully annotated example with multiple cluster types.
3242

3343
---

.claude/skills/common/slurm-setup.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,10 @@ srun \
5151
"
5252
```
5353

54+
### Container registry credentials (pyxis)
55+
56+
If `srun --container-image` uses an image from a private registry (e.g., `nvcr.io/nvidia/...`), pyxis/enroot needs registry credentials on the cluster in `~/.config/enroot/.credentials`. See `skills/common/credentials.md` for the NGC / Docker / HF token setup. Without this, `srun` fails with `401 Unauthorized` when the compute node pulls.
57+
5458
Submit and capture the job ID:
5559

5660
```bash

0 commit comments

Comments
 (0)