Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@
[submodule "3rdparty/cutlass"]
path = 3rdparty/cutlass
url = https://github.com/NVIDIA/cutlass.git
[submodule "3rdparty/nccl"]
path = 3rdparty/nccl
url = https://github.com/NVIDIA/nccl.git
branch = v2.30u1
1 change: 1 addition & 0 deletions 3rdparty/nccl
Submodule nccl added at 6a9bc9
3 changes: 3 additions & 0 deletions qa/L1_cpp_distributed/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,7 @@ if [[ $(nvidia-smi --list-gpus | wc -l) -ge 4 ]]; then
cmake -GNinja -S. -Bbuild
cmake --build build
mpirun --allow-run-as-root --np 4 --oversubscribe ./build/test_comm_gemm

# EP suites; runner self-skips on pre-Hopper GPUs.
bash ./run_test_ep.sh 4 ./build
fi
127 changes: 127 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,34 @@ def setup_common_extension() -> CMakeExtension:
cusolvermp_dir = os.getenv("CUSOLVERMP_HOME", "/usr")
cmake_flags.append(f"-DCUSOLVERMP_DIR={cusolvermp_dir}")

# NCCL EP: on by default; auto-disabled if no arch >= 90.
# Set NVTE_BUILD_WITH_NCCL_EP=0/1 to force off/on.
nccl_ep_env = os.getenv("NVTE_BUILD_WITH_NCCL_EP")
explicit_nccl_ep = nccl_ep_env is not None
build_with_nccl_ep = bool(int(nccl_ep_env)) if explicit_nccl_ep else True

if build_with_nccl_ep:
arch_tokens = [a.strip() for a in str(archs or "").split(";") if a.strip()]
has_hopper_or_newer = any(t.lower() == "native" for t in arch_tokens) or any(
int(t.rstrip("af")) >= 90 for t in arch_tokens if t.rstrip("af").isdigit()
)
if not has_hopper_or_newer:
if explicit_nccl_ep:
raise RuntimeError(
"NVTE_BUILD_WITH_NCCL_EP=1 requires at least one CUDA arch >= 90 in "
f"NVTE_CUDA_ARCHS (got '{archs}'). Add '90' or unset NVTE_BUILD_WITH_NCCL_EP."
)
print(
"[NCCL EP] No CUDA arch >= 90 in NVTE_CUDA_ARCHS"
f" ('{archs}'); auto-disabling NCCL EP (nvte_ep_* will throw at runtime)."
)
build_with_nccl_ep = False

if build_with_nccl_ep:
build_nccl_ep_submodule()
else:
cmake_flags.append("-DNVTE_WITH_NCCL_EP=OFF")

# Add custom CMake arguments from environment variable
nvte_cmake_extra_args = os.getenv("NVTE_CMAKE_EXTRA_ARGS")
if nvte_cmake_extra_args:
Expand Down Expand Up @@ -128,6 +156,105 @@ def setup_requirements() -> Tuple[List[str], List[str]]:
return [remove_dups(reqs) for reqs in [install_reqs, test_reqs]]


def _discover_nccl_home() -> str:
"""Resolve NCCL_HOME: honor env var, else probe well-known prefixes, else ldconfig."""
env_home = os.environ.get("NCCL_HOME")
if env_home:
if (Path(env_home) / "include" / "nccl.h").exists():
return env_home
print(
f"[NCCL EP] WARNING: NCCL_HOME='{env_home}' is set but "
f"'{env_home}/include/nccl.h' was not found; falling back to system probes."
)

for cand in ("/opt/nvidia/nccl", "/usr/local/nccl", "/usr"):
p = Path(cand)
if (p / "include" / "nccl.h").exists() and any(
(p / "lib" / name).exists() or (p / "lib64" / name).exists()
for name in ("libnccl.so", "libnccl.so.2")
):
return str(p)

try:
out = subprocess.check_output(["ldconfig", "-p"], stderr=subprocess.DEVNULL).decode()
for line in out.splitlines():
if "libnccl.so" in line and "=>" in line:
lib_path = Path(line.split("=>")[-1].strip())
root = lib_path.parent.parent
if (root / "include" / "nccl.h").exists():
return str(root)
except (subprocess.CalledProcessError, FileNotFoundError):
pass

raise RuntimeError(
"Could not locate NCCL core (nccl.h + libnccl.so). Set NCCL_HOME to the install prefix."
)


def build_nccl_ep_submodule() -> str:
"""Build libnccl_ep.so from the 3rdparty/nccl submodule.

NCCL EP is on by default; the system NCCL core (libnccl.so) supplies the
headers and runtime symbols. Returns the submodule build directory.
"""
nccl_root = current_file_path / "3rdparty" / "nccl"
if not (nccl_root / "Makefile").exists():
raise RuntimeError(
f"NCCL submodule not found at {nccl_root}. "
"Run `git submodule update --init --recursive`."
)

build_dir = nccl_root / "build"
nccl_ep_lib = build_dir / "lib" / "libnccl_ep.so"

archs = cuda_archs() or "90"
arch_list = []
for a in str(archs).split(";"):
a = a.strip().rstrip("af")
if a and a.isdigit() and int(a) >= 90:
arch_list.append(a)
if not arch_list:
arch_list = ["90"]
gencode = " ".join(f"-gencode=arch=compute_{a},code=sm_{a}" for a in arch_list)

nproc = os.cpu_count() or 8
env = os.environ.copy()
env["NVCC_GENCODE"] = gencode
# NCCL EP needs the core NCCL headers + libnccl.so; write NCCL EP build
# outputs to the submodule's local build/ tree.
nccl_home = _discover_nccl_home()
env["NCCL_HOME"] = nccl_home
env["NCCL_EP_BUILDDIR"] = str(build_dir)

if not nccl_ep_lib.exists():
print(f"[NCCL EP] Building libnccl_ep.so (gencode='{gencode}')")
subprocess.check_call(
["make", "-j", str(nproc), "-C", "contrib/nccl_ep", "lib"],
cwd=str(nccl_root),
env=env,
)

# TE's CMake expects nccl.h under 3rdparty/nccl/build/include/ for its
# version check. Mirror the top-level host headers from the system NCCL
# install — DON'T mirror nccl_device/ because the submodule ships its own
# newer copy at src/include/nccl_device/ with device-side templates that
# conflict with older system versions, and the JIT include path picks the
# submodule's.
nccl_include = build_dir / "include"
nccl_include.mkdir(parents=True, exist_ok=True)
for cand in (Path(nccl_home) / "include", Path("/usr/include")):
p = Path(cand)
if (p / "nccl.h").exists():
for name in ("nccl.h", "nccl_net.h", "nccl_tuner.h"):
src = p / name
dst = nccl_include / name
if src.exists() and not dst.exists():
dst.symlink_to(src)
break

return str(build_dir)


def git_check_submodules() -> None:
"""
Attempt to checkout git submodules automatically during setup.
Expand Down
91 changes: 89 additions & 2 deletions tests/cpp_distributed/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ if(NOT DEFINED TE_LIB_PATH)
get_filename_component(TE_LIB_PATH ${TE_LIB_FILE} DIRECTORY)
endif()

find_library(TE_LIB NAMES transformer_engine PATHS "${TE_LIB_PATH}/.." ${TE_LIB_PATH} ENV TE_LIB_PATH REQUIRED)
find_library(TE_LIB NAMES transformer_engine PATHS "${TE_LIB_PATH}/.." ${TE_LIB_PATH} ENV TE_LIB_PATH REQUIRED NO_CMAKE_SYSTEM_PATH)

message(STATUS "Found transformer_engine library: ${TE_LIB}")
include_directories(../../transformer_engine/common/include)
Expand All @@ -46,12 +46,99 @@ add_executable(test_comm_gemm

find_package(OpenMP REQUIRED)
find_package(MPI REQUIRED)

# ── NCCL library ──────────────────────────────────────────────────────────────
# Search order: NCCL_HOME env → 3rdparty/nccl submodule build → system paths.
set(NCCL_SUBMODULE_BUILD "${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/nccl/build")
find_library(NCCL_LIB
NAMES nccl libnccl
PATH_SUFFIXES lib
HINTS $ENV{NCCL_HOME}/lib ${NCCL_SUBMODULE_BUILD}/lib
PATH_SUFFIXES lib lib64
REQUIRED)

# NCCL headers: prefer submodule build output (has the handle_init API),
# then submodule src, then system (CUDA toolkit).
set(NCCL_SUBMODULE_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/nccl/build/include")
set(NCCL_SUBMODULE_SRC_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/nccl/src/include")
if(EXISTS "${NCCL_SUBMODULE_INCLUDE}/nccl.h")
set(NCCL_INCLUDE_DIR "${NCCL_SUBMODULE_INCLUDE}")
elseif(EXISTS "${NCCL_SUBMODULE_SRC_INCLUDE}/nccl.h")
set(NCCL_INCLUDE_DIR "${NCCL_SUBMODULE_SRC_INCLUDE}")
elseif(DEFINED ENV{NCCL_HOME})
set(NCCL_INCLUDE_DIR "$ENV{NCCL_HOME}/include")
endif()
target_include_directories(test_comm_gemm PRIVATE ${MPI_CXX_INCLUDE_PATH} $ENV{CUBLASMP_HOME}/include)
target_link_libraries(test_comm_gemm PUBLIC CUDA::cuda_driver CUDA::cudart GTest::gtest ${TE_LIB} CUDA::nvrtc MPI::MPI_CXX ${NCCL_LIB} OpenMP::OpenMP_CXX)

include(GoogleTest)
gtest_discover_tests(test_comm_gemm DISCOVERY_TIMEOUT 600)

# ── EP distributed tests (HT mode) ─────────────────────────────────────────
# No MPI dependency — processes are spawned by run_test_ep.sh with
# --rank / --nranks flags. ncclUniqueId exchange uses a
# shared temp file (see test_ep_common.h for details).
# Headers + libs come from the in-tree 3rdparty/nccl submodule build.
set(NCCL_EP_SUBMODULE_ROOT
"${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/nccl")
find_library(NCCL_EP_LIB
NAMES nccl_ep libnccl_ep
HINTS ${NCCL_EP_SUBMODULE_ROOT}/build/lib
NO_DEFAULT_PATH
REQUIRED)

set(NCCL_EP_INCLUDE_DIR "${NCCL_EP_SUBMODULE_ROOT}/contrib/nccl_ep/include")
if(NOT EXISTS "${NCCL_EP_INCLUDE_DIR}/nccl_ep.h")
message(FATAL_ERROR
"NCCL EP header not found at ${NCCL_EP_INCLUDE_DIR}/nccl_ep.h. "
"Run `git submodule update --init --recursive` to checkout 3rdparty/nccl.")
endif()
message(STATUS "EP test: NCCL EP headers: ${NCCL_EP_INCLUDE_DIR}")

# Collect NCCL include dirs shared by all EP test targets (nccl_ep.h + nccl.h).
set(EP_TEST_NCCL_INCLUDES ${NCCL_EP_INCLUDE_DIR})
if(DEFINED NCCL_INCLUDE_DIR)
list(APPEND EP_TEST_NCCL_INCLUDES ${NCCL_INCLUDE_DIR})
message(STATUS "EP test: NCCL headers: ${NCCL_INCLUDE_DIR}")
endif()

set(EP_TEST_COMMON_INCLUDES
${EP_TEST_NCCL_INCLUDES}
../../transformer_engine/common/include
../../transformer_engine/common
${CMAKE_CURRENT_SOURCE_DIR})

set(EP_TEST_COMMON_LIBS
CUDA::cuda_driver
CUDA::cudart
CUDA::nvrtc
GTest::gtest
${TE_LIB}
${NCCL_LIB}
${NCCL_EP_LIB})

# nvrtc symbols are referenced from libtransformer_engine.so but not in its
# DT_NEEDED list (loaded via dlopen in Python). For cpp tests we link nvrtc
# explicitly with --no-as-needed so the linker keeps the dependency.
set(EP_TEST_LINK_OPTS "LINKER:--no-as-needed")

# ── EP init tests (InitPath, HandleMemSizeQuery) ─────────────────────────────
add_executable(test_ep_init test_ep_init.cu)
target_include_directories(test_ep_init PRIVATE ${EP_TEST_COMMON_INCLUDES})
target_link_libraries(test_ep_init PUBLIC ${EP_TEST_COMMON_LIBS})
target_link_options(test_ep_init PUBLIC ${EP_TEST_LINK_OPTS})

# ── EP pipeline tests (dispatch, combine, bwd, integrated) ───────────────────
add_executable(test_ep_pipeline test_ep_pipeline.cu)
target_include_directories(test_ep_pipeline PRIVATE ${EP_TEST_COMMON_INCLUDES})
target_link_libraries(test_ep_pipeline PUBLIC ${EP_TEST_COMMON_LIBS})
target_link_options(test_ep_pipeline PUBLIC ${EP_TEST_LINK_OPTS})

# ── EP coverage tests (multi-handle, top_k=1, empty experts, negatives, threading) ──
add_executable(test_ep_coverage test_ep_coverage.cu)
target_include_directories(test_ep_coverage PRIVATE ${EP_TEST_COMMON_INCLUDES})
target_link_libraries(test_ep_coverage PUBLIC ${EP_TEST_COMMON_LIBS})
target_link_options(test_ep_coverage PUBLIC ${EP_TEST_LINK_OPTS})

# Do NOT use gtest_discover_tests — these binaries require multi-process
# launch via run_test_ep.sh, not direct single-process execution.
message(STATUS "EP distributed tests enabled: ${NCCL_EP_LIB}")
Loading
Loading