-
Notifications
You must be signed in to change notification settings - Fork 735
Expert Parallelism: common C API + NCCL EP backend #3034
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 3 commits
17e5126
cef4b33
0086be4
2dc9fe7
c93387c
ead4344
eb83342
319f9d5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -83,6 +83,34 @@ def setup_common_extension() -> CMakeExtension: | |
| cusolvermp_dir = os.getenv("CUSOLVERMP_HOME", "/usr") | ||
| cmake_flags.append(f"-DCUSOLVERMP_DIR={cusolvermp_dir}") | ||
|
|
||
| # NCCL EP: on by default; auto-disabled if no arch >= 90. | ||
| # Set NVTE_BUILD_WITH_NCCL_EP=0/1 to force off/on. | ||
| nccl_ep_env = os.getenv("NVTE_BUILD_WITH_NCCL_EP") | ||
| explicit_nccl_ep = nccl_ep_env is not None | ||
| build_with_nccl_ep = bool(int(nccl_ep_env)) if explicit_nccl_ep else True | ||
|
|
||
| if build_with_nccl_ep: | ||
| arch_tokens = [a.strip() for a in str(archs or "").split(";") if a.strip()] | ||
| has_hopper_or_newer = any(t.lower() == "native" for t in arch_tokens) or any( | ||
| int(t.rstrip("af")) >= 90 for t in arch_tokens if t.rstrip("af").isdigit() | ||
| ) | ||
| if not has_hopper_or_newer: | ||
| if explicit_nccl_ep: | ||
| raise RuntimeError( | ||
| "NVTE_BUILD_WITH_NCCL_EP=1 requires at least one CUDA arch >= 90 in " | ||
| f"NVTE_CUDA_ARCHS (got '{archs}'). Add '90' or unset NVTE_BUILD_WITH_NCCL_EP." | ||
| ) | ||
| print( | ||
| "[NCCL EP] No CUDA arch >= 90 in NVTE_CUDA_ARCHS" | ||
| f" ('{archs}'); auto-disabling NCCL EP (nvte_ep_* will throw at runtime)." | ||
| ) | ||
| build_with_nccl_ep = False | ||
|
Comment on lines
+92
to
+107
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this needed? Considering that we are fine if they compile for anything AND Hopper+, then we should also be fine if they compile for the same things without Hopper+ (and then it should just say unsupported at runtime once somebody tries to run it on that hardware). |
||
|
|
||
| if build_with_nccl_ep: | ||
| build_nccl_ep_submodule() | ||
| else: | ||
| cmake_flags.append("-DNVTE_WITH_NCCL_EP=OFF") | ||
|
|
||
| # Add custom CMake arguments from environment variable | ||
| nvte_cmake_extra_args = os.getenv("NVTE_CMAKE_EXTRA_ARGS") | ||
| if nvte_cmake_extra_args: | ||
|
|
@@ -128,6 +156,105 @@ def setup_requirements() -> Tuple[List[str], List[str]]: | |
| return [remove_dups(reqs) for reqs in [install_reqs, test_reqs]] | ||
|
|
||
|
|
||
| def _discover_nccl_home() -> str: | ||
| """Resolve NCCL_HOME: honor env var, else probe well-known prefixes, else ldconfig.""" | ||
| env_home = os.environ.get("NCCL_HOME") | ||
| if env_home: | ||
| if (Path(env_home) / "include" / "nccl.h").exists(): | ||
| return env_home | ||
| print( | ||
| f"[NCCL EP] WARNING: NCCL_HOME='{env_home}' is set but " | ||
| f"'{env_home}/include/nccl.h' was not found; falling back to system probes." | ||
| ) | ||
|
|
||
| for cand in ("/opt/nvidia/nccl", "/usr/local/nccl", "/usr"): | ||
| p = Path(cand) | ||
| if (p / "include" / "nccl.h").exists() and any( | ||
| (p / "lib" / name).exists() or (p / "lib64" / name).exists() | ||
| for name in ("libnccl.so", "libnccl.so.2") | ||
| ): | ||
| return str(p) | ||
|
|
||
| try: | ||
| out = subprocess.check_output(["ldconfig", "-p"], stderr=subprocess.DEVNULL).decode() | ||
| for line in out.splitlines(): | ||
| if "libnccl.so" in line and "=>" in line: | ||
| lib_path = Path(line.split("=>")[-1].strip()) | ||
| root = lib_path.parent.parent | ||
| if (root / "include" / "nccl.h").exists(): | ||
| return str(root) | ||
| except (subprocess.CalledProcessError, FileNotFoundError): | ||
| pass | ||
|
|
||
| raise RuntimeError( | ||
| "Could not locate NCCL core (nccl.h + libnccl.so). Set NCCL_HOME to the install prefix." | ||
| ) | ||
|
|
||
|
|
||
| def build_nccl_ep_submodule() -> str: | ||
| """Build libnccl_ep.so from the 3rdparty/nccl submodule. | ||
|
|
||
| NCCL EP is on by default; the system NCCL core (libnccl.so) supplies the | ||
| headers and runtime symbols. Returns the submodule build directory. | ||
| """ | ||
| nccl_root = current_file_path / "3rdparty" / "nccl" | ||
| if not (nccl_root / "Makefile").exists(): | ||
| raise RuntimeError( | ||
| f"NCCL submodule not found at {nccl_root}. " | ||
| "Run `git submodule update --init --recursive`." | ||
| ) | ||
|
|
||
| build_dir = nccl_root / "build" | ||
| nccl_ep_lib = build_dir / "lib" / "libnccl_ep.so" | ||
|
|
||
| archs = cuda_archs() or "90" | ||
| arch_list = [] | ||
| for a in str(archs).split(";"): | ||
| a = a.strip().rstrip("af") | ||
| if a and a.isdigit() and int(a) >= 90: | ||
| arch_list.append(a) | ||
| if not arch_list: | ||
| arch_list = ["90"] | ||
| gencode = " ".join(f"-gencode=arch=compute_{a},code=sm_{a}" for a in arch_list) | ||
|
|
||
| nproc = os.cpu_count() or 8 | ||
| env = os.environ.copy() | ||
| env["NVCC_GENCODE"] = gencode | ||
| # NCCL EP needs the core NCCL headers + libnccl.so; write NCCL EP build | ||
| # outputs to the submodule's local build/ tree. | ||
| nccl_home = _discover_nccl_home() | ||
| env["NCCL_HOME"] = nccl_home | ||
| env["NCCL_EP_BUILDDIR"] = str(build_dir) | ||
|
|
||
| if not nccl_ep_lib.exists(): | ||
| print(f"[NCCL EP] Building libnccl_ep.so (gencode='{gencode}')") | ||
| subprocess.check_call( | ||
| ["make", "-j", str(nproc), "-C", "contrib/nccl_ep", "lib"], | ||
| cwd=str(nccl_root), | ||
| env=env, | ||
| ) | ||
|
|
||
| # TE's CMake expects nccl.h under 3rdparty/nccl/build/include/ for its | ||
| # version check. Mirror the top-level host headers from the system NCCL | ||
| # install — DON'T mirror nccl_device/ because the submodule ships its own | ||
| # newer copy at src/include/nccl_device/ with device-side templates that | ||
| # conflict with older system versions, and the JIT include path picks the | ||
| # submodule's. | ||
| nccl_include = build_dir / "include" | ||
| nccl_include.mkdir(parents=True, exist_ok=True) | ||
| for cand in (Path(nccl_home) / "include", Path("/usr/include")): | ||
| p = Path(cand) | ||
| if (p / "nccl.h").exists(): | ||
| for name in ("nccl.h", "nccl_net.h", "nccl_tuner.h"): | ||
| src = p / name | ||
| dst = nccl_include / name | ||
| if src.exists() and not dst.exists(): | ||
| dst.symlink_to(src) | ||
| break | ||
|
Comment on lines
+241
to
+257
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we really have to do it like that? |
||
|
|
||
| return str(build_dir) | ||
|
|
||
|
|
||
| def git_check_submodules() -> None: | ||
| """ | ||
| Attempt to checkout git submodules automatically during setup. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,7 +30,7 @@ if(NOT DEFINED TE_LIB_PATH) | |
| get_filename_component(TE_LIB_PATH ${TE_LIB_FILE} DIRECTORY) | ||
| endif() | ||
|
|
||
| find_library(TE_LIB NAMES transformer_engine PATHS "${TE_LIB_PATH}/.." ${TE_LIB_PATH} ENV TE_LIB_PATH REQUIRED) | ||
| find_library(TE_LIB NAMES transformer_engine PATHS "${TE_LIB_PATH}/.." ${TE_LIB_PATH} ENV TE_LIB_PATH REQUIRED NO_CMAKE_SYSTEM_PATH) | ||
|
phu0ngng marked this conversation as resolved.
Outdated
|
||
|
|
||
| message(STATUS "Found transformer_engine library: ${TE_LIB}") | ||
| include_directories(../../transformer_engine/common/include) | ||
|
|
@@ -46,12 +46,99 @@ add_executable(test_comm_gemm | |
|
|
||
| find_package(OpenMP REQUIRED) | ||
| find_package(MPI REQUIRED) | ||
|
|
||
| # ── NCCL library ────────────────────────────────────────────────────────────── | ||
| # Search order: NCCL_HOME env → 3rdparty/nccl submodule build → system paths. | ||
| set(NCCL_SUBMODULE_BUILD "${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/nccl/build") | ||
| find_library(NCCL_LIB | ||
| NAMES nccl libnccl | ||
| PATH_SUFFIXES lib | ||
| HINTS $ENV{NCCL_HOME}/lib ${NCCL_SUBMODULE_BUILD}/lib | ||
| PATH_SUFFIXES lib lib64 | ||
| REQUIRED) | ||
|
|
||
| # NCCL headers: prefer submodule build output (has the handle_init API), | ||
| # then submodule src, then system (CUDA toolkit). | ||
| set(NCCL_SUBMODULE_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/nccl/build/include") | ||
| set(NCCL_SUBMODULE_SRC_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/nccl/src/include") | ||
| if(EXISTS "${NCCL_SUBMODULE_INCLUDE}/nccl.h") | ||
| set(NCCL_INCLUDE_DIR "${NCCL_SUBMODULE_INCLUDE}") | ||
| elseif(EXISTS "${NCCL_SUBMODULE_SRC_INCLUDE}/nccl.h") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why would this ever not exist (other than the submodule not being downloaded, but then that is a problem by itself)? |
||
| set(NCCL_INCLUDE_DIR "${NCCL_SUBMODULE_SRC_INCLUDE}") | ||
| elseif(DEFINED ENV{NCCL_HOME}) | ||
| set(NCCL_INCLUDE_DIR "$ENV{NCCL_HOME}/include") | ||
| endif() | ||
| target_include_directories(test_comm_gemm PRIVATE ${MPI_CXX_INCLUDE_PATH} $ENV{CUBLASMP_HOME}/include) | ||
| target_link_libraries(test_comm_gemm PUBLIC CUDA::cuda_driver CUDA::cudart GTest::gtest ${TE_LIB} CUDA::nvrtc MPI::MPI_CXX ${NCCL_LIB} OpenMP::OpenMP_CXX) | ||
|
|
||
| include(GoogleTest) | ||
| gtest_discover_tests(test_comm_gemm DISCOVERY_TIMEOUT 600) | ||
|
|
||
| # ── EP distributed tests (HT mode) ───────────────────────────────────────── | ||
| # No MPI dependency — processes are spawned by run_test_ep.sh with | ||
| # --rank / --nranks flags. ncclUniqueId exchange uses a | ||
| # shared temp file (see test_ep_common.h for details). | ||
|
phu0ngng marked this conversation as resolved.
Outdated
|
||
| # Headers + libs come from the in-tree 3rdparty/nccl submodule build. | ||
| set(NCCL_EP_SUBMODULE_ROOT | ||
| "${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/nccl") | ||
| find_library(NCCL_EP_LIB | ||
| NAMES nccl_ep libnccl_ep | ||
| HINTS ${NCCL_EP_SUBMODULE_ROOT}/build/lib | ||
| NO_DEFAULT_PATH | ||
| REQUIRED) | ||
|
Comment on lines
+79
to
+85
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The fact that you need to do that in tests is worrying - it is possible that this will make the experience of installing and using TE worse if people need to care about the NCCL EP installation and library loading. Could we link it statically into libtransformer_engine.so instead and just rely on libnccl? |
||
|
|
||
| set(NCCL_EP_INCLUDE_DIR "${NCCL_EP_SUBMODULE_ROOT}/contrib/nccl_ep/include") | ||
| if(NOT EXISTS "${NCCL_EP_INCLUDE_DIR}/nccl_ep.h") | ||
| message(FATAL_ERROR | ||
| "NCCL EP header not found at ${NCCL_EP_INCLUDE_DIR}/nccl_ep.h. " | ||
| "Run `git submodule update --init --recursive` to checkout 3rdparty/nccl.") | ||
| endif() | ||
| message(STATUS "EP test: NCCL EP headers: ${NCCL_EP_INCLUDE_DIR}") | ||
|
Comment on lines
+87
to
+93
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need NCCL EP headers to compile tests? |
||
|
|
||
| # Collect NCCL include dirs shared by all EP test targets (nccl_ep.h + nccl.h). | ||
| set(EP_TEST_NCCL_INCLUDES ${NCCL_EP_INCLUDE_DIR}) | ||
| if(DEFINED NCCL_INCLUDE_DIR) | ||
| list(APPEND EP_TEST_NCCL_INCLUDES ${NCCL_INCLUDE_DIR}) | ||
| message(STATUS "EP test: NCCL headers: ${NCCL_INCLUDE_DIR}") | ||
| endif() | ||
|
|
||
| set(EP_TEST_COMMON_INCLUDES | ||
| ${EP_TEST_NCCL_INCLUDES} | ||
| ../../transformer_engine/common/include | ||
| ../../transformer_engine/common | ||
| ${CMAKE_CURRENT_SOURCE_DIR}) | ||
|
|
||
| set(EP_TEST_COMMON_LIBS | ||
| CUDA::cuda_driver | ||
| CUDA::cudart | ||
| CUDA::nvrtc | ||
| GTest::gtest | ||
| ${TE_LIB} | ||
| ${NCCL_LIB} | ||
| ${NCCL_EP_LIB}) | ||
|
|
||
| # nvrtc symbols are referenced from libtransformer_engine.so but not in its | ||
| # DT_NEEDED list (loaded via dlopen in Python). For cpp tests we link nvrtc | ||
| # explicitly with --no-as-needed so the linker keeps the dependency. | ||
| set(EP_TEST_LINK_OPTS "LINKER:--no-as-needed") | ||
|
phu0ngng marked this conversation as resolved.
Outdated
|
||
|
|
||
| # ── EP init tests (InitPath, HandleMemSizeQuery) ───────────────────────────── | ||
| add_executable(test_ep_init test_ep_init.cu) | ||
| target_include_directories(test_ep_init PRIVATE ${EP_TEST_COMMON_INCLUDES}) | ||
| target_link_libraries(test_ep_init PUBLIC ${EP_TEST_COMMON_LIBS}) | ||
| target_link_options(test_ep_init PUBLIC ${EP_TEST_LINK_OPTS}) | ||
|
|
||
| # ── EP pipeline tests (dispatch, combine, bwd, integrated) ─────────────────── | ||
| add_executable(test_ep_pipeline test_ep_pipeline.cu) | ||
| target_include_directories(test_ep_pipeline PRIVATE ${EP_TEST_COMMON_INCLUDES}) | ||
| target_link_libraries(test_ep_pipeline PUBLIC ${EP_TEST_COMMON_LIBS}) | ||
| target_link_options(test_ep_pipeline PUBLIC ${EP_TEST_LINK_OPTS}) | ||
|
|
||
| # ── EP coverage tests (multi-handle, top_k=1, empty experts, negatives, threading) ── | ||
| add_executable(test_ep_coverage test_ep_coverage.cu) | ||
| target_include_directories(test_ep_coverage PRIVATE ${EP_TEST_COMMON_INCLUDES}) | ||
| target_link_libraries(test_ep_coverage PUBLIC ${EP_TEST_COMMON_LIBS}) | ||
| target_link_options(test_ep_coverage PUBLIC ${EP_TEST_LINK_OPTS}) | ||
|
|
||
| # Do NOT use gtest_discover_tests — these binaries require multi-process | ||
| # launch via run_test_ep.sh, not direct single-process execution. | ||
| message(STATUS "EP distributed tests enabled: ${NCCL_EP_LIB}") | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to specify the branch here rather than normally via the commit hash?