Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,6 @@ def build_nccl_ep_submodule() -> str:
)
gencode = " ".join(f"-gencode=arch=compute_{a},code=sm_{a}" for a in arch_list)

nproc = os.cpu_count() or 8
env = os.environ.copy()
env["NVCC_GENCODE"] = gencode
# NCCL EP needs the core NCCL headers + libnccl.so; write NCCL EP build
Expand All @@ -277,7 +276,7 @@ def build_nccl_ep_submodule() -> str:
)
print(f"[NCCL EP] Building libnccl_ep.a (gencode='{gencode}')")
subprocess.check_call(
["make", "-j", str(nproc), "-C", "contrib/nccl_ep", "lib"],
["make", "-j", "-C", "contrib/nccl_ep", "lib"],
cwd=str(nccl_root),
env=env,
)
Comment on lines 278 to 282

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Using make -j with no job limit means GNU make will spawn as many concurrent compiler processes as the dependency graph allows. NVCC compilations are memory-intensive (each process can consume several GB), so on high-core-count machines or in memory-constrained containers this can trigger OOM kills or cause the build to hang. A common middle-ground is to honour any MAKEFLAGS/MAKE_JOBS already set in the environment and fall back to a capped default only when neither is present.

Suggested change
subprocess.check_call(
["make", "-j", str(nproc), "-C", "contrib/nccl_ep", "lib"],
["make", "-j", "-C", "contrib/nccl_ep", "lib"],
cwd=str(nccl_root),
env=env,
)
nproc = int(os.environ.get("MAKE_JOBS", os.cpu_count() or 8))
subprocess.check_call(
["make", "-j", str(nproc), "-C", "contrib/nccl_ep", "lib"],
cwd=str(nccl_root),
env=env,
)

Expand Down
Loading