Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
363 changes: 363 additions & 0 deletions cuda_bindings/cuda/bindings/_internal/_nvml.pxd

Large diffs are not rendered by default.

7,463 changes: 7,463 additions & 0 deletions cuda_bindings/cuda/bindings/_internal/_nvml_linux.pyx

Large diffs are not rendered by default.

6,105 changes: 6,105 additions & 0 deletions cuda_bindings/cuda/bindings/_internal/_nvml_windows.pyx

Large diffs are not rendered by default.

447 changes: 447 additions & 0 deletions cuda_bindings/cuda/bindings/_nvml.pxd

Large diffs are not rendered by default.

27,503 changes: 27,503 additions & 0 deletions cuda_bindings/cuda/bindings/_nvml.pyx

Large diffs are not rendered by default.

2,092 changes: 2,092 additions & 0 deletions cuda_bindings/cuda/bindings/cy_nvml.pxd

Large diffs are not rendered by default.

1,411 changes: 1,411 additions & 0 deletions cuda_bindings/cuda/bindings/cy_nvml.pyx

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions cuda_bindings/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

[build-system]
requires = ["setuptools>=77.0.0", "cython>=3.1,<3.2", "pyclibrary>=0.1.7"]
requires = ["setuptools>=77.0.0", "cython>=3.2,<3.3", "pyclibrary>=0.1.7"]
build-backend = "setuptools.build_meta"

[project]
Expand Down Expand Up @@ -39,7 +39,7 @@ all = [
]

test = [
"cython>=3.1,<3.2",
"cython>=3.2,<3.3",
"setuptools>=77.0.0",
"numpy>=1.21.1",
"pytest>=6.2.4",
Expand Down
Empty file.
2 changes: 2 additions & 0 deletions cuda_bindings/tests/nvml/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
131 changes: 131 additions & 0 deletions cuda_bindings/tests/nvml/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

from collections import namedtuple

import pytest

from cuda.bindings import _nvml as nvml


class NVMLInitializer:
def __init__(self):
pass

def __enter__(self):
nvml.init_v2()

def __exit__(self, exception_type, exception, trace):
nvml.shutdown()


@pytest.fixture
def nvml_init():
with NVMLInitializer():
yield


@pytest.fixture(scope="session", autouse=True)
def device_info():
dev_count = None
bus_id_to_board_details = {}

with NVMLInitializer():
dev_count = nvml.device_get_count_v2()

# Store some details for each device now when we know NVML is in known state
for i in range(dev_count):
try:
dev = nvml.device_get_handle_by_index_v2(i)
except nvml.NoPermissionError:
continue
pci_info = nvml.device_get_pci_info_v3(dev)

name = nvml.device_get_name(dev)
# Get architecture name ex: Ampere, Kepler
arch_id = nvml.device_get_architecture(dev)

BoardCfg = namedtuple("BoardCfg", "name, ids_arr")
board = BoardCfg(name, ids_arr=[(pci_info.pci_device_id, pci_info.pci_sub_system_id)])

try:
serial = nvml.device_get_serial(dev)
except nvml.NvmlError:
serial = None

bus_id = pci_info.bus_id
device_id = pci_info.device_
uuid = nvml.device_get_uuid(dev)

BoardDetails = namedtuple("BoardDetails", "name, board, arch_id, bus_id, device_id, serial")
bus_id_to_board_details[uuid] = BoardDetails(name, board, arch_id, bus_id, device_id, serial)

return bus_id_to_board_details


def get_devices(device_info):
for uuid in list(device_info.keys()):
try:
yield nvml.device_get_handle_by_uuid(uuid)
except nvml.NoPermissionError:
continue # ignore devices that can't be accessed


@pytest.fixture
def all_devices(device_info):
with NVMLInitializer():
yield sorted(list(set(get_devices(device_info))))


@pytest.fixture
def driver(nvml_init, request):
driver_vsn = nvml.system_get_driver_version()
# Return "major" version only
return int(driver_vsn.split(".")[0])


@pytest.fixture
def ngpus(nvml_init):
result = nvml.device_get_count_v2()
assert result > 0
return result


@pytest.fixture
def handles(ngpus):
handles = [nvml.device_get_handle_by_index_v2(i) for i in range(ngpus)]
assert len(handles) == ngpus
return handles


@pytest.fixture
def nmigs(handles):
return nvml.device_get_max_mig_device_count(handles[0])


@pytest.fixture
def mig_handles(nmigs):
handles = [nvml.device_get_mig_device_handle_by_index(i) for i in range(nmigs)]
assert len(handles) == nmigs
return handles


@pytest.fixture
def serials(ngpus, handles):
serials = [nvml.device_get_serial(handles[i]) for i in range(ngpus)]
assert len(serials) == ngpus
return serials


@pytest.fixture
def uuids(ngpus, handles):
uuids = [nvml.device_get_uuid(handles[i]) for i in range(ngpus)]
assert len(uuids) == ngpus
return uuids


@pytest.fixture
def pci_info(ngpus, handles):
pci_info = [nvml.device_get_pci_info_v3(handles[i]) for i in range(ngpus)]
assert len(pci_info) == ngpus
return pci_info
34 changes: 34 additions & 0 deletions cuda_bindings/tests/nvml/test_compute_mode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE


import sys

import pytest

from cuda.bindings import _nvml as nvml

COMPUTE_MODES = [
nvml.ComputeMode.COMPUTEMODE_DEFAULT,
nvml.ComputeMode.COMPUTEMODE_PROHIBITED,
nvml.ComputeMode.COMPUTEMODE_EXCLUSIVE_PROCESS,
]


@pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
def test_compute_mode_supported_nonroot(all_devices):
skip_reasons = set()
for device in all_devices:
try:
original_compute_mode = nvml.device_get_compute_mode(device)
except nvml.NotSupportedError:
skip_reasons.add(f"nvmlDeviceGetComputeMode not supported for device {device}")
continue

for cm in COMPUTE_MODES:
with pytest.raises(nvml.NoPermissionError):
nvml.device_set_compute_mode(device, cm)
assert original_compute_mode == nvml.device_get_compute_mode(device), "Compute mode shouldn't have changed"

if skip_reasons:
pytest.skip(" ; ".join(skip_reasons))
57 changes: 57 additions & 0 deletions cuda_bindings/tests/nvml/test_cuda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

import cuda.bindings.driver as cuda
from cuda.bindings import _nvml as nvml

from .conftest import NVMLInitializer


def get_nvml_device_names():
result = []
with NVMLInitializer():
# uses NVML Library to get the device count, device id and device pci id
num_devices = nvml.device_get_count_v2()
for idx in range(num_devices):
handle = nvml.device_get_handle_by_index_v2(idx)
name = nvml.device_get_name(handle)
info = nvml.device_get_pci_info_v3(handle)
assert isinstance(info.bus, int)
assert isinstance(name, str)
result.append({"name": name, "id": info.bus})

return result


def get_cuda_device_names(sort_by_bus_id=True):
result = []

(err,) = cuda.cuInit(0)
assert err == cuda.CUresult.CUDA_SUCCESS

err, device_count = cuda.cuDeviceGetCount()
assert err == cuda.CUresult.CUDA_SUCCESS

for dev in range(device_count):
size = 256
err, name = cuda.cuDeviceGetName(size, dev)
name = name.split(b"\x00")[0].decode()
assert err == cuda.CUresult.CUDA_SUCCESS

err, pci_bus_id = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev)
assert err == cuda.CUresult.CUDA_SUCCESS
assert isinstance(pci_bus_id, int)

result.append({"name": name, "id": pci_bus_id})

if sort_by_bus_id:
result = sorted(result, key=lambda k: k["id"])

return result


def test_cuda_device_order():
cuda_devices = get_cuda_device_names()
nvml_devices = get_nvml_device_names()

assert cuda_devices == nvml_devices, "CUDA and NVML device lists do not match"
46 changes: 46 additions & 0 deletions cuda_bindings/tests/nvml/test_gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

import pytest

from cuda.bindings import _nvml as nvml

from . import util


def test_gpu_get_module_id(nvml_init):
# Unique module IDs cannot exceed the number of GPUs on the system
device_count = nvml.device_get_count_v2()

for i in range(device_count):
device = nvml.device_get_handle_by_index_v2(i)
uuid = nvml.device_get_uuid(device)

if util.is_vgpu(device):
continue

module_id = nvml.device_get_module_id(device)
assert isinstance(module_id, int)


def test_gpu_get_platform_info(all_devices):
skip_reasons = set()
for device in all_devices:
if util.is_vgpu(device):
skip_reasons.add(f"Not supported on vGPU device {device}")
continue

# TODO
# if device.feature_dict.board.chip < board_class.Architecture.Blackwell:
# test_utils.skip_test("Not supported on chip before Blackwell")

try:
platform_info = nvml.device_get_platform_info(device)
except nvml.NotSupportedError:
skip_reasons.add(f"Not supported returned, linkely NVLink is disable for {device}")
continue

assert isinstance(platform_info, (nvml.PlatformInfo_v2, nvml.PlatformInfo_v1))

if skip_reasons:
pytest.skip(" ; ".join(skip_reasons))
54 changes: 54 additions & 0 deletions cuda_bindings/tests/nvml/test_init.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

import sys

import pytest

from cuda.bindings import _nvml as nvml


def assert_nvml_is_initialized():
assert nvml.device_get_count_v2() > 0


def assert_nvml_is_uninitialized():
with pytest.raises(nvml.UninitializedError):
nvml.device_get_count_v2()


@pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
def test_init_ref_count():
"""
Verifies that we can call NVML shutdown and init(2) multiple times, and that ref counting works
"""
with pytest.raises(nvml.UninitializedError):
nvml.shutdown()

assert_nvml_is_uninitialized()

for i in range(3):
# Init 5 times
for j in range(5):
nvml.init_v2()
assert_nvml_is_initialized()

# Shutdown 4 times, NVML should remain initailized
for j in range(4):
nvml.shutdown()
assert_nvml_is_initialized()

# Shutdown the final time
nvml.shutdown()
assert_nvml_is_uninitialized()


def test_init_check_index(nvml_init):
"""
Verifies that the index from nvmlDeviceGetIndex is correct
"""
dev_count = nvml.device_get_count_v2()
for idx in range(dev_count):
handle = nvml.device_get_handle_by_index_v2(idx)
# Verify that the index matches
assert idx == nvml.device_get_index(handle)
29 changes: 29 additions & 0 deletions cuda_bindings/tests/nvml/test_nvlink.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE


from cuda.bindings import _nvml as nvml


def test_nvlink_get_link_count(all_devices):
"""
Checks that the link count of the device is same.
"""
for device in all_devices:
fields = nvml.FieldValue(1)
fields[0].field_id = nvml.FI.DEV_NVLINK_LINK_COUNT
value = nvml.device_get_field_values(device, fields)[0]
assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, (
f"Unexpected return {value.nvml_return} for link count field query"
)

# Use the alternative argument to device_get_field_values
value = nvml.device_get_field_values(device, [nvml.FI.DEV_NVLINK_LINK_COUNT])[0]
assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, (
f"Unexpected return {value.nvml_return} for link count field query"
)

# The feature_nvlink_supported detection is not robust, so we
# can't be more specific about how many links we should find.
if value.nvml_return == nvml.Return.SUCCESS:
assert value.value.ui_val <= nvml.NVLINK_MAX_LINKS, f"Unexpected link count {value.value.ui_val}"
Loading
Loading