Skip to content

mlx_lm fails to import due to cudaMemAdvise invalid argument #12034

@pawalt

Description

@pawalt

Description

Trying to import the mlx-lm Python fails due to an InvalidArgument on the cudaMemAdvise call, but this behavior does not show on runc. This code:

image = (
    modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.12")
    .uv_pip_install("mlx[cuda]==0.27.1", "mlx-lm==0.26.3")  # colab versions
    .entrypoint([])
)

app = modal.App("hi", image=image)


@app.function(max_containers=1, gpu="T4")
def import_mlx():
    from mlx_lm import load

    load

Will throw this error:

Traceback (most recent call last):
  File "/pkg/modal/_runtime/container_io_manager.py", line 767, in handle_input_exception
    yield
  File "/pkg/modal/_container_entrypoint.py", line 236, in run_input_sync
    res = io_context.call_finalized_function()
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/pkg/modal/_runtime/container_io_manager.py", line 193, in call_finalized_function
    res = self.finalized_function.callable(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/emmellex.py", line 23, in import_mlx
    from mlx_lm import load
  File "/usr/local/lib/python3.12/site-packages/mlx_lm/__init__.py", line 9, in <module>
    from .convert import convert
  File "/usr/local/lib/python3.12/site-packages/mlx_lm/convert.py", line 7, in <module>
    import mlx.core as mx
ImportError: cudaMemAdvise(data_, small_pool_size, cudaMemAdviseSetReadMostly, 0) failed: invalid argument

This has also been reported in ml-explore/mlx#2428.

Steps to reproduce

Create a docker container with mlx[cuda]==0.27.1, mlx-lm==0.26.3, and run import mlx-lm. This works when the runtime runc, but it does not work when the runtime is runsc.

We've narrowed this down to a CUDA-only reproduction as well:

@app.function(max_containers=1, gpu="T4:1")
def test_cudamemadvise():
    import ctypes as C
    import os
    import subprocess

    print(subprocess.run(["nvidia-smi"], text=True, capture_output=True).stdout)
    print("CUDA_VISIBLE_DEVICES:", os.getenv("CUDA_VISIBLE_DEVICES"))

    # 0) Driver/Runtime versions
    libcuda = C.CDLL("libcuda.so.1")
    cudart = C.CDLL("libcudart.so")
    drv = C.c_int()
    rt = C.c_int()
    libcuda.cuDriverGetVersion(C.byref(drv))
    cudart.cudaRuntimeGetVersion(C.byref(rt))

    def ver(n):
        return f"{n // 1000}.{(n % 1000) // 10}"

    print(f"CUDA Driver API: {ver(drv.value)}   CUDA Runtime: {ver(rt.value)}")

    # 1) some constants from CUDA headers/enums
    # Highlighted Ref:
    # https://manpages.ubuntu.com/manpages/bionic/man3/CUDART_TYPES.3.html#:~:text=enum%20cudaMemoryAdvise%20%7B%20cudaMemAdviseSetReadMostly%20%3D%20%201%2C%20cudaMemAdviseUnsetReadMostly%20%3D%20%202%2C%0A%20%20%20%20%20%20%20%20%20%20%20cudaMemAdviseSetPreferredLocation%20%3D%20%203%2C%20cudaMemAdviseUnsetPreferredLocation%20%3D%20%204%2C%0A%20%20%20%20%20%20%20%20%20%20%20cudaMemAdviseSetAccessedBy%20%3D%20%205%2C%20cudaMemAdviseUnsetAccessedBy%20%3D%20%206%20%7D
    cudaMemAttachGlobal = 1
    cudaMemAdviseSetReadMostly = 1  # this does not work
    cudaMemAdviseSetAccessedBy = 3  # this works

    # Helper to show the error *name* instead of a code
    cudart.cudaGetErrorString.restype = C.c_char_p

    def err_name(code: int) -> str:
        return cudart.cudaGetErrorString(code).decode()

    # 3) Minimal repro of the MLX-triggering call
    print("\n=== Unified Memory advise probe ===")

    # Allocate managed page(s)
    devPtr = C.c_void_p()
    size_t = C.c_size_t(4096)
    e = cudart.cudaMallocManaged(C.byref(devPtr), size_t, cudaMemAttachGlobal)
    print("cudaMallocManaged:", e, f"({err_name(e)})")

    device = 0
    e = cudart.cudaMemAdvise(devPtr, size_t, cudaMemAdviseSetReadMostly, device)
    print(f"cudaMemAdvise(...,SetReadMostly, device={device}):", e, f"({err_name(e)})")

    e = cudart.cudaMemAdvise(devPtr, size_t, cudaMemAdviseSetAccessedBy, device)
    print(f"cudaMemAdvise(...,SetAccessedBy, device={device}):", e, f"({err_name(e)})")

On gVisor, this throws:

=== Unified Memory advise probe ===
cudaMallocManaged: 0 (no error)
cudaMemAdvise(...,SetReadMostly, device=0): 1 (invalid argument)
cudaMemAdvise(...,SetAccessedBy, device=0): 0 (no error)

And on runc, this does not throw:

=== Unified Memory advise probe ===
cudaMallocManaged: 0 (no error)
cudaMemAdvise(...,SetReadMostly, device=0): 0 (no error)
cudaMemAdvise(...,SetAccessedBy, device=0): 0 (no error)

runsc version

runsc version 2099bc1c6a2e
spec: 1.2.0

docker version (if using docker)

N/A

uname

Linux ip-10-101-9-167.us-east-2.compute.internal 5.15.0-311.185.9.el9uek.x86_64 #2 SMP Wed Jul 30 05:28:17 PDT 2025 x86_64 x86_64 x86_64 GNU/Linux

kubectl (if using Kubernetes)

N/A

repo state (if built from source)

No response

runsc debug logs (if available)

I'll attach the file in another comment.

Metadata

Metadata

Assignees

No one assigned

    Labels

    area: gpuIssue related to sandboxed GPU accesstype: bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions