microsoft · Binyang2014 · Aug 28, 2025 · Aug 29, 2025 · Sep 1, 2025 · Sep 2, 2025
diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
@@ -521,6 +521,7 @@ class Context : public std::enable_shared_from_this<Context> {
   friend class Endpoint;
   friend class Connection;
   friend class RegisteredMemory;
+  friend class SemaphoreStub;
 };
 
 /// Block of memory that has been registered to a Context.

diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp
@@ -24,6 +24,7 @@ using CUmemGenericAllocationHandle = hipMemGenericAllocationHandle_t;
 using CUmemAllocationProp = hipMemAllocationProp;
 using CUmemAccessDesc = hipMemAccessDesc;
 using CUmemAllocationHandleType = hipMemAllocationHandleType;
+using CUmemAllocationGranularity_flags = hipMemAllocationGranularity_flags;
 
 constexpr auto cudaErrorPeerAccessAlreadyEnabled = hipErrorPeerAccessAlreadyEnabled;
 constexpr auto cudaErrorContextIsDestroyed = hipErrorContextIsDestroyed;
@@ -44,6 +45,7 @@ constexpr auto CU_MEM_ALLOCATION_TYPE_PINNED = hipMemAllocationTypePinned;
 constexpr auto CU_MEM_LOCATION_TYPE_DEVICE = hipMemLocationTypeDevice;
 constexpr auto CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = hipMemHandleTypePosixFileDescriptor;
 constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWrite;
+constexpr auto CU_MEM_ALLOC_GRANULARITY_MINIMUM = hipMemAllocationGranularityMinimum;
 
 #ifndef CUDA_SUCCESS
 #define CUDA_SUCCESS hipSuccess
@@ -105,6 +107,7 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
 #define cuMemRetainAllocationHandle(...) hipMemRetainAllocationHandle(__VA_ARGS__)
 #define cuMemExportToShareableHandle(...) hipMemExportToShareableHandle(__VA_ARGS__)
 #define cuMemImportFromShareableHandle(...) hipMemImportFromShareableHandle(__VA_ARGS__)
+#define cuMemGetAllocationGranularity(...) hipMemGetAllocationGranularity(__VA_ARGS__)
 
 #else
 

diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
@@ -254,6 +254,7 @@ auto gpuCallocPhysicalUnique(size_t nelems = 1, size_t gran = 0, size_t align =
 }
 
 size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag);
+size_t getCuAllocationGranularity(CUmemAllocationGranularity_flags granFlag);
 
 #endif  // CUDA_NVLS_API_AVAILABLE
 

diff --git a/src/context.cc b/src/context.cc
@@ -56,6 +56,13 @@ IbCtx *Context::Impl::getIbContext(Transport ibTransport) {
   return it->second.get();
 }
 
+std::shared_ptr<uint64_t> Context::Impl::getToken() {
+  if (!tokenPool_) {
+    tokenPool_ = std::make_shared<TokenPool>(maxNumTokens_);
+  }
+  return tokenPool_->getToken();
+}
+
 MSCCLPP_API_CPP Context::Context() : pimpl_(std::make_unique<Impl>()) {}
 
 MSCCLPP_API_CPP Context::~Context() = default;

diff --git a/src/gpu_utils.cc b/src/gpu_utils.cc
@@ -155,6 +155,20 @@ void* gpuCallocUncached(size_t bytes) {
 #endif  // defined(__HIP_PLATFORM_AMD__)
 
 #if (CUDA_NVLS_API_AVAILABLE)
+size_t getCuAllocationGranularity(CUmemAllocationGranularity_flags granFlag) {
+  size_t gran = 0;
+  int deviceId = -1;
+  MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
+
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.location.id = deviceId;
+  prop.requestedHandleTypes = (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
+  cuMemGetAllocationGranularity(&gran, &prop, granFlag);
+  return gran;
+}
+
 size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag) {
   size_t gran = 0;
   int numDevices = 0;

diff --git a/src/include/context.hpp b/src/include/context.hpp
@@ -35,13 +35,17 @@ class CudaIpcStream {
   int deviceId() const { return deviceId_; }
 };
 
+class TokenPool;
 struct Context::Impl {
   std::unordered_map<Transport, std::unique_ptr<IbCtx>> ibContexts_;
   std::vector<std::shared_ptr<CudaIpcStream>> ipcStreams_;
+  std::shared_ptr<TokenPool> tokenPool_;
+  const size_t maxNumTokens_ = 1 << 15;  // 32K tokens
 
   Impl();
 
   IbCtx *getIbContext(Transport ibTransport);
+  std::shared_ptr<uint64_t> getToken();
 };
 
 }  // namespace mscclpp

diff --git a/src/include/utils_internal.hpp b/src/include/utils_internal.hpp
@@ -64,6 +64,19 @@ struct PairHash {
   }
 };
 
+class TokenPool : public std::enable_shared_from_this<TokenPool> {
+ public:
+  TokenPool(size_t nTokens);
+  std::shared_ptr<uint64_t> getToken();
+
+ private:
+  size_t nToken_;
+  uint64_t* baseAddr_;
+  uint64_t tailMask_;
+  std::shared_ptr<uint64_t> tokens_;
+  std::vector<uint64_t> allocationMap_;
+};
+
 }  // namespace mscclpp
 
 #endif
diff --git a/src/semaphore.cc b/src/semaphore.cc
@@ -20,18 +20,21 @@ struct SemaphoreStub::Impl {
 
   Impl(const std::vector<char>& data);
 
+  std::shared_ptr<uint64_t> gpuCallocToken(std::shared_ptr<Context> context);
+
   std::shared_ptr<Connection> connection_;
   std::shared_ptr<uint64_t> token_;
   RegisteredMemory idMemory_;
   Device device_;
 };
 
-static std::shared_ptr<uint64_t> gpuCallocToken() {
-// #if (CUDA_NVLS_API_AVAILABLE)
-//   if (isNvlsSupported()) {
-//     return detail::gpuCallocPhysicalShared<uint64_t>(1, 0);
-//   }
-// #endif  // CUDA_NVLS_API_AVAILABLE
+std::shared_ptr<uint64_t> SemaphoreStub::Impl::gpuCallocToken(std::shared_ptr<Context> context) {
+  // NVLS is not supported in ROCm/HIP
+#if (CUDA_NVLS_API_AVAILABLE)
+  if (isNvlsSupported()) {
+    return context->pimpl_->getToken();
+  }
+#endif  // CUDA_NVLS_API_AVAILABLE
 #if defined(MSCCLPP_DEVICE_HIP)
   return detail::gpuCallocUncachedShared<uint64_t>();
 #else   // !defined(MSCCLPP_DEVICE_HIP)
@@ -49,7 +52,7 @@ SemaphoreStub::Impl::Impl(std::shared_ptr<Connection> connection) : connection_(
       throw Error("Local GPU ID is not provided", ErrorCode::InvalidUsage);
     }
     MSCCLPP_CUDATHROW(cudaSetDevice(localDevice.id));
-    token_ = gpuCallocToken();
+    token_ = gpuCallocToken(connection_->context());
   } else {
     throw Error("Unsupported local device type", ErrorCode::InvalidUsage);
   }

diff --git a/src/utils_internal.cc b/src/utils_internal.cc
@@ -12,6 +12,7 @@
 #include <memory>
 #include <mscclpp/env.hpp>
 #include <mscclpp/errors.hpp>
+#include <mscclpp/gpu_utils.hpp>
 #include <sstream>
 #include <string>
 
@@ -232,4 +233,42 @@ void getRandomData(void* buffer, size_t bytes) {
   }
 }
 
+TokenPool::TokenPool(size_t nToken) : nToken_(nToken) {
+#if (CUDA_NVLS_API_AVAILABLE)
+  tokens_ = detail::gpuCallocPhysicalShared<uint64_t>(
+      nToken, detail::getCuAllocationGranularity(CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+  MSCCLPP_CUTHROW(cuMemGetAddressRange((CUdeviceptr*)(&baseAddr_), NULL, (CUdeviceptr)tokens_.get()));
+  size_t nElems = (nToken + (UINT64_WIDTH - 1)) / UINT64_WIDTH;
+  allocationMap_.resize(nElems, 0);
+  tailMask_ = (nToken % UINT64_WIDTH) ? ((1UL << (nToken % UINT64_WIDTH)) - 1) : ~0UL;
+#else
+  throw Error("TokenPool only available on GPUs with NVLS support", ErrorCode::InvalidUsage);
+#endif
+}
+
+std::shared_ptr<uint64_t> TokenPool::getToken() {
+  auto deleter = [self = shared_from_this()](uint64_t* token) {
+    size_t index = (token - self->baseAddr_) / UINT64_WIDTH;
+    size_t bit = (token - self->baseAddr_) % UINT64_WIDTH;
+    uint64_t mask = 1UL << bit;
+    if ((self->allocationMap_[index] & mask) == 0) {
+      WARN("TokenPool tried to free a token that was not allocated");
+      return;
+    }
+    self->allocationMap_[index] &= ~mask;
+  };
+
+  size_t size = allocationMap_.size();
+  for (size_t i = 0; i < size; i++) {
+    uint64_t mask = (i + 1 == size) ? tailMask_ : ~0ULL;
+    uint64_t holes = (~allocationMap_[i]) & mask;
+    if (!holes) continue;
+    size_t bit = __builtin_ctzll(holes);
+    allocationMap_[i] |= (1UL << bit);
+    INFO(MSCCLPP_ALLOC, "TokenPool allocated token at addr %p", baseAddr_ + i * UINT64_WIDTH + bit);
+    return std::shared_ptr<uint64_t>(baseAddr_ + i * UINT64_WIDTH + bit, deleter);
+  }
+  throw Error("TokenPool is exhausted", ErrorCode::InternalError);
+}
+
 }  // namespace mscclpp
-Original file line number
+Diff line change
@@ Expand Up @@
     }
     size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag);
+    size_t getCuAllocationGranularity(CUmemAllocationGranularity_flags granFlag);
     #endif  // CUDA_NVLS_API_AVAILABLE
@@ Expand Down @@