Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@ option(GGML_CUDA_F16 "ggml: use 16 bit floats for some ca
set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
"ggml: iters./thread per block for Q2_K/Q6_K")
set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"ggml: min batch size for GPU offload")
set (GGML_CUDA_MIN_BATCH_OFFLOAD "32" CACHE STRING
"ggml: max. batch size for using peer access")
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
Expand Down
1 change: 1 addition & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,7 @@ if (GGML_CUDA)
add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
add_compile_definitions(GGML_CUDA_MIN_BATCH_OFFLOAD=${GGML_CUDA_MIN_BATCH_OFFLOAD})

if (GGML_CUDA_USE_GRAPHS)
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
Expand Down
31 changes: 27 additions & 4 deletions ggml/src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3656,10 +3656,33 @@ GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, gg
}

GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
const int min_batch_size = 32;

return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
(op->ne[2] >= min_batch_size && (op->op == GGML_OP_MUL_MAT_ID || op->op == GGML_OP_MOE_FUSED_UP_GATE));
constexpr int min_batch_size = GGML_CUDA_MIN_BATCH_OFFLOAD;

// Why do we want to do this? The heuristics that the batch must have more than min_batch_size tokens to be worth it
// offloading the required model weights comes from dense models. For MoE models, the average number of tokens
// each expert deals with in a batch is (active_experts / total_experts) * batch_size. Hence, according to the
// learned heuristics, we need (active_experts / total_experts) * batch_size >= min_batch_size.
// Rearranging we get
//
// batch_size * active_experts >= min_batch_size * total_experts
//
// as the condition for offloading model weights resinding in RAM to the GPU.
// In this case, the number of tokens is not as usual in op->ne[1] but rather in op->ne[2].
if (op->op == GGML_OP_MUL_MAT_ID || op->op == GGML_OP_MOE_FUSED_UP_GATE) {
auto ids = op->op == GGML_OP_MUL_MAT_ID ? op->src[2] : op->src[3];
int64_t batch_size = op->ne[2];
if (batch_size < min_batch_size) return false;
int64_t n_experts_tot = op->src[0]->ne[2];
int64_t n_experts_active = ids->ne[0];
//printf("%s(%s): op->ne[2] = %ld, n_experts_tot = %ld, n_experts_active = %ld, ids: %s, %ld x %ld x %ld x %ld\n", __func__, op->name, op->ne[2], n_experts_tot, n_experts_active, ids->name, ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3]);
return batch_size*n_experts_active >= min_batch_size*n_experts_tot;
}

return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;

// Original:
//return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
// (op->ne[2] >= min_batch_size && (op->op == GGML_OP_MUL_MAT_ID || op->op == GGML_OP_MOE_FUSED_UP_GATE));

GGML_UNUSED(backend);
}
Expand Down