[CI Failure] Fix test_flashinfer_cutlass_mxfp4_mxfp8_fused_moe (vllm-project#24750)

mgoin · bbartels · commit 874c8dd239ff · 2025-09-15T10:20:00.000Z
Signed-off-by: mgoin &lt;mgoin64@gmail.com&gt;
Signed-off-by: bbartels &lt;benjamin@bartels.dev&gt;
diff --git a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@@ -43,6 +43,7 @@ void sm100_cutlass_mla_decode(
     torch::Tensor const& seq_lens,
     torch::Tensor const& page_table,
     torch::Tensor const& workspace,
+    double sm_scale,
     int64_t num_kv_splits) {
   TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_decode");
 }
diff --git a/tests/kernels/moe/test_mxfp4_moe.py b/tests/kernels/moe/test_mxfp4_moe.py
@@ -771,11 +771,11 @@ def dequant_mxfp4_batches(mat_fp4: torch.Tensor,
     w13_ref = dequant_mxfp4_batches(
         w13_q.view(torch.uint8),
         w13_scale.view(torch.uint8).reshape(-1)).to(torch.float32).reshape(
-            num_experts, 2 * intermediate_size, hidden_size)
+            num_experts, 2 * intermediate_size, hidden_size).to(device)
     w2_ref = dequant_mxfp4_batches(
         w2_q.view(torch.uint8),
         w2_scale.view(torch.uint8).reshape(-1)).to(torch.float32).reshape(
-            num_experts, hidden_size, intermediate_size)
+            num_experts, hidden_size, intermediate_size).to(device)
 
     # Quantize activations for SM100 path and dequantize for reference
     hidden_states_q, hidden_states_sf = mxfp8_quantize(hidden_states, True, 32)

Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,7 @@ void sm100_cutlass_mla_decode(`
`43`	`43`	`torch::Tensor const& seq_lens,`
`44`	`44`	`torch::Tensor const& page_table,`
`45`	`45`	`torch::Tensor const& workspace,`
	`46`	`+ double sm_scale,`
`46`	`47`	`int64_t num_kv_splits) {`
`47`	`48`	`TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_decode");`
`48`	`49`	`}`