Skip to content

Commit fe4eb4f

Browse files
fix logic for availability of v_dot2_f32_f16
1 parent 8821183 commit fe4eb4f

File tree

2 files changed

+10
-2
lines changed

2 files changed

+10
-2
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -555,7 +555,7 @@ static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float2 v
555555
}
556556

557557
static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v, const half2 u) {
558-
#if defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(GCN) || defined(CDNA))
558+
#if defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(__gfx906__) || defined(CDNA))
559559
asm volatile("v_dot2_f32_f16 %0, %1, %2, %0" : "+v"(acc) : "v"(v), "v"(u));
560560
#else
561561
#ifdef FAST_FP16_AVAILABLE
@@ -567,7 +567,7 @@ static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v,
567567
acc += tmpv.x * tmpu.x;
568568
acc += tmpv.y * tmpu.y;
569569
#endif // FAST_FP16_AVAILABLE
570-
#endif // defined(GGML_USE_HIP) && defined(GCN)
570+
#endif // defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(GCN5) || defined(CDNA))
571571
}
572572

573573
// Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD.

ggml/src/ggml-cuda/vendors/hip.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,14 @@
162162
#define GCN
163163
#endif
164164

165+
#if defined(__gfx900__) || defined(__gfx906__)
166+
#define GCN5
167+
#endif
168+
169+
#if defined(__gfx803__)
170+
#define GCN4
171+
#endif
172+
165173
#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
166174
#define CDNA // For the entire family
167175
#endif

0 commit comments

Comments
 (0)