Skip to content

Commit eded4e2

Browse files
ikawrakowIwan Kawrakow
andauthored
IQ1_M_R4 CUDA implementation (#494)
* iq1_m_r4: CUDA dequantize * iq1_m_r4: CUDA dequantize --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 8ffad18 commit eded4e2

File tree

5 files changed

+109
-3
lines changed

5 files changed

+109
-3
lines changed

ggml/src/ggml-cuda.cu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3477,6 +3477,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
34773477
case GGML_TYPE_IQ5_K_R4:
34783478
case GGML_TYPE_IQ5_KS_R4:
34793479
case GGML_TYPE_IQ1_S_R4:
3480+
case GGML_TYPE_IQ1_M_R4:
34803481
return true;
34813482
default:
34823483
return false;

ggml/src/ggml-cuda/convert.cu

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -542,7 +542,7 @@ static __global__ void dequantize_block_iq1_s_r4(const void * __restrict__ vx, d
542542
const int ib = tid%8; // 0...7
543543

544544
const half * dptr = (const half *)((const char *)vx + 4*row4*row_size);
545-
const float d = (float)dptr[ir];
545+
const float d = __half2float(dptr[ir]);
546546
const block_iq1_s_r4 * x = (const block_iq1_s_r4 *)(dptr + 4) + ibl;
547547
dst_t * y = yy + 256*ii + 32*ib + 8*il;
548548

@@ -561,6 +561,42 @@ static __global__ void dequantize_block_iq1_s_r4(const void * __restrict__ vx, d
561561
}
562562
}
563563

564+
template<typename dst_t>
565+
static __global__ void dequantize_block_iq1_m_r4(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
566+
567+
int64_t ii = blockIdx.x;
568+
569+
int64_t nblock = n_per_row/32;
570+
int64_t row = (8*ii)/nblock;
571+
int64_t row4 = row/4;
572+
int64_t ir = row%4;
573+
int64_t ibl = (8*ii)%nblock;
574+
575+
const int tid = threadIdx.x;
576+
const int il = tid/8; // 0...3
577+
const int ib = tid%8; // 0...7
578+
579+
const half * dptr = (const half *)((const char *)vx + 4*row4*row_size);
580+
const float d = __half2float(dptr[ir]);
581+
const block_iq1_m_r4 * x = (const block_iq1_m_r4 *)(dptr + 4) + ibl;
582+
dst_t * y = yy + 256*ii + 32*ib + 8*il;
583+
584+
uint8_t qh = x[ib].qh[4*(il/2)+ir] >> 4*(il%2);
585+
float dl = d*((x[ib].scales[ir] >> 4*(il/2)) & 0xf);
586+
float delta = dl * (qh & 0x8 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA);
587+
588+
uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
589+
grid32[0] = iq1s_grid_gpu[x[ib].qs[4*il+ir] | ((qh & 7) << 8)];
590+
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
591+
grid32[0] &= 0x0f0f0f0f;
592+
593+
if constexpr (std::is_same_v<dst_t, nv_bfloat16>) {
594+
for (int j = 0; j < 8; ++j) y[j] = __float2bfloat16(dl*q[j] + delta);
595+
} else {
596+
for (int j = 0; j < 8; ++j) y[j] = dl*q[j] + delta;
597+
}
598+
}
599+
564600
template<typename dst_t>
565601
static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {
566602

@@ -1441,6 +1477,14 @@ static void dequantize_row_iq1_s_r4_cuda(const void * vx, dst_t * y, const int64
14411477
dequantize_block_iq1_s_r4<<<nb, 32, 0, stream>>>(vx, y, n_per_row, row_size);
14421478
}
14431479

1480+
template<typename dst_t>
1481+
static void dequantize_row_iq1_m_r4_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
1482+
const int64_t k = nrows * n_per_row;
1483+
const int64_t row_size = ggml_row_size(GGML_TYPE_IQ1_M_R4, n_per_row);
1484+
const int nb = (k + QK_K - 1) / QK_K;
1485+
dequantize_block_iq1_m_r4<<<nb, 32, 0, stream>>>(vx, y, n_per_row, row_size);
1486+
}
1487+
14441488
template<typename dst_t>
14451489
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
14461490
const int64_t k = nrows * n_per_row;
@@ -1696,6 +1740,8 @@ to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) {
16961740
return dequantize_row_iq5_ks_r4_cuda<nv_bfloat16>;
16971741
case GGML_TYPE_IQ1_S_R4:
16981742
return dequantize_row_iq1_s_r4_cuda<nv_bfloat16>;
1743+
case GGML_TYPE_IQ1_M_R4:
1744+
return dequantize_row_iq1_m_r4_cuda<nv_bfloat16>;
16991745
default:
17001746
return nullptr;
17011747
}
@@ -1746,6 +1792,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
17461792
return dequantize_row_iq1_s_cuda;
17471793
case GGML_TYPE_IQ1_S_R4:
17481794
return dequantize_row_iq1_s_r4_cuda;
1795+
case GGML_TYPE_IQ1_M_R4:
1796+
return dequantize_row_iq1_m_r4_cuda;
17491797
case GGML_TYPE_IQ1_M:
17501798
return dequantize_row_iq1_m_cuda;
17511799
case GGML_TYPE_IQ1_BN:
@@ -1839,6 +1887,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
18391887
return dequantize_row_iq1_s_cuda;
18401888
case GGML_TYPE_IQ1_S_R4:
18411889
return dequantize_row_iq1_s_r4_cuda;
1890+
case GGML_TYPE_IQ1_M_R4:
1891+
return dequantize_row_iq1_m_r4_cuda;
18421892
case GGML_TYPE_IQ1_M:
18431893
return dequantize_row_iq1_m_cuda;
18441894
case GGML_TYPE_IQ1_BN:

ggml/src/ggml-cuda/iqk_mmvq.cu

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ5_K_R4> {
3636
static constexpr int qi = QI5_XS;
3737
};
3838

39+
template<>
40+
struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M_R4> {
41+
static constexpr int qk = 32;
42+
static constexpr int qr = 2;
43+
static constexpr int qi = 4;
44+
};
45+
3946
// Reminder:
4047
// constexpr int qk = ggml_cuda_type_traits<type>::qk;
4148
// constexpr int qi = ggml_cuda_type_traits<type>::qi;
@@ -338,7 +345,6 @@ __device__ __forceinline__ void vec_dot_iq4_ks_r4_q8_1(
338345
}
339346
}
340347

341-
// TODO
342348
__device__ __forceinline__ void vec_dot_iq1_s_r4_q8_1(
343349
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs, float * result) {
344350

@@ -356,7 +362,7 @@ __device__ __forceinline__ void vec_dot_iq1_s_r4_q8_1(
356362
for (int k = 0; k < 4; ++k) minus = ggml_cuda_dp4a(0x01010101, q8[4*(iqs/2)+k], minus);
357363

358364
for (int i = 0; i < 4; ++i) {
359-
float dl = (float)dptr[i]*(2*((bq1->qh[i] >> 12) & 7) + 1) * d8;
365+
float dl = __half2float(dptr[i])*(2*((bq1->qh[i] >> 12) & 7) + 1) * d8;
360366
float ml = dl * (bq1->qh[i] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA);
361367
grid32[0] = iq1s_grid_gpu[bq1->qs[4*iqs+i] | (((bq1->qh[i] >> 3*iqs) & 7) << 8)];
362368
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
@@ -370,6 +376,38 @@ __device__ __forceinline__ void vec_dot_iq1_s_r4_q8_1(
370376
}
371377
}
372378

379+
__device__ __forceinline__ void vec_dot_iq1_m_r4_q8_1(
380+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs, float * result) {
381+
382+
const half * dptr = (const half *)vbq;
383+
const block_iq1_m_r4 * bq1 = (const block_iq1_m_r4 *)(dptr + 4) + kbx;
384+
385+
// iqs is 0 or 2
386+
const float d8 = __low2float(bq8_1->ds);
387+
const int32_t * q8 = (const int *)bq8_1->qs;
388+
389+
int32_t grid32[2];
390+
const int * igrid = (const int *)grid32;
391+
392+
int minus1 = ggml_cuda_dp4a(0x01010101, q8[4*(iqs/2)+0], ggml_cuda_dp4a(0x01010101, q8[4*(iqs/2)+1], 0));
393+
int minus2 = ggml_cuda_dp4a(0x01010101, q8[4*(iqs/2)+2], ggml_cuda_dp4a(0x01010101, q8[4*(iqs/2)+3], 0));
394+
395+
for (int i = 0; i < 4; ++i) {
396+
float dl = __half2float(dptr[i])*((bq1->scales[i] >> 4*(iqs/2)) & 0xf) * d8;
397+
float ml1 = dl * (bq1->qh[4*(iqs/2)+i] & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA);
398+
float ml2 = dl * (bq1->qh[4*(iqs/2)+i] & 0x80 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA);
399+
grid32[0] = iq1s_grid_gpu[bq1->qs[4*iqs+i] | ((bq1->qh[4*(iqs/2)+i] & 0x07) << 8)];
400+
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
401+
grid32[0] &= 0x0f0f0f0f;
402+
int sumi = ggml_cuda_dp4a(igrid[0], q8[4*(iqs/2)+0], ggml_cuda_dp4a(igrid[1], q8[4*(iqs/2)+1], 0));
403+
grid32[0] = iq1s_grid_gpu[bq1->qs[4*iqs+i+4] | ((bq1->qh[4*(iqs/2)+i] & 0x70) << 4)];
404+
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
405+
grid32[0] &= 0x0f0f0f0f;
406+
sumi = ggml_cuda_dp4a(igrid[0], q8[4*(iqs/2)+2], ggml_cuda_dp4a(igrid[1], q8[4*(iqs/2)+3], sumi));
407+
result[i] += dl * sumi + ml1 * minus1 + ml2*minus2;
408+
}
409+
}
410+
373411
#define VDR_IQ4_KS_Q8_1_MMVQ 4
374412
#define VDR_IQ4_KS_Q8_1_MMQ 4
375413

@@ -1131,6 +1169,14 @@ void mul_mat_vec_iq1_s_r4_q8_1_cuda(
11311169
iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ1_S_R4, 2, vec_dot_iq1_s_r4_q8_1, 4>(vx, vy, dst, ids_data, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, ne2, nb02, nb12, nb2, ids_nb0, stream);
11321170
}
11331171

1172+
void mul_mat_vec_iq1_m_r4_q8_1_cuda(
1173+
const void * vx, const void * vy, float * dst, const char * ids_data,
1174+
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst,
1175+
const int ne2, const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, int64_t ids_nb0, cudaStream_t stream) {
1176+
1177+
iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ1_M_R4, 2, vec_dot_iq1_m_r4_q8_1, 4>(vx, vy, dst, ids_data, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, ne2, nb02, nb12, nb2, ids_nb0, stream);
1178+
}
1179+
11341180
void mul_mat_vec_iq5_k_r4_q8_1_cuda(
11351181
const void * vx, const void * vy, float * dst, const char * ids_data,
11361182
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst,

ggml/src/ggml-cuda/iqk_mmvq.cuh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,3 +95,8 @@ void mul_mat_vec_iq1_s_r4_q8_1_cuda(
9595
const void * vx, const void * vy, float * dst, const char * ids_data,
9696
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst,
9797
const int ne2, const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0, cudaStream_t stream);
98+
99+
void mul_mat_vec_iq1_m_r4_q8_1_cuda(
100+
const void * vx, const void * vy, float * dst, const char * ids_data,
101+
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst,
102+
const int ne2, const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0, cudaStream_t stream);

ggml/src/ggml-cuda/mmvq.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,9 @@ static void ggml_cuda_op_mul_mat_vec_q_impl(ggml_backend_cuda_context & ctx, ggm
563563
case GGML_TYPE_IQ1_S_R4:
564564
mul_mat_vec_iq1_s_r4_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ids_data, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, ne2, nb02, nb12, nb2, ids_nb0, stream);
565565
break;
566+
case GGML_TYPE_IQ1_M_R4:
567+
mul_mat_vec_iq1_m_r4_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ids_data, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, ne2, nb02, nb12, nb2, ids_nb0, stream);
568+
break;
566569
default:
567570
GGML_ABORT("fatal error");
568571
break;
@@ -683,6 +686,7 @@ bool ggml_cuda_mmvq_type_supported(ggml_type src0_type) {
683686
case GGML_TYPE_IQ5_K_R4:
684687
case GGML_TYPE_IQ5_KS_R4:
685688
case GGML_TYPE_IQ1_S_R4:
689+
case GGML_TYPE_IQ1_M_R4:
686690
return true;
687691
default:
688692
return false;

0 commit comments

Comments
 (0)