Skip to content

Commit 3a9926b

Browse files
author
Iwan Kawrakow
committed
Checkpoint
Go to groups of 8 for iq3_kt. 2 x 8 = 16 bits for the magnitude plus 1 bpw for the sign. It goves a visible improvement in the PPL vs bpw plot, but that comes at the expense of much longer quantization time (7.5 minutes for LLaMA-3.1-8B on the Ryzen-5975WX). I also notices that the 3INST generator is not actually generating a Gaussian distribution. But going to a better generator means readjusting all the hyper-parameters, so leaving it for later.
1 parent 2be4cff commit 3a9926b

File tree

4 files changed

+237
-129
lines changed

4 files changed

+237
-129
lines changed

ggml/src/ggml-cuda/convert.cu

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -349,7 +349,12 @@ float __device__ __forceinline__ trellis_next(uint32_t& val) {
349349
const half * h = (const half *)&s;
350350
val = ka*val + kb;
351351
s = (val & kmask) ^ km32;
352-
return (float)(h[0] +h[1]);
352+
//float r = (float)(h[0] +h[1]);
353+
//val = ka*val + kb;
354+
//s = (val & kmask) ^ km32;
355+
//r += (float)(h[0]+h[1]);
356+
//return r;
357+
return (float)(h[0]+h[1]);
353358
}
354359

355360
template<typename dst_t>
@@ -383,20 +388,42 @@ static __global__ void dequantize_block_iq3_kt(const void * __restrict__ vx, dst
383388
const block_iq3_kt * x = (const block_iq3_kt *)(cx + sizeof(float));
384389
const int64_t i = ii - (row*n_per_row)/QK_K;
385390

386-
const int8_t * scale_values = iq4k_values + 16;
387-
388391
const int64_t tid = threadIdx.x;
389392
const int64_t ib = tid; // 0...31
390393
dst_t * y = yy + ii*QK_K + 8*ib;
391-
uint32_t idx1 = x[i].ql[2*ib+0] + ((x[i].qh[(2*ib+0)%32] << (8-4*((2*ib+0)/32))) & 0xf00) + 4096;
392-
uint32_t idx2 = x[i].ql[2*ib+1] + ((x[i].qh[(2*ib+1)%32] << (8-4*((2*ib+1)/32))) & 0xf00) + 4096;
393-
const float dl = scale * scale_values[((x[i].scales[(ib/4)%4] >> 4*(ib/16)) & 0xf)] * 31.75f * 1.015f;
394-
for (int j = 0; j < 4; ++j) {
395-
y[j+0] = dl * trellis_next(idx1);
396-
y[j+4] = dl * trellis_next(idx2);
394+
const uint16_t * ql = (const uint16_t *)x[i].ql;
395+
uint32_t idx = ql[ib] + 4096;
396+
const float dl = scale * ((x[i].scales[(ib/4)%4] >> 4*(ib/16)) & 0xf) * 31.75f * 1.01f; //1.015f;
397+
uint8_t mask = 1 << (ib/4);
398+
for (int j = 0; j < 8; ++j) {
399+
y[j] = dl * std::abs(trellis_next(idx)) * (x[i].qh[(8*ib+j)%32] & mask ? -1.f : 1.f);
397400
}
398401
}
399402

403+
//template<typename dst_t>
404+
//static __global__ void dequantize_block_iq3_kt(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
405+
//
406+
// int64_t ii = blockIdx.x;
407+
// int64_t row = (QK_K * ii) / n_per_row;
408+
// const float * dptr = (const float *)((const char *)vx + row * row_size);
409+
// float scale = dptr[0];
410+
// float alpha = dptr[1];
411+
// const block_iq3_kt * x = (const block_iq3_kt *)(dptr + 2);
412+
// const int64_t i = ii - (row*n_per_row)/QK_K;
413+
//
414+
// const int64_t tid = threadIdx.x;
415+
// const int64_t ib = tid; // 0...31
416+
// dst_t * y = yy + ii*QK_K + 8*ib;
417+
// const uint16_t * ql = (const uint16_t *)x[i].ql;
418+
// uint32_t idx = ql[ib] + 4096;
419+
// const float dl = scale * ((x[i].scales[(ib/4)%4] >> 4*(ib/16)) & 0xf) * 31.75f * 1.01f; //1.015f;
420+
// uint8_t mask = 1 << (ib/4);
421+
// for (int j = 0; j < 8; ++j) {
422+
// float ay = std::abs(trellis_next(idx));
423+
// y[j] = dl * ay/(1 - alpha*ay) * (x[i].qh[(8*ib+j)%32] & mask ? -1.f : 1.f);
424+
// }
425+
//}
426+
400427
template<typename dst_t>
401428
static __global__ void dequantize_block_iq4_kt(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
402429

ggml/src/ggml-cuda/dmmv.cu

Lines changed: 59 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,54 @@ static __device__ __forceinline__ void trellis_accum(uint32_t& val1, uint32_t& v
4141
#endif
4242
}
4343

44+
//static __device__ __forceinline__ void trellis_accum(uint32_t& val1, uint32_t& val2, uint32_t* s, const dfloat2* y, dfloat2& bdot1, dfloat2& bdot2) {
45+
// const half * h = (const half *)s;
46+
// s[0] = trellis_next(val1);
47+
// s[1] = trellis_next(val1);
48+
// s[2] = trellis_next(val1);
49+
// s[3] = trellis_next(val1);
50+
//#ifdef GGML_CUDA_F16
51+
// bdot1 = __hfma2(y[ 0], {h[0]+h[1]+h[2]+h[3], h[4]+h[5]+h[6]+h[7]}, bdot1);
52+
//#else
53+
// bdot1.x += y[ 0].x * (float)(h[0] + h[1] + h[2] + h[3]);
54+
// bdot1.y += y[ 0].y * (float)(h[4] + h[5] + h[6] + h[7]);
55+
//#endif
56+
// s[0] = trellis_next(val2);
57+
// s[1] = trellis_next(val2);
58+
// s[2] = trellis_next(val2);
59+
// s[3] = trellis_next(val2);
60+
//#ifdef GGML_CUDA_F16
61+
// bdot2 = __hfma2(y[64], {h[0]+h[1]+h[2]+h[3], h[4]+h[5]+h[6]+h[7]}, bdot2);
62+
//#else
63+
// bdot2.x += y[64].x * (float)(h[0] + h[1] + h[2] + h[3]);
64+
// bdot2.y += y[64].y * (float)(h[4] + h[5] + h[6] + h[7]);
65+
//#endif
66+
//}
67+
68+
static __device__ __forceinline__ void trellis_accum_abs(uint8_t signs1, uint8_t signs2, uint8_t mask1, uint8_t mask2,
69+
uint32_t& val1, uint32_t& val2, uint32_t* s, const dfloat2* y, dfloat2& bdot1, dfloat2& bdot2) {
70+
const half * h = (const half *)s;
71+
s[0] = trellis_next(val1);
72+
s[1] = trellis_next(val1);
73+
s[2] = trellis_next(val2);
74+
s[3] = trellis_next(val2);
75+
#ifdef GGML_CUDA_F16
76+
half h00 = __habs(h[0]+h[1]), h01 = __habs(h[2]+h[3]);
77+
half h10 = __habs(h[4]+h[5]), h11 = __habs(h[6]+h[7]);
78+
half2 h1 = {signs1 & mask1 ? -h00 : h00, signs2 & mask1 ? -h01 : h01};
79+
half2 h2 = {signs1 & mask2 ? -h10 : h10, signs2 & mask2 ? -h11 : h11};
80+
//half2 h1 = __hmul2(__habs2({h[0]+h[1], h[2]+h[3]}), {signs1 & mask1 ? -1 : 1, signs2 & mask1 ? -1 : 1});
81+
//half2 h2 = __hmul2(__habs2({h[4]+h[5], h[6]+h[7]}), {signs1 & mask2 ? -1 : 1, signs2 & mask2 ? -1 : 1});
82+
bdot1 = __hfma2(y[ 0], h1, bdot1);
83+
bdot2 = __hfma2(y[64], h2, bdot2);
84+
#else
85+
bdot1.x += y[ 0].x * fabsf((float)(h[0] + h[1])) * (signs1 & mask1 ? -1 : 1);
86+
bdot1.y += y[ 0].y * fabsf((float)(h[2] + h[3])) * (signs2 & mask1 ? -1 : 1);
87+
bdot2.x += y[64].x * fabsf((float)(h[4] + h[5])) * (signs1 & mask2 ? -1 : 1);
88+
bdot2.y += y[64].y * fabsf((float)(h[6] + h[7])) * (signs2 & mask2 ? -1 : 1);
89+
#endif
90+
}
91+
4492
static __device__ __forceinline__ void trellis_accum(const dfloat2& dl1, const dfloat2& dl2, const dfloat2& bdot1, const dfloat2& bdot2, dfloat2& tmp) {
4593
#ifdef GGML_CUDA_F16
4694
tmp = __hfma2(dl1, bdot1, tmp);
@@ -114,25 +162,23 @@ static __global__ void dequantize_mul_mat_vec_iq3_kt(const void * __restrict__ v
114162

115163
uint32_t s[4];
116164

165+
uint8_t mask1 = 1 << (it/4);
166+
uint8_t mask2 = mask1 << 4;
167+
117168
for (int i = ix; i < num_blocks_per_row; i += 2) {
118169
const dfloat2 * y = (const dfloat2 *)(yy + i * QK_K + 8*it);
119-
const uint8_t * ql = x[i].ql;
120-
const uint8_t * qh = x[i].qh;
121-
const dfloat scale1 = iq4k_values[(x[i].scales[it/4] & 0xf)+16];
122-
const dfloat scale2 = iq4k_values[(x[i].scales[it/4] >> 4)+16];
170+
const uint16_t * ql = (const uint16_t *)x[i].ql;
171+
const uint8_t * qh = x[i].qh;
172+
const dfloat scale1 = (x[i].scales[it/4] & 0xf);
173+
const dfloat scale2 = (x[i].scales[it/4] >> 4);
123174
const dfloat2 dl1 = {scale1, scale1};
124175
const dfloat2 dl2 = {scale2, scale2};
125176
dfloat2 bdot1 = {0, 0};
126177
dfloat2 bdot2 = {0, 0};
127-
uint32_t val1 = ql[2*it+ 0] + ((qh[2*it+0] << 8) & 0xf00) + 4096;
128-
uint32_t val2 = ql[2*it+32] + ((qh[2*it+0] << 4) & 0xf00) + 4096;
129-
for (int k = 0; k < 2; ++k) {
130-
trellis_accum(val1, val2, s, y+k, bdot1, bdot2);
131-
}
132-
val1 = ql[2*it+ 1] + ((qh[2*it+1] << 8) & 0xf00) + 4096;
133-
val2 = ql[2*it+33] + ((qh[2*it+1] << 4) & 0xf00) + 4096;
134-
for (int k = 2; k < 4; ++k) {
135-
trellis_accum(val1, val2, s, y+k, bdot1, bdot2);
178+
uint32_t val1 = ql[it+ 0] + 4096;
179+
uint32_t val2 = ql[it+16] + 4096;
180+
for (int k = 0; k < 4; ++k) {
181+
trellis_accum_abs(qh[(8*it+2*k+0)%32], qh[(8*it+2*k+1)%32], mask1, mask2, val1, val2, s, y+k, bdot1, bdot2);
136182
}
137183
trellis_accum(dl1, dl2, bdot1, bdot2, tmp);
138184
}

0 commit comments

Comments
 (0)