ikawrakow
diff --git a/‎ggml/src/ggml-cuda/convert.cu‎
Lines changed: 36 additions & 9 deletions b/‎ggml/src/ggml-cuda/convert.cu‎
Lines changed: 36 additions & 9 deletions
diff --git a/‎ggml/src/ggml-cuda/dmmv.cu‎
Lines changed: 59 additions & 13 deletions b/‎ggml/src/ggml-cuda/dmmv.cu‎
Lines changed: 59 additions & 13 deletions
@@ -349,7 +349,12 @@ float __device__ __forceinline__ trellis_next(uint32_t& val) {
     const half * h = (const half *)&s;
     val = ka*val + kb;
     s = (val & kmask) ^ km32;
-    return (float)(h[0] +h[1]);
+    //float r = (float)(h[0] +h[1]);
+    //val = ka*val + kb;
+    //s = (val & kmask) ^ km32;
+    //r += (float)(h[0]+h[1]);
+    //return r;
+    return (float)(h[0]+h[1]);
 }
 
 template<typename dst_t>
@@ -383,20 +388,42 @@ static __global__ void dequantize_block_iq3_kt(const void * __restrict__ vx, dst
     const block_iq3_kt * x = (const block_iq3_kt *)(cx + sizeof(float));
     const int64_t i = ii - (row*n_per_row)/QK_K;
 
-    const int8_t * scale_values = iq4k_values + 16;
-
     const int64_t tid = threadIdx.x;
     const int64_t ib = tid; // 0...31
     dst_t * y = yy + ii*QK_K + 8*ib;
-    uint32_t idx1 = x[i].ql[2*ib+0] + ((x[i].qh[(2*ib+0)%32] << (8-4*((2*ib+0)/32))) & 0xf00) + 4096;
-    uint32_t idx2 = x[i].ql[2*ib+1] + ((x[i].qh[(2*ib+1)%32] << (8-4*((2*ib+1)/32))) & 0xf00) + 4096;
-    const float dl = scale * scale_values[((x[i].scales[(ib/4)%4] >> 4*(ib/16)) & 0xf)] * 31.75f * 1.015f;
-    for (int j = 0; j < 4; ++j) {
-        y[j+0] = dl * trellis_next(idx1);
-        y[j+4] = dl * trellis_next(idx2);
+    const uint16_t * ql = (const uint16_t *)x[i].ql;
+    uint32_t idx = ql[ib] + 4096;
+    const float dl = scale * ((x[i].scales[(ib/4)%4] >> 4*(ib/16)) & 0xf) * 31.75f * 1.01f; //1.015f;
+    uint8_t mask = 1 << (ib/4);
+    for (int j = 0; j < 8; ++j) {
+        y[j] = dl * std::abs(trellis_next(idx)) * (x[i].qh[(8*ib+j)%32] & mask ? -1.f : 1.f);
     }
 }
 
+//template<typename dst_t>
+//static __global__ void dequantize_block_iq3_kt(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
+//
+//    int64_t ii  = blockIdx.x;
+//    int64_t row = (QK_K * ii) / n_per_row;
+//    const float * dptr = (const float *)((const char *)vx + row * row_size);
+//    float scale = dptr[0];
+//    float alpha = dptr[1];
+//    const block_iq3_kt * x = (const block_iq3_kt *)(dptr + 2);
+//    const int64_t i = ii - (row*n_per_row)/QK_K;
+//
+//    const int64_t tid = threadIdx.x;
+//    const int64_t ib = tid; // 0...31
+//    dst_t * y = yy + ii*QK_K + 8*ib;
+//    const uint16_t * ql = (const uint16_t *)x[i].ql;
+//    uint32_t idx = ql[ib] + 4096;
+//    const float dl = scale * ((x[i].scales[(ib/4)%4] >> 4*(ib/16)) & 0xf) * 31.75f * 1.01f; //1.015f;
+//    uint8_t mask = 1 << (ib/4);
+//    for (int j = 0; j < 8; ++j) {
+//        float ay = std::abs(trellis_next(idx));
+//        y[j] = dl * ay/(1 - alpha*ay) * (x[i].qh[(8*ib+j)%32] & mask ? -1.f : 1.f);
+//    }
+//}
+
 template<typename dst_t>
 static __global__ void dequantize_block_iq4_kt(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
 
 
@@ -41,6 +41,54 @@ static __device__ __forceinline__ void trellis_accum(uint32_t& val1, uint32_t& v
 #endif
 }
 
+//static __device__ __forceinline__ void trellis_accum(uint32_t& val1, uint32_t& val2, uint32_t* s, const dfloat2* y, dfloat2& bdot1, dfloat2& bdot2) {
+//    const half * h = (const half *)s;
+//    s[0] = trellis_next(val1);
+//    s[1] = trellis_next(val1);
+//    s[2] = trellis_next(val1);
+//    s[3] = trellis_next(val1);
+//#ifdef GGML_CUDA_F16
+//    bdot1 = __hfma2(y[ 0], {h[0]+h[1]+h[2]+h[3], h[4]+h[5]+h[6]+h[7]}, bdot1);
+//#else
+//    bdot1.x += y[ 0].x * (float)(h[0] + h[1] + h[2] + h[3]);
+//    bdot1.y += y[ 0].y * (float)(h[4] + h[5] + h[6] + h[7]);
+//#endif
+//    s[0] = trellis_next(val2);
+//    s[1] = trellis_next(val2);
+//    s[2] = trellis_next(val2);
+//    s[3] = trellis_next(val2);
+//#ifdef GGML_CUDA_F16
+//    bdot2 = __hfma2(y[64], {h[0]+h[1]+h[2]+h[3], h[4]+h[5]+h[6]+h[7]}, bdot2);
+//#else
+//    bdot2.x += y[64].x * (float)(h[0] + h[1] + h[2] + h[3]);
+//    bdot2.y += y[64].y * (float)(h[4] + h[5] + h[6] + h[7]);
+//#endif
+//}
+
+static __device__ __forceinline__ void trellis_accum_abs(uint8_t signs1, uint8_t signs2, uint8_t mask1, uint8_t mask2,
+        uint32_t& val1, uint32_t& val2, uint32_t* s, const dfloat2* y, dfloat2& bdot1, dfloat2& bdot2) {
+    const half * h = (const half *)s;
+    s[0] = trellis_next(val1);
+    s[1] = trellis_next(val1);
+    s[2] = trellis_next(val2);
+    s[3] = trellis_next(val2);
+#ifdef GGML_CUDA_F16
+    half h00 = __habs(h[0]+h[1]), h01 = __habs(h[2]+h[3]);
+    half h10 = __habs(h[4]+h[5]), h11 = __habs(h[6]+h[7]);
+    half2 h1 = {signs1 & mask1 ? -h00 : h00, signs2 & mask1 ? -h01 : h01};
+    half2 h2 = {signs1 & mask2 ? -h10 : h10, signs2 & mask2 ? -h11 : h11};
+    //half2 h1 = __hmul2(__habs2({h[0]+h[1], h[2]+h[3]}), {signs1 & mask1 ? -1 : 1, signs2 & mask1 ? -1 : 1});
+    //half2 h2 = __hmul2(__habs2({h[4]+h[5], h[6]+h[7]}), {signs1 & mask2 ? -1 : 1, signs2 & mask2 ? -1 : 1});
+    bdot1 = __hfma2(y[ 0], h1, bdot1);
+    bdot2 = __hfma2(y[64], h2, bdot2);
+#else
+    bdot1.x += y[ 0].x * fabsf((float)(h[0] + h[1])) * (signs1 & mask1 ? -1 : 1);
+    bdot1.y += y[ 0].y * fabsf((float)(h[2] + h[3])) * (signs2 & mask1 ? -1 : 1);
+    bdot2.x += y[64].x * fabsf((float)(h[4] + h[5])) * (signs1 & mask2 ? -1 : 1);
+    bdot2.y += y[64].y * fabsf((float)(h[6] + h[7])) * (signs2 & mask2 ? -1 : 1);
+#endif
+}
+
 static __device__ __forceinline__ void trellis_accum(const dfloat2& dl1, const dfloat2& dl2, const dfloat2& bdot1, const dfloat2& bdot2, dfloat2& tmp) {
 #ifdef GGML_CUDA_F16
         tmp = __hfma2(dl1, bdot1, tmp);
@@ -114,25 +162,23 @@ static __global__ void dequantize_mul_mat_vec_iq3_kt(const void * __restrict__ v
 
     uint32_t s[4];
 
+    uint8_t mask1 = 1 << (it/4);
+    uint8_t mask2 = mask1 << 4;
+
     for (int i = ix; i < num_blocks_per_row; i += 2) {
         const dfloat2 * y = (const dfloat2 *)(yy + i * QK_K + 8*it);
-        const uint8_t * ql = x[i].ql;
-        const uint8_t * qh = x[i].qh;
-        const dfloat scale1 = iq4k_values[(x[i].scales[it/4] & 0xf)+16];
-        const dfloat scale2 = iq4k_values[(x[i].scales[it/4] >>  4)+16];
+        const uint16_t * ql = (const uint16_t *)x[i].ql;
+        const uint8_t  * qh = x[i].qh;
+        const dfloat scale1 = (x[i].scales[it/4] & 0xf);
+        const dfloat scale2 = (x[i].scales[it/4] >>  4);
         const dfloat2 dl1 = {scale1, scale1};
         const dfloat2 dl2 = {scale2, scale2};
         dfloat2 bdot1 = {0, 0};
         dfloat2 bdot2 = {0, 0};
-        uint32_t val1 = ql[2*it+ 0] + ((qh[2*it+0] << 8) & 0xf00) + 4096;
-        uint32_t val2 = ql[2*it+32] + ((qh[2*it+0] << 4) & 0xf00) + 4096;
-        for (int k = 0; k < 2; ++k) {
-            trellis_accum(val1, val2, s, y+k, bdot1, bdot2);
-        }
-        val1 = ql[2*it+ 1] + ((qh[2*it+1] << 8) & 0xf00) + 4096;
-        val2 = ql[2*it+33] + ((qh[2*it+1] << 4) & 0xf00) + 4096;
-        for (int k = 2; k < 4; ++k) {
-            trellis_accum(val1, val2, s, y+k, bdot1, bdot2);
+        uint32_t val1 = ql[it+ 0] + 4096;
+        uint32_t val2 = ql[it+16] + 4096;
+        for (int k = 0; k < 4; ++k) {
+            trellis_accum_abs(qh[(8*it+2*k+0)%32], qh[(8*it+2*k+1)%32], mask1, mask2, val1, val2, s, y+k, bdot1, bdot2);
         }
         trellis_accum(dl1, dl2, bdot1, bdot2, tmp);
     }