ikawrakow · ikawrakow · Oct 13, 2024 · Oct 12, 2024 · Oct 12, 2024 · Oct 12, 2024
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
@@ -3,6 +3,10 @@
 #include "ggml.h"
 #include "llama.h"
 
+#define GGML_COMMON_DECL_C
+#define GGML_COMMON_IMPL_C
+#include "../ggml/src/ggml-common.h"
+
 #include <algorithm>
 #include <cassert>
 #include <cinttypes>
@@ -21,6 +25,20 @@
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
+#include <intrin.h>
+#include <ammintrin.h>
+#include <nmmintrin.h>
+#include <immintrin.h>
+#include <stdlib.h>
+inline int popcount(uint8_t x) { return __popcnt(x); }
+inline int popcount(uint16_t x) { return __popcnt(x); }
+inline int popcount(uint32_t x) { return __popcnt(x); }
+inline int popcount(uint64_t x) { return _mm_popcnt_u64(x); }
+#else
+constexpr int popcount(uint8_t x) { return __builtin_popcount(x); }
+constexpr int popcount(uint16_t x) { return __builtin_popcount(x); }
+constexpr int popcount(uint32_t x) { return __builtin_popcount(x); }
+constexpr int popcount(uint64_t x) { return __builtin_popcountll(x); }
 #endif
 
 struct quantize_stats_params {
@@ -228,6 +246,97 @@ static void test_roundtrip_on_layer(
     }
 }
 
+static void analyze_iq4ks(const char * name, int nrows, int n_per_row, const float * values, float& tot_mse, float& tot_elements) {
+    int row_size = ggml_row_size(GGML_TYPE_IQ4_KS, n_per_row);
+    int nblock = n_per_row/QK_K;
+    int nthread = std::max(1, int(std::thread::hardware_concurrency()/2));
+    int chunk = (nrows + 8*nthread - 1)/(8*nthread);
+    std::mutex mutex;
+    int counter = 0;
+    float mse0 = 0, mse = 0;
+    auto compute = [&mutex, &counter, &mse0, &mse, values, row_size, nblock, nrows, n_per_row, chunk] () {
+        std::vector<char> Q(row_size);
+        float lmse0 = 0, lmse = 0;
+        while (true) {
+            std::unique_lock<std::mutex> lock(mutex);
+            int first = counter; counter += chunk;
+            if (first >= nrows) {
+                mse += lmse; mse0 += lmse0;
+                return;
+            }
+            lock.unlock();
+            int last = std::min(first + chunk, nrows);
+            for (int row = first; row < last; ++row) {
+                auto xr = values + row*n_per_row;
+                ggml_quantize_chunk(GGML_TYPE_IQ4_KS, xr, (void *)Q.data(), 0, 1, n_per_row, nullptr);
+                const float * dptr = (const float *)Q.data();
+                const float d = *dptr;
+                const block_iq4_ks * iq4 = (const block_iq4_ks *)(dptr + 1);
+                for (int ibl = 0; ibl < nblock; ++ibl) {
+                    const float * xbl = xr + ibl*QK_K;
+                    auto qs = iq4[ibl].qs;
+                    for (int ib = 0; ib < QK_K/32; ++ib) {
+                        const float * xb = xbl + 32*ib;
+                        const float dl = d * ((iq4[ibl].scales[ib] & 254) - 127);
+                        const int8_t * values = iq4k_values + ((iq4[ibl].scales[ib] & 1) << 4);
+                        for (int j = 0; j < 16; j += 2) {
+                            uint16_t v0 = *(const uint16_t *)(qs + j);
+                            int non = popcount(v0);
+                            float diff1 = xb[j+ 0] - dl*values[qs[j+0] & 0xf];
+                            float diff2 = xb[j+16] - dl*values[qs[j+0] >>  4];
+                            float diff3 = xb[j+ 1] - dl*values[qs[j+1] & 0xf];
+                            float diff4 = xb[j+17] - dl*values[qs[j+1] >>  4];
+                            lmse0 += diff1*diff1 + diff2*diff2 + diff3*diff3 + diff4*diff4;
+                            if (non%2 == 0) {
+                                lmse += diff1*diff1 + diff2*diff2 + diff3*diff3 + diff4*diff4;
+                            } else {
+                                float best = std::numeric_limits<float>::max();
+                                for (int k = 0; k < 16; k += 4) {
+                                    uint16_t v = v0 ^ (1 << k);
+                                    uint8_t v1 = v;
+                                    uint8_t v2 = v >> 8;
+                                    diff1 = xb[j+ 0] - dl*values[v1 & 0xf];
+                                    diff2 = xb[j+16] - dl*values[v1 >>  4];
+                                    diff3 = xb[j+ 1] - dl*values[v2 & 0xf];
+                                    diff4 = xb[j+17] - dl*values[v2 >>  4];
+                                    float score = diff1*diff1 + diff2*diff2 + diff3*diff3 + diff4*diff4;
+                                    if (score < best) best = score;
+                                }
+                                lmse += best;
+                            }
+                        }
+                        qs += 16;
+                    }
+                }
+            }
+        }
+    };
+    std::vector<std::thread> workers(nthread-1);
+    for (auto& w : workers) w = std::thread(compute);
+    compute();
+    for (auto& w : workers) w.join();
+    tot_mse += mse;
+    tot_elements += n_per_row*nrows;
+    printf("%s:  %g  %g    %g\n", name, sqrt(mse0/(n_per_row*nrows)), sqrt(mse/(n_per_row*nrows)), sqrt(tot_mse/tot_elements));
+}
+
+static void analyze_iq4ks(const ggml_tensor * t, float& tot_mse, float& tot_elements) {
+    if (!ggml_is_contiguous(t) || (t->type != GGML_TYPE_F32 && t->type != GGML_TYPE_F16 && t->type != GGML_TYPE_BF16)) {
+        return;
+    }
+    if (t->type == GGML_TYPE_F32) {
+        analyze_iq4ks(t->name, t->ne[1], t->ne[0], (const float *)t->data, tot_mse, tot_elements);
+    } else {
+        std::vector<float> aux(t->ne[0]*t->ne[1]);
+        if (t->type == GGML_TYPE_F16) {
+            ggml_fp16_to_fp32_row((const ggml_fp16_t *)t->data, aux.data(), aux.size());
+        } else {
+            ggml_bf16_to_fp32_row((const ggml_bf16_t *)t->data, aux.data(), aux.size());
+        }
+        analyze_iq4ks(t->name, t->ne[1], t->ne[0], aux.data(), tot_mse, tot_elements);
+    }
+}
+
 static void print_fp_stats(const char * msg, const uint64_t * counts) {
     printf("===== %s\n", msg);
     uint64_t tot = 0; for (int i = 0; i < 32; ++i) tot += counts[i];
@@ -263,6 +372,7 @@ int main(int argc, char ** argv) {
     int max_thread = 0;
     bool invalid_param = false;
     bool analyze_fp = false;
+    bool analyze = false;
     std::string arg;
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
@@ -278,6 +388,8 @@ int main(int argc, char ** argv) {
             params.per_layer_stats = true;
         } else if (arg == "-afp" || arg == "--analyze-fp") {
             analyze_fp = true;
+        } else if (arg == "-a" || arg == "--analyze") {
+            analyze = true;
         } else if (arg == "--histogram") {
             params.print_histogram = true;
         } else if (arg == "-m" || arg == "--model") {
@@ -404,6 +516,21 @@ int main(int argc, char ** argv) {
     std::vector<char> quantized_scratch;
     std::vector<float> output_scratch;
 
+    if (analyze) {
+        float tot_mse = 0, tot_elements = 0;
+        for (const auto& kv_tensor : tensors) {
+            if (!layer_included(params, kv_tensor.first)) {
+                continue;
+            }
+            if (kv_tensor.second->ne[0] == 1 || kv_tensor.second->ne[1] == 1) {
+                // we never quantize those
+                continue;
+            }
+            analyze_iq4ks(kv_tensor.second, tot_mse, tot_elements);
+        }
+        return 0;
+    }
+
     if (analyze_fp) {
         for (const auto& kv_tensor : tensors) {
             if (!layer_included(params, kv_tensor.first)) {

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -45,6 +45,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "IQ4_XS",   LLAMA_FTYPE_MOSTLY_IQ4_XS,   " 4.25 bpw non-linear quantization", },
     { "IQ4_KS",   LLAMA_FTYPE_MOSTLY_IQ4_KS,   " 4.25 bpw non-linear quantization", },
     { "IQ2_K",    LLAMA_FTYPE_MOSTLY_IQ2_K,    " 2.375 bpw non-linear quantization",},
+    { "IQ2_KS",   LLAMA_FTYPE_MOSTLY_IQ2_KS,   " 2.1875 bpw non-linear quantization",},
     { "IQ3_K",    LLAMA_FTYPE_MOSTLY_IQ3_K,    " 3.44 bpw non-linear quantization", },
     { "IQ3_KL",   LLAMA_FTYPE_MOSTLY_IQ3_KL,   " 4 bpw non-linear quantization mix",},
     { "IQ4_K",    LLAMA_FTYPE_MOSTLY_IQ4_K,    " 4.5 bpw non-linear quantization",  },

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -404,6 +404,7 @@ extern "C" {
         GGML_TYPE_IQ2_TN  = 142,
         GGML_TYPE_IQ1_TN  = 143,
         GGML_TYPE_IQ4_KS  = 144,
+        GGML_TYPE_IQ2_KS  = 145,
         GGML_TYPE_COUNT,
     };
 
@@ -460,6 +461,7 @@ extern "C" {
         GGML_FTYPE_MOSTLY_IQ2_TN  = 135, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ1_TN  = 136, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ4_KS  = 137, // except 1d tensors
+        GGML_FTYPE_MOSTLY_IQ2_KS  = 138, // except 1d tensors
     };
 
     // available tensor operations:

diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
@@ -455,6 +455,13 @@ typedef struct {
 } block_iq2_k;
 static_assert(sizeof(block_iq2_k) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/32 + QK_K/4, "wrong iq2_k block size/padding");
 
+typedef struct {
+    uint16_t extra;
+    uint8_t  scales[QK_K/64];
+    uint8_t  qs[QK_K/4];
+} block_iq2_ks;
+static_assert(sizeof(block_iq2_ks) == sizeof(uint16_t) + QK_K/64 + QK_K/4, "wrong iq2_ks block size/padding");
+
 typedef struct {
     ggml_half d;
     uint16_t extra;

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
@@ -2830,6 +2830,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                     case GGML_TYPE_IQ4_XS:
                     case GGML_TYPE_IQ4_KS:
                     case GGML_TYPE_IQ2_K:
+                    case GGML_TYPE_IQ2_KS:
                     case GGML_TYPE_IQ3_K:
                     case GGML_TYPE_IQ4_K:
                     case GGML_TYPE_IQ5_K:

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
@@ -515,6 +515,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ2_K> {
     static constexpr int qi = QI4_XS;
 };
 
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ2_KS> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR4_XS;
+    static constexpr int qi = QI4_XS;
+};
+
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_IQ3_K> {
     static constexpr int qk = QK_K;

diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
@@ -729,10 +729,10 @@ static __global__ void dequantize_block_iq2_k(const void * __restrict__ vx, dst_
     int il    = tid%16; // 0...15
     dst_t * y = yy + i*QK_K + 128*ib128 + 2*il;
     const float d = (float)x[i].d;
-    const float dl1 = d * (2*((x[i].scales[4*ib128+0] >> 4*(il/8)) & 0xf) - 15);
-    const float dl2 = d * (2*((x[i].scales[4*ib128+1] >> 4*(il/8)) & 0xf) - 15);
-    const float dl3 = d * (2*((x[i].scales[4*ib128+2] >> 4*(il/8)) & 0xf) - 15);
-    const float dl4 = d * (2*((x[i].scales[4*ib128+3] >> 4*(il/8)) & 0xf) - 15);
+    const float dl1 = d * (((x[i].scales[4*ib128+0] >> 4*(il/8)) & 0xf) - 8);
+    const float dl2 = d * (((x[i].scales[4*ib128+1] >> 4*(il/8)) & 0xf) - 8);
+    const float dl3 = d * (((x[i].scales[4*ib128+2] >> 4*(il/8)) & 0xf) - 8);
+    const float dl4 = d * (((x[i].scales[4*ib128+3] >> 4*(il/8)) & 0xf) - 8);
     const uint8_t * qs = x[i].qs + 32*ib128 + 2*il;
     const int16_t extra = x[i].extra >> (8*ib128 + (il/8));
     for (int j = 0; j < 2; ++j) {
@@ -743,6 +743,34 @@ static __global__ void dequantize_block_iq2_k(const void * __restrict__ vx, dst_
     }
 }
 
+template<typename dst_t>
+static __global__ void dequantize_block_iq2_ks(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
+
+    int64_t ii  = blockIdx.x;
+    int64_t row = (QK_K * ii) / n_per_row;
+    const char * cx = (const char *)vx + row * row_size;
+    const float d = (float)*(const half *)cx;
+    const block_iq2_ks * x = (const block_iq2_ks *)(cx + sizeof(half));
+    const int64_t i   = ii - (row*n_per_row)/QK_K;
+
+    const int tid = threadIdx.x;
+    int ib128 = tid/16; // 0 or 1
+    int il    = tid%16; // 0...15
+    dst_t * y = yy + ii*QK_K + 128*ib128 + 2*il;
+    const int16_t extra = x[i].extra >> 4*ib128;
+    const float dl1 = d * (((x[i].scales[2*ib128+0] & 0xf) | ((extra >> 4) & 0x10)) - 16);
+    const float dl2 = d * (((x[i].scales[2*ib128+0] >>  4) | ((extra >> 5) & 0x10)) - 16);
+    const float dl3 = d * (((x[i].scales[2*ib128+1] & 0xf) | ((extra >> 6) & 0x10)) - 16);
+    const float dl4 = d * (((x[i].scales[2*ib128+1] >>  4) | ((extra >> 7) & 0x10)) - 16);
+    const uint8_t * qs = x[i].qs + 32*ib128 + 2*il;
+    for (int j = 0; j < 2; ++j) {
+        y[j+ 0] = dl1 * iq2nl_values[((qs[j] >> 0) & 0x03) + ((extra << 2) & 4)];
+        y[j+32] = dl2 * iq2nl_values[((qs[j] >> 2) & 0x03) + ((extra << 1) & 4)];
+        y[j+64] = dl3 * iq2nl_values[((qs[j] >> 4) & 0x03) + ((extra >> 0) & 4)];
+        y[j+96] = dl4 * iq2nl_values[((qs[j] >> 6) & 0x03) + ((extra >> 1) & 4)];
+    }
+}
+
 template<typename dst_t>
 static __global__ void dequantize_block_iq3_k(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 
@@ -952,6 +980,14 @@ static void dequantize_row_iq4_ks_cuda(const void * vx, dst_t * y, const int64_t
     dequantize_block_iq4_ks<<<nb, 32, 0, stream>>>(vx, y, n_per_row, row_size);
 }
 
+template<typename dst_t>
+static void dequantize_row_iq2_ks_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
+    const int64_t k = nrows * n_per_row;
+    const int64_t row_size = ggml_row_size(GGML_TYPE_IQ2_KS, n_per_row);
+    const int nb = (k + QK_K - 1) / QK_K;
+    dequantize_block_iq2_ks<<<nb, 32, 0, stream>>>(vx, y, n_per_row, row_size);
+}
+
 template<typename dst_t>
 static void dequantize_row_iq2_k_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
     const int64_t k = nrows * n_per_row;
@@ -1116,6 +1152,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
             return dequantize_row_iq4_xs_cuda;
         case GGML_TYPE_IQ4_KS:
             return dequantize_row_iq4_ks_cuda;
+        case GGML_TYPE_IQ2_KS:
+            return dequantize_row_iq2_ks_cuda;
         case GGML_TYPE_IQ2_K:
             return dequantize_row_iq2_k_cuda;
         case GGML_TYPE_IQ3_K:
@@ -1187,6 +1225,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
             return dequantize_row_iq4_xs_cuda;
         case GGML_TYPE_IQ4_KS:
             return dequantize_row_iq4_ks_cuda;
+        case GGML_TYPE_IQ2_KS:
+            return dequantize_row_iq2_ks_cuda;
         case GGML_TYPE_IQ2_K:
             return dequantize_row_iq2_k_cuda;
         case GGML_TYPE_IQ3_K: