Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 127 additions & 0 deletions examples/quantize-stats/quantize-stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
#include "ggml.h"
#include "llama.h"

#define GGML_COMMON_DECL_C
#define GGML_COMMON_IMPL_C
#include "../ggml/src/ggml-common.h"

#include <algorithm>
#include <cassert>
#include <cinttypes>
Expand All @@ -21,6 +25,20 @@

#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#include <intrin.h>
#include <ammintrin.h>
#include <nmmintrin.h>
#include <immintrin.h>
#include <stdlib.h>
inline int popcount(uint8_t x) { return __popcnt(x); }
inline int popcount(uint16_t x) { return __popcnt(x); }
inline int popcount(uint32_t x) { return __popcnt(x); }
inline int popcount(uint64_t x) { return _mm_popcnt_u64(x); }
#else
constexpr int popcount(uint8_t x) { return __builtin_popcount(x); }
constexpr int popcount(uint16_t x) { return __builtin_popcount(x); }
constexpr int popcount(uint32_t x) { return __builtin_popcount(x); }
constexpr int popcount(uint64_t x) { return __builtin_popcountll(x); }
#endif

struct quantize_stats_params {
Expand Down Expand Up @@ -228,6 +246,97 @@ static void test_roundtrip_on_layer(
}
}

static void analyze_iq4ks(const char * name, int nrows, int n_per_row, const float * values, float& tot_mse, float& tot_elements) {
int row_size = ggml_row_size(GGML_TYPE_IQ4_KS, n_per_row);
int nblock = n_per_row/QK_K;
int nthread = std::max(1, int(std::thread::hardware_concurrency()/2));
int chunk = (nrows + 8*nthread - 1)/(8*nthread);
std::mutex mutex;
int counter = 0;
float mse0 = 0, mse = 0;
auto compute = [&mutex, &counter, &mse0, &mse, values, row_size, nblock, nrows, n_per_row, chunk] () {
std::vector<char> Q(row_size);
float lmse0 = 0, lmse = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int first = counter; counter += chunk;
if (first >= nrows) {
mse += lmse; mse0 += lmse0;
return;
}
lock.unlock();
int last = std::min(first + chunk, nrows);
for (int row = first; row < last; ++row) {
auto xr = values + row*n_per_row;
ggml_quantize_chunk(GGML_TYPE_IQ4_KS, xr, (void *)Q.data(), 0, 1, n_per_row, nullptr);
const float * dptr = (const float *)Q.data();
const float d = *dptr;
const block_iq4_ks * iq4 = (const block_iq4_ks *)(dptr + 1);
for (int ibl = 0; ibl < nblock; ++ibl) {
const float * xbl = xr + ibl*QK_K;
auto qs = iq4[ibl].qs;
for (int ib = 0; ib < QK_K/32; ++ib) {
const float * xb = xbl + 32*ib;
const float dl = d * ((iq4[ibl].scales[ib] & 254) - 127);
const int8_t * values = iq4k_values + ((iq4[ibl].scales[ib] & 1) << 4);
for (int j = 0; j < 16; j += 2) {
uint16_t v0 = *(const uint16_t *)(qs + j);
int non = popcount(v0);
float diff1 = xb[j+ 0] - dl*values[qs[j+0] & 0xf];
float diff2 = xb[j+16] - dl*values[qs[j+0] >> 4];
float diff3 = xb[j+ 1] - dl*values[qs[j+1] & 0xf];
float diff4 = xb[j+17] - dl*values[qs[j+1] >> 4];
lmse0 += diff1*diff1 + diff2*diff2 + diff3*diff3 + diff4*diff4;
if (non%2 == 0) {
lmse += diff1*diff1 + diff2*diff2 + diff3*diff3 + diff4*diff4;
} else {
float best = std::numeric_limits<float>::max();
for (int k = 0; k < 16; k += 4) {
uint16_t v = v0 ^ (1 << k);
uint8_t v1 = v;
uint8_t v2 = v >> 8;
diff1 = xb[j+ 0] - dl*values[v1 & 0xf];
diff2 = xb[j+16] - dl*values[v1 >> 4];
diff3 = xb[j+ 1] - dl*values[v2 & 0xf];
diff4 = xb[j+17] - dl*values[v2 >> 4];
float score = diff1*diff1 + diff2*diff2 + diff3*diff3 + diff4*diff4;
if (score < best) best = score;
}
lmse += best;
}
}
qs += 16;
}
}
}
}
};
std::vector<std::thread> workers(nthread-1);
for (auto& w : workers) w = std::thread(compute);
compute();
for (auto& w : workers) w.join();
tot_mse += mse;
tot_elements += n_per_row*nrows;
printf("%s: %g %g %g\n", name, sqrt(mse0/(n_per_row*nrows)), sqrt(mse/(n_per_row*nrows)), sqrt(tot_mse/tot_elements));
}

static void analyze_iq4ks(const ggml_tensor * t, float& tot_mse, float& tot_elements) {
if (!ggml_is_contiguous(t) || (t->type != GGML_TYPE_F32 && t->type != GGML_TYPE_F16 && t->type != GGML_TYPE_BF16)) {
return;
}
if (t->type == GGML_TYPE_F32) {
analyze_iq4ks(t->name, t->ne[1], t->ne[0], (const float *)t->data, tot_mse, tot_elements);
} else {
std::vector<float> aux(t->ne[0]*t->ne[1]);
if (t->type == GGML_TYPE_F16) {
ggml_fp16_to_fp32_row((const ggml_fp16_t *)t->data, aux.data(), aux.size());
} else {
ggml_bf16_to_fp32_row((const ggml_bf16_t *)t->data, aux.data(), aux.size());
}
analyze_iq4ks(t->name, t->ne[1], t->ne[0], aux.data(), tot_mse, tot_elements);
}
}

static void print_fp_stats(const char * msg, const uint64_t * counts) {
printf("===== %s\n", msg);
uint64_t tot = 0; for (int i = 0; i < 32; ++i) tot += counts[i];
Expand Down Expand Up @@ -263,6 +372,7 @@ int main(int argc, char ** argv) {
int max_thread = 0;
bool invalid_param = false;
bool analyze_fp = false;
bool analyze = false;
std::string arg;
for (int i = 1; i < argc; i++) {
arg = argv[i];
Expand All @@ -278,6 +388,8 @@ int main(int argc, char ** argv) {
params.per_layer_stats = true;
} else if (arg == "-afp" || arg == "--analyze-fp") {
analyze_fp = true;
} else if (arg == "-a" || arg == "--analyze") {
analyze = true;
} else if (arg == "--histogram") {
params.print_histogram = true;
} else if (arg == "-m" || arg == "--model") {
Expand Down Expand Up @@ -404,6 +516,21 @@ int main(int argc, char ** argv) {
std::vector<char> quantized_scratch;
std::vector<float> output_scratch;

if (analyze) {
float tot_mse = 0, tot_elements = 0;
for (const auto& kv_tensor : tensors) {
if (!layer_included(params, kv_tensor.first)) {
continue;
}
if (kv_tensor.second->ne[0] == 1 || kv_tensor.second->ne[1] == 1) {
// we never quantize those
continue;
}
analyze_iq4ks(kv_tensor.second, tot_mse, tot_elements);
}
return 0;
}

if (analyze_fp) {
for (const auto& kv_tensor : tensors) {
if (!layer_included(params, kv_tensor.first)) {
Expand Down
1 change: 1 addition & 0 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
{ "IQ4_KS", LLAMA_FTYPE_MOSTLY_IQ4_KS, " 4.25 bpw non-linear quantization", },
{ "IQ2_K", LLAMA_FTYPE_MOSTLY_IQ2_K, " 2.375 bpw non-linear quantization",},
{ "IQ2_KS", LLAMA_FTYPE_MOSTLY_IQ2_KS, " 2.1875 bpw non-linear quantization",},
{ "IQ3_K", LLAMA_FTYPE_MOSTLY_IQ3_K, " 3.44 bpw non-linear quantization", },
{ "IQ3_KL", LLAMA_FTYPE_MOSTLY_IQ3_KL, " 4 bpw non-linear quantization mix",},
{ "IQ4_K", LLAMA_FTYPE_MOSTLY_IQ4_K, " 4.5 bpw non-linear quantization", },
Expand Down
2 changes: 2 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,7 @@ extern "C" {
GGML_TYPE_IQ2_TN = 142,
GGML_TYPE_IQ1_TN = 143,
GGML_TYPE_IQ4_KS = 144,
GGML_TYPE_IQ2_KS = 145,
GGML_TYPE_COUNT,
};

Expand Down Expand Up @@ -460,6 +461,7 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ2_TN = 135, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_TN = 136, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ4_KS = 137, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_KS = 138, // except 1d tensors
};

// available tensor operations:
Expand Down
7 changes: 7 additions & 0 deletions ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,13 @@ typedef struct {
} block_iq2_k;
static_assert(sizeof(block_iq2_k) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/32 + QK_K/4, "wrong iq2_k block size/padding");

typedef struct {
uint16_t extra;
uint8_t scales[QK_K/64];
uint8_t qs[QK_K/4];
} block_iq2_ks;
static_assert(sizeof(block_iq2_ks) == sizeof(uint16_t) + QK_K/64 + QK_K/4, "wrong iq2_ks block size/padding");

typedef struct {
ggml_half d;
uint16_t extra;
Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2830,6 +2830,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ4_KS:
case GGML_TYPE_IQ2_K:
case GGML_TYPE_IQ2_KS:
case GGML_TYPE_IQ3_K:
case GGML_TYPE_IQ4_K:
case GGML_TYPE_IQ5_K:
Expand Down
7 changes: 7 additions & 0 deletions ggml/src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ2_K> {
static constexpr int qi = QI4_XS;
};

template<>
struct ggml_cuda_type_traits<GGML_TYPE_IQ2_KS> {
static constexpr int qk = QK_K;
static constexpr int qr = QR4_XS;
static constexpr int qi = QI4_XS;
};

template<>
struct ggml_cuda_type_traits<GGML_TYPE_IQ3_K> {
static constexpr int qk = QK_K;
Expand Down
48 changes: 44 additions & 4 deletions ggml/src/ggml-cuda/convert.cu
Original file line number Diff line number Diff line change
Expand Up @@ -729,10 +729,10 @@ static __global__ void dequantize_block_iq2_k(const void * __restrict__ vx, dst_
int il = tid%16; // 0...15
dst_t * y = yy + i*QK_K + 128*ib128 + 2*il;
const float d = (float)x[i].d;
const float dl1 = d * (2*((x[i].scales[4*ib128+0] >> 4*(il/8)) & 0xf) - 15);
const float dl2 = d * (2*((x[i].scales[4*ib128+1] >> 4*(il/8)) & 0xf) - 15);
const float dl3 = d * (2*((x[i].scales[4*ib128+2] >> 4*(il/8)) & 0xf) - 15);
const float dl4 = d * (2*((x[i].scales[4*ib128+3] >> 4*(il/8)) & 0xf) - 15);
const float dl1 = d * (((x[i].scales[4*ib128+0] >> 4*(il/8)) & 0xf) - 8);
const float dl2 = d * (((x[i].scales[4*ib128+1] >> 4*(il/8)) & 0xf) - 8);
const float dl3 = d * (((x[i].scales[4*ib128+2] >> 4*(il/8)) & 0xf) - 8);
const float dl4 = d * (((x[i].scales[4*ib128+3] >> 4*(il/8)) & 0xf) - 8);
const uint8_t * qs = x[i].qs + 32*ib128 + 2*il;
const int16_t extra = x[i].extra >> (8*ib128 + (il/8));
for (int j = 0; j < 2; ++j) {
Expand All @@ -743,6 +743,34 @@ static __global__ void dequantize_block_iq2_k(const void * __restrict__ vx, dst_
}
}

template<typename dst_t>
static __global__ void dequantize_block_iq2_ks(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {

int64_t ii = blockIdx.x;
int64_t row = (QK_K * ii) / n_per_row;
const char * cx = (const char *)vx + row * row_size;
const float d = (float)*(const half *)cx;
const block_iq2_ks * x = (const block_iq2_ks *)(cx + sizeof(half));
const int64_t i = ii - (row*n_per_row)/QK_K;

const int tid = threadIdx.x;
int ib128 = tid/16; // 0 or 1
int il = tid%16; // 0...15
dst_t * y = yy + ii*QK_K + 128*ib128 + 2*il;
const int16_t extra = x[i].extra >> 4*ib128;
const float dl1 = d * (((x[i].scales[2*ib128+0] & 0xf) | ((extra >> 4) & 0x10)) - 16);
const float dl2 = d * (((x[i].scales[2*ib128+0] >> 4) | ((extra >> 5) & 0x10)) - 16);
const float dl3 = d * (((x[i].scales[2*ib128+1] & 0xf) | ((extra >> 6) & 0x10)) - 16);
const float dl4 = d * (((x[i].scales[2*ib128+1] >> 4) | ((extra >> 7) & 0x10)) - 16);
const uint8_t * qs = x[i].qs + 32*ib128 + 2*il;
for (int j = 0; j < 2; ++j) {
y[j+ 0] = dl1 * iq2nl_values[((qs[j] >> 0) & 0x03) + ((extra << 2) & 4)];
y[j+32] = dl2 * iq2nl_values[((qs[j] >> 2) & 0x03) + ((extra << 1) & 4)];
y[j+64] = dl3 * iq2nl_values[((qs[j] >> 4) & 0x03) + ((extra >> 0) & 4)];
y[j+96] = dl4 * iq2nl_values[((qs[j] >> 6) & 0x03) + ((extra >> 1) & 4)];
}
}

template<typename dst_t>
static __global__ void dequantize_block_iq3_k(const void * __restrict__ vx, dst_t * __restrict__ yy) {

Expand Down Expand Up @@ -952,6 +980,14 @@ static void dequantize_row_iq4_ks_cuda(const void * vx, dst_t * y, const int64_t
dequantize_block_iq4_ks<<<nb, 32, 0, stream>>>(vx, y, n_per_row, row_size);
}

template<typename dst_t>
static void dequantize_row_iq2_ks_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
const int64_t k = nrows * n_per_row;
const int64_t row_size = ggml_row_size(GGML_TYPE_IQ2_KS, n_per_row);
const int nb = (k + QK_K - 1) / QK_K;
dequantize_block_iq2_ks<<<nb, 32, 0, stream>>>(vx, y, n_per_row, row_size);
}

template<typename dst_t>
static void dequantize_row_iq2_k_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
const int64_t k = nrows * n_per_row;
Expand Down Expand Up @@ -1116,6 +1152,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
return dequantize_row_iq4_xs_cuda;
case GGML_TYPE_IQ4_KS:
return dequantize_row_iq4_ks_cuda;
case GGML_TYPE_IQ2_KS:
return dequantize_row_iq2_ks_cuda;
case GGML_TYPE_IQ2_K:
return dequantize_row_iq2_k_cuda;
case GGML_TYPE_IQ3_K:
Expand Down Expand Up @@ -1187,6 +1225,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
return dequantize_row_iq4_xs_cuda;
case GGML_TYPE_IQ4_KS:
return dequantize_row_iq4_ks_cuda;
case GGML_TYPE_IQ2_KS:
return dequantize_row_iq2_ks_cuda;
case GGML_TYPE_IQ2_K:
return dequantize_row_iq2_k_cuda;
case GGML_TYPE_IQ3_K:
Expand Down
Loading