Skip to content

Commit aa719c2

Browse files
ggml : fix loongarch lsx compilation error (ggml-org#15864)
1 parent 4cdd0bb commit aa719c2

File tree

2 files changed

+20
-20
lines changed

2 files changed

+20
-20
lines changed

ggml/src/ggml-cpu/arch/loongarch/quants.c

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,18 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
105105

106106
return ((v4f32)res)[0];
107107
}
108+
109+
// multiply int8_t, add results pairwise twice
110+
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
111+
// Get absolute values of x vectors
112+
const __m128i ax = __lsx_vsigncov_b(x, x);
113+
// Sign the values of the y vectors
114+
const __m128i sy = __lsx_vsigncov_b(x, y);
115+
// Perform multiplication and create 16-bit values
116+
const __m128i dot = lsx_maddubs_h(ax, sy);
117+
const __m128i ones = __lsx_vreplgr2vr_h(1);
118+
return lsx_madd_h(ones, dot);
119+
}
108120
#endif
109121

110122
#if defined(__loongarch_asx)
@@ -323,18 +335,6 @@ static inline __m256i lasx_xvandi_b_bit(__m256i a, const unsigned int b) {
323335
}
324336
}
325337

326-
// multiply int8_t, add results pairwise twice
327-
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
328-
// Get absolute values of x vectors
329-
const __m128i ax = __lsx_vsigncov_b(x, x);
330-
// Sign the values of the y vectors
331-
const __m128i sy = __lsx_vsigncov_b(x, y);
332-
// Perform multiplication and create 16-bit values
333-
const __m128i dot = lsx_maddubs_h(ax, sy);
334-
const __m128i ones = __lsx_vreplgr2vr_h(1);
335-
return lsx_madd_h(ones, dot);
336-
}
337-
338338
// horizontally add 8 floats
339339
static inline float hsum_float_8(const __m256 x) {
340340
__m128 res = lasx_extractf128(x, 1);

ggml/src/ggml-cpu/simd-mappings.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -998,9 +998,9 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
998998
#define GGML_F32_EPR 4
999999

10001000
#define GGML_F32x4 __m128
1001-
#define GGML_F32x4_ZERO __lsx_vldi(0)
1002-
#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
1003-
#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
1001+
#define GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
1002+
#define GGML_F32x4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
1003+
#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
10041004
#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
10051005
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
10061006
#define GGML_F32x4_ADD __lsx_vfadd_s
@@ -1022,7 +1022,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
10221022
__m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
10231023
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
10241024
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1025-
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
1025+
const __m128 t0 = (__m128)__lsx_vshuf4i_w(tmp, 0x88); \
10261026
tmp = __lsx_vsrli_d((__m128i) t0, 32); \
10271027
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
10281028
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
@@ -1052,7 +1052,7 @@ static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
10521052
tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
10531053
tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
10541054

1055-
return __lsx_vld(tmp, 0);
1055+
return (__m128)__lsx_vld(tmp, 0);
10561056
}
10571057

10581058
static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
@@ -1067,9 +1067,9 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
10671067
}
10681068

10691069
#define GGML_F32Cx4 __m128
1070-
#define GGML_F32Cx4_ZERO __lsx_vldi(0)
1071-
#define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
1072-
#define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
1070+
#define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
1071+
#define GGML_F32Cx4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
1072+
#define GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x)
10731073
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
10741074
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
10751075
#define GGML_F32Cx4_ADD __lsx_vfadd_s

0 commit comments

Comments
 (0)