@@ -998,9 +998,9 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
998
998
#define GGML_F32_EPR 4
999
999
1000
1000
#define GGML_F32x4 __m128
1001
- #define GGML_F32x4_ZERO __lsx_vldi(0)
1002
- #define GGML_F32x4_SET1 (x ) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
1003
- #define GGML_F32x4_LOAD (x ) __lsx_vld((x), 0)
1001
+ #define GGML_F32x4_ZERO (__m128) __lsx_vldi(0)
1002
+ #define GGML_F32x4_SET1 (x ) (__m128) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
1003
+ #define GGML_F32x4_LOAD (x ) (__m128) __lsx_vld((x), 0)
1004
1004
#define GGML_F32x4_STORE (x , y ) __lsx_vst(y, x, 0)
1005
1005
#define GGML_F32x4_FMA (a , b , c ) __lsx_vfmadd_s(b, c, a)
1006
1006
#define GGML_F32x4_ADD __lsx_vfadd_s
@@ -1022,7 +1022,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
1022
1022
__m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
1023
1023
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
1024
1024
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1025
- const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
1025
+ const __m128 t0 = (__m128) __lsx_vshuf4i_w(tmp, 0x88); \
1026
1026
tmp = __lsx_vsrli_d((__m128i) t0, 32); \
1027
1027
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
1028
1028
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
@@ -1052,7 +1052,7 @@ static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
1052
1052
tmp [2 ] = GGML_CPU_FP16_TO_FP32 (x [2 ]);
1053
1053
tmp [3 ] = GGML_CPU_FP16_TO_FP32 (x [3 ]);
1054
1054
1055
- return __lsx_vld (tmp , 0 );
1055
+ return ( __m128 ) __lsx_vld (tmp , 0 );
1056
1056
}
1057
1057
1058
1058
static inline void __lsx_f16x4_store (ggml_fp16_t * x , __m128 y ) {
@@ -1067,9 +1067,9 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
1067
1067
}
1068
1068
1069
1069
#define GGML_F32Cx4 __m128
1070
- #define GGML_F32Cx4_ZERO __lsx_vldi(0)
1071
- #define GGML_F32Cx4_SET1 (x ) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
1072
- #define GGML_F32Cx4_LOAD (x ) __lsx_f16x4_load(x)
1070
+ #define GGML_F32Cx4_ZERO (__m128) __lsx_vldi(0)
1071
+ #define GGML_F32Cx4_SET1 (x ) (__m128) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
1072
+ #define GGML_F32Cx4_LOAD (x ) (__m128) __lsx_f16x4_load(x)
1073
1073
#define GGML_F32Cx4_STORE (x , y ) __lsx_f16x4_store(x, y)
1074
1074
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
1075
1075
#define GGML_F32Cx4_ADD __lsx_vfadd_s
0 commit comments