Skip to content

Commit caba83c

Browse files
Use AVX/AVX2 masks in minmax_element and minmax vectorization (#4917)
Co-authored-by: Stephan T. Lavavej <[email protected]>
1 parent ab555ad commit caba83c

File tree

2 files changed

+224
-116
lines changed

2 files changed

+224
-116
lines changed

benchmarks/src/minmax_element.cpp

Lines changed: 80 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <random>
99
#include <ranges>
1010
#include <type_traits>
11+
#include <vector>
1112

1213
enum class Op {
1314
Min,
@@ -20,9 +21,9 @@ enum class Op {
2021

2122
using namespace std;
2223

23-
template <class T, size_t Size, Op Operation>
24+
template <class T, Op Operation>
2425
void bm(benchmark::State& state) {
25-
T a[Size];
26+
vector<T> a(static_cast<size_t>(state.range()));
2627

2728
mt19937 gen(84710);
2829

@@ -35,6 +36,8 @@ void bm(benchmark::State& state) {
3536
}
3637

3738
for (auto _ : state) {
39+
benchmark::DoNotOptimize(a);
40+
3841
if constexpr (Operation == Op::Min) {
3942
benchmark::DoNotOptimize(ranges::min_element(a));
4043
} else if constexpr (Operation == Op::Max) {
@@ -51,75 +54,81 @@ void bm(benchmark::State& state) {
5154
}
5255
}
5356

54-
BENCHMARK(bm<uint8_t, 8021, Op::Min>);
55-
BENCHMARK(bm<uint8_t, 8021, Op::Max>);
56-
BENCHMARK(bm<uint8_t, 8021, Op::Both>);
57-
BENCHMARK(bm<uint8_t, 8021, Op::Min_val>);
58-
BENCHMARK(bm<uint8_t, 8021, Op::Max_val>);
59-
BENCHMARK(bm<uint8_t, 8021, Op::Both_val>);
60-
61-
BENCHMARK(bm<uint16_t, 8021, Op::Min>);
62-
BENCHMARK(bm<uint16_t, 8021, Op::Max>);
63-
BENCHMARK(bm<uint16_t, 8021, Op::Both>);
64-
BENCHMARK(bm<uint16_t, 8021, Op::Min_val>);
65-
BENCHMARK(bm<uint16_t, 8021, Op::Max_val>);
66-
BENCHMARK(bm<uint16_t, 8021, Op::Both_val>);
67-
68-
BENCHMARK(bm<uint32_t, 8021, Op::Min>);
69-
BENCHMARK(bm<uint32_t, 8021, Op::Max>);
70-
BENCHMARK(bm<uint32_t, 8021, Op::Both>);
71-
BENCHMARK(bm<uint32_t, 8021, Op::Min_val>);
72-
BENCHMARK(bm<uint32_t, 8021, Op::Max_val>);
73-
BENCHMARK(bm<uint32_t, 8021, Op::Both_val>);
74-
75-
BENCHMARK(bm<uint64_t, 8021, Op::Min>);
76-
BENCHMARK(bm<uint64_t, 8021, Op::Max>);
77-
BENCHMARK(bm<uint64_t, 8021, Op::Both>);
78-
BENCHMARK(bm<uint64_t, 8021, Op::Min_val>);
79-
BENCHMARK(bm<uint64_t, 8021, Op::Max_val>);
80-
BENCHMARK(bm<uint64_t, 8021, Op::Both_val>);
81-
82-
BENCHMARK(bm<int8_t, 8021, Op::Min>);
83-
BENCHMARK(bm<int8_t, 8021, Op::Max>);
84-
BENCHMARK(bm<int8_t, 8021, Op::Both>);
85-
BENCHMARK(bm<int8_t, 8021, Op::Min_val>);
86-
BENCHMARK(bm<int8_t, 8021, Op::Max_val>);
87-
BENCHMARK(bm<int8_t, 8021, Op::Both_val>);
88-
89-
BENCHMARK(bm<int16_t, 8021, Op::Min>);
90-
BENCHMARK(bm<int16_t, 8021, Op::Max>);
91-
BENCHMARK(bm<int16_t, 8021, Op::Both>);
92-
BENCHMARK(bm<int16_t, 8021, Op::Min_val>);
93-
BENCHMARK(bm<int16_t, 8021, Op::Max_val>);
94-
BENCHMARK(bm<int16_t, 8021, Op::Both_val>);
95-
96-
BENCHMARK(bm<int32_t, 8021, Op::Min>);
97-
BENCHMARK(bm<int32_t, 8021, Op::Max>);
98-
BENCHMARK(bm<int32_t, 8021, Op::Both>);
99-
BENCHMARK(bm<int32_t, 8021, Op::Min_val>);
100-
BENCHMARK(bm<int32_t, 8021, Op::Max_val>);
101-
BENCHMARK(bm<int32_t, 8021, Op::Both_val>);
102-
103-
BENCHMARK(bm<int64_t, 8021, Op::Min>);
104-
BENCHMARK(bm<int64_t, 8021, Op::Max>);
105-
BENCHMARK(bm<int64_t, 8021, Op::Both>);
106-
BENCHMARK(bm<int64_t, 8021, Op::Min_val>);
107-
BENCHMARK(bm<int64_t, 8021, Op::Max_val>);
108-
BENCHMARK(bm<int64_t, 8021, Op::Both_val>);
109-
110-
BENCHMARK(bm<float, 8021, Op::Min>);
111-
BENCHMARK(bm<float, 8021, Op::Max>);
112-
BENCHMARK(bm<float, 8021, Op::Both>);
113-
BENCHMARK(bm<float, 8021, Op::Min_val>);
114-
BENCHMARK(bm<float, 8021, Op::Max_val>);
115-
BENCHMARK(bm<float, 8021, Op::Both_val>);
116-
117-
BENCHMARK(bm<double, 8021, Op::Min>);
118-
BENCHMARK(bm<double, 8021, Op::Max>);
119-
BENCHMARK(bm<double, 8021, Op::Both>);
120-
BENCHMARK(bm<double, 8021, Op::Min_val>);
121-
BENCHMARK(bm<double, 8021, Op::Max_val>);
122-
BENCHMARK(bm<double, 8021, Op::Both_val>);
57+
template <size_t ElementSize>
58+
void common_arg(auto bm) {
59+
bm->Arg(8021);
60+
// AVX tail tests
61+
bm->Arg(63 / ElementSize);
62+
}
12363

64+
BENCHMARK(bm<uint8_t, Op::Min>)->Apply(common_arg<1>);
65+
BENCHMARK(bm<uint8_t, Op::Max>)->Apply(common_arg<1>);
66+
BENCHMARK(bm<uint8_t, Op::Both>)->Apply(common_arg<1>);
67+
BENCHMARK(bm<uint8_t, Op::Min_val>)->Apply(common_arg<1>);
68+
BENCHMARK(bm<uint8_t, Op::Max_val>)->Apply(common_arg<1>);
69+
BENCHMARK(bm<uint8_t, Op::Both_val>)->Apply(common_arg<1>);
70+
71+
BENCHMARK(bm<uint16_t, Op::Min>)->Apply(common_arg<2>);
72+
BENCHMARK(bm<uint16_t, Op::Max>)->Apply(common_arg<2>);
73+
BENCHMARK(bm<uint16_t, Op::Both>)->Apply(common_arg<2>);
74+
BENCHMARK(bm<uint16_t, Op::Min_val>)->Apply(common_arg<2>);
75+
BENCHMARK(bm<uint16_t, Op::Max_val>)->Apply(common_arg<2>);
76+
BENCHMARK(bm<uint16_t, Op::Both_val>)->Apply(common_arg<2>);
77+
78+
BENCHMARK(bm<uint32_t, Op::Min>)->Apply(common_arg<4>);
79+
BENCHMARK(bm<uint32_t, Op::Max>)->Apply(common_arg<4>);
80+
BENCHMARK(bm<uint32_t, Op::Both>)->Apply(common_arg<4>);
81+
BENCHMARK(bm<uint32_t, Op::Min_val>)->Apply(common_arg<4>);
82+
BENCHMARK(bm<uint32_t, Op::Max_val>)->Apply(common_arg<4>);
83+
BENCHMARK(bm<uint32_t, Op::Both_val>)->Apply(common_arg<4>);
84+
85+
BENCHMARK(bm<uint64_t, Op::Min>)->Apply(common_arg<8>);
86+
BENCHMARK(bm<uint64_t, Op::Max>)->Apply(common_arg<8>);
87+
BENCHMARK(bm<uint64_t, Op::Both>)->Apply(common_arg<8>);
88+
BENCHMARK(bm<uint64_t, Op::Min_val>)->Apply(common_arg<8>);
89+
BENCHMARK(bm<uint64_t, Op::Max_val>)->Apply(common_arg<8>);
90+
BENCHMARK(bm<uint64_t, Op::Both_val>)->Apply(common_arg<8>);
91+
92+
BENCHMARK(bm<int8_t, Op::Min>)->Apply(common_arg<1>);
93+
BENCHMARK(bm<int8_t, Op::Max>)->Apply(common_arg<1>);
94+
BENCHMARK(bm<int8_t, Op::Both>)->Apply(common_arg<1>);
95+
BENCHMARK(bm<int8_t, Op::Min_val>)->Apply(common_arg<1>);
96+
BENCHMARK(bm<int8_t, Op::Max_val>)->Apply(common_arg<1>);
97+
BENCHMARK(bm<int8_t, Op::Both_val>)->Apply(common_arg<1>);
98+
99+
BENCHMARK(bm<int16_t, Op::Min>)->Apply(common_arg<2>);
100+
BENCHMARK(bm<int16_t, Op::Max>)->Apply(common_arg<2>);
101+
BENCHMARK(bm<int16_t, Op::Both>)->Apply(common_arg<2>);
102+
BENCHMARK(bm<int16_t, Op::Min_val>)->Apply(common_arg<2>);
103+
BENCHMARK(bm<int16_t, Op::Max_val>)->Apply(common_arg<2>);
104+
BENCHMARK(bm<int16_t, Op::Both_val>)->Apply(common_arg<2>);
105+
106+
BENCHMARK(bm<int32_t, Op::Min>)->Apply(common_arg<4>);
107+
BENCHMARK(bm<int32_t, Op::Max>)->Apply(common_arg<4>);
108+
BENCHMARK(bm<int32_t, Op::Both>)->Apply(common_arg<4>);
109+
BENCHMARK(bm<int32_t, Op::Min_val>)->Apply(common_arg<4>);
110+
BENCHMARK(bm<int32_t, Op::Max_val>)->Apply(common_arg<4>);
111+
BENCHMARK(bm<int32_t, Op::Both_val>)->Apply(common_arg<4>);
112+
113+
BENCHMARK(bm<int64_t, Op::Min>)->Apply(common_arg<8>);
114+
BENCHMARK(bm<int64_t, Op::Max>)->Apply(common_arg<8>);
115+
BENCHMARK(bm<int64_t, Op::Both>)->Apply(common_arg<8>);
116+
BENCHMARK(bm<int64_t, Op::Min_val>)->Apply(common_arg<8>);
117+
BENCHMARK(bm<int64_t, Op::Max_val>)->Apply(common_arg<8>);
118+
BENCHMARK(bm<int64_t, Op::Both_val>)->Apply(common_arg<8>);
119+
120+
BENCHMARK(bm<float, Op::Min>)->Apply(common_arg<4>);
121+
BENCHMARK(bm<float, Op::Max>)->Apply(common_arg<4>);
122+
BENCHMARK(bm<float, Op::Both>)->Apply(common_arg<4>);
123+
BENCHMARK(bm<float, Op::Min_val>)->Apply(common_arg<4>);
124+
BENCHMARK(bm<float, Op::Max_val>)->Apply(common_arg<4>);
125+
BENCHMARK(bm<float, Op::Both_val>)->Apply(common_arg<4>);
126+
127+
BENCHMARK(bm<double, Op::Min>)->Apply(common_arg<8>);
128+
BENCHMARK(bm<double, Op::Max>)->Apply(common_arg<8>);
129+
BENCHMARK(bm<double, Op::Both>)->Apply(common_arg<8>);
130+
BENCHMARK(bm<double, Op::Min_val>)->Apply(common_arg<8>);
131+
BENCHMARK(bm<double, Op::Max_val>)->Apply(common_arg<8>);
132+
BENCHMARK(bm<double, Op::Both_val>)->Apply(common_arg<8>);
124133

125134
BENCHMARK_MAIN();

0 commit comments

Comments
 (0)