Skip to content

Commit 5afb032

Browse files
Vectorize adjacent_find (#5331)
Co-authored-by: Stephan T. Lavavej <[email protected]>
1 parent 185398a commit 5afb032

File tree

6 files changed

+289
-0
lines changed

6 files changed

+289
-0
lines changed

benchmarks/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ function(add_benchmark name)
107107
endfunction()
108108

109109
add_benchmark(adjacent_difference src/adjacent_difference.cpp)
110+
add_benchmark(adjacent_find src/adjacent_find.cpp)
110111
add_benchmark(bitset_from_string src/bitset_from_string.cpp)
111112
add_benchmark(bitset_to_string src/bitset_to_string.cpp)
112113
add_benchmark(efficient_nonlocking_print src/efficient_nonlocking_print.cpp)

benchmarks/src/adjacent_find.cpp

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3+
4+
#include <algorithm>
5+
#include <benchmark/benchmark.h>
6+
#include <cstddef>
7+
#include <cstdint>
8+
#include <cstdlib>
9+
#include <vector>
10+
11+
using namespace std;
12+
13+
enum class AlgType { Std, Rng };
14+
15+
template <AlgType Alg, class T>
16+
void bm(benchmark::State& state) {
17+
const size_t size = static_cast<size_t>(state.range(0));
18+
const size_t pos = static_cast<size_t>(state.range(1));
19+
20+
vector<T> v(size);
21+
22+
for (size_t i = 0; i != size; ++i) {
23+
v[i] = static_cast<T>(i & 3);
24+
}
25+
26+
if (pos == 0 || pos >= size) {
27+
abort();
28+
}
29+
30+
v[pos] = v[pos - 1];
31+
32+
for (auto _ : state) {
33+
benchmark::DoNotOptimize(v);
34+
if constexpr (Alg == AlgType::Std) {
35+
benchmark::DoNotOptimize(adjacent_find(v.begin(), v.end()));
36+
} else {
37+
benchmark::DoNotOptimize(ranges::adjacent_find(v));
38+
}
39+
}
40+
}
41+
42+
void common_args(auto bm) {
43+
bm->ArgPair(2525, 1142);
44+
}
45+
46+
BENCHMARK(bm<AlgType::Std, int8_t>)->Apply(common_args);
47+
BENCHMARK(bm<AlgType::Std, int16_t>)->Apply(common_args);
48+
BENCHMARK(bm<AlgType::Std, int32_t>)->Apply(common_args);
49+
BENCHMARK(bm<AlgType::Std, int64_t>)->Apply(common_args);
50+
51+
BENCHMARK(bm<AlgType::Rng, int8_t>)->Apply(common_args);
52+
BENCHMARK(bm<AlgType::Rng, int16_t>)->Apply(common_args);
53+
BENCHMARK(bm<AlgType::Rng, int32_t>)->Apply(common_args);
54+
BENCHMARK(bm<AlgType::Rng, int64_t>)->Apply(common_args);
55+
56+
BENCHMARK_MAIN();

stl/inc/algorithm

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -542,6 +542,24 @@ _NODISCARD _CONSTEXPR20 _FwdIt adjacent_find(const _FwdIt _First, _FwdIt _Last,
542542
auto _UFirst = _STD _Get_unwrapped(_First);
543543
auto _ULast = _STD _Get_unwrapped(_Last);
544544
if (_UFirst != _ULast) {
545+
#if _USE_STD_VECTOR_ALGORITHMS
546+
if constexpr (_Equal_memcmp_is_safe<decltype(_UFirst), decltype(_UFirst), _Pr>) {
547+
if (!_STD _Is_constant_evaluated()) {
548+
const auto _First_ptr = _STD _To_address(_UFirst);
549+
const auto _Result = _STD _Adjacent_find_vectorized(_First_ptr, _STD _To_address(_ULast));
550+
551+
if constexpr (is_pointer_v<decltype(_ULast)>) {
552+
_ULast = _Result;
553+
} else {
554+
_ULast = _UFirst + (_Result - _First_ptr);
555+
}
556+
557+
_STD _Seek_wrapped(_Last, _ULast);
558+
return _Last;
559+
}
560+
}
561+
#endif // _USE_STD_VECTOR_ALGORITHMS
562+
545563
for (auto _UNext = _UFirst; ++_UNext != _ULast; _UFirst = _UNext) {
546564
if (_Pred(*_UFirst, *_UNext)) {
547565
_ULast = _UFirst;

stl/inc/xutility

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,11 @@ const void* __stdcall __std_find_last_trivial_2(const void* _First, const void*
9898
const void* __stdcall __std_find_last_trivial_4(const void* _First, const void* _Last, uint32_t _Val) noexcept;
9999
const void* __stdcall __std_find_last_trivial_8(const void* _First, const void* _Last, uint64_t _Val) noexcept;
100100

101+
const void* __stdcall __std_adjacent_find_1(const void* _First, const void* _Last) noexcept;
102+
const void* __stdcall __std_adjacent_find_2(const void* _First, const void* _Last) noexcept;
103+
const void* __stdcall __std_adjacent_find_4(const void* _First, const void* _Last) noexcept;
104+
const void* __stdcall __std_adjacent_find_8(const void* _First, const void* _Last) noexcept;
105+
101106
const void* __stdcall __std_search_1(
102107
const void* _First1, const void* _Last1, const void* _First2, size_t _Count2) noexcept;
103108
const void* __stdcall __std_search_2(
@@ -240,6 +245,21 @@ _Ty* _Find_last_vectorized(_Ty* const _First, _Ty* const _Last, const _TVal _Val
240245
}
241246
}
242247

248+
template <class _Ty>
249+
_Ty* _Adjacent_find_vectorized(_Ty* const _First, _Ty* const _Last) noexcept {
250+
if constexpr (sizeof(_Ty) == 1) {
251+
return const_cast<_Ty*>(static_cast<const _Ty*>(::__std_adjacent_find_1(_First, _Last)));
252+
} else if constexpr (sizeof(_Ty) == 2) {
253+
return const_cast<_Ty*>(static_cast<const _Ty*>(::__std_adjacent_find_2(_First, _Last)));
254+
} else if constexpr (sizeof(_Ty) == 4) {
255+
return const_cast<_Ty*>(static_cast<const _Ty*>(::__std_adjacent_find_4(_First, _Last)));
256+
} else if constexpr (sizeof(_Ty) == 8) {
257+
return const_cast<_Ty*>(static_cast<const _Ty*>(::__std_adjacent_find_8(_First, _Last)));
258+
} else {
259+
_STL_INTERNAL_STATIC_ASSERT(false); // unexpected size
260+
}
261+
}
262+
243263
// find_first_of vectorization is likely to be a win after this size (in elements)
244264
_INLINE_VAR constexpr ptrdiff_t _Threshold_find_first_of = 16;
245265

@@ -6786,6 +6806,23 @@ namespace ranges {
67866806
return _First;
67876807
}
67886808

6809+
#if _USE_STD_VECTOR_ALGORITHMS
6810+
if constexpr (_Equal_memcmp_is_safe<_It, _It, _Pr> && sized_sentinel_for<_Se, _It>
6811+
&& is_same_v<_Pj, identity>) {
6812+
if (!_STD is_constant_evaluated()) {
6813+
const auto _First_ptr = _STD _To_address(_First);
6814+
const auto _Last_ptr = _First_ptr + (_Last - _First);
6815+
6816+
const auto _Result = _STD _Adjacent_find_vectorized(_First_ptr, _Last_ptr);
6817+
if constexpr (is_pointer_v<_It>) {
6818+
return _Result;
6819+
} else {
6820+
return _First + (_Result - _First_ptr);
6821+
}
6822+
}
6823+
}
6824+
#endif // _USE_STD_VECTOR_ALGORITHMS
6825+
67896826
for (auto _Next = _First;; ++_First) {
67906827
if (++_Next == _Last) {
67916828
return _Next;

stl/src/vector_algorithms.cpp

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2751,6 +2751,96 @@ namespace {
27512751
}
27522752
}
27532753

2754+
template <class _Traits, class _Ty>
2755+
const void* __stdcall __std_adjacent_find_impl(const void* _First, const void* const _Last) noexcept {
2756+
if (_First == _Last) {
2757+
return _Last;
2758+
}
2759+
2760+
#ifndef _M_ARM64EC
2761+
const size_t _Size_bytes = _Byte_length(_First, _Last) - sizeof(_Ty);
2762+
2763+
if (const size_t _Avx_size = _Size_bytes & ~size_t{0x1F}; _Avx_size != 0 && _Use_avx2()) {
2764+
_Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414
2765+
2766+
const void* _Stop_at = _First;
2767+
_Advance_bytes(_Stop_at, _Avx_size);
2768+
2769+
do {
2770+
const void* _Next = _First;
2771+
_Advance_bytes(_Next, sizeof(_Ty));
2772+
2773+
const __m256i _Data = _mm256_loadu_si256(static_cast<const __m256i*>(_First));
2774+
const __m256i _Comparand = _mm256_loadu_si256(static_cast<const __m256i*>(_Next));
2775+
const int _Bingo = _mm256_movemask_epi8(_Traits::_Cmp_avx(_Data, _Comparand));
2776+
2777+
if (_Bingo != 0) {
2778+
const unsigned long _Offset = _tzcnt_u32(_Bingo);
2779+
_Advance_bytes(_First, _Offset);
2780+
return _First;
2781+
}
2782+
2783+
_Advance_bytes(_First, 32);
2784+
} while (_First != _Stop_at);
2785+
2786+
if (const size_t _Avx_tail_size = _Size_bytes & 0x1C; _Avx_tail_size != 0) {
2787+
const void* _Next = _First;
2788+
_Advance_bytes(_Next, sizeof(_Ty));
2789+
2790+
const __m256i _Tail_mask = _Avx2_tail_mask_32(_Avx_tail_size >> 2);
2791+
const __m256i _Data = _mm256_maskload_epi32(static_cast<const int*>(_First), _Tail_mask);
2792+
const __m256i _Comparand = _mm256_maskload_epi32(static_cast<const int*>(_Next), _Tail_mask);
2793+
const int _Bingo =
2794+
_mm256_movemask_epi8(_mm256_and_si256(_Traits::_Cmp_avx(_Data, _Comparand), _Tail_mask));
2795+
2796+
if (_Bingo != 0) {
2797+
const unsigned long _Offset = _tzcnt_u32(_Bingo);
2798+
_Advance_bytes(_First, _Offset);
2799+
return _First;
2800+
}
2801+
2802+
_Advance_bytes(_First, _Avx_tail_size);
2803+
}
2804+
2805+
if constexpr (sizeof(_Ty) >= 4) {
2806+
return _Last;
2807+
}
2808+
} else if (const size_t _Sse_size = _Size_bytes & ~size_t{0xF}; _Sse_size != 0 && _Use_sse42()) {
2809+
const void* _Stop_at = _First;
2810+
_Advance_bytes(_Stop_at, _Sse_size);
2811+
2812+
do {
2813+
const void* _Next = _First;
2814+
_Advance_bytes(_Next, sizeof(_Ty));
2815+
2816+
const __m128i _Data = _mm_loadu_si128(static_cast<const __m128i*>(_First));
2817+
const __m128i _Comparand = _mm_loadu_si128(static_cast<const __m128i*>(_Next));
2818+
const int _Bingo = _mm_movemask_epi8(_Traits::_Cmp_sse(_Data, _Comparand));
2819+
2820+
if (_Bingo != 0) {
2821+
unsigned long _Offset;
2822+
// CodeQL [SM02313] _Offset is always initialized: we just tested `if (_Bingo != 0)`.
2823+
_BitScanForward(&_Offset, _Bingo);
2824+
_Advance_bytes(_First, _Offset);
2825+
return _First;
2826+
}
2827+
2828+
_Advance_bytes(_First, 16);
2829+
} while (_First != _Stop_at);
2830+
}
2831+
#endif // !_M_ARM64EC
2832+
2833+
auto _Ptr = static_cast<const _Ty*>(_First);
2834+
auto _Next = _Ptr + 1;
2835+
for (; _Next != _Last; ++_Ptr, ++_Next) {
2836+
if (*_Ptr == *_Next) {
2837+
return _Ptr;
2838+
}
2839+
}
2840+
2841+
return _Last;
2842+
}
2843+
27542844
struct _Count_traits_8 : _Find_traits_8 {
27552845
#ifndef _M_ARM64EC
27562846
static __m256i _Sub_avx(const __m256i _Lhs, const __m256i _Rhs) noexcept {
@@ -4788,6 +4878,22 @@ __declspec(noalias) size_t __stdcall __std_find_last_not_ch_pos_8(
47884878
return __std_find_last_pos<_Find_traits_8, _Find_one_predicate::_Not_equal>(_First, _Last, _Val);
47894879
}
47904880

4881+
const void* __stdcall __std_adjacent_find_1(const void* const _First, const void* const _Last) noexcept {
4882+
return __std_adjacent_find_impl<_Find_traits_1, uint8_t>(_First, _Last);
4883+
}
4884+
4885+
const void* __stdcall __std_adjacent_find_2(const void* const _First, const void* const _Last) noexcept {
4886+
return __std_adjacent_find_impl<_Find_traits_2, uint16_t>(_First, _Last);
4887+
}
4888+
4889+
const void* __stdcall __std_adjacent_find_4(const void* const _First, const void* const _Last) noexcept {
4890+
return __std_adjacent_find_impl<_Find_traits_4, uint32_t>(_First, _Last);
4891+
}
4892+
4893+
const void* __stdcall __std_adjacent_find_8(const void* const _First, const void* const _Last) noexcept {
4894+
return __std_adjacent_find_impl<_Find_traits_8, uint64_t>(_First, _Last);
4895+
}
4896+
47914897
__declspec(noalias) size_t __stdcall __std_count_trivial_1(
47924898
const void* const _First, const void* const _Last, const uint8_t _Val) noexcept {
47934899
return __std_count_trivial_impl<_Count_traits_1>(_First, _Last, _Val);

tests/std/tests/VSO_0000000_vector_algorithms/test.cpp

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,67 @@ void test_adjacent_difference_with_heterogeneous_types() {
103103
assert(output == expected);
104104
}
105105

106+
template <class FwdIt>
107+
FwdIt last_known_good_adj_find(FwdIt first, FwdIt last) {
108+
if (first == last) {
109+
return last;
110+
}
111+
112+
auto next = first;
113+
for (++next; next != last; ++first, ++next) {
114+
if (*first == *next) {
115+
return first;
116+
}
117+
}
118+
119+
return last;
120+
}
121+
122+
template <class T>
123+
void test_case_adj_find(const vector<T>& input) {
124+
const auto actual = adjacent_find(input.begin(), input.end());
125+
const auto expected = last_known_good_adj_find(input.begin(), input.end());
126+
assert(actual == expected);
127+
128+
#if _HAS_CXX20
129+
const auto actual_r = ranges::adjacent_find(input);
130+
assert(actual_r == expected);
131+
#endif // _HAS_CXX20
132+
}
133+
134+
template <class T>
135+
void test_adjacent_find(mt19937_64& gen) {
136+
constexpr size_t replicaCount = 4;
137+
138+
using Limits = numeric_limits<T>;
139+
140+
uniform_int_distribution<conditional_t<sizeof(T) == 1, int, T>> dis(Limits::min(), Limits::max());
141+
142+
vector<T> original_input;
143+
vector<T> input;
144+
145+
original_input.reserve(dataCount);
146+
input.reserve(dataCount);
147+
148+
test_case_adj_find(input);
149+
for (size_t attempts = 0; attempts < dataCount; ++attempts) {
150+
original_input.push_back(static_cast<T>(dis(gen)));
151+
input = original_input;
152+
153+
test_case_adj_find(input);
154+
155+
if (original_input.size() > 2) {
156+
uniform_int_distribution<size_t> pos_dis(0, original_input.size() - 2);
157+
158+
for (size_t replicas = 0; replicas < replicaCount; ++replicas) {
159+
const size_t replica_pos = pos_dis(gen);
160+
input[replica_pos] = input[replica_pos + 1];
161+
test_case_adj_find(input);
162+
}
163+
}
164+
}
165+
}
166+
106167
template <class FwdIt, class T>
107168
ptrdiff_t last_known_good_count(FwdIt first, FwdIt last, T v) {
108169
ptrdiff_t result = 0;
@@ -763,6 +824,16 @@ void test_vector_algorithms(mt19937_64& gen) {
763824

764825
test_adjacent_difference_with_heterogeneous_types();
765826

827+
test_adjacent_find<char>(gen);
828+
test_adjacent_find<signed char>(gen);
829+
test_adjacent_find<unsigned char>(gen);
830+
test_adjacent_find<short>(gen);
831+
test_adjacent_find<unsigned short>(gen);
832+
test_adjacent_find<int>(gen);
833+
test_adjacent_find<unsigned int>(gen);
834+
test_adjacent_find<long long>(gen);
835+
test_adjacent_find<unsigned long long>(gen);
836+
766837
test_count<char>(gen);
767838
test_count<signed char>(gen);
768839
test_count<unsigned char>(gen);

0 commit comments

Comments
 (0)