Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ function(add_benchmark name)
endfunction()

add_benchmark(adjacent_difference src/adjacent_difference.cpp)
add_benchmark(adjacent_find src/adjacent_find.cpp)
add_benchmark(bitset_from_string src/bitset_from_string.cpp)
add_benchmark(bitset_to_string src/bitset_to_string.cpp)
add_benchmark(efficient_nonlocking_print src/efficient_nonlocking_print.cpp)
Expand Down
56 changes: 56 additions & 0 deletions benchmarks/src/adjacent_find.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include <algorithm>
#include <benchmark/benchmark.h>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <vector>

using namespace std;

enum class AlgType { Std, Rng };

template <AlgType Alg, class T>
void bm(benchmark::State& state) {
const size_t size = static_cast<size_t>(state.range(0));
const size_t pos = static_cast<size_t>(state.range(1));

vector<T> v(size);

for (size_t i = 0; i != size; ++i) {
v[i] = static_cast<T>(i & 3);
}

if (pos == 0 || pos >= size) {
abort();
}

v[pos] = v[pos - 1];

for (auto _ : state) {
benchmark::DoNotOptimize(v);
if constexpr (Alg == AlgType::Std) {
benchmark::DoNotOptimize(adjacent_find(v.begin(), v.end()));
} else {
benchmark::DoNotOptimize(ranges::adjacent_find(v));
}
}
}

void common_args(auto bm) {
bm->ArgPair(2525, 1142);
}

BENCHMARK(bm<AlgType::Std, int8_t>)->Apply(common_args);
BENCHMARK(bm<AlgType::Std, int16_t>)->Apply(common_args);
BENCHMARK(bm<AlgType::Std, int32_t>)->Apply(common_args);
BENCHMARK(bm<AlgType::Std, int64_t>)->Apply(common_args);

BENCHMARK(bm<AlgType::Rng, int8_t>)->Apply(common_args);
BENCHMARK(bm<AlgType::Rng, int16_t>)->Apply(common_args);
BENCHMARK(bm<AlgType::Rng, int32_t>)->Apply(common_args);
BENCHMARK(bm<AlgType::Rng, int64_t>)->Apply(common_args);

BENCHMARK_MAIN();
18 changes: 18 additions & 0 deletions stl/inc/algorithm
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,24 @@ _NODISCARD _CONSTEXPR20 _FwdIt adjacent_find(const _FwdIt _First, _FwdIt _Last,
auto _UFirst = _STD _Get_unwrapped(_First);
auto _ULast = _STD _Get_unwrapped(_Last);
if (_UFirst != _ULast) {
#if _USE_STD_VECTOR_ALGORITHMS
if constexpr (_Equal_memcmp_is_safe<decltype(_UFirst), decltype(_UFirst), _Pr>) {
if (!_STD _Is_constant_evaluated()) {
const auto _First_ptr = _STD _To_address(_UFirst);
const auto _Result = _STD _Adjacent_find_vectorized(_First_ptr, _STD _To_address(_ULast));

if constexpr (is_pointer_v<decltype(_ULast)>) {
_ULast = _Result;
} else {
_ULast = _UFirst + (_Result - _First_ptr);
}

_STD _Seek_wrapped(_Last, _ULast);
return _Last;
}
}
#endif // _USE_STD_VECTOR_ALGORITHMS

for (auto _UNext = _UFirst; ++_UNext != _ULast; _UFirst = _UNext) {
if (_Pred(*_UFirst, *_UNext)) {
_ULast = _UFirst;
Expand Down
37 changes: 37 additions & 0 deletions stl/inc/xutility
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,11 @@ const void* __stdcall __std_find_last_trivial_2(const void* _First, const void*
const void* __stdcall __std_find_last_trivial_4(const void* _First, const void* _Last, uint32_t _Val) noexcept;
const void* __stdcall __std_find_last_trivial_8(const void* _First, const void* _Last, uint64_t _Val) noexcept;

const void* __stdcall __std_adjacent_find_1(const void* _First, const void* _Last) noexcept;
const void* __stdcall __std_adjacent_find_2(const void* _First, const void* _Last) noexcept;
const void* __stdcall __std_adjacent_find_4(const void* _First, const void* _Last) noexcept;
const void* __stdcall __std_adjacent_find_8(const void* _First, const void* _Last) noexcept;

const void* __stdcall __std_search_1(
const void* _First1, const void* _Last1, const void* _First2, size_t _Count2) noexcept;
const void* __stdcall __std_search_2(
Expand Down Expand Up @@ -240,6 +245,21 @@ _Ty* _Find_last_vectorized(_Ty* const _First, _Ty* const _Last, const _TVal _Val
}
}

template <class _Ty>
_Ty* _Adjacent_find_vectorized(_Ty* const _First, _Ty* const _Last) noexcept {
if constexpr (sizeof(_Ty) == 1) {
return const_cast<_Ty*>(static_cast<const _Ty*>(::__std_adjacent_find_1(_First, _Last)));
} else if constexpr (sizeof(_Ty) == 2) {
return const_cast<_Ty*>(static_cast<const _Ty*>(::__std_adjacent_find_2(_First, _Last)));
} else if constexpr (sizeof(_Ty) == 4) {
return const_cast<_Ty*>(static_cast<const _Ty*>(::__std_adjacent_find_4(_First, _Last)));
} else if constexpr (sizeof(_Ty) == 8) {
return const_cast<_Ty*>(static_cast<const _Ty*>(::__std_adjacent_find_8(_First, _Last)));
} else {
_STL_INTERNAL_STATIC_ASSERT(false); // unexpected size
}
}

// find_first_of vectorization is likely to be a win after this size (in elements)
_INLINE_VAR constexpr ptrdiff_t _Threshold_find_first_of = 16;

Expand Down Expand Up @@ -6786,6 +6806,23 @@ namespace ranges {
return _First;
}

#if _USE_STD_VECTOR_ALGORITHMS
if constexpr (_Equal_memcmp_is_safe<_It, _It, _Pr> && sized_sentinel_for<_Se, _It>
&& is_same_v<_Pj, identity>) {
if (!_STD is_constant_evaluated()) {
const auto _First_ptr = _STD _To_address(_First);
const auto _Last_ptr = _First_ptr + (_Last - _First);

const auto _Result = _STD _Adjacent_find_vectorized(_First_ptr, _Last_ptr);
if constexpr (is_pointer_v<_It>) {
return _Result;
} else {
return _First + (_Result - _First_ptr);
}
}
}
#endif // _USE_STD_VECTOR_ALGORITHMS

for (auto _Next = _First;; ++_First) {
if (++_Next == _Last) {
return _Next;
Expand Down
106 changes: 106 additions & 0 deletions stl/src/vector_algorithms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2751,6 +2751,96 @@ namespace {
}
}

template <class _Traits, class _Ty>
const void* __stdcall __std_adjacent_find_impl(const void* _First, const void* const _Last) noexcept {
if (_First == _Last) {
return _Last;
}

#ifndef _M_ARM64EC
const size_t _Size_bytes = _Byte_length(_First, _Last) - sizeof(_Ty);

if (const size_t _Avx_size = _Size_bytes & ~size_t{0x1F}; _Avx_size != 0 && _Use_avx2()) {
_Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414

const void* _Stop_at = _First;
_Advance_bytes(_Stop_at, _Avx_size);

do {
const void* _Next = _First;
_Advance_bytes(_Next, sizeof(_Ty));

const __m256i _Data = _mm256_loadu_si256(static_cast<const __m256i*>(_First));
const __m256i _Comparand = _mm256_loadu_si256(static_cast<const __m256i*>(_Next));
const int _Bingo = _mm256_movemask_epi8(_Traits::_Cmp_avx(_Data, _Comparand));

if (_Bingo != 0) {
const unsigned long _Offset = _tzcnt_u32(_Bingo);
_Advance_bytes(_First, _Offset);
return _First;
}

_Advance_bytes(_First, 32);
} while (_First != _Stop_at);

if (const size_t _Avx_tail_size = _Size_bytes & 0x1C; _Avx_tail_size != 0) {
const void* _Next = _First;
_Advance_bytes(_Next, sizeof(_Ty));

const __m256i _Tail_mask = _Avx2_tail_mask_32(_Avx_tail_size >> 2);
const __m256i _Data = _mm256_maskload_epi32(static_cast<const int*>(_First), _Tail_mask);
const __m256i _Comparand = _mm256_maskload_epi32(static_cast<const int*>(_Next), _Tail_mask);
const int _Bingo =
_mm256_movemask_epi8(_mm256_and_si256(_Traits::_Cmp_avx(_Data, _Comparand), _Tail_mask));

if (_Bingo != 0) {
const unsigned long _Offset = _tzcnt_u32(_Bingo);
_Advance_bytes(_First, _Offset);
return _First;
}

_Advance_bytes(_First, _Avx_tail_size);
}

if constexpr (sizeof(_Ty) >= 4) {
return _Last;
}
} else if (const size_t _Sse_size = _Size_bytes & ~size_t{0xF}; _Sse_size != 0 && _Use_sse42()) {
const void* _Stop_at = _First;
_Advance_bytes(_Stop_at, _Sse_size);

do {
const void* _Next = _First;
_Advance_bytes(_Next, sizeof(_Ty));

const __m128i _Data = _mm_loadu_si128(static_cast<const __m128i*>(_First));
const __m128i _Comparand = _mm_loadu_si128(static_cast<const __m128i*>(_Next));
const int _Bingo = _mm_movemask_epi8(_Traits::_Cmp_sse(_Data, _Comparand));

if (_Bingo != 0) {
unsigned long _Offset;
// CodeQL [SM02313] _Offset is always initialized: we just tested `if (_Bingo != 0)`.
_BitScanForward(&_Offset, _Bingo);
_Advance_bytes(_First, _Offset);
return _First;
}

_Advance_bytes(_First, 16);
} while (_First != _Stop_at);
}
#endif // !_M_ARM64EC

auto _Ptr = static_cast<const _Ty*>(_First);
auto _Next = _Ptr + 1;
for (; _Next != _Last; ++_Ptr, ++_Next) {
if (*_Ptr == *_Next) {
return _Ptr;
}
}

return _Last;
}

struct _Count_traits_8 : _Find_traits_8 {
#ifndef _M_ARM64EC
static __m256i _Sub_avx(const __m256i _Lhs, const __m256i _Rhs) noexcept {
Expand Down Expand Up @@ -4788,6 +4878,22 @@ __declspec(noalias) size_t __stdcall __std_find_last_not_ch_pos_8(
return __std_find_last_pos<_Find_traits_8, _Find_one_predicate::_Not_equal>(_First, _Last, _Val);
}

const void* __stdcall __std_adjacent_find_1(const void* const _First, const void* const _Last) noexcept {
return __std_adjacent_find_impl<_Find_traits_1, uint8_t>(_First, _Last);
}

const void* __stdcall __std_adjacent_find_2(const void* const _First, const void* const _Last) noexcept {
return __std_adjacent_find_impl<_Find_traits_2, uint16_t>(_First, _Last);
}

const void* __stdcall __std_adjacent_find_4(const void* const _First, const void* const _Last) noexcept {
return __std_adjacent_find_impl<_Find_traits_4, uint32_t>(_First, _Last);
}

const void* __stdcall __std_adjacent_find_8(const void* const _First, const void* const _Last) noexcept {
return __std_adjacent_find_impl<_Find_traits_8, uint64_t>(_First, _Last);
}

__declspec(noalias) size_t __stdcall __std_count_trivial_1(
const void* const _First, const void* const _Last, const uint8_t _Val) noexcept {
return __std_count_trivial_impl<_Count_traits_1>(_First, _Last, _Val);
Expand Down
71 changes: 71 additions & 0 deletions tests/std/tests/VSO_0000000_vector_algorithms/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,67 @@ void test_adjacent_difference_with_heterogeneous_types() {
assert(output == expected);
}

template <class FwdIt>
FwdIt last_known_good_adj_find(FwdIt first, FwdIt last) {
if (first == last) {
return last;
}

auto next = first;
for (++next; next != last; ++first, ++next) {
if (*first == *next) {
return first;
}
}

return last;
}

template <class T>
void test_case_adj_find(const vector<T>& input) {
const auto actual = adjacent_find(input.begin(), input.end());
const auto expected = last_known_good_adj_find(input.begin(), input.end());
assert(actual == expected);

#if _HAS_CXX20
const auto actual_r = ranges::adjacent_find(input);
assert(actual_r == expected);
#endif // _HAS_CXX20
}

template <class T>
void test_adjacent_find(mt19937_64& gen) {
constexpr size_t replicaCount = 4;

using Limits = numeric_limits<T>;

uniform_int_distribution<conditional_t<sizeof(T) == 1, int, T>> dis(Limits::min(), Limits::max());

vector<T> original_input;
vector<T> input;

original_input.reserve(dataCount);
input.reserve(dataCount);

test_case_adj_find(input);
for (size_t attempts = 0; attempts < dataCount; ++attempts) {
original_input.push_back(static_cast<T>(dis(gen)));
input = original_input;

test_case_adj_find(input);

if (original_input.size() > 2) {
uniform_int_distribution<size_t> pos_dis(0, original_input.size() - 2);

for (size_t replicas = 0; replicas < replicaCount; ++replicas) {
const size_t replica_pos = pos_dis(gen);
input[replica_pos] = input[replica_pos + 1];
test_case_adj_find(input);
}
}
}
}

template <class FwdIt, class T>
ptrdiff_t last_known_good_count(FwdIt first, FwdIt last, T v) {
ptrdiff_t result = 0;
Expand Down Expand Up @@ -763,6 +824,16 @@ void test_vector_algorithms(mt19937_64& gen) {

test_adjacent_difference_with_heterogeneous_types();

test_adjacent_find<char>(gen);
test_adjacent_find<signed char>(gen);
test_adjacent_find<unsigned char>(gen);
test_adjacent_find<short>(gen);
test_adjacent_find<unsigned short>(gen);
test_adjacent_find<int>(gen);
test_adjacent_find<unsigned int>(gen);
test_adjacent_find<long long>(gen);
test_adjacent_find<unsigned long long>(gen);

test_count<char>(gen);
test_count<signed char>(gen);
test_count<unsigned char>(gen);
Expand Down