Skip to content

Commit d38c194

Browse files
Vectorize rotate even better (#5525)
Co-authored-by: Stephan T. Lavavej <[email protected]>
1 parent 38c0237 commit d38c194

File tree

3 files changed

+219
-115
lines changed

3 files changed

+219
-115
lines changed

benchmarks/src/rotate.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,4 +55,6 @@ BENCHMARK(bm_rotate<uint64_t, AlgType::Rng>)->Apply(common_args);
5555
BENCHMARK(bm_rotate<color, AlgType::Std>)->Apply(common_args);
5656
BENCHMARK(bm_rotate<color, AlgType::Rng>)->Apply(common_args);
5757

58+
BENCHMARK(bm_rotate<uint8_t, AlgType::Std>)->Args({35000, 520})->Args({35000, 3000});
59+
5860
BENCHMARK_MAIN();

stl/src/vector_algorithms.cpp

Lines changed: 210 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -144,13 +144,12 @@ __declspec(noalias) void __cdecl __std_swap_ranges_trivially_swappable_noalias(
144144
#endif
145145
#endif // ^^^ !defined(_M_ARM64EC) ^^^
146146

147-
auto _First1c = static_cast<unsigned char*>(_First1);
148-
const auto _Last1c = static_cast<unsigned char*>(_Last1);
149-
auto _First2c = static_cast<unsigned char*>(_First2);
150-
for (; _First1c != _Last1c; ++_First1c, ++_First2c) {
151-
unsigned char _Ch = *_First1c;
152-
*_First1c = *_First2c;
153-
*_First2c = _Ch;
147+
auto _First1c = static_cast<unsigned char*>(_First1);
148+
auto _First2c = static_cast<unsigned char*>(_First2);
149+
for (; _First1c != _Last1; ++_First1c, ++_First2c) {
150+
const unsigned char _Ch = *_First1c;
151+
*_First1c = *_First2c;
152+
*_First2c = _Ch;
154153
}
155154
}
156155

@@ -163,6 +162,210 @@ void* __cdecl __std_swap_ranges_trivially_swappable(
163162

164163
} // extern "C"
165164

165+
namespace {
166+
namespace _Rotating {
167+
void _Swap_3_ranges(void* _First1, void* const _Last1, void* _First2, void* _First3) noexcept {
168+
#ifndef _M_ARM64EC
169+
constexpr size_t _Mask_32 = ~((static_cast<size_t>(1) << 5) - 1);
170+
if (_Byte_length(_First1, _Last1) >= 32 && _Use_avx2()) {
171+
const void* _Stop_at = _First1;
172+
_Advance_bytes(_Stop_at, _Byte_length(_First1, _Last1) & _Mask_32);
173+
do {
174+
const __m256i _Val1 = _mm256_loadu_si256(static_cast<__m256i*>(_First1));
175+
const __m256i _Val2 = _mm256_loadu_si256(static_cast<__m256i*>(_First2));
176+
const __m256i _Val3 = _mm256_loadu_si256(static_cast<__m256i*>(_First3));
177+
_mm256_storeu_si256(static_cast<__m256i*>(_First1), _Val2);
178+
_mm256_storeu_si256(static_cast<__m256i*>(_First2), _Val3);
179+
_mm256_storeu_si256(static_cast<__m256i*>(_First3), _Val1);
180+
_Advance_bytes(_First1, 32);
181+
_Advance_bytes(_First2, 32);
182+
_Advance_bytes(_First3, 32);
183+
} while (_First1 != _Stop_at);
184+
185+
_mm256_zeroupper(); // TRANSITION, DevCom-10331414
186+
}
187+
188+
constexpr size_t _Mask_16 = ~((static_cast<size_t>(1) << 4) - 1);
189+
if (_Byte_length(_First1, _Last1) >= 16 && _Use_sse42()) {
190+
const void* _Stop_at = _First1;
191+
_Advance_bytes(_Stop_at, _Byte_length(_First1, _Last1) & _Mask_16);
192+
do {
193+
const __m128i _Val1 = _mm_loadu_si128(static_cast<__m128i*>(_First1));
194+
const __m128i _Val2 = _mm_loadu_si128(static_cast<__m128i*>(_First2));
195+
const __m128i _Val3 = _mm_loadu_si128(static_cast<__m128i*>(_First3));
196+
_mm_storeu_si128(static_cast<__m128i*>(_First1), _Val2);
197+
_mm_storeu_si128(static_cast<__m128i*>(_First2), _Val3);
198+
_mm_storeu_si128(static_cast<__m128i*>(_First3), _Val1);
199+
_Advance_bytes(_First1, 16);
200+
_Advance_bytes(_First2, 16);
201+
_Advance_bytes(_First3, 16);
202+
} while (_First1 != _Stop_at);
203+
}
204+
205+
#if defined(_M_X64) // NOTE: UNALIGNED MEMORY ACCESSES
206+
constexpr size_t _Mask_8 = ~((static_cast<size_t>(1) << 3) - 1);
207+
if (_Byte_length(_First1, _Last1) >= 8) {
208+
const void* _Stop_at = _First1;
209+
_Advance_bytes(_Stop_at, _Byte_length(_First1, _Last1) & _Mask_8);
210+
do {
211+
const unsigned long long _Val1 = *static_cast<unsigned long long*>(_First1);
212+
const unsigned long long _Val2 = *static_cast<unsigned long long*>(_First2);
213+
const unsigned long long _Val3 = *static_cast<unsigned long long*>(_First3);
214+
*static_cast<unsigned long long*>(_First1) = _Val2;
215+
*static_cast<unsigned long long*>(_First2) = _Val3;
216+
*static_cast<unsigned long long*>(_First3) = _Val1;
217+
_Advance_bytes(_First1, 8);
218+
_Advance_bytes(_First2, 8);
219+
_Advance_bytes(_First3, 8);
220+
} while (_First1 != _Stop_at);
221+
}
222+
#elif defined(_M_IX86) // NOTE: UNALIGNED MEMORY ACCESSES
223+
constexpr size_t _Mask_4 = ~((static_cast<size_t>(1) << 2) - 1);
224+
if (_Byte_length(_First1, _Last1) >= 4) {
225+
const void* _Stop_at = _First1;
226+
_Advance_bytes(_Stop_at, _Byte_length(_First1, _Last1) & _Mask_4);
227+
do {
228+
const unsigned long _Val1 = *static_cast<unsigned long*>(_First1);
229+
const unsigned long _Val2 = *static_cast<unsigned long*>(_First2);
230+
const unsigned long _Val3 = *static_cast<unsigned long*>(_First3);
231+
*static_cast<unsigned long*>(_First1) = _Val2;
232+
*static_cast<unsigned long*>(_First2) = _Val3;
233+
*static_cast<unsigned long*>(_First3) = _Val1;
234+
_Advance_bytes(_First1, 4);
235+
_Advance_bytes(_First2, 4);
236+
_Advance_bytes(_First3, 4);
237+
} while (_First1 != _Stop_at);
238+
}
239+
#else
240+
#error Unsupported architecture
241+
#endif
242+
#endif // ^^^ !defined(_M_ARM64EC) ^^^
243+
244+
auto _First1c = static_cast<unsigned char*>(_First1);
245+
auto _First2c = static_cast<unsigned char*>(_First2);
246+
auto _First3c = static_cast<unsigned char*>(_First3);
247+
for (; _First1c != _Last1; ++_First1c, ++_First2c, ++_First3c) {
248+
const unsigned char _Ch = *_First1c;
249+
*_First1c = *_First2c;
250+
*_First2c = *_First3c;
251+
*_First3c = _Ch;
252+
}
253+
}
254+
255+
256+
// TRANSITION, GH-5506 "VCRuntime: memmove() is surprisingly slow for more than 8 KB on certain CPUs":
257+
// As a workaround, the following code calls memmove() for 8 KB portions.
258+
constexpr size_t _Portion_size = 8192;
259+
constexpr size_t _Portion_mask = _Portion_size - 1;
260+
static_assert((_Portion_size & _Portion_mask) == 0);
261+
262+
void _Move_to_lower_address(void* _Dest, const void* _Src, const size_t _Size) noexcept {
263+
const size_t _Whole_portions_size = _Size & ~_Portion_mask;
264+
265+
void* _Dest_end = _Dest;
266+
_Advance_bytes(_Dest_end, _Whole_portions_size);
267+
268+
while (_Dest != _Dest_end) {
269+
memmove(_Dest, _Src, _Portion_size);
270+
_Advance_bytes(_Dest, _Portion_size);
271+
_Advance_bytes(_Src, _Portion_size);
272+
}
273+
274+
if (const size_t _Tail = _Size - _Whole_portions_size; _Tail != 0) {
275+
memmove(_Dest, _Src, _Tail);
276+
}
277+
}
278+
279+
void _Move_to_higher_address(void* const _Dest, const void* const _Src, const size_t _Size) noexcept {
280+
const size_t _Whole_portions_size = _Size & ~_Portion_mask;
281+
282+
void* _Dest_end = _Dest;
283+
_Advance_bytes(_Dest_end, _Whole_portions_size);
284+
const void* _Src_end = _Src;
285+
_Advance_bytes(_Src_end, _Whole_portions_size);
286+
287+
if (const size_t _Tail = _Size - _Whole_portions_size; _Tail != 0) {
288+
memmove(_Dest_end, _Src_end, _Tail);
289+
}
290+
291+
while (_Dest_end != _Dest) {
292+
_Rewind_bytes(_Dest_end, _Portion_size);
293+
_Rewind_bytes(_Src_end, _Portion_size);
294+
memmove(_Dest_end, _Src_end, _Portion_size);
295+
}
296+
}
297+
298+
constexpr size_t _Buf_size = 512;
299+
300+
bool _Use_buffer(const size_t _Smaller, const size_t _Larger) noexcept {
301+
return _Smaller <= _Buf_size && (_Smaller <= 128 || _Larger >= _Smaller * 2);
302+
}
303+
} // namespace _Rotating
304+
} // unnamed namespace
305+
306+
extern "C" {
307+
308+
__declspec(noalias) void __stdcall __std_rotate(void* _First, void* const _Mid, void* _Last) noexcept {
309+
unsigned char _Buf[_Rotating::_Buf_size];
310+
311+
for (;;) {
312+
const size_t _Left = _Byte_length(_First, _Mid);
313+
const size_t _Right = _Byte_length(_Mid, _Last);
314+
315+
if (_Left <= _Right) {
316+
if (_Left == 0) {
317+
break;
318+
}
319+
320+
if (_Rotating::_Use_buffer(_Left, _Right)) {
321+
memcpy(_Buf, _First, _Left);
322+
_Rotating::_Move_to_lower_address(_First, _Mid, _Right);
323+
_Advance_bytes(_First, _Right);
324+
memcpy(_First, _Buf, _Left);
325+
break;
326+
}
327+
328+
void* _Mid2 = _Last;
329+
_Rewind_bytes(_Mid2, _Left);
330+
if (_Left * 2 > _Right) {
331+
__std_swap_ranges_trivially_swappable_noalias(_Mid2, _Last, _First);
332+
_Last = _Mid2;
333+
} else {
334+
void* _Mid3 = _Mid2;
335+
_Rewind_bytes(_Mid3, _Left);
336+
_Rotating::_Swap_3_ranges(_Mid2, _Last, _First, _Mid3);
337+
_Last = _Mid3;
338+
}
339+
} else {
340+
if (_Right == 0) {
341+
break;
342+
}
343+
344+
if (_Rotating::_Use_buffer(_Right, _Left)) {
345+
_Rewind_bytes(_Last, _Right);
346+
memcpy(_Buf, _Last, _Right);
347+
void* _Mid2 = _First;
348+
_Advance_bytes(_Mid2, _Right);
349+
_Rotating::_Move_to_higher_address(_Mid2, _First, _Left);
350+
memcpy(_First, _Buf, _Right);
351+
break;
352+
}
353+
354+
if (_Right * 2 > _Left) {
355+
__std_swap_ranges_trivially_swappable_noalias(_Mid, _Last, _First);
356+
_Advance_bytes(_First, _Right);
357+
} else {
358+
void* _Mid2 = _First;
359+
_Advance_bytes(_Mid2, _Right);
360+
_Rotating::_Swap_3_ranges(_Mid, _Last, _Mid2, _First);
361+
_Advance_bytes(_First, _Right * 2);
362+
}
363+
}
364+
}
365+
}
366+
367+
} // extern "C"
368+
166369
namespace {
167370
namespace _Reversing {
168371
#ifdef _M_ARM64EC
@@ -382,107 +585,6 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_8(
382585

383586
} // extern "C"
384587

385-
namespace {
386-
namespace _Rotating {
387-
// TRANSITION, GH-5506 "VCRuntime: memmove() is surprisingly slow for more than 8 KB on certain CPUs":
388-
// As a workaround, the following code calls memmove() for 8 KB portions.
389-
constexpr size_t _Portion_size = 8192;
390-
constexpr size_t _Portion_mask = _Portion_size - 1;
391-
static_assert((_Portion_size & _Portion_mask) == 0);
392-
393-
void _Move_to_lower_address(void* _Dest, const void* _Src, const size_t _Size) noexcept {
394-
const size_t _Whole_portions_size = _Size & ~_Portion_mask;
395-
396-
void* _Dest_end = _Dest;
397-
_Advance_bytes(_Dest_end, _Whole_portions_size);
398-
399-
while (_Dest != _Dest_end) {
400-
memmove(_Dest, _Src, _Portion_size);
401-
_Advance_bytes(_Dest, _Portion_size);
402-
_Advance_bytes(_Src, _Portion_size);
403-
}
404-
405-
if (const size_t _Tail = _Size - _Whole_portions_size; _Tail != 0) {
406-
memmove(_Dest, _Src, _Tail);
407-
}
408-
}
409-
410-
void _Move_to_higher_address(void* const _Dest, const void* const _Src, const size_t _Size) noexcept {
411-
const size_t _Whole_portions_size = _Size & ~_Portion_mask;
412-
413-
void* _Dest_end = _Dest;
414-
_Advance_bytes(_Dest_end, _Whole_portions_size);
415-
const void* _Src_end = _Src;
416-
_Advance_bytes(_Src_end, _Whole_portions_size);
417-
418-
if (const size_t _Tail = _Size - _Whole_portions_size; _Tail != 0) {
419-
memmove(_Dest_end, _Src_end, _Tail);
420-
}
421-
422-
while (_Dest_end != _Dest) {
423-
_Rewind_bytes(_Dest_end, _Portion_size);
424-
_Rewind_bytes(_Src_end, _Portion_size);
425-
memmove(_Dest_end, _Src_end, _Portion_size);
426-
}
427-
}
428-
429-
constexpr size_t _Buf_size = 512;
430-
431-
bool _Use_buffer(const size_t _Smaller, const size_t _Larger) noexcept {
432-
return _Smaller <= _Buf_size && (_Smaller <= 128 || _Larger >= _Smaller * 2);
433-
}
434-
} // namespace _Rotating
435-
} // unnamed namespace
436-
437-
extern "C" {
438-
439-
__declspec(noalias) void __stdcall __std_rotate(void* _First, void* const _Mid, void* _Last) noexcept {
440-
unsigned char _Buf[_Rotating::_Buf_size];
441-
442-
for (;;) {
443-
const size_t _Left = _Byte_length(_First, _Mid);
444-
const size_t _Right = _Byte_length(_Mid, _Last);
445-
446-
if (_Left <= _Right) {
447-
if (_Left == 0) {
448-
break;
449-
}
450-
451-
if (_Rotating::_Use_buffer(_Left, _Right)) {
452-
memcpy(_Buf, _First, _Left);
453-
_Rotating::_Move_to_lower_address(_First, _Mid, _Right);
454-
_Advance_bytes(_First, _Right);
455-
memcpy(_First, _Buf, _Left);
456-
break;
457-
}
458-
459-
void* _Mid2 = _Last;
460-
_Rewind_bytes(_Mid2, _Left);
461-
__std_swap_ranges_trivially_swappable_noalias(_Mid2, _Last, _First);
462-
_Last = _Mid2;
463-
} else {
464-
if (_Right == 0) {
465-
break;
466-
}
467-
468-
if (_Rotating::_Use_buffer(_Right, _Left)) {
469-
_Rewind_bytes(_Last, _Right);
470-
memcpy(_Buf, _Last, _Right);
471-
void* _Mid2 = _First;
472-
_Advance_bytes(_Mid2, _Right);
473-
_Rotating::_Move_to_higher_address(_Mid2, _First, _Left);
474-
memcpy(_First, _Buf, _Right);
475-
break;
476-
}
477-
478-
__std_swap_ranges_trivially_swappable_noalias(_Mid, _Last, _First);
479-
_Advance_bytes(_First, _Right);
480-
}
481-
}
482-
}
483-
484-
} // extern "C"
485-
486588
namespace {
487589
namespace _Sorting {
488590
enum _Min_max_mode {

tests/std/tests/VSO_0000000_vector_algorithms/test.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -778,17 +778,17 @@ void test_case_rotate(
778778
}
779779

780780
template <class T>
781-
void test_rotate(mt19937_64& gen) {
781+
void test_rotate(mt19937_64& gen, const size_t data_count = dataCount) {
782782
vector<T> actual;
783783
vector<T> actual_r;
784784
vector<T> expected;
785785
vector<T> tmp;
786-
actual.reserve(dataCount);
787-
actual_r.reserve(dataCount);
788-
expected.reserve(dataCount);
789-
tmp.reserve(dataCount);
786+
actual.reserve(data_count);
787+
actual_r.reserve(data_count);
788+
expected.reserve(data_count);
789+
tmp.reserve(data_count);
790790
test_case_rotate(actual, actual_r, expected, 0, tmp);
791-
for (size_t attempts = 0; attempts < dataCount; ++attempts) {
791+
for (size_t attempts = 0; attempts < data_count; ++attempts) {
792792
const T val = static_cast<T>(gen()); // intentionally narrows
793793
actual.push_back(val);
794794
actual_r.push_back(val);
@@ -1241,7 +1241,7 @@ void test_vector_algorithms(mt19937_64& gen) {
12411241
test_reverse_copy<double>(gen);
12421242
test_reverse_copy<long double>(gen);
12431243

1244-
test_rotate<char>(gen);
1244+
test_rotate<char>(gen, 20000); // one real long rotate run, as for smaller arrays some strategies aren't executed
12451245
test_rotate<signed char>(gen);
12461246
test_rotate<unsigned char>(gen);
12471247
test_rotate<short>(gen);

0 commit comments

Comments
 (0)