@@ -144,13 +144,12 @@ __declspec(noalias) void __cdecl __std_swap_ranges_trivially_swappable_noalias(
144
144
#endif
145
145
#endif // ^^^ !defined(_M_ARM64EC) ^^^
146
146
147
- auto _First1c = static_cast <unsigned char *>(_First1);
148
- const auto _Last1c = static_cast <unsigned char *>(_Last1);
149
- auto _First2c = static_cast <unsigned char *>(_First2);
150
- for (; _First1c != _Last1c; ++_First1c, ++_First2c) {
151
- unsigned char _Ch = *_First1c;
152
- *_First1c = *_First2c;
153
- *_First2c = _Ch;
147
+ auto _First1c = static_cast <unsigned char *>(_First1);
148
+ auto _First2c = static_cast <unsigned char *>(_First2);
149
+ for (; _First1c != _Last1; ++_First1c, ++_First2c) {
150
+ const unsigned char _Ch = *_First1c;
151
+ *_First1c = *_First2c;
152
+ *_First2c = _Ch;
154
153
}
155
154
}
156
155
@@ -163,6 +162,210 @@ void* __cdecl __std_swap_ranges_trivially_swappable(
163
162
164
163
} // extern "C"
165
164
165
+ namespace {
166
+ namespace _Rotating {
167
+ void _Swap_3_ranges (void * _First1, void * const _Last1, void * _First2, void * _First3) noexcept {
168
+ #ifndef _M_ARM64EC
169
+ constexpr size_t _Mask_32 = ~((static_cast <size_t >(1 ) << 5 ) - 1 );
170
+ if (_Byte_length (_First1, _Last1) >= 32 && _Use_avx2 ()) {
171
+ const void * _Stop_at = _First1;
172
+ _Advance_bytes (_Stop_at, _Byte_length (_First1, _Last1) & _Mask_32);
173
+ do {
174
+ const __m256i _Val1 = _mm256_loadu_si256 (static_cast <__m256i*>(_First1));
175
+ const __m256i _Val2 = _mm256_loadu_si256 (static_cast <__m256i*>(_First2));
176
+ const __m256i _Val3 = _mm256_loadu_si256 (static_cast <__m256i*>(_First3));
177
+ _mm256_storeu_si256 (static_cast <__m256i*>(_First1), _Val2);
178
+ _mm256_storeu_si256 (static_cast <__m256i*>(_First2), _Val3);
179
+ _mm256_storeu_si256 (static_cast <__m256i*>(_First3), _Val1);
180
+ _Advance_bytes (_First1, 32 );
181
+ _Advance_bytes (_First2, 32 );
182
+ _Advance_bytes (_First3, 32 );
183
+ } while (_First1 != _Stop_at);
184
+
185
+ _mm256_zeroupper (); // TRANSITION, DevCom-10331414
186
+ }
187
+
188
+ constexpr size_t _Mask_16 = ~((static_cast <size_t >(1 ) << 4 ) - 1 );
189
+ if (_Byte_length (_First1, _Last1) >= 16 && _Use_sse42 ()) {
190
+ const void * _Stop_at = _First1;
191
+ _Advance_bytes (_Stop_at, _Byte_length (_First1, _Last1) & _Mask_16);
192
+ do {
193
+ const __m128i _Val1 = _mm_loadu_si128 (static_cast <__m128i*>(_First1));
194
+ const __m128i _Val2 = _mm_loadu_si128 (static_cast <__m128i*>(_First2));
195
+ const __m128i _Val3 = _mm_loadu_si128 (static_cast <__m128i*>(_First3));
196
+ _mm_storeu_si128 (static_cast <__m128i*>(_First1), _Val2);
197
+ _mm_storeu_si128 (static_cast <__m128i*>(_First2), _Val3);
198
+ _mm_storeu_si128 (static_cast <__m128i*>(_First3), _Val1);
199
+ _Advance_bytes (_First1, 16 );
200
+ _Advance_bytes (_First2, 16 );
201
+ _Advance_bytes (_First3, 16 );
202
+ } while (_First1 != _Stop_at);
203
+ }
204
+
205
+ #if defined(_M_X64) // NOTE: UNALIGNED MEMORY ACCESSES
206
+ constexpr size_t _Mask_8 = ~((static_cast <size_t >(1 ) << 3 ) - 1 );
207
+ if (_Byte_length (_First1, _Last1) >= 8 ) {
208
+ const void * _Stop_at = _First1;
209
+ _Advance_bytes (_Stop_at, _Byte_length (_First1, _Last1) & _Mask_8);
210
+ do {
211
+ const unsigned long long _Val1 = *static_cast <unsigned long long *>(_First1);
212
+ const unsigned long long _Val2 = *static_cast <unsigned long long *>(_First2);
213
+ const unsigned long long _Val3 = *static_cast <unsigned long long *>(_First3);
214
+ *static_cast <unsigned long long *>(_First1) = _Val2;
215
+ *static_cast <unsigned long long *>(_First2) = _Val3;
216
+ *static_cast <unsigned long long *>(_First3) = _Val1;
217
+ _Advance_bytes (_First1, 8 );
218
+ _Advance_bytes (_First2, 8 );
219
+ _Advance_bytes (_First3, 8 );
220
+ } while (_First1 != _Stop_at);
221
+ }
222
+ #elif defined(_M_IX86) // NOTE: UNALIGNED MEMORY ACCESSES
223
+ constexpr size_t _Mask_4 = ~((static_cast <size_t >(1 ) << 2 ) - 1 );
224
+ if (_Byte_length (_First1, _Last1) >= 4 ) {
225
+ const void * _Stop_at = _First1;
226
+ _Advance_bytes (_Stop_at, _Byte_length (_First1, _Last1) & _Mask_4);
227
+ do {
228
+ const unsigned long _Val1 = *static_cast <unsigned long *>(_First1);
229
+ const unsigned long _Val2 = *static_cast <unsigned long *>(_First2);
230
+ const unsigned long _Val3 = *static_cast <unsigned long *>(_First3);
231
+ *static_cast <unsigned long *>(_First1) = _Val2;
232
+ *static_cast <unsigned long *>(_First2) = _Val3;
233
+ *static_cast <unsigned long *>(_First3) = _Val1;
234
+ _Advance_bytes (_First1, 4 );
235
+ _Advance_bytes (_First2, 4 );
236
+ _Advance_bytes (_First3, 4 );
237
+ } while (_First1 != _Stop_at);
238
+ }
239
+ #else
240
+ #error Unsupported architecture
241
+ #endif
242
+ #endif // ^^^ !defined(_M_ARM64EC) ^^^
243
+
244
+ auto _First1c = static_cast <unsigned char *>(_First1);
245
+ auto _First2c = static_cast <unsigned char *>(_First2);
246
+ auto _First3c = static_cast <unsigned char *>(_First3);
247
+ for (; _First1c != _Last1; ++_First1c, ++_First2c, ++_First3c) {
248
+ const unsigned char _Ch = *_First1c;
249
+ *_First1c = *_First2c;
250
+ *_First2c = *_First3c;
251
+ *_First3c = _Ch;
252
+ }
253
+ }
254
+
255
+
256
+ // TRANSITION, GH-5506 "VCRuntime: memmove() is surprisingly slow for more than 8 KB on certain CPUs":
257
+ // As a workaround, the following code calls memmove() for 8 KB portions.
258
+ constexpr size_t _Portion_size = 8192 ;
259
+ constexpr size_t _Portion_mask = _Portion_size - 1 ;
260
+ static_assert ((_Portion_size & _Portion_mask) == 0 );
261
+
262
+ void _Move_to_lower_address (void * _Dest, const void * _Src, const size_t _Size) noexcept {
263
+ const size_t _Whole_portions_size = _Size & ~_Portion_mask;
264
+
265
+ void * _Dest_end = _Dest;
266
+ _Advance_bytes (_Dest_end, _Whole_portions_size);
267
+
268
+ while (_Dest != _Dest_end) {
269
+ memmove (_Dest, _Src, _Portion_size);
270
+ _Advance_bytes (_Dest, _Portion_size);
271
+ _Advance_bytes (_Src, _Portion_size);
272
+ }
273
+
274
+ if (const size_t _Tail = _Size - _Whole_portions_size; _Tail != 0 ) {
275
+ memmove (_Dest, _Src, _Tail);
276
+ }
277
+ }
278
+
279
+ void _Move_to_higher_address (void * const _Dest, const void * const _Src, const size_t _Size) noexcept {
280
+ const size_t _Whole_portions_size = _Size & ~_Portion_mask;
281
+
282
+ void * _Dest_end = _Dest;
283
+ _Advance_bytes (_Dest_end, _Whole_portions_size);
284
+ const void * _Src_end = _Src;
285
+ _Advance_bytes (_Src_end, _Whole_portions_size);
286
+
287
+ if (const size_t _Tail = _Size - _Whole_portions_size; _Tail != 0 ) {
288
+ memmove (_Dest_end, _Src_end, _Tail);
289
+ }
290
+
291
+ while (_Dest_end != _Dest) {
292
+ _Rewind_bytes (_Dest_end, _Portion_size);
293
+ _Rewind_bytes (_Src_end, _Portion_size);
294
+ memmove (_Dest_end, _Src_end, _Portion_size);
295
+ }
296
+ }
297
+
298
+ constexpr size_t _Buf_size = 512 ;
299
+
300
+ bool _Use_buffer (const size_t _Smaller, const size_t _Larger) noexcept {
301
+ return _Smaller <= _Buf_size && (_Smaller <= 128 || _Larger >= _Smaller * 2 );
302
+ }
303
+ } // namespace _Rotating
304
+ } // unnamed namespace
305
+
306
+ extern " C" {
307
+
308
+ __declspec (noalias) void __stdcall __std_rotate(void * _First, void * const _Mid, void * _Last) noexcept {
309
+ unsigned char _Buf[_Rotating::_Buf_size];
310
+
311
+ for (;;) {
312
+ const size_t _Left = _Byte_length (_First, _Mid);
313
+ const size_t _Right = _Byte_length (_Mid, _Last);
314
+
315
+ if (_Left <= _Right) {
316
+ if (_Left == 0 ) {
317
+ break ;
318
+ }
319
+
320
+ if (_Rotating::_Use_buffer (_Left, _Right)) {
321
+ memcpy (_Buf, _First, _Left);
322
+ _Rotating::_Move_to_lower_address (_First, _Mid, _Right);
323
+ _Advance_bytes (_First, _Right);
324
+ memcpy (_First, _Buf, _Left);
325
+ break ;
326
+ }
327
+
328
+ void * _Mid2 = _Last;
329
+ _Rewind_bytes (_Mid2, _Left);
330
+ if (_Left * 2 > _Right) {
331
+ __std_swap_ranges_trivially_swappable_noalias (_Mid2, _Last, _First);
332
+ _Last = _Mid2;
333
+ } else {
334
+ void * _Mid3 = _Mid2;
335
+ _Rewind_bytes (_Mid3, _Left);
336
+ _Rotating::_Swap_3_ranges (_Mid2, _Last, _First, _Mid3);
337
+ _Last = _Mid3;
338
+ }
339
+ } else {
340
+ if (_Right == 0 ) {
341
+ break ;
342
+ }
343
+
344
+ if (_Rotating::_Use_buffer (_Right, _Left)) {
345
+ _Rewind_bytes (_Last, _Right);
346
+ memcpy (_Buf, _Last, _Right);
347
+ void * _Mid2 = _First;
348
+ _Advance_bytes (_Mid2, _Right);
349
+ _Rotating::_Move_to_higher_address (_Mid2, _First, _Left);
350
+ memcpy (_First, _Buf, _Right);
351
+ break ;
352
+ }
353
+
354
+ if (_Right * 2 > _Left) {
355
+ __std_swap_ranges_trivially_swappable_noalias (_Mid, _Last, _First);
356
+ _Advance_bytes (_First, _Right);
357
+ } else {
358
+ void * _Mid2 = _First;
359
+ _Advance_bytes (_Mid2, _Right);
360
+ _Rotating::_Swap_3_ranges (_Mid, _Last, _Mid2, _First);
361
+ _Advance_bytes (_First, _Right * 2 );
362
+ }
363
+ }
364
+ }
365
+ }
366
+
367
+ } // extern "C"
368
+
166
369
namespace {
167
370
namespace _Reversing {
168
371
#ifdef _M_ARM64EC
@@ -382,107 +585,6 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_8(
382
585
383
586
} // extern "C"
384
587
385
- namespace {
386
- namespace _Rotating {
387
- // TRANSITION, GH-5506 "VCRuntime: memmove() is surprisingly slow for more than 8 KB on certain CPUs":
388
- // As a workaround, the following code calls memmove() for 8 KB portions.
389
- constexpr size_t _Portion_size = 8192 ;
390
- constexpr size_t _Portion_mask = _Portion_size - 1 ;
391
- static_assert ((_Portion_size & _Portion_mask) == 0 );
392
-
393
- void _Move_to_lower_address (void * _Dest, const void * _Src, const size_t _Size) noexcept {
394
- const size_t _Whole_portions_size = _Size & ~_Portion_mask;
395
-
396
- void * _Dest_end = _Dest;
397
- _Advance_bytes (_Dest_end, _Whole_portions_size);
398
-
399
- while (_Dest != _Dest_end) {
400
- memmove (_Dest, _Src, _Portion_size);
401
- _Advance_bytes (_Dest, _Portion_size);
402
- _Advance_bytes (_Src, _Portion_size);
403
- }
404
-
405
- if (const size_t _Tail = _Size - _Whole_portions_size; _Tail != 0 ) {
406
- memmove (_Dest, _Src, _Tail);
407
- }
408
- }
409
-
410
- void _Move_to_higher_address (void * const _Dest, const void * const _Src, const size_t _Size) noexcept {
411
- const size_t _Whole_portions_size = _Size & ~_Portion_mask;
412
-
413
- void * _Dest_end = _Dest;
414
- _Advance_bytes (_Dest_end, _Whole_portions_size);
415
- const void * _Src_end = _Src;
416
- _Advance_bytes (_Src_end, _Whole_portions_size);
417
-
418
- if (const size_t _Tail = _Size - _Whole_portions_size; _Tail != 0 ) {
419
- memmove (_Dest_end, _Src_end, _Tail);
420
- }
421
-
422
- while (_Dest_end != _Dest) {
423
- _Rewind_bytes (_Dest_end, _Portion_size);
424
- _Rewind_bytes (_Src_end, _Portion_size);
425
- memmove (_Dest_end, _Src_end, _Portion_size);
426
- }
427
- }
428
-
429
- constexpr size_t _Buf_size = 512 ;
430
-
431
- bool _Use_buffer (const size_t _Smaller, const size_t _Larger) noexcept {
432
- return _Smaller <= _Buf_size && (_Smaller <= 128 || _Larger >= _Smaller * 2 );
433
- }
434
- } // namespace _Rotating
435
- } // unnamed namespace
436
-
437
- extern " C" {
438
-
439
- __declspec (noalias) void __stdcall __std_rotate(void * _First, void * const _Mid, void * _Last) noexcept {
440
- unsigned char _Buf[_Rotating::_Buf_size];
441
-
442
- for (;;) {
443
- const size_t _Left = _Byte_length (_First, _Mid);
444
- const size_t _Right = _Byte_length (_Mid, _Last);
445
-
446
- if (_Left <= _Right) {
447
- if (_Left == 0 ) {
448
- break ;
449
- }
450
-
451
- if (_Rotating::_Use_buffer (_Left, _Right)) {
452
- memcpy (_Buf, _First, _Left);
453
- _Rotating::_Move_to_lower_address (_First, _Mid, _Right);
454
- _Advance_bytes (_First, _Right);
455
- memcpy (_First, _Buf, _Left);
456
- break ;
457
- }
458
-
459
- void * _Mid2 = _Last;
460
- _Rewind_bytes (_Mid2, _Left);
461
- __std_swap_ranges_trivially_swappable_noalias (_Mid2, _Last, _First);
462
- _Last = _Mid2;
463
- } else {
464
- if (_Right == 0 ) {
465
- break ;
466
- }
467
-
468
- if (_Rotating::_Use_buffer (_Right, _Left)) {
469
- _Rewind_bytes (_Last, _Right);
470
- memcpy (_Buf, _Last, _Right);
471
- void * _Mid2 = _First;
472
- _Advance_bytes (_Mid2, _Right);
473
- _Rotating::_Move_to_higher_address (_Mid2, _First, _Left);
474
- memcpy (_First, _Buf, _Right);
475
- break ;
476
- }
477
-
478
- __std_swap_ranges_trivially_swappable_noalias (_Mid, _Last, _First);
479
- _Advance_bytes (_First, _Right);
480
- }
481
- }
482
- }
483
-
484
- } // extern "C"
485
-
486
588
namespace {
487
589
namespace _Sorting {
488
590
enum _Min_max_mode {
0 commit comments