@@ -1837,15 +1837,15 @@ namespace {
1837
1837
template <class _Traits , class _Ty >
1838
1838
const void * __stdcall __std_find_trivial_impl (const void * _First, const void * _Last, _Ty _Val) noexcept {
1839
1839
#ifndef _M_ARM64EC
1840
- size_t _Size_bytes = _Byte_length (_First, _Last);
1840
+ const size_t _Size_bytes = _Byte_length (_First, _Last);
1841
1841
1842
- const size_t _Avx_size = _Size_bytes & ~size_t {0x1F };
1843
- if (_Avx_size != 0 && _Use_avx2 ()) {
1842
+ if (const size_t _Avx_size = _Size_bytes & ~size_t {0x1F }; _Avx_size != 0 && _Use_avx2 ()) {
1844
1843
_Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414
1845
1844
1846
1845
const __m256i _Comparand = _Traits::_Set_avx (_Val);
1847
1846
const void * _Stop_at = _First;
1848
1847
_Advance_bytes (_Stop_at, _Avx_size);
1848
+
1849
1849
do {
1850
1850
const __m256i _Data = _mm256_loadu_si256 (static_cast <const __m256i*>(_First));
1851
1851
const int _Bingo = _mm256_movemask_epi8 (_Traits::_Cmp_avx (_Data, _Comparand));
@@ -1858,14 +1858,30 @@ namespace {
1858
1858
1859
1859
_Advance_bytes (_First, 32 );
1860
1860
} while (_First != _Stop_at);
1861
- _Size_bytes &= 0x1F ;
1862
- }
1863
1861
1864
- const size_t _Sse_size = _Size_bytes & ~size_t {0xF };
1865
- if (_Sse_size != 0 && _Use_sse42 ()) {
1862
+ if (const size_t _Avx_tail_size = _Size_bytes & 0x1C ; _Avx_tail_size != 0 ) {
1863
+ const __m256i _Tail_mask = _Avx2_tail_mask_32 (_Avx_tail_size >> 2 );
1864
+ const __m256i _Data = _mm256_maskload_epi32 (static_cast <const int *>(_First), _Tail_mask);
1865
+ const int _Bingo =
1866
+ _mm256_movemask_epi8 (_mm256_and_si256 (_Traits::_Cmp_avx (_Data, _Comparand), _Tail_mask));
1867
+
1868
+ if (_Bingo != 0 ) {
1869
+ const unsigned long _Offset = _tzcnt_u32 (_Bingo);
1870
+ _Advance_bytes (_First, _Offset);
1871
+ return _First;
1872
+ }
1873
+
1874
+ _Advance_bytes (_First, _Avx_tail_size);
1875
+ }
1876
+
1877
+ if constexpr (sizeof (_Ty) >= 4 ) {
1878
+ return _First;
1879
+ }
1880
+ } else if (const size_t _Sse_size = _Size_bytes & ~size_t {0xF }; _Sse_size != 0 && _Use_sse42 ()) {
1866
1881
const __m128i _Comparand = _Traits::_Set_sse (_Val);
1867
1882
const void * _Stop_at = _First;
1868
1883
_Advance_bytes (_Stop_at, _Sse_size);
1884
+
1869
1885
do {
1870
1886
const __m128i _Data = _mm_loadu_si128 (static_cast <const __m128i*>(_First));
1871
1887
const int _Bingo = _mm_movemask_epi8 (_Traits::_Cmp_sse (_Data, _Comparand));
@@ -1892,15 +1908,15 @@ namespace {
1892
1908
const void * __stdcall __std_find_last_trivial_impl (const void * _First, const void * _Last, _Ty _Val) noexcept {
1893
1909
const void * const _Real_last = _Last;
1894
1910
#ifndef _M_ARM64EC
1895
- size_t _Size_bytes = _Byte_length (_First, _Last);
1911
+ const size_t _Size_bytes = _Byte_length (_First, _Last);
1896
1912
1897
- const size_t _Avx_size = _Size_bytes & ~size_t {0x1F };
1898
- if (_Avx_size != 0 && _Use_avx2 ()) {
1913
+ if (const size_t _Avx_size = _Size_bytes & ~size_t {0x1F }; _Avx_size != 0 && _Use_avx2 ()) {
1899
1914
_Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414
1900
1915
1901
1916
const __m256i _Comparand = _Traits::_Set_avx (_Val);
1902
1917
const void * _Stop_at = _Last;
1903
1918
_Rewind_bytes (_Stop_at, _Avx_size);
1919
+
1904
1920
do {
1905
1921
_Rewind_bytes (_Last, 32 );
1906
1922
const __m256i _Data = _mm256_loadu_si256 (static_cast <const __m256i*>(_Last));
@@ -1912,14 +1928,29 @@ namespace {
1912
1928
return _Last;
1913
1929
}
1914
1930
} while (_Last != _Stop_at);
1915
- _Size_bytes &= 0x1F ;
1916
- }
1917
1931
1918
- const size_t _Sse_size = _Size_bytes & ~size_t {0xF };
1919
- if (_Sse_size != 0 && _Use_sse42 ()) {
1932
+ if (const size_t _Avx_tail_size = _Size_bytes & 0x1C ; _Avx_tail_size != 0 ) {
1933
+ _Rewind_bytes (_Last, _Avx_tail_size);
1934
+ const __m256i _Tail_mask = _Avx2_tail_mask_32 (_Avx_tail_size >> 2 );
1935
+ const __m256i _Data = _mm256_maskload_epi32 (static_cast <const int *>(_Last), _Tail_mask);
1936
+ const int _Bingo =
1937
+ _mm256_movemask_epi8 (_mm256_and_si256 (_Traits::_Cmp_avx (_Data, _Comparand), _Tail_mask));
1938
+
1939
+ if (_Bingo != 0 ) {
1940
+ const unsigned long _Offset = _lzcnt_u32 (_Bingo);
1941
+ _Advance_bytes (_Last, (31 - _Offset) - (sizeof (_Ty) - 1 ));
1942
+ return _Last;
1943
+ }
1944
+ }
1945
+
1946
+ if constexpr (sizeof (_Ty) >= 4 ) {
1947
+ return _Real_last;
1948
+ }
1949
+ } else if (const size_t _Sse_size = _Size_bytes & ~size_t {0xF }; _Sse_size != 0 && _Use_sse42 ()) {
1920
1950
const __m128i _Comparand = _Traits::_Set_sse (_Val);
1921
1951
const void * _Stop_at = _Last;
1922
1952
_Rewind_bytes (_Stop_at, _Sse_size);
1953
+
1923
1954
do {
1924
1955
_Rewind_bytes (_Last, 16 );
1925
1956
const __m128i _Data = _mm_loadu_si128 (static_cast <const __m128i*>(_Last));
@@ -1952,40 +1983,53 @@ namespace {
1952
1983
size_t _Result = 0 ;
1953
1984
1954
1985
#ifndef _M_ARM64EC
1955
- size_t _Size_bytes = _Byte_length (_First, _Last);
1986
+ const size_t _Size_bytes = _Byte_length (_First, _Last);
1956
1987
1957
- const size_t _Avx_size = _Size_bytes & ~size_t {0x1F };
1958
- if (_Avx_size != 0 && _Use_avx2 ()) {
1988
+ if (const size_t _Avx_size = _Size_bytes & ~size_t {0x1F }; _Avx_size != 0 && _Use_avx2 ()) {
1959
1989
const __m256i _Comparand = _Traits::_Set_avx (_Val);
1960
1990
const void * _Stop_at = _First;
1961
1991
_Advance_bytes (_Stop_at, _Avx_size);
1992
+
1962
1993
do {
1963
1994
const __m256i _Data = _mm256_loadu_si256 (static_cast <const __m256i*>(_First));
1964
1995
const int _Bingo = _mm256_movemask_epi8 (_Traits::_Cmp_avx (_Data, _Comparand));
1965
1996
_Result += __popcnt (_Bingo); // Assume available with SSE4.2
1966
1997
_Advance_bytes (_First, 32 );
1967
1998
} while (_First != _Stop_at);
1968
- _Size_bytes &= 0x1F ;
1999
+
2000
+ if (const size_t _Avx_tail_size = _Size_bytes & 0x1C ; _Avx_tail_size != 0 ) {
2001
+ const __m256i _Tail_mask = _Avx2_tail_mask_32 (_Avx_tail_size >> 2 );
2002
+ const __m256i _Data = _mm256_maskload_epi32 (static_cast <const int *>(_First), _Tail_mask);
2003
+ const int _Bingo =
2004
+ _mm256_movemask_epi8 (_mm256_and_si256 (_Traits::_Cmp_avx (_Data, _Comparand), _Tail_mask));
2005
+ _Result += __popcnt (_Bingo); // Assume available with SSE4.2
2006
+ _Advance_bytes (_First, _Avx_tail_size);
2007
+ }
1969
2008
1970
2009
_mm256_zeroupper (); // TRANSITION, DevCom-10331414
1971
- }
1972
2010
1973
- const size_t _Sse_size = _Size_bytes & ~size_t {0xF };
1974
- if (_Sse_size != 0 && _Use_sse42 ()) {
2011
+ _Result >>= _Traits::_Shift;
2012
+
2013
+ if constexpr (sizeof (_Ty) >= 4 ) {
2014
+ return _Result;
2015
+ }
2016
+ } else if (const size_t _Sse_size = _Size_bytes & ~size_t {0xF }; _Sse_size != 0 && _Use_sse42 ()) {
1975
2017
const __m128i _Comparand = _Traits::_Set_sse (_Val);
1976
2018
const void * _Stop_at = _First;
1977
2019
_Advance_bytes (_Stop_at, _Sse_size);
2020
+
1978
2021
do {
1979
2022
const __m128i _Data = _mm_loadu_si128 (static_cast <const __m128i*>(_First));
1980
2023
const int _Bingo = _mm_movemask_epi8 (_Traits::_Cmp_sse (_Data, _Comparand));
1981
2024
_Result += __popcnt (_Bingo); // Assume available with SSE4.2
1982
2025
_Advance_bytes (_First, 16 );
1983
2026
} while (_First != _Stop_at);
2027
+
2028
+ _Result >>= _Traits::_Shift;
1984
2029
}
1985
2030
#endif // !_M_ARM64EC
1986
- _Result >>= _Traits::_Shift;
1987
- auto _Ptr = static_cast <const _Ty*>(_First);
1988
- for (; _Ptr != _Last; ++_Ptr) {
2031
+
2032
+ for (auto _Ptr = static_cast <const _Ty*>(_First); _Ptr != _Last; ++_Ptr) {
1989
2033
if (*_Ptr == _Val) {
1990
2034
++_Result;
1991
2035
}
0 commit comments