Skip to content

Commit b8a9b19

Browse files
authored
SIM905: Fix handling of U+001C..U+001F whitespace (#19849)
Fixes #19845 ## Summary The linked issue explains it well, Rust and Python do not agree on what whitespace is for the purposes of `str.split`.
1 parent 4d8ccb6 commit b8a9b19

File tree

3 files changed

+103
-3
lines changed

3 files changed

+103
-3
lines changed

crates/ruff_linter/resources/test/fixtures/flake8_simplify/SIM905.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,3 +161,8 @@
161161
'no need' to escape
162162
"swap" quote style
163163
"use' ugly triple quotes""".split("\n")
164+
165+
# https://github.com/astral-sh/ruff/issues/19845
166+
print("S\x1cP\x1dL\x1eI\x1fT".split())
167+
print("\x1c\x1d\x1e\x1f>".split(maxsplit=0))
168+
print("<\x1c\x1d\x1e\x1f".rsplit(maxsplit=0))

crates/ruff_linter/src/rules/flake8_simplify/rules/split_static_string.rs

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -199,9 +199,9 @@ fn split_default(
199199
// - "".split(maxsplit=0) -> []
200200
// - " ".split(maxsplit=0) -> []
201201
let processed_str = if direction == Direction::Left {
202-
string_val.trim_start()
202+
string_val.trim_start_matches(py_unicode_is_whitespace)
203203
} else {
204-
string_val.trim_end()
204+
string_val.trim_end_matches(py_unicode_is_whitespace)
205205
};
206206
let list_items: &[_] = if processed_str.is_empty() {
207207
&[]
@@ -214,7 +214,10 @@ fn split_default(
214214
))
215215
}
216216
Ordering::Less => {
217-
let list_items: Vec<&str> = string_val.split_whitespace().collect();
217+
let list_items: Vec<&str> = string_val
218+
.split(py_unicode_is_whitespace)
219+
.filter(|s| !s.is_empty())
220+
.collect();
218221
Some(construct_replacement(
219222
&list_items,
220223
str_value.first_literal_flags(),
@@ -292,3 +295,34 @@ enum Direction {
292295
Left,
293296
Right,
294297
}
298+
299+
/// Like [`char::is_whitespace`] but with Python's notion of whitespace.
300+
///
301+
/// <https://github.com/astral-sh/ruff/issues/19845>
302+
/// <https://github.com/python/cpython/blob/v3.14.0rc1/Objects/unicodetype_db.h#L6673-L6711>
303+
#[rustfmt::skip]
304+
#[inline]
305+
const fn py_unicode_is_whitespace(ch: char) -> bool {
306+
matches!(
307+
ch,
308+
| '\u{0009}'
309+
| '\u{000A}'
310+
| '\u{000B}'
311+
| '\u{000C}'
312+
| '\u{000D}'
313+
| '\u{001C}'
314+
| '\u{001D}'
315+
| '\u{001E}'
316+
| '\u{001F}'
317+
| '\u{0020}'
318+
| '\u{0085}'
319+
| '\u{00A0}'
320+
| '\u{1680}'
321+
| '\u{2000}'..='\u{200A}'
322+
| '\u{2028}'
323+
| '\u{2029}'
324+
| '\u{202F}'
325+
| '\u{205F}'
326+
| '\u{3000}'
327+
)
328+
}

crates/ruff_linter/src/rules/flake8_simplify/snapshots/ruff_linter__rules__flake8_simplify__tests__SIM905_SIM905.py.snap

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1402,6 +1402,8 @@ SIM905 [*] Consider using a list literal instead of `str.split`
14021402
162 | | "swap" quote style
14031403
163 | | "use' ugly triple quotes""".split("\n")
14041404
| |_______________________________________^
1405+
164 |
1406+
165 | # https://github.com/astral-sh/ruff/issues/19845
14051407
|
14061408
help: Replace with list literal
14071409

@@ -1414,3 +1416,62 @@ help: Replace with list literal
14141416
162 |-"swap" quote style
14151417
163 |-"use' ugly triple quotes""".split("\n")
14161418
160 |+[r"first", r"'no need' to escape", r'"swap" quote style', r""""use' ugly triple quotes"""]
1419+
164 161 |
1420+
165 162 | # https://github.com/astral-sh/ruff/issues/19845
1421+
166 163 | print("S\x1cP\x1dL\x1eI\x1fT".split())
1422+
1423+
SIM905 [*] Consider using a list literal instead of `str.split`
1424+
--> SIM905.py:166:7
1425+
|
1426+
165 | # https://github.com/astral-sh/ruff/issues/19845
1427+
166 | print("S\x1cP\x1dL\x1eI\x1fT".split())
1428+
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1429+
167 | print("\x1c\x1d\x1e\x1f>".split(maxsplit=0))
1430+
168 | print("<\x1c\x1d\x1e\x1f".rsplit(maxsplit=0))
1431+
|
1432+
help: Replace with list literal
1433+
1434+
Safe fix
1435+
163 163 | "use' ugly triple quotes""".split("\n")
1436+
164 164 |
1437+
165 165 | # https://github.com/astral-sh/ruff/issues/19845
1438+
166 |-print("S\x1cP\x1dL\x1eI\x1fT".split())
1439+
166 |+print(["S", "P", "L", "I", "T"])
1440+
167 167 | print("\x1c\x1d\x1e\x1f>".split(maxsplit=0))
1441+
168 168 | print("<\x1c\x1d\x1e\x1f".rsplit(maxsplit=0))
1442+
1443+
SIM905 [*] Consider using a list literal instead of `str.split`
1444+
--> SIM905.py:167:7
1445+
|
1446+
165 | # https://github.com/astral-sh/ruff/issues/19845
1447+
166 | print("S\x1cP\x1dL\x1eI\x1fT".split())
1448+
167 | print("\x1c\x1d\x1e\x1f>".split(maxsplit=0))
1449+
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1450+
168 | print("<\x1c\x1d\x1e\x1f".rsplit(maxsplit=0))
1451+
|
1452+
help: Replace with list literal
1453+
1454+
Safe fix
1455+
164 164 |
1456+
165 165 | # https://github.com/astral-sh/ruff/issues/19845
1457+
166 166 | print("S\x1cP\x1dL\x1eI\x1fT".split())
1458+
167 |-print("\x1c\x1d\x1e\x1f>".split(maxsplit=0))
1459+
167 |+print([">"])
1460+
168 168 | print("<\x1c\x1d\x1e\x1f".rsplit(maxsplit=0))
1461+
1462+
SIM905 [*] Consider using a list literal instead of `str.split`
1463+
--> SIM905.py:168:7
1464+
|
1465+
166 | print("S\x1cP\x1dL\x1eI\x1fT".split())
1466+
167 | print("\x1c\x1d\x1e\x1f>".split(maxsplit=0))
1467+
168 | print("<\x1c\x1d\x1e\x1f".rsplit(maxsplit=0))
1468+
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1469+
|
1470+
help: Replace with list literal
1471+
1472+
Safe fix
1473+
165 165 | # https://github.com/astral-sh/ruff/issues/19845
1474+
166 166 | print("S\x1cP\x1dL\x1eI\x1fT".split())
1475+
167 167 | print("\x1c\x1d\x1e\x1f>".split(maxsplit=0))
1476+
168 |-print("<\x1c\x1d\x1e\x1f".rsplit(maxsplit=0))
1477+
168 |+print(["<"])

0 commit comments

Comments
 (0)