@@ -2,17 +2,129 @@ module Tokenize
2
2
3
3
export tokenize, untokenize
4
4
5
- using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str
5
+ using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str, @u8_str
6
6
7
- import ..JuliaSyntax: kind, Unicode,
7
+ import ..JuliaSyntax: kind,
8
8
is_literal, is_error, is_contextual_keyword, is_word_operator
9
9
10
10
#-------------------------------------------------------------------------------
11
11
# Character-based predicates for tokenization
12
12
13
13
const EOF_CHAR = typemax(Char)
14
14
15
- using .Unicode: is_identifier_char, is_identifier_start_char
15
+ # Julia identifier parsing predicates
16
+
17
+ using UnicodeNext
18
+
19
+ import UnicodeNext: CATEGORY_CS, CATEGORY_LL, CATEGORY_LM, CATEGORY_LO,
20
+ CATEGORY_LT, CATEGORY_LU, CATEGORY_MC, CATEGORY_ME, CATEGORY_MN,
21
+ CATEGORY_ND, CATEGORY_NL, CATEGORY_NO, CATEGORY_PC, CATEGORY_PD,
22
+ CATEGORY_PO, CATEGORY_SC, CATEGORY_SK, CATEGORY_SO, CATEGORY_ZS
23
+
24
+ # port of is_wc_cat_id_start from julia/src/flisp/julia_extensions.c
25
+ function _is_identifier_start_char(c::UInt32, cat::Integer)
26
+ return (cat == CATEGORY_LU || cat == CATEGORY_LL ||
27
+ cat == CATEGORY_LT || cat == CATEGORY_LM ||
28
+ cat == CATEGORY_LO || cat == CATEGORY_NL ||
29
+ cat == CATEGORY_SC || # allow currency symbols
30
+ # other symbols, but not arrows or replacement characters
31
+ (cat == CATEGORY_SO && !(c >= 0x2190 && c <= 0x21FF) &&
32
+ c != 0xfffc && c != 0xfffd &&
33
+ c != 0x233f && # notslash
34
+ c != 0x00a6) || # broken bar
35
+
36
+ # math symbol (category Sm) whitelist
37
+ (c >= 0x2140 && c <= 0x2a1c &&
38
+ ((c >= 0x2140 && c <= 0x2144) || # ⅀, ⅁, ⅂, ⅃, ⅄
39
+ c == 0x223f || c == 0x22be || c == 0x22bf || # ∿, ⊾, ⊿
40
+ c == 0x22a4 || c == 0x22a5 || # ⊤ ⊥
41
+
42
+ (c >= 0x2200 && c <= 0x2233 &&
43
+ (c == 0x2202 || c == 0x2205 || c == 0x2206 || # ∂, ∅, ∆
44
+ c == 0x2207 || c == 0x220e || c == 0x220f || # ∇, ∎, ∏
45
+ c == 0x2200 || c == 0x2203 || c == 0x2204 || # ∀, ∃, ∄
46
+ c == 0x2210 || c == 0x2211 || # ∐, ∑
47
+ c == 0x221e || c == 0x221f || # ∞, ∟
48
+ c >= 0x222b)) || # ∫, ∬, ∭, ∮, ∯, ∰, ∱, ∲, ∳
49
+
50
+ (c >= 0x22c0 && c <= 0x22c3) || # N-ary big ops: ⋀, ⋁, ⋂, ⋃
51
+ (c >= 0x25F8 && c <= 0x25ff) || # ◸, ◹, ◺, ◻, ◼, ◽, ◾, ◿
52
+
53
+ (c >= 0x266f &&
54
+ (c == 0x266f || c == 0x27d8 || c == 0x27d9 || # ♯, ⟘, ⟙
55
+ (c >= 0x27c0 && c <= 0x27c1) || # ⟀, ⟁
56
+ (c >= 0x29b0 && c <= 0x29b4) || # ⦰, ⦱, ⦲, ⦳, ⦴
57
+ (c >= 0x2a00 && c <= 0x2a06) || # ⨀, ⨁, ⨂, ⨃, ⨄, ⨅, ⨆
58
+ (c >= 0x2a09 && c <= 0x2a16) || # ⨉, ⨊, ⨋, ⨌, ⨍, ⨎, ⨏, ⨐, ⨑, ⨒, ⨓, ⨔, ⨕, ⨖
59
+ c == 0x2a1b || c == 0x2a1c)))) || # ⨛, ⨜
60
+
61
+ (c >= 0x1d6c1 && # variants of \nabla and \partial
62
+ (c == 0x1d6c1 || c == 0x1d6db ||
63
+ c == 0x1d6fb || c == 0x1d715 ||
64
+ c == 0x1d735 || c == 0x1d74f ||
65
+ c == 0x1d76f || c == 0x1d789 ||
66
+ c == 0x1d7a9 || c == 0x1d7c3)) ||
67
+
68
+ # super- and subscript +-=()
69
+ (c >= 0x207a && c <= 0x207e) ||
70
+ (c >= 0x208a && c <= 0x208e) ||
71
+
72
+ # angle symbols
73
+ (c >= 0x2220 && c <= 0x2222) || # ∠, ∡, ∢
74
+ (c >= 0x299b && c <= 0x29af) || # ⦛, ⦜, ⦝, ⦞, ⦟, ⦠, ⦡, ⦢, ⦣, ⦤, ⦥, ⦦, ⦧, ⦨, ⦩, ⦪, ⦫, ⦬, ⦭, ⦮, ⦯
75
+
76
+ # Other_ID_Start
77
+ c == 0x2118 || c == 0x212E || # ℘, ℮
78
+ (c >= 0x309B && c <= 0x309C) || # katakana-hiragana sound marks
79
+
80
+ # bold-digits and double-struck digits
81
+ (c >= 0x1D7CE && c <= 0x1D7E1)) # 𝟎 through 𝟗 (inclusive), 𝟘 through 𝟡 (inclusive)
82
+ end
83
+
84
+ # utility function to return the ASCII byte if isascii(c),
85
+ # and otherwise (for non-ASCII or invalid chars) return 0xff,
86
+ # based on the isascii source code.
87
+ @inline function _ascii_byte(c::Char)
88
+ x = bswap(reinterpret(UInt32, c))
89
+ return x < 0x80 ? x % UInt8 : 0xff
90
+ end
91
+
92
+ # from jl_id_start_char in julia/src/flisp/julia_extensions.c
93
+ function is_identifier_start_char(c::Char)
94
+ a = _ascii_byte(c)
95
+ if a != 0xff # ascii fast path
96
+ return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") || a == u8"_"
97
+ end
98
+ if c < Char(0xA1) || !isvalid(c)
99
+ return false
100
+ end
101
+ x = UInt32(c)
102
+ return _is_identifier_start_char(x, UnicodeNext.category_code(x))
103
+ end
104
+
105
+ # from jl_id_char in julia/src/flisp/julia_extensions.c
106
+ function is_identifier_char(c::Char)
107
+ a = _ascii_byte(c)
108
+ if a != 0xff # ascii fast path
109
+ return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") ||
110
+ a == u8"_" || (a >= u8"0" && a <= u8"9") || a == u8"!"
111
+ end
112
+ if c < Char(0xA1) || !isvalid(c)
113
+ return false
114
+ end
115
+ x = UInt32(c)
116
+ cat = UnicodeNext.category_code(x)
117
+ _is_identifier_start_char(x, cat) && return true
118
+ if (cat == CATEGORY_MN || cat == CATEGORY_MC ||
119
+ cat == CATEGORY_ND || cat == CATEGORY_PC ||
120
+ cat == CATEGORY_SK || cat == CATEGORY_ME ||
121
+ cat == CATEGORY_NO ||
122
+ # primes (single, double, triple, their reverses, and quadruple)
123
+ (x >= 0x2032 && x <= 0x2037) || (x == 0x2057))
124
+ return true
125
+ end
126
+ return false
127
+ end
16
128
17
129
function is_invisible_char(c::Char)
18
130
# These are the chars considered invisible by the reference parser.
33
145
# Chars that we will never allow to be part of a valid non-operator identifier
34
146
function is_never_id_char(ch::Char)
35
147
isvalid(ch) || return true
36
- cat = Unicode .category_code(ch)
148
+ cat = UnicodeNext .category_code(ch)
37
149
c = UInt32(ch)
38
150
return (
39
151
# spaces and control characters:
40
- (cat >= Unicode.UTF8PROC_CATEGORY_ZS && cat <= Unicode.UTF8PROC_CATEGORY_CS ) ||
152
+ (cat >= CATEGORY_ZS && cat <= CATEGORY_CS ) ||
41
153
42
154
# ASCII and Latin1 non-connector punctuation
43
155
(c < 0xff &&
44
- cat >= Unicode.UTF8PROC_CATEGORY_PD && cat <= Unicode.UTF8PROC_CATEGORY_PO ) ||
156
+ cat >= CATEGORY_PD && cat <= CATEGORY_PO ) ||
45
157
46
158
c == UInt32('`') ||
47
159
61
173
readchar(io::IO) = eof(io) ? EOF_CHAR : read(io, Char)
62
174
63
175
# Some unicode operators are normalized by the tokenizer into their equivalent
64
- # kinds. See also Unicode. normalize_identifier()
176
+ # kinds. See also normalize_identifier()
65
177
const _ops_with_unicode_aliases = [
66
178
# \minus '−' is normalized into K"-",
67
179
'−' => K"-"
@@ -126,10 +238,10 @@ end
126
238
if (u < 0xa1 || u > 0x10ffff)
127
239
return false
128
240
end
129
- cat = Unicode .category_code(u)
130
- if (cat == Unicode.UTF8PROC_CATEGORY_MN ||
131
- cat == Unicode.UTF8PROC_CATEGORY_MC ||
132
- cat == Unicode.UTF8PROC_CATEGORY_ME )
241
+ cat = UnicodeNext .category_code(u)
242
+ if (cat == CATEGORY_MN ||
243
+ cat == CATEGORY_MC ||
244
+ cat == CATEGORY_ME )
133
245
return true
134
246
end
135
247
# Additional allowed cases
215
327
@inline ishex(c::Char) = isdigit(c) || ('a' <= c <= 'f') || ('A' <= c <= 'F')
216
328
@inline isbinary(c::Char) = c == '0' || c == '1'
217
329
@inline isoctal(c::Char) = '0' ≤ c ≤ '7'
218
- @inline iswhitespace(c::Char) = (isvalid(c) && Unicode .isspace(c)) || c === '\ufeff'
330
+ @inline iswhitespace(c::Char) = (isvalid(c) && UnicodeNext .isspace(c)) || c === '\ufeff'
219
331
220
332
struct StringState
221
333
triplestr::Bool
@@ -1278,25 +1390,27 @@ function lex_identifier(l::Lexer, c)
1278
1390
h = simple_hash(c, UInt64(0))
1279
1391
n = 1
1280
1392
ascii = isascii(c)
1281
- graphemestate = Ref(Int32(ascii)) # all ASCII id chars are UTF8PROC_BOUNDCLASS_OTHER
1282
- graphemestate_peek = Ref(zero(Int32))
1393
+ graphemestate = UnicodeNext.GraphemeState(c)
1283
1394
while true
1284
1395
pc, ppc = dpeekchar(l)
1285
- pc_byte = Unicode.ascii_byte (pc)
1396
+ pc_byte = _ascii_byte (pc)
1286
1397
ascii = ascii && pc_byte != 0xff
1287
1398
if ascii # fast path
1288
1399
@inbounds if (pc_byte == UInt8('!') && ppc == '=') || !ascii_is_identifier_char[pc_byte+1]
1289
1400
break
1290
1401
end
1291
- elseif Unicode.isgraphemebreak!(graphemestate, c, pc)
1292
- if (pc == '!' && ppc == '=') || !is_identifier_char(pc)
1293
- break
1294
- end
1295
- elseif pc in ('\u200c','\u200d') # ZWNJ/ZWJ control characters
1296
- # ZWJ/ZWNJ only within grapheme sequences, not at end
1297
- graphemestate_peek[] = graphemestate[]
1298
- if Unicode.isgraphemebreak!(graphemestate_peek, pc, ppc)
1299
- break
1402
+ else
1403
+ graphemestate, isbreak = UnicodeNext.isgraphemebreak(graphemestate, pc)
1404
+ if isbreak
1405
+ if ((pc == '!' && ppc == '=') || !is_identifier_char(pc))
1406
+ break
1407
+ end
1408
+ elseif pc in ('\u200c','\u200d') # ZWNJ/ZWJ control characters
1409
+ # ZWJ/ZWNJ only within grapheme sequences, not at end
1410
+ _, isbreak_peek = UnicodeNext.isgraphemebreak(graphemestate, ppc)
1411
+ if isbreak_peek
1412
+ break
1413
+ end
1300
1414
end
1301
1415
end
1302
1416
c = readchar(l)
0 commit comments