Skip to content

Commit c820724

Browse files
committed
Use UnicodeNext for unicode functionality
Here I've put tokenization-related functionality like `is_id_start_char()` in the Tokenize module.
1 parent 1506668 commit c820724

File tree

6 files changed

+160
-234
lines changed

6 files changed

+160
-234
lines changed

Project.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,10 @@ authors = ["Claire Foster <[email protected]> and contributors"]
44
version = "0.4.6"
55

66
[deps]
7-
utf8proc_jll = "00992c89-a35c-5347-9984-e6609dacc59a"
7+
UnicodeNext = "7b9d9d2f-29eb-4111-b31d-f1cfc33d1412"
88

99
[compat]
1010
julia = "1.0"
11-
utf8proc_jll = "~2.9" # = 2.9.x for Unicode 15.1
1211

1312
[extras]
1413
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"

src/JuliaSyntax.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
module JuliaSyntax
22

3+
using UnicodeNext
4+
35
# Conservative list of exports - only export the most common/useful things
46
# here.
57

@@ -20,7 +22,6 @@ export SyntaxNode
2022

2123
# Helper utilities
2224
include("utils.jl")
23-
include("unicode.jl")
2425

2526
include("kinds.jl")
2627

src/literal_parsing.jl

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,24 @@ end
331331
#-------------------------------------------------------------------------------
332332
# Unicode normalization.
333333

334-
using .Unicode: normalize_identifier
334+
function normalize_identifier(c::Char)
335+
if c <= '~'
336+
return c # ASCII common case
337+
end
338+
return c == '\u025B' ? '\u03B5' : # 'ɛ' => 'ε'
339+
c == '\u00B5' ? '\u03BC' : # 'µ' => 'μ'
340+
c == '\u00B7' ? '\u22C5' : # '·' => '⋅'
341+
c == '\u0387' ? '\u22C5' : # '·' => '⋅'
342+
c == '\u2212' ? '\u002D' : # '−' (\minus) => '-'
343+
c == '\u210F' ? '\u0127' : # 'ℏ' (\hslash) => 'ħ' \hbar
344+
c
345+
end
346+
347+
function normalize_identifier(str::AbstractString)
348+
isascii(str) ? str :
349+
UnicodeNext.normalize(str, stable=true, compose=true,
350+
chartransform=normalize_identifier)
351+
end
335352

336353
#-------------------------------------------------------------------------------
337354
function parse_julia_literal(txtbuf::Vector{UInt8}, head::SyntaxHead, srcrange)

src/source_files.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ function _print_marker_line(io, prefix_str, str, underline, singleline, color,
145145
# Getting exactly the same width of whitespace as `str` is tricky.
146146
# Especially for mixtures of tabs and spaces.
147147
# tabs are zero width according to textwidth
148-
indent = join(Unicode.isspace(c) ? c : repeat(' ', textwidth(c)) for c in prefix_str)
148+
indent = join(UnicodeNext.isspace(c) ? c : repeat(' ', textwidth(c)) for c in prefix_str)
149149

150150
# Assume tabs are 4 wide rather than 0. (fixme: implement tab alignment?)
151151
w = textwidth(str) + 4*count(c->c=='\t', str)

src/tokenize.jl

Lines changed: 138 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,129 @@ module Tokenize
22

33
export tokenize, untokenize
44

5-
using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str
5+
using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str, @u8_str
66

7-
import ..JuliaSyntax: kind, Unicode,
7+
import ..JuliaSyntax: kind,
88
is_literal, is_error, is_contextual_keyword, is_word_operator
99

1010
#-------------------------------------------------------------------------------
1111
# Character-based predicates for tokenization
1212

1313
const EOF_CHAR = typemax(Char)
1414

15-
using .Unicode: is_identifier_char, is_identifier_start_char
15+
# Julia identifier parsing predicates
16+
17+
using UnicodeNext
18+
19+
import UnicodeNext: CATEGORY_CS, CATEGORY_LL, CATEGORY_LM, CATEGORY_LO,
20+
CATEGORY_LT, CATEGORY_LU, CATEGORY_MC, CATEGORY_ME, CATEGORY_MN,
21+
CATEGORY_ND, CATEGORY_NL, CATEGORY_NO, CATEGORY_PC, CATEGORY_PD,
22+
CATEGORY_PO, CATEGORY_SC, CATEGORY_SK, CATEGORY_SO, CATEGORY_ZS
23+
24+
# port of is_wc_cat_id_start from julia/src/flisp/julia_extensions.c
25+
function _is_identifier_start_char(c::UInt32, cat::Integer)
26+
return (cat == CATEGORY_LU || cat == CATEGORY_LL ||
27+
cat == CATEGORY_LT || cat == CATEGORY_LM ||
28+
cat == CATEGORY_LO || cat == CATEGORY_NL ||
29+
cat == CATEGORY_SC || # allow currency symbols
30+
# other symbols, but not arrows or replacement characters
31+
(cat == CATEGORY_SO && !(c >= 0x2190 && c <= 0x21FF) &&
32+
c != 0xfffc && c != 0xfffd &&
33+
c != 0x233f && # notslash
34+
c != 0x00a6) || # broken bar
35+
36+
# math symbol (category Sm) whitelist
37+
(c >= 0x2140 && c <= 0x2a1c &&
38+
((c >= 0x2140 && c <= 0x2144) || # ⅀, ⅁, ⅂, ⅃, ⅄
39+
c == 0x223f || c == 0x22be || c == 0x22bf || # ∿, ⊾, ⊿
40+
c == 0x22a4 || c == 0x22a5 || # ⊤ ⊥
41+
42+
(c >= 0x2200 && c <= 0x2233 &&
43+
(c == 0x2202 || c == 0x2205 || c == 0x2206 || # ∂, ∅, ∆
44+
c == 0x2207 || c == 0x220e || c == 0x220f || # ∇, ∎, ∏
45+
c == 0x2200 || c == 0x2203 || c == 0x2204 || # ∀, ∃, ∄
46+
c == 0x2210 || c == 0x2211 || # ∐, ∑
47+
c == 0x221e || c == 0x221f || # ∞, ∟
48+
c >= 0x222b)) || # ∫, ∬, ∭, ∮, ∯, ∰, ∱, ∲, ∳
49+
50+
(c >= 0x22c0 && c <= 0x22c3) || # N-ary big ops: ⋀, ⋁, ⋂, ⋃
51+
(c >= 0x25F8 && c <= 0x25ff) || # ◸, ◹, ◺, ◻, ◼, ◽, ◾, ◿
52+
53+
(c >= 0x266f &&
54+
(c == 0x266f || c == 0x27d8 || c == 0x27d9 || # ♯, ⟘, ⟙
55+
(c >= 0x27c0 && c <= 0x27c1) || # ⟀, ⟁
56+
(c >= 0x29b0 && c <= 0x29b4) || # ⦰, ⦱, ⦲, ⦳, ⦴
57+
(c >= 0x2a00 && c <= 0x2a06) || # ⨀, ⨁, ⨂, ⨃, ⨄, ⨅, ⨆
58+
(c >= 0x2a09 && c <= 0x2a16) || # ⨉, ⨊, ⨋, ⨌, ⨍, ⨎, ⨏, ⨐, ⨑, ⨒, ⨓, ⨔, ⨕, ⨖
59+
c == 0x2a1b || c == 0x2a1c)))) || # ⨛, ⨜
60+
61+
(c >= 0x1d6c1 && # variants of \nabla and \partial
62+
(c == 0x1d6c1 || c == 0x1d6db ||
63+
c == 0x1d6fb || c == 0x1d715 ||
64+
c == 0x1d735 || c == 0x1d74f ||
65+
c == 0x1d76f || c == 0x1d789 ||
66+
c == 0x1d7a9 || c == 0x1d7c3)) ||
67+
68+
# super- and subscript +-=()
69+
(c >= 0x207a && c <= 0x207e) ||
70+
(c >= 0x208a && c <= 0x208e) ||
71+
72+
# angle symbols
73+
(c >= 0x2220 && c <= 0x2222) || # ∠, ∡, ∢
74+
(c >= 0x299b && c <= 0x29af) || # ⦛, ⦜, ⦝, ⦞, ⦟, ⦠, ⦡, ⦢, ⦣, ⦤, ⦥, ⦦, ⦧, ⦨, ⦩, ⦪, ⦫, ⦬, ⦭, ⦮, ⦯
75+
76+
# Other_ID_Start
77+
c == 0x2118 || c == 0x212E || # ℘, ℮
78+
(c >= 0x309B && c <= 0x309C) || # katakana-hiragana sound marks
79+
80+
# bold-digits and double-struck digits
81+
(c >= 0x1D7CE && c <= 0x1D7E1)) # 𝟎 through 𝟗 (inclusive), 𝟘 through 𝟡 (inclusive)
82+
end
83+
84+
# utility function to return the ASCII byte if isascii(c),
85+
# and otherwise (for non-ASCII or invalid chars) return 0xff,
86+
# based on the isascii source code.
87+
@inline function _ascii_byte(c::Char)
88+
x = bswap(reinterpret(UInt32, c))
89+
return x < 0x80 ? x % UInt8 : 0xff
90+
end
91+
92+
# from jl_id_start_char in julia/src/flisp/julia_extensions.c
93+
function is_identifier_start_char(c::Char)
94+
a = _ascii_byte(c)
95+
if a != 0xff # ascii fast path
96+
return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") || a == u8"_"
97+
end
98+
if c < Char(0xA1) || !isvalid(c)
99+
return false
100+
end
101+
x = UInt32(c)
102+
return _is_identifier_start_char(x, UnicodeNext.category_code(x))
103+
end
104+
105+
# from jl_id_char in julia/src/flisp/julia_extensions.c
106+
function is_identifier_char(c::Char)
107+
a = _ascii_byte(c)
108+
if a != 0xff # ascii fast path
109+
return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") ||
110+
a == u8"_" || (a >= u8"0" && a <= u8"9") || a == u8"!"
111+
end
112+
if c < Char(0xA1) || !isvalid(c)
113+
return false
114+
end
115+
x = UInt32(c)
116+
cat = UnicodeNext.category_code(x)
117+
_is_identifier_start_char(x, cat) && return true
118+
if (cat == CATEGORY_MN || cat == CATEGORY_MC ||
119+
cat == CATEGORY_ND || cat == CATEGORY_PC ||
120+
cat == CATEGORY_SK || cat == CATEGORY_ME ||
121+
cat == CATEGORY_NO ||
122+
# primes (single, double, triple, their reverses, and quadruple)
123+
(x >= 0x2032 && x <= 0x2037) || (x == 0x2057))
124+
return true
125+
end
126+
return false
127+
end
16128

17129
function is_invisible_char(c::Char)
18130
# These are the chars considered invisible by the reference parser.
@@ -33,15 +145,15 @@ end
33145
# Chars that we will never allow to be part of a valid non-operator identifier
34146
function is_never_id_char(ch::Char)
35147
isvalid(ch) || return true
36-
cat = Unicode.category_code(ch)
148+
cat = UnicodeNext.category_code(ch)
37149
c = UInt32(ch)
38150
return (
39151
# spaces and control characters:
40-
(cat >= Unicode.UTF8PROC_CATEGORY_ZS && cat <= Unicode.UTF8PROC_CATEGORY_CS) ||
152+
(cat >= CATEGORY_ZS && cat <= CATEGORY_CS) ||
41153

42154
# ASCII and Latin1 non-connector punctuation
43155
(c < 0xff &&
44-
cat >= Unicode.UTF8PROC_CATEGORY_PD && cat <= Unicode.UTF8PROC_CATEGORY_PO) ||
156+
cat >= CATEGORY_PD && cat <= CATEGORY_PO) ||
45157

46158
c == UInt32('`') ||
47159

@@ -61,7 +173,7 @@ end
61173
readchar(io::IO) = eof(io) ? EOF_CHAR : read(io, Char)
62174

63175
# Some unicode operators are normalized by the tokenizer into their equivalent
64-
# kinds. See also Unicode.normalize_identifier()
176+
# kinds. See also normalize_identifier()
65177
const _ops_with_unicode_aliases = [
66178
# \minus '−' is normalized into K"-",
67179
'−' => K"-"
@@ -126,10 +238,10 @@ end
126238
if (u < 0xa1 || u > 0x10ffff)
127239
return false
128240
end
129-
cat = Unicode.category_code(u)
130-
if (cat == Unicode.UTF8PROC_CATEGORY_MN ||
131-
cat == Unicode.UTF8PROC_CATEGORY_MC ||
132-
cat == Unicode.UTF8PROC_CATEGORY_ME)
241+
cat = UnicodeNext.category_code(u)
242+
if (cat == CATEGORY_MN ||
243+
cat == CATEGORY_MC ||
244+
cat == CATEGORY_ME)
133245
return true
134246
end
135247
# Additional allowed cases
@@ -215,7 +327,7 @@ end
215327
@inline ishex(c::Char) = isdigit(c) || ('a' <= c <= 'f') || ('A' <= c <= 'F')
216328
@inline isbinary(c::Char) = c == '0' || c == '1'
217329
@inline isoctal(c::Char) = '0' ≤ c ≤ '7'
218-
@inline iswhitespace(c::Char) = (isvalid(c) && Unicode.isspace(c)) || c === '\ufeff'
330+
@inline iswhitespace(c::Char) = (isvalid(c) && UnicodeNext.isspace(c)) || c === '\ufeff'
219331

220332
struct StringState
221333
triplestr::Bool
@@ -1278,25 +1390,27 @@ function lex_identifier(l::Lexer, c)
12781390
h = simple_hash(c, UInt64(0))
12791391
n = 1
12801392
ascii = isascii(c)
1281-
graphemestate = Ref(Int32(ascii)) # all ASCII id chars are UTF8PROC_BOUNDCLASS_OTHER
1282-
graphemestate_peek = Ref(zero(Int32))
1393+
graphemestate = UnicodeNext.GraphemeState(c)
12831394
while true
12841395
pc, ppc = dpeekchar(l)
1285-
pc_byte = Unicode.ascii_byte(pc)
1396+
pc_byte = _ascii_byte(pc)
12861397
ascii = ascii && pc_byte != 0xff
12871398
if ascii # fast path
12881399
@inbounds if (pc_byte == UInt8('!') && ppc == '=') || !ascii_is_identifier_char[pc_byte+1]
12891400
break
12901401
end
1291-
elseif Unicode.isgraphemebreak!(graphemestate, c, pc)
1292-
if (pc == '!' && ppc == '=') || !is_identifier_char(pc)
1293-
break
1294-
end
1295-
elseif pc in ('\u200c','\u200d') # ZWNJ/ZWJ control characters
1296-
# ZWJ/ZWNJ only within grapheme sequences, not at end
1297-
graphemestate_peek[] = graphemestate[]
1298-
if Unicode.isgraphemebreak!(graphemestate_peek, pc, ppc)
1299-
break
1402+
else
1403+
graphemestate, isbreak = UnicodeNext.isgraphemebreak(graphemestate, pc)
1404+
if isbreak
1405+
if ((pc == '!' && ppc == '=') || !is_identifier_char(pc))
1406+
break
1407+
end
1408+
elseif pc in ('\u200c','\u200d') # ZWNJ/ZWJ control characters
1409+
# ZWJ/ZWNJ only within grapheme sequences, not at end
1410+
_, isbreak_peek = UnicodeNext.isgraphemebreak(graphemestate, ppc)
1411+
if isbreak_peek
1412+
break
1413+
end
13001414
end
13011415
end
13021416
c = readchar(l)

0 commit comments

Comments
 (0)