Skip to content

Commit 01cdfcd

Browse files
committed
Work on table based lookup for grapheme segmentation
1 parent 56b49c4 commit 01cdfcd

File tree

1 file changed

+197
-35
lines changed

1 file changed

+197
-35
lines changed

gen/wcwidth.py

Lines changed: 197 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env python
22
# License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
33

4+
# Imports {{{
45
import json
56
import os
67
import re
@@ -16,11 +17,13 @@
1617
from typing import (
1718
Callable,
1819
DefaultDict,
20+
Iterator,
1921
Literal,
2022
NamedTuple,
2123
Optional,
2224
Protocol,
2325
Sequence,
26+
TypedDict,
2427
Union,
2528
)
2629
from urllib.request import urlopen
@@ -29,8 +32,9 @@
2932
import __main__
3033
__main__.__package__ = 'gen'
3134
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
35+
# }}}
3236

33-
37+
# Fetching data {{{
3438
non_characters = frozenset(range(0xfffe, 0x10ffff, 0x10000))
3539
non_characters |= frozenset(range(0xffff, 0x10ffff + 1, 0x10000))
3640
non_characters |= frozenset(range(0xfdd0, 0xfdf0))
@@ -63,8 +67,9 @@ def unicode_version() -> tuple[int, int, int]:
6367
if m is not None:
6468
return int(m.group(1)), int(m.group(2)), int(m.group(3))
6569
raise ValueError('Could not find Unicode Version')
70+
# }}}
6671

67-
72+
# Parsing Unicode databases {{{
6873
# Map of class names to set of codepoints in class
6974
class_maps: dict[str, set[int]] = {}
7075
all_symbols: set[int] = set()
@@ -269,9 +274,13 @@ def parse_eaw() -> None:
269274

270275
def parse_grapheme_segmentation() -> None:
271276
global extended_pictographic
277+
grapheme_segmentation_maps['AtStart'] # this is used by the segmentation algorithm, no character has it
278+
grapheme_segmentation_maps['None'] # this is used by the segmentation algorithm, no character has it
272279
for line in get_data('ucd/auxiliary/GraphemeBreakProperty.txt'):
273280
chars, category = split_two(line)
274281
grapheme_segmentation_maps[category] |= chars
282+
grapheme_segmentation_maps['Private_Expecting_RI'] # this is used by the segmentation algorithm, no character has it
283+
incb_map['None'] # used by segmentation algorithm no character has it
275284
for line in get_data('ucd/DerivedCoreProperties.txt'):
276285
spec, rest = line.split(';', 1)
277286
category = rest.strip().split(' ', 1)[0].strip().rstrip(';')
@@ -287,6 +296,36 @@ def parse_grapheme_segmentation() -> None:
287296
extended_pictographic |= chars
288297

289298

299+
class GraphemeSegmentationTest(TypedDict):
300+
data: tuple[str, ...]
301+
comment: str
302+
303+
304+
grapheme_segmentation_tests: list[GraphemeSegmentationTest] = []
305+
306+
307+
def parse_test_data() -> None:
308+
for line in get_data('ucd/auxiliary/GraphemeBreakTest.txt'):
309+
t, comment = line.split('#')
310+
t = t.lstrip('÷').strip().rstrip('÷').strip()
311+
chars: list[list[str]] = [[]]
312+
for x in re.split(r'([÷×])', t):
313+
x = x.strip()
314+
match x:
315+
case '÷':
316+
chars.append([])
317+
case '×':
318+
pass
319+
case '':
320+
pass
321+
case _:
322+
ch = chr(int(x, 16))
323+
chars[-1].append(ch)
324+
c = tuple(''.join(c) for c in chars)
325+
grapheme_segmentation_tests.append({'data': c, 'comment': comment.strip()})
326+
# }}}
327+
328+
290329
def write_case(spec: Union[tuple[int, ...], int], p: Callable[..., None], for_go: bool = False) -> None:
291330
if isinstance(spec, tuple):
292331
if for_go:
@@ -386,27 +425,8 @@ def print_range() -> None:
386425

387426

388427
def gen_test_data() -> None:
389-
tests = []
390-
for line in get_data('ucd/auxiliary/GraphemeBreakTest.txt'):
391-
t, comment = line.split('#')
392-
t = t.lstrip('÷').strip().rstrip('÷').strip()
393-
chars: list[list[str]] = [[]]
394-
for x in re.split(r'([÷×])', t):
395-
x = x.strip()
396-
match x:
397-
case '÷':
398-
chars.append([])
399-
case '×':
400-
pass
401-
case '':
402-
pass
403-
case _:
404-
ch = chr(int(x, 16))
405-
chars[-1].append(ch)
406-
c = [''.join(c) for c in chars]
407-
tests.append({'data': c, 'comment': comment.strip()})
408428
with open('kitty_tests/GraphemeBreakTest.json', 'wb') as f:
409-
f.write(json.dumps(tests, indent=2, ensure_ascii=False).encode())
429+
f.write(json.dumps(grapheme_segmentation_tests, indent=2, ensure_ascii=False).encode())
410430

411431

412432
def getsize(data: Iterable[int]) -> Literal[1, 2, 4]:
@@ -519,6 +539,148 @@ def bitsize(maxval: int) -> int: # number of bits needed to store maxval
519539
return ceil(log(maxval, 2))
520540

521541

542+
def clamped_bitsize(val: int) -> int:
543+
if val <= 8:
544+
return 8
545+
if val <= 16:
546+
return 16
547+
if val <= 32:
548+
return 32
549+
if val <= 64:
550+
return 64
551+
raise ValueError('Too many fields')
552+
553+
554+
class GraphemeSegmentationProps(NamedTuple):
555+
556+
grapheme_break: str = '' # set at runtime
557+
indic_conjunct_break: str = '' # set at runtime
558+
is_extended_pictographic: bool = True
559+
560+
@classmethod
561+
def bitsize(cls) -> int:
562+
ans = sum(int(cls._field_defaults[f]) for f in cls._fields)
563+
return clamped_bitsize(ans)
564+
565+
566+
control_grapheme_breaks = 'CR', 'LF', 'Control'
567+
linker_or_extend = 'Linker', 'Extend'
568+
569+
570+
class GraphemeSegmentationState(NamedTuple):
571+
grapheme_break: str = '' # set at runtime
572+
# True if the last character ends a sequence of Indic_Conjunct_Break values: consonant {extend|linker}*
573+
incb_consonant_extended: bool = True
574+
# True if the last character ends a sequence of Indic_Conjunct_Break values: consonant {extend|linker}* linker
575+
incb_consonant_extended_linker: bool = True
576+
# True if the last character ends a sequence of Indic_Conjunct_Break values: consonant {extend|linker}* linker {extend|linker}*
577+
incb_consonant_extended_linker_extended: bool = True
578+
# True if the last character ends an emoji modifier sequence \p{Extended_Pictographic} Extend*
579+
emoji_modifier_sequence: bool = True
580+
# True if the last character was immediately preceded by an emoji modifier sequence \p{Extended_Pictographic} Extend*
581+
emoji_modifier_sequence_before_last_char: bool = True
582+
583+
@classmethod
584+
def make(cls) -> 'GraphemeSegmentationState':
585+
return GraphemeSegmentationState('AtStart', False, False, False, False, False)
586+
587+
@classmethod
588+
def bitsize(cls) -> int:
589+
ans = sum(int(cls._field_defaults[f]) for f in cls._fields)
590+
return clamped_bitsize(ans)
591+
592+
def add_to_current_cell(self, p: GraphemeSegmentationProps) -> 'GraphemeSegmentationResult':
593+
prev = self.grapheme_break
594+
prop = p.grapheme_break
595+
incb = p.indic_conjunct_break
596+
add_to_cell = False
597+
if self.grapheme_break == 'AtStart':
598+
add_to_cell = True
599+
if prop == 'Regional_Indicator':
600+
prop = 'Private_Expecting_RI'
601+
else:
602+
# No break between CR and LF (GB3).
603+
if prev == 'CR' and prop == 'LF':
604+
add_to_cell = True
605+
# Break before and after controls (GB4, GB5).
606+
elif prev in control_grapheme_breaks or prop in control_grapheme_breaks:
607+
pass
608+
# No break between Hangul syllable sequences (GB6, GB7, GB8).
609+
elif (
610+
(prev == 'L' and prop in ('L', 'V', 'LV', 'LVT')) or
611+
(prev in ('LV', 'V') and prop in ('V', 'T')) or
612+
(prev in ('LVT', 'T') and prop == 'T')
613+
):
614+
add_to_cell = True
615+
# No break before: extending characters or ZWJ (GB9), SpacingMarks (GB9a), Prepend characters (GB9b).
616+
elif prop in ('Extend', 'ZWJ', 'SpacingMark') or prev in 'Prepend':
617+
add_to_cell = True
618+
# No break within certain combinations of Indic_Conjunct_Break values
619+
# Between consonant {extend|linker}* linker {extend|linker}* and consonant (GB9c).
620+
elif self.incb_consonant_extended_linker_extended and incb == 'Consonant':
621+
add_to_cell = True
622+
# No break within emoji modifier sequences or emoji zwj sequences (GB11).
623+
elif prev == 'ZWJ' and self.emoji_modifier_sequence_before_last_char and p.is_extended_pictographic:
624+
add_to_cell = True
625+
# No break between RI if there is an odd number of RI characters before (GB12, GB13).
626+
elif prop == 'Regional_Indicator':
627+
if prev == 'Private_Expecting_RI':
628+
add_to_cell = True
629+
else:
630+
prop = 'Private_Expecting_RI'
631+
# Break everywhere else GB999
632+
633+
incb_consonant_extended_linker = self.incb_consonant_extended and incb == 'Linker'
634+
incb_consonant_extended_linker_extended = incb_consonant_extended_linker or (
635+
self.incb_consonant_extended_linker_extended and incb in linker_or_extend)
636+
incb_consonant_extended = incb == 'Consonant' or (
637+
self.incb_consonant_extended and incb in linker_or_extend)
638+
emoji_modifier_sequence_before_last_char = self.emoji_modifier_sequence
639+
emoji_modifier_sequence = (self.emoji_modifier_sequence and prop == 'Extend') or p.is_extended_pictographic
640+
641+
return GraphemeSegmentationResult(GraphemeSegmentationState(
642+
grapheme_break=prop, incb_consonant_extended=incb_consonant_extended,
643+
incb_consonant_extended_linker=incb_consonant_extended_linker,
644+
incb_consonant_extended_linker_extended=incb_consonant_extended_linker_extended,
645+
emoji_modifier_sequence=emoji_modifier_sequence, emoji_modifier_sequence_before_last_char=emoji_modifier_sequence_before_last_char
646+
), add_to_cell)
647+
648+
649+
def split_into_graphemes(text: str, props: Sequence[GraphemeSegmentationProps]) -> Iterator[str]:
650+
s = GraphemeSegmentationState.make()
651+
pos = 0
652+
for i, ch in enumerate(text):
653+
p = props[ord(ch)]
654+
s, add_to_cell = s.add_to_current_cell(p)
655+
if not add_to_cell:
656+
yield text[pos:i]
657+
pos = i
658+
if pos < len(text):
659+
yield text[pos:]
660+
661+
662+
def test_grapheme_segmentation(props: Sequence[GraphemeSegmentationProps]) -> None:
663+
for test in grapheme_segmentation_tests:
664+
expected = test['data']
665+
actual = tuple(split_into_graphemes(''.join(test['data']), props))
666+
if expected != actual:
667+
def as_codepoints(text: str) -> str:
668+
return ' '.join(hex(ord(x))[2:] for x in text)
669+
qe = tuple(map(as_codepoints, expected))
670+
qa = tuple(map(as_codepoints, actual))
671+
raise SystemExit(f'Failed to split graphemes for: {test["comment"]}\n{expected!r} {qe} != {actual!r} {qa}')
672+
673+
674+
class GraphemeSegmentationKey(NamedTuple):
675+
state: GraphemeSegmentationState
676+
next_char: GraphemeSegmentationProps
677+
678+
679+
class GraphemeSegmentationResult(NamedTuple):
680+
new_state: GraphemeSegmentationState
681+
add_to_current_cell: bool
682+
683+
522684
class CharProps(NamedTuple):
523685

524686
width: int = 3
@@ -540,15 +702,7 @@ class CharProps(NamedTuple):
540702
@classmethod
541703
def bitsize(cls) -> int:
542704
ans = sum(int(cls._field_defaults[f]) for f in cls._fields)
543-
if ans <= 8:
544-
return 8
545-
if ans <= 16:
546-
return 16
547-
if ans <= 32:
548-
return 32
549-
if ans <= 64:
550-
return 64
551-
raise ValueError('Too many fields')
705+
return clamped_bitsize(ans)
552706

553707
@property
554708
def go_fields(self) -> Iterable[str]:
@@ -650,9 +804,12 @@ def top_level_category(q: str) -> set[int]:
650804

651805

652806
def gen_char_props() -> None:
653-
CharProps._field_defaults['grapheme_break'] = str(bitsize(len(grapheme_segmentation_maps) + 2))
654-
CharProps._field_defaults['indic_conjunct_break'] = str(bitsize(len(incb_map) + 1))
807+
CharProps._field_defaults['grapheme_break'] = str(bitsize(len(grapheme_segmentation_maps)))
808+
CharProps._field_defaults['indic_conjunct_break'] = str(bitsize(len(incb_map)))
655809
CharProps._field_defaults['category'] = str(bitsize(len(class_maps) + 1))
810+
GraphemeSegmentationProps._field_defaults['grapheme_break'] = CharProps._field_defaults['grapheme_break']
811+
GraphemeSegmentationProps._field_defaults['indic_conjunct_break'] = CharProps._field_defaults['indic_conjunct_break']
812+
GraphemeSegmentationState._field_defaults['grapheme_break'] = GraphemeSegmentationProps._field_defaults['grapheme_break']
656813
invalid = class_maps['Cc'] | class_maps['Cs']
657814
non_printing = invalid | class_maps['Cf']
658815
is_word_char = top_level_category('LN')
@@ -691,6 +848,10 @@ def aw(s: Iterable[int], width: int) -> None:
691848
is_combining_char=ch in marks, category=cat_map.get(ch, 'Cn'), is_word_char=ch in is_word_char,
692849
is_punctuation=ch in is_punctuation,
693850
) for ch in range(sys.maxunicode + 1))
851+
gsprops = tuple(GraphemeSegmentationProps(
852+
grapheme_break=x.grapheme_break, indic_conjunct_break=x.indic_conjunct_break,
853+
is_extended_pictographic=x.is_extended_pictographic) for x in prop_array)
854+
test_grapheme_segmentation(gsprops)
694855
t1, t2, t3, shift, mask, bytesz = splitbins(prop_array, CharProps.bitsize() // 8)
695856
print(f'Size of character properties table: {bytesz/1024:.1f}KB')
696857

@@ -700,8 +861,8 @@ def aw(s: Iterable[int], width: int) -> None:
700861
with create_header('kitty/char-props-data.h', include_data_types=False) as c, open('tools/wcswidth/char-props-data.go', 'w') as gof:
701862
gp = partial(print, file=gof)
702863
gp('package wcswidth')
703-
generate_enum(c, gp, 'GraphemeBreakProperty', 'AtStart', 'None', *grapheme_segmentation_maps, prefix='GBP_')
704-
generate_enum(c, gp, 'IndicConjunctBreak', 'None', *incb_map, prefix='ICB_')
864+
generate_enum(c, gp, 'GraphemeBreakProperty', *grapheme_segmentation_maps, prefix='GBP_')
865+
generate_enum(c, gp, 'IndicConjunctBreak', *incb_map, prefix='ICB_')
705866
cen('// UCBDeclaration')
706867
generate_enum(cen, gp, 'UnicodeCategory', 'Cn', *class_maps, prefix='UC_')
707868
cen('// EndUCBDeclaration')
@@ -730,6 +891,7 @@ def main(args: list[str]=sys.argv) -> None:
730891
parse_emoji()
731892
parse_eaw()
732893
parse_grapheme_segmentation()
894+
parse_test_data()
733895
gen_names()
734896
gen_rowcolumn_diacritics()
735897
gen_test_data()

0 commit comments

Comments
 (0)