1
1
#!/usr/bin/env python
2
2
# License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
3
3
4
+ # Imports {{{
4
5
import json
5
6
import os
6
7
import re
16
17
from typing import (
17
18
Callable ,
18
19
DefaultDict ,
20
+ Iterator ,
19
21
Literal ,
20
22
NamedTuple ,
21
23
Optional ,
22
24
Protocol ,
23
25
Sequence ,
26
+ TypedDict ,
24
27
Union ,
25
28
)
26
29
from urllib .request import urlopen
29
32
import __main__
30
33
__main__ .__package__ = 'gen'
31
34
sys .path .insert (0 , os .path .dirname (os .path .dirname (os .path .abspath (__file__ ))))
35
+ # }}}
32
36
33
-
37
+ # Fetching data {{{
34
38
non_characters = frozenset (range (0xfffe , 0x10ffff , 0x10000 ))
35
39
non_characters |= frozenset (range (0xffff , 0x10ffff + 1 , 0x10000 ))
36
40
non_characters |= frozenset (range (0xfdd0 , 0xfdf0 ))
@@ -63,8 +67,9 @@ def unicode_version() -> tuple[int, int, int]:
63
67
if m is not None :
64
68
return int (m .group (1 )), int (m .group (2 )), int (m .group (3 ))
65
69
raise ValueError ('Could not find Unicode Version' )
70
+ # }}}
66
71
67
-
72
+ # Parsing Unicode databases {{{
68
73
# Map of class names to set of codepoints in class
69
74
class_maps : dict [str , set [int ]] = {}
70
75
all_symbols : set [int ] = set ()
@@ -269,9 +274,13 @@ def parse_eaw() -> None:
269
274
270
275
def parse_grapheme_segmentation () -> None :
271
276
global extended_pictographic
277
+ grapheme_segmentation_maps ['AtStart' ] # this is used by the segmentation algorithm, no character has it
278
+ grapheme_segmentation_maps ['None' ] # this is used by the segmentation algorithm, no character has it
272
279
for line in get_data ('ucd/auxiliary/GraphemeBreakProperty.txt' ):
273
280
chars , category = split_two (line )
274
281
grapheme_segmentation_maps [category ] |= chars
282
+ grapheme_segmentation_maps ['Private_Expecting_RI' ] # this is used by the segmentation algorithm, no character has it
283
+ incb_map ['None' ] # used by segmentation algorithm no character has it
275
284
for line in get_data ('ucd/DerivedCoreProperties.txt' ):
276
285
spec , rest = line .split (';' , 1 )
277
286
category = rest .strip ().split (' ' , 1 )[0 ].strip ().rstrip (';' )
@@ -287,6 +296,36 @@ def parse_grapheme_segmentation() -> None:
287
296
extended_pictographic |= chars
288
297
289
298
299
+ class GraphemeSegmentationTest (TypedDict ):
300
+ data : tuple [str , ...]
301
+ comment : str
302
+
303
+
304
+ grapheme_segmentation_tests : list [GraphemeSegmentationTest ] = []
305
+
306
+
307
+ def parse_test_data () -> None :
308
+ for line in get_data ('ucd/auxiliary/GraphemeBreakTest.txt' ):
309
+ t , comment = line .split ('#' )
310
+ t = t .lstrip ('÷' ).strip ().rstrip ('÷' ).strip ()
311
+ chars : list [list [str ]] = [[]]
312
+ for x in re .split (r'([÷×])' , t ):
313
+ x = x .strip ()
314
+ match x :
315
+ case '÷' :
316
+ chars .append ([])
317
+ case '×' :
318
+ pass
319
+ case '' :
320
+ pass
321
+ case _:
322
+ ch = chr (int (x , 16 ))
323
+ chars [- 1 ].append (ch )
324
+ c = tuple ('' .join (c ) for c in chars )
325
+ grapheme_segmentation_tests .append ({'data' : c , 'comment' : comment .strip ()})
326
+ # }}}
327
+
328
+
290
329
def write_case (spec : Union [tuple [int , ...], int ], p : Callable [..., None ], for_go : bool = False ) -> None :
291
330
if isinstance (spec , tuple ):
292
331
if for_go :
@@ -386,27 +425,8 @@ def print_range() -> None:
386
425
387
426
388
427
def gen_test_data () -> None :
389
- tests = []
390
- for line in get_data ('ucd/auxiliary/GraphemeBreakTest.txt' ):
391
- t , comment = line .split ('#' )
392
- t = t .lstrip ('÷' ).strip ().rstrip ('÷' ).strip ()
393
- chars : list [list [str ]] = [[]]
394
- for x in re .split (r'([÷×])' , t ):
395
- x = x .strip ()
396
- match x :
397
- case '÷' :
398
- chars .append ([])
399
- case '×' :
400
- pass
401
- case '' :
402
- pass
403
- case _:
404
- ch = chr (int (x , 16 ))
405
- chars [- 1 ].append (ch )
406
- c = ['' .join (c ) for c in chars ]
407
- tests .append ({'data' : c , 'comment' : comment .strip ()})
408
428
with open ('kitty_tests/GraphemeBreakTest.json' , 'wb' ) as f :
409
- f .write (json .dumps (tests , indent = 2 , ensure_ascii = False ).encode ())
429
+ f .write (json .dumps (grapheme_segmentation_tests , indent = 2 , ensure_ascii = False ).encode ())
410
430
411
431
412
432
def getsize (data : Iterable [int ]) -> Literal [1 , 2 , 4 ]:
@@ -519,6 +539,148 @@ def bitsize(maxval: int) -> int: # number of bits needed to store maxval
519
539
return ceil (log (maxval , 2 ))
520
540
521
541
542
+ def clamped_bitsize (val : int ) -> int :
543
+ if val <= 8 :
544
+ return 8
545
+ if val <= 16 :
546
+ return 16
547
+ if val <= 32 :
548
+ return 32
549
+ if val <= 64 :
550
+ return 64
551
+ raise ValueError ('Too many fields' )
552
+
553
+
554
+ class GraphemeSegmentationProps (NamedTuple ):
555
+
556
+ grapheme_break : str = '' # set at runtime
557
+ indic_conjunct_break : str = '' # set at runtime
558
+ is_extended_pictographic : bool = True
559
+
560
+ @classmethod
561
+ def bitsize (cls ) -> int :
562
+ ans = sum (int (cls ._field_defaults [f ]) for f in cls ._fields )
563
+ return clamped_bitsize (ans )
564
+
565
+
566
+ control_grapheme_breaks = 'CR' , 'LF' , 'Control'
567
+ linker_or_extend = 'Linker' , 'Extend'
568
+
569
+
570
+ class GraphemeSegmentationState (NamedTuple ):
571
+ grapheme_break : str = '' # set at runtime
572
+ # True if the last character ends a sequence of Indic_Conjunct_Break values: consonant {extend|linker}*
573
+ incb_consonant_extended : bool = True
574
+ # True if the last character ends a sequence of Indic_Conjunct_Break values: consonant {extend|linker}* linker
575
+ incb_consonant_extended_linker : bool = True
576
+ # True if the last character ends a sequence of Indic_Conjunct_Break values: consonant {extend|linker}* linker {extend|linker}*
577
+ incb_consonant_extended_linker_extended : bool = True
578
+ # True if the last character ends an emoji modifier sequence \p{Extended_Pictographic} Extend*
579
+ emoji_modifier_sequence : bool = True
580
+ # True if the last character was immediately preceded by an emoji modifier sequence \p{Extended_Pictographic} Extend*
581
+ emoji_modifier_sequence_before_last_char : bool = True
582
+
583
+ @classmethod
584
+ def make (cls ) -> 'GraphemeSegmentationState' :
585
+ return GraphemeSegmentationState ('AtStart' , False , False , False , False , False )
586
+
587
+ @classmethod
588
+ def bitsize (cls ) -> int :
589
+ ans = sum (int (cls ._field_defaults [f ]) for f in cls ._fields )
590
+ return clamped_bitsize (ans )
591
+
592
+ def add_to_current_cell (self , p : GraphemeSegmentationProps ) -> 'GraphemeSegmentationResult' :
593
+ prev = self .grapheme_break
594
+ prop = p .grapheme_break
595
+ incb = p .indic_conjunct_break
596
+ add_to_cell = False
597
+ if self .grapheme_break == 'AtStart' :
598
+ add_to_cell = True
599
+ if prop == 'Regional_Indicator' :
600
+ prop = 'Private_Expecting_RI'
601
+ else :
602
+ # No break between CR and LF (GB3).
603
+ if prev == 'CR' and prop == 'LF' :
604
+ add_to_cell = True
605
+ # Break before and after controls (GB4, GB5).
606
+ elif prev in control_grapheme_breaks or prop in control_grapheme_breaks :
607
+ pass
608
+ # No break between Hangul syllable sequences (GB6, GB7, GB8).
609
+ elif (
610
+ (prev == 'L' and prop in ('L' , 'V' , 'LV' , 'LVT' )) or
611
+ (prev in ('LV' , 'V' ) and prop in ('V' , 'T' )) or
612
+ (prev in ('LVT' , 'T' ) and prop == 'T' )
613
+ ):
614
+ add_to_cell = True
615
+ # No break before: extending characters or ZWJ (GB9), SpacingMarks (GB9a), Prepend characters (GB9b).
616
+ elif prop in ('Extend' , 'ZWJ' , 'SpacingMark' ) or prev in 'Prepend' :
617
+ add_to_cell = True
618
+ # No break within certain combinations of Indic_Conjunct_Break values
619
+ # Between consonant {extend|linker}* linker {extend|linker}* and consonant (GB9c).
620
+ elif self .incb_consonant_extended_linker_extended and incb == 'Consonant' :
621
+ add_to_cell = True
622
+ # No break within emoji modifier sequences or emoji zwj sequences (GB11).
623
+ elif prev == 'ZWJ' and self .emoji_modifier_sequence_before_last_char and p .is_extended_pictographic :
624
+ add_to_cell = True
625
+ # No break between RI if there is an odd number of RI characters before (GB12, GB13).
626
+ elif prop == 'Regional_Indicator' :
627
+ if prev == 'Private_Expecting_RI' :
628
+ add_to_cell = True
629
+ else :
630
+ prop = 'Private_Expecting_RI'
631
+ # Break everywhere else GB999
632
+
633
+ incb_consonant_extended_linker = self .incb_consonant_extended and incb == 'Linker'
634
+ incb_consonant_extended_linker_extended = incb_consonant_extended_linker or (
635
+ self .incb_consonant_extended_linker_extended and incb in linker_or_extend )
636
+ incb_consonant_extended = incb == 'Consonant' or (
637
+ self .incb_consonant_extended and incb in linker_or_extend )
638
+ emoji_modifier_sequence_before_last_char = self .emoji_modifier_sequence
639
+ emoji_modifier_sequence = (self .emoji_modifier_sequence and prop == 'Extend' ) or p .is_extended_pictographic
640
+
641
+ return GraphemeSegmentationResult (GraphemeSegmentationState (
642
+ grapheme_break = prop , incb_consonant_extended = incb_consonant_extended ,
643
+ incb_consonant_extended_linker = incb_consonant_extended_linker ,
644
+ incb_consonant_extended_linker_extended = incb_consonant_extended_linker_extended ,
645
+ emoji_modifier_sequence = emoji_modifier_sequence , emoji_modifier_sequence_before_last_char = emoji_modifier_sequence_before_last_char
646
+ ), add_to_cell )
647
+
648
+
649
+ def split_into_graphemes (text : str , props : Sequence [GraphemeSegmentationProps ]) -> Iterator [str ]:
650
+ s = GraphemeSegmentationState .make ()
651
+ pos = 0
652
+ for i , ch in enumerate (text ):
653
+ p = props [ord (ch )]
654
+ s , add_to_cell = s .add_to_current_cell (p )
655
+ if not add_to_cell :
656
+ yield text [pos :i ]
657
+ pos = i
658
+ if pos < len (text ):
659
+ yield text [pos :]
660
+
661
+
662
+ def test_grapheme_segmentation (props : Sequence [GraphemeSegmentationProps ]) -> None :
663
+ for test in grapheme_segmentation_tests :
664
+ expected = test ['data' ]
665
+ actual = tuple (split_into_graphemes ('' .join (test ['data' ]), props ))
666
+ if expected != actual :
667
+ def as_codepoints (text : str ) -> str :
668
+ return ' ' .join (hex (ord (x ))[2 :] for x in text )
669
+ qe = tuple (map (as_codepoints , expected ))
670
+ qa = tuple (map (as_codepoints , actual ))
671
+ raise SystemExit (f'Failed to split graphemes for: { test ["comment" ]} \n { expected !r} { qe } != { actual !r} { qa } ' )
672
+
673
+
674
+ class GraphemeSegmentationKey (NamedTuple ):
675
+ state : GraphemeSegmentationState
676
+ next_char : GraphemeSegmentationProps
677
+
678
+
679
+ class GraphemeSegmentationResult (NamedTuple ):
680
+ new_state : GraphemeSegmentationState
681
+ add_to_current_cell : bool
682
+
683
+
522
684
class CharProps (NamedTuple ):
523
685
524
686
width : int = 3
@@ -540,15 +702,7 @@ class CharProps(NamedTuple):
540
702
@classmethod
541
703
def bitsize (cls ) -> int :
542
704
ans = sum (int (cls ._field_defaults [f ]) for f in cls ._fields )
543
- if ans <= 8 :
544
- return 8
545
- if ans <= 16 :
546
- return 16
547
- if ans <= 32 :
548
- return 32
549
- if ans <= 64 :
550
- return 64
551
- raise ValueError ('Too many fields' )
705
+ return clamped_bitsize (ans )
552
706
553
707
@property
554
708
def go_fields (self ) -> Iterable [str ]:
@@ -650,9 +804,12 @@ def top_level_category(q: str) -> set[int]:
650
804
651
805
652
806
def gen_char_props () -> None :
653
- CharProps ._field_defaults ['grapheme_break' ] = str (bitsize (len (grapheme_segmentation_maps ) + 2 ))
654
- CharProps ._field_defaults ['indic_conjunct_break' ] = str (bitsize (len (incb_map ) + 1 ))
807
+ CharProps ._field_defaults ['grapheme_break' ] = str (bitsize (len (grapheme_segmentation_maps )))
808
+ CharProps ._field_defaults ['indic_conjunct_break' ] = str (bitsize (len (incb_map )))
655
809
CharProps ._field_defaults ['category' ] = str (bitsize (len (class_maps ) + 1 ))
810
+ GraphemeSegmentationProps ._field_defaults ['grapheme_break' ] = CharProps ._field_defaults ['grapheme_break' ]
811
+ GraphemeSegmentationProps ._field_defaults ['indic_conjunct_break' ] = CharProps ._field_defaults ['indic_conjunct_break' ]
812
+ GraphemeSegmentationState ._field_defaults ['grapheme_break' ] = GraphemeSegmentationProps ._field_defaults ['grapheme_break' ]
656
813
invalid = class_maps ['Cc' ] | class_maps ['Cs' ]
657
814
non_printing = invalid | class_maps ['Cf' ]
658
815
is_word_char = top_level_category ('LN' )
@@ -691,6 +848,10 @@ def aw(s: Iterable[int], width: int) -> None:
691
848
is_combining_char = ch in marks , category = cat_map .get (ch , 'Cn' ), is_word_char = ch in is_word_char ,
692
849
is_punctuation = ch in is_punctuation ,
693
850
) for ch in range (sys .maxunicode + 1 ))
851
+ gsprops = tuple (GraphemeSegmentationProps (
852
+ grapheme_break = x .grapheme_break , indic_conjunct_break = x .indic_conjunct_break ,
853
+ is_extended_pictographic = x .is_extended_pictographic ) for x in prop_array )
854
+ test_grapheme_segmentation (gsprops )
694
855
t1 , t2 , t3 , shift , mask , bytesz = splitbins (prop_array , CharProps .bitsize () // 8 )
695
856
print (f'Size of character properties table: { bytesz / 1024 :.1f} KB' )
696
857
@@ -700,8 +861,8 @@ def aw(s: Iterable[int], width: int) -> None:
700
861
with create_header ('kitty/char-props-data.h' , include_data_types = False ) as c , open ('tools/wcswidth/char-props-data.go' , 'w' ) as gof :
701
862
gp = partial (print , file = gof )
702
863
gp ('package wcswidth' )
703
- generate_enum (c , gp , 'GraphemeBreakProperty' , 'AtStart' , 'None' , * grapheme_segmentation_maps , prefix = 'GBP_' )
704
- generate_enum (c , gp , 'IndicConjunctBreak' , 'None' , * incb_map , prefix = 'ICB_' )
864
+ generate_enum (c , gp , 'GraphemeBreakProperty' , * grapheme_segmentation_maps , prefix = 'GBP_' )
865
+ generate_enum (c , gp , 'IndicConjunctBreak' , * incb_map , prefix = 'ICB_' )
705
866
cen ('// UCBDeclaration' )
706
867
generate_enum (cen , gp , 'UnicodeCategory' , 'Cn' , * class_maps , prefix = 'UC_' )
707
868
cen ('// EndUCBDeclaration' )
@@ -730,6 +891,7 @@ def main(args: list[str]=sys.argv) -> None:
730
891
parse_emoji ()
731
892
parse_eaw ()
732
893
parse_grapheme_segmentation ()
894
+ parse_test_data ()
733
895
gen_names ()
734
896
gen_rowcolumn_diacritics ()
735
897
gen_test_data ()
0 commit comments