88// option. This file may not be copied, modified, or distributed
99// except according to those terms.
1010
11- use core:: cmp;
12-
1311use crate :: tables:: grapheme:: GraphemeCat ;
12+ use core:: cmp;
1413
1514/// External iterator for grapheme clusters and byte offsets.
1615///
@@ -177,7 +176,11 @@ enum GraphemeState {
177176 Regional ,
178177 /// The codepoint after is Extended_Pictographic,
179178 /// so whether it's a boundary depends on pre-context according to GB11.
180- Emoji ,
179+ Emoji {
180+ /// Whether the ZWJ char has been seen already an only a "\p{Extended_Pictographic} Extend*"
181+ /// part of GB11 has to be checked
182+ seen_zwj : bool ,
183+ } ,
181184}
182185
183186/// Cursor-based segmenter for grapheme clusters.
@@ -424,7 +427,7 @@ impl GraphemeCursor {
424427 match self . state {
425428 GraphemeState :: InCbConsonant => self . handle_incb_consonant ( chunk, chunk_start) ,
426429 GraphemeState :: Regional => self . handle_regional ( chunk, chunk_start) ,
427- GraphemeState :: Emoji => self . handle_emoji ( chunk, chunk_start) ,
430+ GraphemeState :: Emoji { seen_zwj } => self . handle_emoji ( chunk, chunk_start, seen_zwj ) ,
428431 _ => {
429432 if self . cat_before . is_none ( ) && self . offset == chunk. len ( ) + chunk_start {
430433 let ch = chunk. chars ( ) . next_back ( ) . unwrap ( ) ;
@@ -532,13 +535,18 @@ impl GraphemeCursor {
532535 }
533536
534537 #[ inline]
535- fn handle_emoji ( & mut self , chunk : & str , chunk_start : usize ) {
538+ fn handle_emoji ( & mut self , chunk : & str , chunk_start : usize , mut seen_zwj : bool ) {
539+ // \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
536540 use crate :: tables:: grapheme as gr;
537541 let mut iter = chunk. chars ( ) . rev ( ) ;
538- if let Some ( ch) = iter. next ( ) {
539- if self . grapheme_category ( ch) != gr:: GC_ZWJ {
540- self . decide ( true ) ;
541- return ;
542+ if !seen_zwj {
543+ if let Some ( ch) = iter. next ( ) {
544+ if self . grapheme_category ( ch) != gr:: GC_ZWJ {
545+ self . decide ( true ) ;
546+ return ;
547+ } else {
548+ seen_zwj = true ;
549+ }
542550 }
543551 }
544552 for ch in iter {
@@ -558,7 +566,7 @@ impl GraphemeCursor {
558566 self . decide ( true ) ;
559567 } else {
560568 self . pre_context_offset = Some ( chunk_start) ;
561- self . state = GraphemeState :: Emoji ;
569+ self . state = GraphemeState :: Emoji { seen_zwj } ;
562570 }
563571 }
564572
@@ -616,7 +624,9 @@ impl GraphemeCursor {
616624 match self . cat_after . unwrap ( ) {
617625 gr:: GC_InCB_Consonant => self . state = GraphemeState :: InCbConsonant ,
618626 gr:: GC_Regional_Indicator => self . state = GraphemeState :: Regional ,
619- gr:: GC_Extended_Pictographic => self . state = GraphemeState :: Emoji ,
627+ gr:: GC_Extended_Pictographic => {
628+ self . state = GraphemeState :: Emoji { seen_zwj : false }
629+ }
620630 _ => need_pre_context = self . cat_before . is_none ( ) ,
621631 }
622632 if need_pre_context {
@@ -647,7 +657,7 @@ impl GraphemeCursor {
647657 self . is_boundary_result ( )
648658 }
649659 PairResult :: Emoji => {
650- self . handle_emoji ( & chunk[ ..offset_in_chunk] , chunk_start) ;
660+ self . handle_emoji ( & chunk[ ..offset_in_chunk] , chunk_start, false ) ;
651661 self . is_boundary_result ( )
652662 }
653663 }
@@ -882,3 +892,105 @@ fn test_grapheme_cursor_prev_boundary_chunk_start() {
882892 ) ;
883893 assert_eq ! ( c. prev_boundary( & s[ ..2 ] , 0 ) , Ok ( Some ( 1 ) ) ) ;
884894}
895+
896+ #[ test]
897+ fn test_grapheme_cursor_boundary_with_zwj_on_chunk_start ( ) {
898+ use GraphemeIncomplete :: * ;
899+
900+ let chunk0 = "👩" ; // 4 bytes
901+ let chunk1 = "\u{200d} 🔬" ; // 3 bytes + 4 bytes
902+
903+ let full_len = chunk0. len ( ) + chunk1. len ( ) ;
904+
905+ let mut cur = GraphemeCursor :: new ( 0 , full_len, true ) ;
906+ assert_eq ! ( cur. next_boundary( chunk0, 0 ) , Err ( NextChunk ) ) ;
907+ match cur. next_boundary ( chunk1, chunk0. len ( ) ) {
908+ Ok ( res) => assert_eq ! ( res, Some ( 11 ) ) ,
909+ Err ( PreContext ( _) ) => {
910+ cur. provide_context ( chunk0, 0 ) ;
911+ assert_eq ! ( cur. next_boundary( chunk1, chunk0. len( ) ) , Ok ( Some ( 11 ) ) ) ;
912+ }
913+ _ => unreachable ! ( ) ,
914+ }
915+ }
916+
917+ #[ test]
918+ fn test_grapheme_cursor_emoji_no_zwj ( ) {
919+ use GraphemeIncomplete :: * ;
920+ let chunk0 = "🍒" ; // 4 bytes
921+ let chunk1 = "🥑" ; // 4 bytes
922+ let full_len = chunk0. len ( ) + chunk1. len ( ) ;
923+
924+ let mut c = GraphemeCursor :: new ( 0 , full_len, true ) ;
925+ assert_eq ! ( c. next_boundary( chunk0, 0 ) , Err ( NextChunk ) ) ;
926+ assert_eq ! (
927+ c. next_boundary( chunk1, chunk0. len( ) ) ,
928+ Err ( PreContext ( chunk0. len( ) ) )
929+ ) ;
930+ c. provide_context ( chunk0, 0 ) ;
931+ assert_eq ! ( c. next_boundary( chunk1, chunk0. len( ) ) , Ok ( Some ( 4 ) ) ) ;
932+ assert_eq ! ( c. next_boundary( chunk1, chunk0. len( ) ) , Ok ( Some ( 8 ) ) ) ;
933+ assert_eq ! ( c. next_boundary( chunk1, chunk0. len( ) ) , Ok ( None ) ) ;
934+ }
935+
936+ #[ test]
937+ fn test_grapheme_cursor_emoji_chunk_boundary_before_zwj ( ) {
938+ use GraphemeIncomplete :: * ;
939+ let chunk0 = "🍒" ; // 4 bytes
940+ let chunk1 = "\u{200d} 🥑" ; // 3 + 4 bytes
941+ let full_len = chunk0. len ( ) + chunk1. len ( ) ; // 11
942+
943+ let mut c = GraphemeCursor :: new ( 0 , full_len, true ) ;
944+ assert_eq ! ( c. next_boundary( chunk0, 0 ) , Err ( NextChunk ) ) ;
945+ assert_eq ! (
946+ c. next_boundary( chunk1, chunk0. len( ) ) ,
947+ Err ( PreContext ( chunk0. len( ) ) )
948+ ) ;
949+ c. provide_context ( chunk0, 0 ) ;
950+ assert_eq ! ( c. next_boundary( chunk1, chunk0. len( ) ) , Ok ( Some ( 11 ) ) ) ;
951+ assert_eq ! ( c. next_boundary( chunk1, chunk0. len( ) ) , Ok ( None ) ) ;
952+ }
953+
954+ #[ test]
955+ fn test_grapheme_cursor_emoji_chunk_boundary_after_zwj ( ) {
956+ use GraphemeIncomplete :: * ;
957+ let chunk0 = "🍒\u{200d} " ; // 4 + 3 bytes
958+ let chunk1 = "🥑" ; // 4 bytes
959+ let full_len = chunk0. len ( ) + chunk1. len ( ) ; // 11
960+
961+ let mut c = GraphemeCursor :: new ( 0 , full_len, true ) ;
962+ assert_eq ! ( c. next_boundary( chunk0, 0 ) , Err ( NextChunk ) ) ;
963+ assert_eq ! (
964+ c. next_boundary( chunk1, chunk0. len( ) ) ,
965+ Err ( PreContext ( chunk0. len( ) ) )
966+ ) ;
967+ c. provide_context ( chunk0, 0 ) ;
968+ assert_eq ! ( c. next_boundary( chunk1, chunk0. len( ) ) , Ok ( Some ( 11 ) ) ) ;
969+ assert_eq ! ( c. next_boundary( chunk1, chunk0. len( ) ) , Ok ( None ) ) ;
970+ }
971+
972+ #[ test]
973+ fn test_grapheme_cursor_emoji_zwj_across_chunks ( ) {
974+ use GraphemeIncomplete :: * ;
975+ let chunk0 = "🍒" ; // 4 bytes
976+ let chunk1 = "\u{200d} " ; // 3 bytes
977+ let chunk2 = "🥑" ; // 4 bytes
978+ let full_len = chunk0. len ( ) + chunk1. len ( ) + chunk2. len ( ) ; // 11
979+ let chunk2_start = chunk0. len ( ) + chunk1. len ( ) ;
980+
981+ let mut c = GraphemeCursor :: new ( 0 , full_len, true ) ;
982+ assert_eq ! ( c. next_boundary( chunk0, 0 ) , Err ( NextChunk ) ) ;
983+ assert_eq ! ( c. next_boundary( chunk1, chunk0. len( ) ) , Err ( NextChunk ) ) ;
984+ assert_eq ! (
985+ c. next_boundary( chunk2, chunk2_start) ,
986+ Err ( PreContext ( chunk2_start) )
987+ ) ;
988+ c. provide_context ( chunk1, chunk0. len ( ) ) ;
989+ assert_eq ! (
990+ c. next_boundary( chunk2, chunk2_start) ,
991+ Err ( PreContext ( chunk0. len( ) ) )
992+ ) ;
993+ c. provide_context ( chunk0, 0 ) ;
994+ assert_eq ! ( c. next_boundary( chunk2, chunk2_start) , Ok ( Some ( 11 ) ) ) ;
995+ assert_eq ! ( c. next_boundary( chunk2, chunk2_start) , Ok ( None ) ) ;
996+ }
0 commit comments