Skip to content

Commit 9a42b9d

Browse files
authored
Fix GB11 case on a chunk boundary (#172)
1 parent 3874b9a commit 9a42b9d

1 file changed

Lines changed: 124 additions & 12 deletions

File tree

src/grapheme.rs

Lines changed: 124 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,8 @@
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
1010

11-
use core::cmp;
12-
1311
use crate::tables::grapheme::GraphemeCat;
12+
use core::cmp;
1413

1514
/// External iterator for grapheme clusters and byte offsets.
1615
///
@@ -177,7 +176,11 @@ enum GraphemeState {
177176
Regional,
178177
/// The codepoint after is Extended_Pictographic,
179178
/// so whether it's a boundary depends on pre-context according to GB11.
180-
Emoji,
179+
Emoji {
180+
/// Whether the ZWJ char has been seen already an only a "\p{Extended_Pictographic} Extend*"
181+
/// part of GB11 has to be checked
182+
seen_zwj: bool,
183+
},
181184
}
182185

183186
/// Cursor-based segmenter for grapheme clusters.
@@ -424,7 +427,7 @@ impl GraphemeCursor {
424427
match self.state {
425428
GraphemeState::InCbConsonant => self.handle_incb_consonant(chunk, chunk_start),
426429
GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
427-
GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
430+
GraphemeState::Emoji { seen_zwj } => self.handle_emoji(chunk, chunk_start, seen_zwj),
428431
_ => {
429432
if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
430433
let ch = chunk.chars().next_back().unwrap();
@@ -532,13 +535,18 @@ impl GraphemeCursor {
532535
}
533536

534537
#[inline]
535-
fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
538+
fn handle_emoji(&mut self, chunk: &str, chunk_start: usize, mut seen_zwj: bool) {
539+
// \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
536540
use crate::tables::grapheme as gr;
537541
let mut iter = chunk.chars().rev();
538-
if let Some(ch) = iter.next() {
539-
if self.grapheme_category(ch) != gr::GC_ZWJ {
540-
self.decide(true);
541-
return;
542+
if !seen_zwj {
543+
if let Some(ch) = iter.next() {
544+
if self.grapheme_category(ch) != gr::GC_ZWJ {
545+
self.decide(true);
546+
return;
547+
} else {
548+
seen_zwj = true;
549+
}
542550
}
543551
}
544552
for ch in iter {
@@ -558,7 +566,7 @@ impl GraphemeCursor {
558566
self.decide(true);
559567
} else {
560568
self.pre_context_offset = Some(chunk_start);
561-
self.state = GraphemeState::Emoji;
569+
self.state = GraphemeState::Emoji { seen_zwj };
562570
}
563571
}
564572

@@ -616,7 +624,9 @@ impl GraphemeCursor {
616624
match self.cat_after.unwrap() {
617625
gr::GC_InCB_Consonant => self.state = GraphemeState::InCbConsonant,
618626
gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
619-
gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
627+
gr::GC_Extended_Pictographic => {
628+
self.state = GraphemeState::Emoji { seen_zwj: false }
629+
}
620630
_ => need_pre_context = self.cat_before.is_none(),
621631
}
622632
if need_pre_context {
@@ -647,7 +657,7 @@ impl GraphemeCursor {
647657
self.is_boundary_result()
648658
}
649659
PairResult::Emoji => {
650-
self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
660+
self.handle_emoji(&chunk[..offset_in_chunk], chunk_start, false);
651661
self.is_boundary_result()
652662
}
653663
}
@@ -882,3 +892,105 @@ fn test_grapheme_cursor_prev_boundary_chunk_start() {
882892
);
883893
assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
884894
}
895+
896+
#[test]
897+
fn test_grapheme_cursor_boundary_with_zwj_on_chunk_start() {
898+
use GraphemeIncomplete::*;
899+
900+
let chunk0 = "👩"; // 4 bytes
901+
let chunk1 = "\u{200d}🔬"; // 3 bytes + 4 bytes
902+
903+
let full_len = chunk0.len() + chunk1.len();
904+
905+
let mut cur = GraphemeCursor::new(0, full_len, true);
906+
assert_eq!(cur.next_boundary(chunk0, 0), Err(NextChunk));
907+
match cur.next_boundary(chunk1, chunk0.len()) {
908+
Ok(res) => assert_eq!(res, Some(11)),
909+
Err(PreContext(_)) => {
910+
cur.provide_context(chunk0, 0);
911+
assert_eq!(cur.next_boundary(chunk1, chunk0.len()), Ok(Some(11)));
912+
}
913+
_ => unreachable!(),
914+
}
915+
}
916+
917+
#[test]
918+
fn test_grapheme_cursor_emoji_no_zwj() {
919+
use GraphemeIncomplete::*;
920+
let chunk0 = "🍒"; // 4 bytes
921+
let chunk1 = "🥑"; // 4 bytes
922+
let full_len = chunk0.len() + chunk1.len();
923+
924+
let mut c = GraphemeCursor::new(0, full_len, true);
925+
assert_eq!(c.next_boundary(chunk0, 0), Err(NextChunk));
926+
assert_eq!(
927+
c.next_boundary(chunk1, chunk0.len()),
928+
Err(PreContext(chunk0.len()))
929+
);
930+
c.provide_context(chunk0, 0);
931+
assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(Some(4)));
932+
assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(Some(8)));
933+
assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(None));
934+
}
935+
936+
#[test]
937+
fn test_grapheme_cursor_emoji_chunk_boundary_before_zwj() {
938+
use GraphemeIncomplete::*;
939+
let chunk0 = "🍒"; // 4 bytes
940+
let chunk1 = "\u{200d}🥑"; // 3 + 4 bytes
941+
let full_len = chunk0.len() + chunk1.len(); // 11
942+
943+
let mut c = GraphemeCursor::new(0, full_len, true);
944+
assert_eq!(c.next_boundary(chunk0, 0), Err(NextChunk));
945+
assert_eq!(
946+
c.next_boundary(chunk1, chunk0.len()),
947+
Err(PreContext(chunk0.len()))
948+
);
949+
c.provide_context(chunk0, 0);
950+
assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(Some(11)));
951+
assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(None));
952+
}
953+
954+
#[test]
955+
fn test_grapheme_cursor_emoji_chunk_boundary_after_zwj() {
956+
use GraphemeIncomplete::*;
957+
let chunk0 = "🍒\u{200d}"; // 4 + 3 bytes
958+
let chunk1 = "🥑"; // 4 bytes
959+
let full_len = chunk0.len() + chunk1.len(); // 11
960+
961+
let mut c = GraphemeCursor::new(0, full_len, true);
962+
assert_eq!(c.next_boundary(chunk0, 0), Err(NextChunk));
963+
assert_eq!(
964+
c.next_boundary(chunk1, chunk0.len()),
965+
Err(PreContext(chunk0.len()))
966+
);
967+
c.provide_context(chunk0, 0);
968+
assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(Some(11)));
969+
assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(None));
970+
}
971+
972+
#[test]
973+
fn test_grapheme_cursor_emoji_zwj_across_chunks() {
974+
use GraphemeIncomplete::*;
975+
let chunk0 = "🍒"; // 4 bytes
976+
let chunk1 = "\u{200d}"; // 3 bytes
977+
let chunk2 = "🥑"; // 4 bytes
978+
let full_len = chunk0.len() + chunk1.len() + chunk2.len(); // 11
979+
let chunk2_start = chunk0.len() + chunk1.len();
980+
981+
let mut c = GraphemeCursor::new(0, full_len, true);
982+
assert_eq!(c.next_boundary(chunk0, 0), Err(NextChunk));
983+
assert_eq!(c.next_boundary(chunk1, chunk0.len()), Err(NextChunk));
984+
assert_eq!(
985+
c.next_boundary(chunk2, chunk2_start),
986+
Err(PreContext(chunk2_start))
987+
);
988+
c.provide_context(chunk1, chunk0.len());
989+
assert_eq!(
990+
c.next_boundary(chunk2, chunk2_start),
991+
Err(PreContext(chunk0.len()))
992+
);
993+
c.provide_context(chunk0, 0);
994+
assert_eq!(c.next_boundary(chunk2, chunk2_start), Ok(Some(11)));
995+
assert_eq!(c.next_boundary(chunk2, chunk2_start), Ok(None));
996+
}

0 commit comments

Comments
 (0)