Skip to content

Commit 0126cbd

Browse files
committed
fix: out-of-bounds read in NumericUnkMaker::checkPeriod for trailing digit+period
`checkPeriod` bounded the lookahead with `pos + 1 < codepoints.size()`, but the index it actually read was `posPeriod + 1 = start + pos + 1`. On the second pass of `spawnNodes` (start > 0) the two diverge, so inputs like `10.` or `ほげ4.` read one past the end of the codepoint vector and caused the crash reported in #157. Bound the lookahead against the absolute index instead. Dropped the now-dead `pos + 1` check so the condition reflects what is actually being guarded. Added regression tests for `10.` and `ほげ4.`; both abort on the prior code under libstdc++ debug mode / ASan. Closes #157.
1 parent c2b8b59 commit 0126cbd

File tree

2 files changed

+22
-1
lines changed

2 files changed

+22
-1
lines changed

src/core/analysis/numeric_creator.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ size_t NumericUnkMaker::checkPeriod(const CodepointStorage &codepoints,
245245
if (pos == 0) return 0;
246246
if (!codepoints[posPeriod].hasClass(PeriodClass)) return 0;
247247
if (!codepoints[posPeriod - 1].hasClass(charClass_)) return 0;
248-
if (pos + 1 < codepoints.size() &&
248+
if (posPeriod + 1 < codepoints.size() &&
249249
codepoints[posPeriod + 1].hasClass(charClass_))
250250
return 1;
251251
return 0;

src/core/analysis/numeric_creator_test.cc

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,27 @@ TEST_CASE("do not make numeric unk nodes ends with period") {
243243
CHECK(env.numNodeSeeds() == 1);
244244
}
245245

246+
// Regression for ku-nlp/jumanpp#157: trailing digit+period caused a read
247+
// past the end of the codepoint vector during the second spawnNodes pass
248+
// (start > 0), because checkPeriod bounded the lookahead against `pos`
249+
// instead of the absolute position `start + pos`.
250+
TEST_CASE("multi-digit number followed by trailing period does not crash") {
251+
NumericTestEnv env{"x,l1\nほげ,l2\n"};
252+
env.analyze("10.");
253+
CHECK(env.contains("10", 0, "l1"));
254+
CHECK(env.contains("", 1, "l1"));
255+
CHECK(!env.contains("10.", 0, "l1"));
256+
CHECK(!env.contains("0.", 1, "l1"));
257+
CHECK(env.numNodeSeeds() == 2);
258+
}
259+
260+
TEST_CASE("digit+period preceded by non-numeric context does not crash") {
261+
NumericTestEnv env{"x,l1\nほげ,l2\n"};
262+
env.analyze("ほげ4.");
263+
CHECK(env.contains("", 2, "l1"));
264+
CHECK(!env.contains("4.", 2, "l1"));
265+
}
266+
246267
TEST_CASE("do not make numeric unk nodes starts with period") {
247268
NumericTestEnv env{"x,l1\nほげ,l2\n"};
248269
env.analyze(".4");

0 commit comments

Comments
 (0)