diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 400224cf..8f9a86df 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -52,7 +52,7 @@ jobs: shell: bash steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - name: Install dependencies (Ubuntu) if: startsWith(matrix.os, 'ubuntu-') diff --git a/src/core/analysis/numeric_creator.cc b/src/core/analysis/numeric_creator.cc index 9dd3d16c..52cbd47e 100644 --- a/src/core/analysis/numeric_creator.cc +++ b/src/core/analysis/numeric_creator.cc @@ -245,7 +245,7 @@ size_t NumericUnkMaker::checkPeriod(const CodepointStorage &codepoints, if (pos == 0) return 0; if (!codepoints[posPeriod].hasClass(PeriodClass)) return 0; if (!codepoints[posPeriod - 1].hasClass(charClass_)) return 0; - if (pos + 1 < codepoints.size() && + if (posPeriod + 1 < codepoints.size() && codepoints[posPeriod + 1].hasClass(charClass_)) return 1; return 0; diff --git a/src/core/analysis/numeric_creator_test.cc b/src/core/analysis/numeric_creator_test.cc index 92c78fe0..1e87e0df 100644 --- a/src/core/analysis/numeric_creator_test.cc +++ b/src/core/analysis/numeric_creator_test.cc @@ -243,6 +243,27 @@ TEST_CASE("do not make numeric unk nodes ends with period") { CHECK(env.numNodeSeeds() == 1); } +// Regression for ku-nlp/jumanpp#157: trailing digit+period caused a read +// past the end of the codepoint vector during the second spawnNodes pass +// (start > 0), because checkPeriod bounded the lookahead against `pos` +// instead of the absolute position `start + pos`. +TEST_CASE("multi-digit number followed by trailing period does not crash") { + NumericTestEnv env{"x,l1\nほげ,l2\n"}; + env.analyze("10."); + CHECK(env.contains("10", 0, "l1")); + CHECK(env.contains("0", 1, "l1")); + CHECK(!env.contains("10.", 0, "l1")); + CHECK(!env.contains("0.", 1, "l1")); + CHECK(env.numNodeSeeds() == 2); +} + +TEST_CASE("digit+period preceded by non-numeric context does not crash") { + NumericTestEnv env{"x,l1\nほげ,l2\n"}; + env.analyze("ほげ4."); + CHECK(env.contains("4", 2, "l1")); + CHECK(!env.contains("4.", 2, "l1")); +} + TEST_CASE("do not make numeric unk nodes starts with period") { NumericTestEnv env{"x,l1\nほげ,l2\n"}; env.analyze(".4");