From ee8e6d92fa1316d91c2d1a36c2399c253d2af18d Mon Sep 17 00:00:00 2001 From: James Date: Mon, 8 Jun 2026 16:15:08 +0100 Subject: [PATCH 1/3] fix(config): detect overlapping-alternation ReDoS in the regex safety guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `isSafeRegex` is the ReDoS guard for every regex compiled from config and route input (redirect/rewrite/header `source`, `has`/`missing` `value`, and middleware `matcher`), all of which are matched against attacker-controlled pathnames, headers, cookies, and query values. It only detected *nested* quantifiers (`(a+)+`), so it let through the complementary "overlapping alternation under an unbounded quantifier" class — e.g. `(a|a)*`, `(a|ab)+`, `(\d|\d\d){2,}`. Those contain no inner quantifier but still backtrack exponentially: matching `(a|a)*` against `"aaaa…!"` already takes ~9s at 29 characters and roughly doubles per added character. On the Node production server this blocks the event loop (DoS); on Workers it burns the request's CPU budget. Add a second, conservative check: a group directly repeated by an unbounded quantifier (`*`, `+`, `{n,}`) whose top-level alternation has overlapping branches — a branch equal to, or a textual prefix of, another — is rejected. Bounded repetition (`{n,m}`), the optional `?`, and disjoint-token alternations (`(foo|bar)*`, `(GET|POST)+`) are unaffected, so legitimate config patterns keep working; a rejected pattern is ignored with a warning, the same safe failure mode as the existing nested-quantifier check. Adds focused `isSafeRegex` unit tests for the new class and confirms the existing safe cases (`(foo|bar)*`, `(ab)+`, `(?:/.*)?`, bounded/optional repetition) still pass. --- packages/vinext/src/config/config-matchers.ts | 150 +++++++++++++++++- tests/shims.test.ts | 56 +++++++ 2 files changed, 198 insertions(+), 8 deletions(-) diff --git a/packages/vinext/src/config/config-matchers.ts b/packages/vinext/src/config/config-matchers.ts index 1f8f67733..c0dc4a54f 100644 --- a/packages/vinext/src/config/config-matchers.ts +++ b/packages/vinext/src/config/config-matchers.ts @@ -259,19 +259,152 @@ function stripHopByHopRequestHeaders(headers: Headers): void { } } +/** + * Read an unbounded quantifier at `idx`. Returns true for `*`, `+`, and the + * open-ended brace form `{n,}`. Bounded forms (`{n}`, `{n,m}`) and the optional + * `?` are NOT unbounded and so cannot drive exponential backtracking. + */ +function isUnboundedQuantifierAt(pattern: string, idx: number): boolean { + const c = pattern[idx]; + if (c === "*" || c === "+") return true; + if (c !== "{") return false; + let j = idx + 1; + while (j < pattern.length && pattern[j] >= "0" && pattern[j] <= "9") j++; + // `{n,}` (comma immediately followed by `}`) is unbounded; `{n}`/`{n,m}` are not. + return pattern[j] === "," && pattern[j + 1] === "}"; +} + +/** + * Strip a group-body prefix so the remaining text is a plain alternation. + * Non-capturing groups (`?:`) are unwrapped. Lookarounds, named groups, and + * inline-flag groups (`?=`, `?!`, `?<=`, `?`, `?i:`, …) are skipped + * (returns null) — analysing them textually would be unreliable, so we simply + * don't flag them here (the nested-quantifier pass still applies). + */ +function alternationBodyOf(groupBody: string): string | null { + if (!groupBody.startsWith("?")) return groupBody; + if (groupBody.startsWith("?:")) return groupBody.slice(2); + return null; +} + +/** Split a group body into its top-level (depth-0) `|` alternatives. */ +function splitTopLevelAlternatives(body: string): string[] { + const branches: string[] = []; + let current = ""; + let depth = 0; + let inClass = false; + for (let i = 0; i < body.length; i++) { + const ch = body[i]; + if (ch === "\\") { + current += ch + (body[i + 1] ?? ""); + i++; + continue; + } + if (inClass) { + current += ch; + if (ch === "]") inClass = false; + continue; + } + if (ch === "[") { + inClass = true; + current += ch; + continue; + } + if (ch === "(") depth++; + else if (ch === ")") depth--; + else if (ch === "|" && depth === 0) { + branches.push(current); + current = ""; + continue; + } + current += ch; + } + branches.push(current); + return branches; +} + +/** + * Conservatively detect overlapping alternatives. Two branches overlap when one + * is equal to, or a textual prefix of, the other (e.g. `a|a`, `a|ab`, + * `\d|\d\d`). Distinct-token alternations like `foo|bar` or `GET|POST` do not + * overlap and are left alone. + */ +function hasOverlappingAlternatives(branches: readonly string[]): boolean { + const trimmed = branches.map((b) => b.trim()).filter((b) => b.length > 0); + for (let a = 0; a < trimmed.length; a++) { + for (let b = a + 1; b < trimmed.length; b++) { + const longer = trimmed[a].length >= trimmed[b].length ? trimmed[a] : trimmed[b]; + const shorter = trimmed[a].length >= trimmed[b].length ? trimmed[b] : trimmed[a]; + if (longer.startsWith(shorter)) return true; + } + } + return false; +} + +/** + * Detect an alternation with overlapping branches that is directly repeated by + * an UNBOUNDED quantifier, e.g. `(a|a)*`, `(a|ab)+`, `(\d|\d\d){2,}`. + * + * This is the complement of the nested-quantifier check below: such a group + * contains no inner quantifier, yet still backtracks exponentially because the + * ambiguous alternatives give the engine multiple ways to consume the same + * input on every repetition (`(a|a)*` against `"aaaa…!"` is exponential). + */ +function hasAmbiguousUnboundedAlternation(pattern: string): boolean { + const groupStartStack: number[] = []; + let inClass = false; + for (let i = 0; i < pattern.length; i++) { + const ch = pattern[i]; + if (ch === "\\") { + i++; + continue; + } + if (inClass) { + if (ch === "]") inClass = false; + continue; + } + if (ch === "[") { + inClass = true; + continue; + } + if (ch === "(") { + groupStartStack.push(i + 1); + continue; + } + if (ch === ")") { + const start = groupStartStack.pop(); + if (start === undefined) continue; + if (!isUnboundedQuantifierAt(pattern, i + 1)) continue; + const body = alternationBodyOf(pattern.slice(start, i)); + if (body === null) continue; + const branches = splitTopLevelAlternatives(body); + if (branches.length >= 2 && hasOverlappingAlternatives(branches)) return true; + } + } + return false; +} + /** * Detect regex patterns vulnerable to catastrophic backtracking (ReDoS). * - * Uses a lightweight heuristic: scans the pattern string for nested quantifiers - * (a quantifier applied to a group that itself contains a quantifier). This - * catches the most common pathological patterns like `(a+)+`, `(.*)*`, - * `([^/]+)+`, `(a|a+)+` without needing a full regex parser. + * Uses a lightweight heuristic without a full regex parser. Two pathological + * shapes are rejected: + * + * 1. Nested quantifiers — a quantifier applied to a group that itself + * contains a quantifier, e.g. `(a+)+`, `(.*)*`, `([^/]+)+`, `(a|a+)+`. + * 2. An overlapping alternation under an UNBOUNDED quantifier, e.g. `(a|a)*`, + * `(a|ab)+`, `(\d|\d\d){2,}`. These contain no inner quantifier but still + * backtrack exponentially because the ambiguous branches give the engine + * multiple ways to consume the same input on each repetition. * * Returns true if the pattern appears safe, false if it's potentially dangerous. */ export function isSafeRegex(pattern: string): boolean { - // Track parenthesis nesting depth and whether we've seen a quantifier - // at each depth level. + // (2) Ambiguous alternation repeated by an unbounded quantifier. + if (hasAmbiguousUnboundedAlternation(pattern)) return false; + + // (1) Nested quantifiers — track parenthesis nesting depth and whether we've + // seen a quantifier at each depth level. const quantifierAtDepth: boolean[] = []; let depth = 0; let i = 0; @@ -383,8 +516,9 @@ export function safeRegExp(pattern: string, flags?: string): RegExp | null { if (!isSafeRegex(pattern)) { console.warn( `[vinext] Ignoring potentially unsafe regex pattern (ReDoS risk): ${pattern}\n` + - ` Patterns with nested quantifiers (e.g. (a+)+) can cause catastrophic backtracking.\n` + - ` Simplify the pattern to avoid nested repetition.`, + ` Nested quantifiers (e.g. (a+)+) and overlapping alternations repeated by an\n` + + ` unbounded quantifier (e.g. (a|a)*, (a|ab)+) can cause catastrophic backtracking.\n` + + ` Simplify the pattern to avoid nested repetition and ambiguous alternatives.`, ); return null; } diff --git a/tests/shims.test.ts b/tests/shims.test.ts index e1eddf79f..bdf88d311 100644 --- a/tests/shims.test.ts +++ b/tests/shims.test.ts @@ -9057,6 +9057,62 @@ describe("isSafeRegex", () => { expect(isSafeRegex("(foo|bar)*")).toBe(true); }); + // Overlapping alternation under an unbounded quantifier — no inner quantifier, + // but still exponential because the ambiguous branches give the engine + // multiple ways to consume the same input on each repetition. `(a|a)*` matched + // against "aaaa…!" takes seconds at ~30 chars. These complement the + // nested-quantifier checks above (which `(a|a)*` slips past). + it("rejects overlapping alternation repeated by *: (a|a)*", async () => { + const { isSafeRegex } = await import("../packages/vinext/src/config/config-matchers.js"); + expect(isSafeRegex("(a|a)*")).toBe(false); + expect(isSafeRegex("(a|a)*$")).toBe(false); + expect(isSafeRegex("^(a|a)*!")).toBe(false); + }); + + it("rejects overlapping alternation repeated by +: (a|a|a)+, (foo|foo)+", async () => { + const { isSafeRegex } = await import("../packages/vinext/src/config/config-matchers.js"); + expect(isSafeRegex("(a|a|a)+")).toBe(false); + expect(isSafeRegex("(foo|foo)+")).toBe(false); + }); + + it("rejects prefix-overlapping alternation: (a|ab)*, (\\d|\\d\\d)+", async () => { + const { isSafeRegex } = await import("../packages/vinext/src/config/config-matchers.js"); + expect(isSafeRegex("(a|ab)*")).toBe(false); + expect(isSafeRegex("(\\d|\\d\\d)+")).toBe(false); + expect(isSafeRegex("(ab|abc)+")).toBe(false); + }); + + it("rejects overlapping alternation repeated by unbounded brace: (a|a){2,}", async () => { + const { isSafeRegex } = await import("../packages/vinext/src/config/config-matchers.js"); + expect(isSafeRegex("(a|a){2,}")).toBe(false); + }); + + it("rejects overlapping alternation in a non-capturing group: (?:a|a)*", async () => { + const { isSafeRegex } = await import("../packages/vinext/src/config/config-matchers.js"); + expect(isSafeRegex("(?:a|a)*")).toBe(false); + }); + + it("accepts disjoint alternation under a quantifier (no overlap)", async () => { + const { isSafeRegex } = await import("../packages/vinext/src/config/config-matchers.js"); + // Distinct-token alternations are unambiguous and safe. + expect(isSafeRegex("(foo|bar)*")).toBe(true); + expect(isSafeRegex("(foo|bar|baz)+")).toBe(true); + expect(isSafeRegex("(en|fr|de)*")).toBe(true); + expect(isSafeRegex("(a|b)*")).toBe(true); + expect(isSafeRegex("(GET|POST|PUT)+")).toBe(true); + }); + + it("accepts overlapping alternation only under BOUNDED repetition", async () => { + const { isSafeRegex } = await import("../packages/vinext/src/config/config-matchers.js"); + // Bounded repetition cannot blow up exponentially. + expect(isSafeRegex("(a|a){2,5}")).toBe(true); + expect(isSafeRegex("(a|a){3}")).toBe(true); + // Optional (zero-or-one) is only 2 paths. + expect(isSafeRegex("(a|a)?")).toBe(true); + // No quantifier at all. + expect(isSafeRegex("(a|a)")).toBe(true); + }); + it("treats escaped characters as safe", async () => { const { isSafeRegex } = await import("../packages/vinext/src/config/config-matchers.js"); // \\+ is a literal +, not a quantifier From 812e3ce9353b78cb1a6ffab6264367dead0ba004 Mon Sep 17 00:00:00 2001 From: James Date: Mon, 8 Jun 2026 20:01:51 +0100 Subject: [PATCH 2/3] fix(config): catch wrapped/nested overlapping-alternation ReDoS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The overlapping-alternation detector only split the top-level alternation of a directly-quantified group's body, so wrapping the ambiguous group in a redundant capturing or non-capturing group bypassed it while staying exponential: - ((a|a))* — body is the sole child group (a|a) - ((a|a)|x)* — branch (a|a) is a nested overlapping group - (x|(a|a))* — branch (a|a) is a nested overlapping group Analyse the quantified body recursively: split into top-level alternatives, then peel any branch that is wholly a single group (unwrapping the parens and normalising a ?: prefix) and recurse. A depth cap bounds work on adversarial nesting. Disjoint/bounded alternations ((ab|cd)*, ((ab|cd))*) recurse to non-overlapping branches and stay safe — no new false positives. --- packages/vinext/src/config/config-matchers.ts | 94 ++++++++++++++++++- tests/shims.test.ts | 34 +++++++ 2 files changed, 126 insertions(+), 2 deletions(-) diff --git a/packages/vinext/src/config/config-matchers.ts b/packages/vinext/src/config/config-matchers.ts index c0dc4a54f..0587fb26f 100644 --- a/packages/vinext/src/config/config-matchers.ts +++ b/packages/vinext/src/config/config-matchers.ts @@ -341,6 +341,95 @@ function hasOverlappingAlternatives(branches: readonly string[]): boolean { return false; } +/** + * If `branch` is, in its entirety, a single parenthesised group (its first + * non-space char is `(` whose matching `)` is the last non-space char), return + * that group's inner body. Otherwise return null. + * + * Used to peel a redundant wrapper group off a quantified body so a wrapped + * alternation is still analysed. For example the body of `((a|a))*` is the + * single branch `(a|a)`; unwrapping it yields `a|a`, which is overlapping. + * `?:`/named/lookaround prefixes are normalised via `alternationBodyOf` by the + * caller after unwrapping; here we only strip the outer parentheses. + */ +function unwrapSoleGroup(branch: string): string | null { + const trimmed = branch.trim(); + if (trimmed.length < 2 || trimmed[0] !== "(" || trimmed[trimmed.length - 1] !== ")") { + return null; + } + // Verify the opening `(` matches the trailing `)` (i.e. it's a single group + // spanning the whole branch, not e.g. `(a)(b)` where the first `)` closes + // early). Track depth, character classes, and escapes the same way the + // tokenisers above do. + let depth = 0; + let inClass = false; + for (let i = 0; i < trimmed.length; i++) { + const ch = trimmed[i]; + if (ch === "\\") { + i++; + continue; + } + if (inClass) { + if (ch === "]") inClass = false; + continue; + } + if (ch === "[") { + inClass = true; + continue; + } + if (ch === "(") depth++; + else if (ch === ")") { + depth--; + // If the group closes before the final char, this is not a sole group. + if (depth === 0 && i !== trimmed.length - 1) return null; + } + } + if (depth !== 0) return null; + return trimmed.slice(1, -1); +} + +/** + * Decide whether a quantified group's body contains an overlapping alternation, + * accounting for redundant wrapper groups. The detector that calls this only + * inspects the *directly* quantified group, so without unwrapping, trivial + * transforms of `(a|a)*` slip through while staying exponential: + * + * - `((a|a))*` — body is the sole group `(a|a)` + * - `((a|a)|x)*` — branch `(a|a)` is a nested overlapping group + * - `(x|(a|a))*` — branch `(a|a)` is a nested overlapping group + * + * Strategy: split the body into top-level alternatives. If two or more of them + * overlap, it's ambiguous. Otherwise, recurse into any branch that is wholly a + * single group (peeling the wrapper, then normalising any `?:` prefix) — a + * nested ambiguous alternation under the same unbounded quantifier is just as + * exponential as a direct one. + * + * Precision is preserved: disjoint/bounded alternations (`(ab|cd)*`, + * `((ab|cd))*`) recurse to non-overlapping branches and stay safe. A bounded + * recursion depth caps work on adversarial nesting. + */ +function bodyHasOverlappingAlternation(body: string, depth = 0): boolean { + // Cap recursion so deeply/adversarially nested wrappers can't blow up the + // scan. Real config regexes nest a handful of levels at most. + if (depth > 32) return false; + + const branches = splitTopLevelAlternatives(body); + if (branches.length >= 2 && hasOverlappingAlternatives(branches)) return true; + + // Recurse into branches that are wholly a single group. This catches the + // wrapped (`((a|a))*`) and nested (`(x|(a|a))*`) variants. Note a single + // top-level branch that is a sole group (the `((a|a))*` case) is handled here + // even though the `>= 2` overlap check above didn't fire. + for (const branch of branches) { + const inner = unwrapSoleGroup(branch); + if (inner === null) continue; + const normalised = alternationBodyOf(inner); + if (normalised === null) continue; + if (bodyHasOverlappingAlternation(normalised, depth + 1)) return true; + } + return false; +} + /** * Detect an alternation with overlapping branches that is directly repeated by * an UNBOUNDED quantifier, e.g. `(a|a)*`, `(a|ab)+`, `(\d|\d\d){2,}`. @@ -377,8 +466,9 @@ function hasAmbiguousUnboundedAlternation(pattern: string): boolean { if (!isUnboundedQuantifierAt(pattern, i + 1)) continue; const body = alternationBodyOf(pattern.slice(start, i)); if (body === null) continue; - const branches = splitTopLevelAlternatives(body); - if (branches.length >= 2 && hasOverlappingAlternatives(branches)) return true; + // Analyse the body recursively so wrapped/nested alternations are caught + // too — `((a|a))*`, `((a|a)|x)*`, `(x|(a|a))*` are all exponential. + if (bodyHasOverlappingAlternation(body)) return true; } } return false; diff --git a/tests/shims.test.ts b/tests/shims.test.ts index bdf88d311..b428c9e44 100644 --- a/tests/shims.test.ts +++ b/tests/shims.test.ts @@ -9113,6 +9113,40 @@ describe("isSafeRegex", () => { expect(isSafeRegex("(a|a)")).toBe(true); }); + // Wrapping an overlapping alternation in a redundant group must NOT let it + // slip past the unbounded-quantifier check. These are trivial transforms of + // `(a|a)*` and remain exponential (~7s on ~26 chars). Without unwrapping the + // sole/nested child group, the top-level split sees a single non-overlapping + // branch and rates them safe. + it("rejects a wrapped overlapping alternation: ((a|a))*", async () => { + const { isSafeRegex } = await import("../packages/vinext/src/config/config-matchers.js"); + expect(isSafeRegex("((a|a))*")).toBe(false); + expect(isSafeRegex("((a|a))+")).toBe(false); + expect(isSafeRegex("((a|a)){2,}")).toBe(false); + // Multiple redundant wrapper layers and a non-capturing wrapper. + expect(isSafeRegex("(((a|a)))*")).toBe(false); + expect(isSafeRegex("(?:(?:a|a))*")).toBe(false); + }); + + it("rejects a nested overlapping alternation branch: ((a|a)|x)* and (x|(a|a))*", async () => { + const { isSafeRegex } = await import("../packages/vinext/src/config/config-matchers.js"); + expect(isSafeRegex("((a|a)|x)*")).toBe(false); + expect(isSafeRegex("(x|(a|a))*")).toBe(false); + // Prefix-overlap nested in a branch, repeated by +. + expect(isSafeRegex("(x|(a|ab)|y)+")).toBe(false); + }); + + it("keeps disjoint wrapped/nested alternations SAFE (no false positives)", async () => { + const { isSafeRegex } = await import("../packages/vinext/src/config/config-matchers.js"); + // Distinct-token branches are unambiguous even when wrapped or nested. + expect(isSafeRegex("(ab|cd)*")).toBe(true); + expect(isSafeRegex("((ab|cd))*")).toBe(true); + expect(isSafeRegex("((ab|cd)|ef)*")).toBe(true); + expect(isSafeRegex("(x|(ab|cd))*")).toBe(true); + // A wrapped locale alternation (common in real config) stays safe. + expect(isSafeRegex("((en|fr|de))*")).toBe(true); + }); + it("treats escaped characters as safe", async () => { const { isSafeRegex } = await import("../packages/vinext/src/config/config-matchers.js"); // \\+ is a literal +, not a quantifier From 76830c1c8d5eb81d3dbf8e853c961c3fe75c79bb Mon Sep 17 00:00:00 2001 From: James Date: Mon, 8 Jun 2026 22:00:43 +0100 Subject: [PATCH 3/3] fix(config): guard concatenated-group overlapping-alternation ReDoS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The wrapped/nested ReDoS detector only recursed into branches that were wholly a single group, so an overlapping alternation concatenated with another token inside the quantified body slipped through while staying exponential: ((a|a)b)*, (b(a|a))*, (b(a|a)c)* all backtrack exponentially because the inner overlapping group is still repeated by the outer unbounded quantifier. Generalise the recursion: descend into every top-level sub-group of each branch (not just sole groups). topLevelGroupBodies replaces unwrapSoleGroup and returns a sole-group branch's body just as readily as a concatenated one. Precision is preserved — only an overlapping inner alternation trips the guard, so disjoint config shapes like ((en|fr|de)x)* and (/(en|fr)/foo)* stay safe. --- packages/vinext/src/config/config-matchers.ts | 100 ++++++++++-------- tests/shims.test.ts | 30 ++++++ 2 files changed, 85 insertions(+), 45 deletions(-) diff --git a/packages/vinext/src/config/config-matchers.ts b/packages/vinext/src/config/config-matchers.ts index 0587fb26f..8519924cf 100644 --- a/packages/vinext/src/config/config-matchers.ts +++ b/packages/vinext/src/config/config-matchers.ts @@ -342,29 +342,29 @@ function hasOverlappingAlternatives(branches: readonly string[]): boolean { } /** - * If `branch` is, in its entirety, a single parenthesised group (its first - * non-space char is `(` whose matching `)` is the last non-space char), return - * that group's inner body. Otherwise return null. + * Extract the inner body of every TOP-LEVEL (depth-1) parenthesised group that + * appears anywhere in `branch`, regardless of what surrounds it. A group that + * spans the whole branch (`(a|a)` from the wrapped `((a|a))*`) is returned just + * as readily as one concatenated with other tokens — the "concatenated group" + * shape — e.g. `(a|a)b`, `b(a|a)`, `b(a|a)c`. * - * Used to peel a redundant wrapper group off a quantified body so a wrapped - * alternation is still analysed. For example the body of `((a|a))*` is the - * single branch `(a|a)`; unwrapping it yields `a|a`, which is overlapping. - * `?:`/named/lookaround prefixes are normalised via `alternationBodyOf` by the - * caller after unwrapping; here we only strip the outer parentheses. + * Such a group is still repeated by the same outer unbounded quantifier, so an + * overlapping alternation inside it is just as exponential as a directly + * quantified one: `((a|a)b)*` against `"abab…!"` backtracks exponentially even + * though `(a|a)` is glued to a literal `b`. Returns each group's inner text + * (parentheses stripped); nested deeper groups are reached by the caller's + * recursion, not flattened here. + * + * Escapes and character classes are tracked so `\(` and `[(]` are never treated + * as group openers. */ -function unwrapSoleGroup(branch: string): string | null { - const trimmed = branch.trim(); - if (trimmed.length < 2 || trimmed[0] !== "(" || trimmed[trimmed.length - 1] !== ")") { - return null; - } - // Verify the opening `(` matches the trailing `)` (i.e. it's a single group - // spanning the whole branch, not e.g. `(a)(b)` where the first `)` closes - // early). Track depth, character classes, and escapes the same way the - // tokenisers above do. +function topLevelGroupBodies(branch: string): string[] { + const bodies: string[] = []; let depth = 0; + let groupStart = -1; let inClass = false; - for (let i = 0; i < trimmed.length; i++) { - const ch = trimmed[i]; + for (let i = 0; i < branch.length; i++) { + const ch = branch[i]; if (ch === "\\") { i++; continue; @@ -377,36 +377,44 @@ function unwrapSoleGroup(branch: string): string | null { inClass = true; continue; } - if (ch === "(") depth++; - else if (ch === ")") { + if (ch === "(") { + if (depth === 0) groupStart = i + 1; + depth++; + } else if (ch === ")") { depth--; - // If the group closes before the final char, this is not a sole group. - if (depth === 0 && i !== trimmed.length - 1) return null; + if (depth === 0 && groupStart !== -1) { + bodies.push(branch.slice(groupStart, i)); + groupStart = -1; + } } } - if (depth !== 0) return null; - return trimmed.slice(1, -1); + return bodies; } /** * Decide whether a quantified group's body contains an overlapping alternation, - * accounting for redundant wrapper groups. The detector that calls this only - * inspects the *directly* quantified group, so without unwrapping, trivial - * transforms of `(a|a)*` slip through while staying exponential: + * accounting for redundant wrapper groups and concatenated sub-groups. The + * detector that calls this only inspects the *directly* quantified group, so + * without descending, trivial transforms of `(a|a)*` slip through while staying + * exponential: * - * - `((a|a))*` — body is the sole group `(a|a)` - * - `((a|a)|x)*` — branch `(a|a)` is a nested overlapping group - * - `(x|(a|a))*` — branch `(a|a)` is a nested overlapping group + * - `((a|a))*` — body is the sole group `(a|a)` + * - `((a|a)|x)*` — branch `(a|a)` is a nested overlapping group + * - `(x|(a|a))*` — branch `(a|a)` is a nested overlapping group + * - `((a|a)b)*` — branch `(a|a)b` is a group concatenated with a literal + * - `(b(a|a)c)*` — overlapping group concatenated on both sides * * Strategy: split the body into top-level alternatives. If two or more of them - * overlap, it's ambiguous. Otherwise, recurse into any branch that is wholly a - * single group (peeling the wrapper, then normalising any `?:` prefix) — a + * overlap, it's ambiguous. Otherwise, recurse into every top-level sub-group of + * each branch (peeling its parentheses, then normalising any `?:` prefix) — a * nested ambiguous alternation under the same unbounded quantifier is just as - * exponential as a direct one. + * exponential as a direct one, whether the sub-group is the whole branch + * (`((a|a))*`) or merely concatenated within it (`((a|a)b)*`). * * Precision is preserved: disjoint/bounded alternations (`(ab|cd)*`, - * `((ab|cd))*`) recurse to non-overlapping branches and stay safe. A bounded - * recursion depth caps work on adversarial nesting. + * `((ab|cd))*`, `((en|fr|de)x)*`, `(/(en|fr)/foo)*`) recurse to non-overlapping + * branches and stay safe — only an *overlapping* inner alternation trips the + * guard. A bounded recursion depth caps work on adversarial nesting. */ function bodyHasOverlappingAlternation(body: string, depth = 0): boolean { // Cap recursion so deeply/adversarially nested wrappers can't blow up the @@ -416,16 +424,18 @@ function bodyHasOverlappingAlternation(body: string, depth = 0): boolean { const branches = splitTopLevelAlternatives(body); if (branches.length >= 2 && hasOverlappingAlternatives(branches)) return true; - // Recurse into branches that are wholly a single group. This catches the - // wrapped (`((a|a))*`) and nested (`(x|(a|a))*`) variants. Note a single - // top-level branch that is a sole group (the `((a|a))*` case) is handled here - // even though the `>= 2` overlap check above didn't fire. + // Recurse into every top-level sub-group of each branch. This catches the + // wrapped (`((a|a))*`), nested (`(x|(a|a))*`), and concatenated-group + // (`((a|a)b)*`, `(b(a|a)c)*`) variants: each inner group is repeated by the + // same outer unbounded quantifier, so an overlapping alternation inside it is + // exponential too. `topLevelGroupBodies` returns the body of a sole-group + // branch (`(a|a)` from `((a|a))*`) just as readily as a concatenated one. for (const branch of branches) { - const inner = unwrapSoleGroup(branch); - if (inner === null) continue; - const normalised = alternationBodyOf(inner); - if (normalised === null) continue; - if (bodyHasOverlappingAlternation(normalised, depth + 1)) return true; + for (const inner of topLevelGroupBodies(branch)) { + const normalised = alternationBodyOf(inner); + if (normalised === null) continue; + if (bodyHasOverlappingAlternation(normalised, depth + 1)) return true; + } } return false; } diff --git a/tests/shims.test.ts b/tests/shims.test.ts index b428c9e44..81722bb44 100644 --- a/tests/shims.test.ts +++ b/tests/shims.test.ts @@ -9147,6 +9147,36 @@ describe("isSafeRegex", () => { expect(isSafeRegex("((en|fr|de))*")).toBe(true); }); + // Concatenating an overlapping alternation group with another token inside the + // quantified body keeps it exponential: each inner group is still repeated by + // the same outer unbounded quantifier. `((a|a)b)*` against "abab…!" backtracks + // exponentially (~30ms at n=22, exponential thereafter) even though the + // overlapping `(a|a)` is glued to a literal `b`. A sole-group unwrap misses + // these because `(a|a)b` is not wholly a single group. + it("rejects a concatenated overlapping-alternation group: ((a|a)b)* and (b(a|a))*", async () => { + const { isSafeRegex } = await import("../packages/vinext/src/config/config-matchers.js"); + expect(isSafeRegex("((a|a)b)*")).toBe(false); + expect(isSafeRegex("(b(a|a))*")).toBe(false); + expect(isSafeRegex("(b(a|a)c)*")).toBe(false); + expect(isSafeRegex("((a|a)b)+")).toBe(false); + expect(isSafeRegex("((a|a)b){2,}")).toBe(false); + // Wrapped around a concatenated group, and prefix-overlap concatenated. + expect(isSafeRegex("(((a|a)b))*")).toBe(false); + expect(isSafeRegex("((a|ab)x)+")).toBe(false); + // Overlapping group with a bounded inner quantifier is still exponential + // under the outer unbounded `*` (the ambiguity per outer repetition remains). + expect(isSafeRegex("((a|a){2}b)*")).toBe(false); + }); + + it("keeps disjoint concatenated-group alternations SAFE (no false positives)", async () => { + const { isSafeRegex } = await import("../packages/vinext/src/config/config-matchers.js"); + // A disjoint locale group concatenated with a literal — common config shape. + expect(isSafeRegex("((en|fr|de)x)*")).toBe(true); + expect(isSafeRegex("(/(en|fr)/foo)*")).toBe(true); + expect(isSafeRegex("(x(ab|cd)y)*")).toBe(true); + expect(isSafeRegex("((foo|bar)/(baz|qux))+")).toBe(true); + }); + it("treats escaped characters as safe", async () => { const { isSafeRegex } = await import("../packages/vinext/src/config/config-matchers.js"); // \\+ is a literal +, not a quantifier