From d3fbefbeb982f47a02983335ef0cbcbccf94aaba Mon Sep 17 00:00:00 2001 From: Margus Veanes Date: Fri, 26 Jun 2026 03:55:21 +0300 Subject: [PATCH] perf(seq): right-associate concats in derivative; restore union subsumption; fix mk_regex_concat sort bug The symbolic derivative of a left-associated concat (a*b)*r2 recurses through the entire left spine, exceeding m_max_depth and emitting stuck symbolic re.derivative terms that accumulate across NFA states and cause an exponential blow-up on contains-pattern intersections. Right- associate the concat (via derive::mk_concat, a single linear pass that does not touch the derivative depth counter) before deriving, so the head stays atomic and recursion is shallow. Also restore the L(a)subset L(b) subsumption in mk_union_core (collapses semantically-equal union states in antimirov intersection derivatives), and fix a latent sort bug in mk_regex_concat where the '..* = .+' rewrite passed the element sort instead of the regex sort to mk_full_char, triggering a 're.+' sort-mismatch exception once exercised by the derivative path. Result on QF_S/20250410-matching wildcard-matching-regex set: 02 15s->0.6s, 04 15s->0.7s, 05 timeout->4.2s, 59/62 timeout->0.1-0.3s (vs master). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/ast/rewriter/seq_derive.cpp | 19 +++++++++++++++++++ src/ast/rewriter/seq_rewriter.cpp | 4 ++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/ast/rewriter/seq_derive.cpp b/src/ast/rewriter/seq_derive.cpp index 64248fa2ca..f99abb382f 100644 --- a/src/ast/rewriter/seq_derive.cpp +++ b/src/ast/rewriter/seq_derive.cpp @@ -189,6 +189,18 @@ namespace seq { // δ(r1 · r2) = δ(r1) · r2 ∪ (if nullable(r1) then δ(r2) else ∅) if (re().is_concat(r, r1, r2)) { + // Ensure right-associative form first. A left-nested concat + // (a·b)·r2 makes the head r1 a large sub-concat, so deriving it + // recurses through the whole left spine and can exceed + // m_max_depth, producing stuck symbolic re.derivative terms that + // accumulate across states and blow up. mk_concat right- + // associates in a single linear pass (without touching the + // derivative depth counter), keeping the head atomic. + if (re().is_concat(r1)) { + expr_ref rr = mk_concat(r1, r2); + if (rr != r) + return derive_rec(rr); + } expr_ref d1 = derive_rec(r1); expr_ref d1_r2 = mk_deriv_concat(d1, r2); expr_ref nullable_r1 = is_nullable(r1); @@ -748,6 +760,13 @@ namespace seq { return mk_deriv_concat(a1, tail); } + // Subsumption: L(a) ⊆ L(b) ⇒ a ∪ b = b (and symmetrically). + // This collapses semantically-equal union states that arise in + // antimirov intersection derivatives, which is essential to keep the + // emptiness/bisim worklist from blowing up on contains-patterns. + if (is_subset(a, b)) return expr_ref(b, m); + if (is_subset(b, a)) return expr_ref(a, m); + return m_re.mk_union(a, b); } diff --git a/src/ast/rewriter/seq_rewriter.cpp b/src/ast/rewriter/seq_rewriter.cpp index fe932ec782..774664c097 100644 --- a/src/ast/rewriter/seq_rewriter.cpp +++ b/src/ast/rewriter/seq_rewriter.cpp @@ -3969,10 +3969,10 @@ expr_ref seq_rewriter::mk_regex_concat(expr *r, expr *s) { result = r; else if (re().is_full_char(r) && re().is_full_seq(s)) // ..* = .+ - result = re().mk_plus(re().mk_full_char(ele_sort)); + result = re().mk_plus(re().mk_full_char(r->get_sort())); else if (re().is_full_seq(r) && re().is_full_char(s)) // .*. = .+ - result = re().mk_plus(re().mk_full_char(ele_sort)); + result = re().mk_plus(re().mk_full_char(r->get_sort())); else if (re().is_concat(r, r1, r2)) // create the resulting concatenation in right-associative form except for the following case // TODO: maintain the following invariant for A ++ B{m,n} + C