From 0b8bb98656ca0defb3297e85613b529a0de8da82 Mon Sep 17 00:00:00 2001 From: Margus Veanes Date: Mon, 15 Jun 2026 02:28:21 -0700 Subject: [PATCH] fix(seq::derive): symbolize top-level cache key to avoid concrete-ele poisoning `seq::derive::operator()(ele, r)` looks up `m_top_cache` keyed only by the regex `r`, but on a miss it used to set `m_ele = ele` (a concrete char) before calling `derive_rec(r)`. The resulting ITE-tree contained constant-folded `(= ele c)` conditions, so the "symbolic" derivative stored in the cache was actually specialized to that one ele. Subsequent calls with the same `r` but a different ele hit the stale cached answer and the substitution at the bottom was a no-op (no `v0` left to replace). Simplest victim: (str.in_re "aP" (re.++ (re.* "a") "P")) returned `unsat`. The first call D_'a'(a*P) computed `a*P` and cached it under key `a*P`; the next call D_'P'(a*P) hit that cache entry and returned `a*P` instead of epsilon, so the membership check ended on a non-nullable state. Fix: set `m_ele = v` (the canonical fresh var) so the derivative is genuinely symbolic. Concrete-ele callers go through the existing substitution at the bottom of `operator()`. Adds a regression test in src/test/seq_regex_bisim.cpp checking that D_'a'(a*P) is not nullable while D_'P'(a*P) is. Note: this is independent of the mut_0013 bisim-level unsoundness; that case still fails and is being tracked separately. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/ast/rewriter/seq_derive.cpp | 7 +++- src/test/seq_regex_bisim.cpp | 59 +++++++++++++++++++++++++++++++-- 2 files changed, 63 insertions(+), 3 deletions(-) diff --git a/src/ast/rewriter/seq_derive.cpp b/src/ast/rewriter/seq_derive.cpp index 2b62f4551..7c6ddf991 100644 --- a/src/ast/rewriter/seq_derive.cpp +++ b/src/ast/rewriter/seq_derive.cpp @@ -80,7 +80,12 @@ namespace seq { result = cached; } else { - m_ele = ele; + // Always compute the SYMBOLIC derivative wrt the canonical + // variable v (so the cached result is reusable for any + // concrete ele via substitution below). Using the concrete + // `ele` here would bake it into the cached ITE-tree and + // poison future lookups for the same r with a different ele. + m_ele = v; m_depth = 0; // Initialize path state for inline pruning m_path.reset(); diff --git a/src/test/seq_regex_bisim.cpp b/src/test/seq_regex_bisim.cpp index bfe3f02e9..83404439b 100644 --- a/src/test/seq_regex_bisim.cpp +++ b/src/test/seq_regex_bisim.cpp @@ -7,14 +7,22 @@ // D(a|b). That made the bisimulation procedure conclude bogus equalities // like a* == (a|b)*. // -// This test asserts the basic shape of the derivative and the bisim result -// for that minimal case, so the bug cannot silently regress. +// This file also covers the seq::derive top-level-cache poisoning bug. +// `m_top_cache` is keyed only by the regex; the routine used to populate it +// while `m_ele` was set to a *concrete* character, baking that character +// into the cached "symbolic" derivative. Subsequent calls with the same +// regex but a different ele then returned a stale concrete answer instead +// of the true symbolic derivative. The simplest victim is +// (str.in_re "aP" (re.++ (re.* "a") "P")) +// which used to return false because the derivative wrt 'a' was cached and +// re-used as the derivative wrt 'P'. #include "ast/ast.h" #include "ast/ast_pp.h" #include "ast/reg_decl_plugins.h" #include "ast/seq_decl_plugin.h" #include "ast/rewriter/seq_rewriter.h" #include "ast/rewriter/seq_regex_bisim.h" +#include "ast/rewriter/th_rewriter.h" #include static void test_a_star_neq_ab_star() { @@ -67,6 +75,53 @@ static void test_a_star_neq_ab_star() { ENSURE(eq == l_false); } +// Regression for the derive top-level-cache poisoning bug. +// Take r = (re.* "a") ++ "P" and check str.in_re "aP" r. Before the fix +// the first per-char derivative call (wrt 'a') populated m_top_cache with +// 'a' baked into the symbolic ITE-tree, so the next call (wrt 'P') returned +// that stale cached value instead of computing D_P(r) = epsilon, making +// str.in_re wrongly return false. +static void test_derive_cache_per_ele() { + ast_manager m; + reg_decl_plugins(m); + seq_util u(m); + seq_rewriter rw(m); + + sort_ref str_sort(u.str.mk_string_sort(), m); + + zstring sa("a"), sP("P"), s_aP("aP"); + expr_ref re_a(u.re.mk_to_re(u.str.mk_string(sa)), m); + expr_ref re_P(u.re.mk_to_re(u.str.mk_string(sP)), m); + expr_ref a_star(u.re.mk_star(re_a), m); + expr_ref r(u.re.mk_concat(a_star, re_P), m); + expr_ref aP(u.str.mk_string(s_aP), m); + + // Compute D_'a'(a*P) and D_'P'(a*P) directly via mk_derivative. + // Before the fix, m_top_cache was populated while m_ele = ele (the + // concrete char), so the second call hit the stale cached answer from + // the first. After the fix the cache is keyed by a symbolic var, so + // each concrete-ele substitution produces the right answer. + expr_ref ch_a(u.mk_char('a'), m); + expr_ref ch_P(u.mk_char('P'), m); + expr_ref d_a = rw.mk_derivative(ch_a, r); + expr_ref d_P = rw.mk_derivative(ch_P, r); + std::cout << "D_a(a*P) = " << mk_pp(d_a, m) << "\n"; + std::cout << "D_P(a*P) = " << mk_pp(d_P, m) << "\n"; + + // D_P(a*P) must be nullable (it accepts the empty suffix), while + // D_a(a*P) must not be (it still needs a trailing 'P'). + expr_ref n_a = rw.is_nullable(d_a); + expr_ref n_P = rw.is_nullable(d_P); + th_rewriter trw(m); + trw(n_a); + trw(n_P); + std::cout << "nullable(D_a) = " << mk_pp(n_a, m) << "\n"; + std::cout << "nullable(D_P) = " << mk_pp(n_P, m) << "\n"; + ENSURE(m.is_false(n_a)); + ENSURE(m.is_true(n_P)); +} + void tst_seq_regex_bisim() { test_a_star_neq_ab_star(); + test_derive_cache_per_ele(); }