3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2026-06-19 15:16:29 +00:00

fix(seq::derive): symbolize top-level cache key to avoid concrete-ele poisoning

`seq::derive::operator()(ele, r)` looks up `m_top_cache` keyed only by the
regex `r`, but on a miss it used to set `m_ele = ele` (a concrete char)
before calling `derive_rec(r)`. The resulting ITE-tree contained
constant-folded `(= ele c)` conditions, so the "symbolic" derivative
stored in the cache was actually specialized to that one ele. Subsequent
calls with the same `r` but a different ele hit the stale cached answer
and the substitution at the bottom was a no-op (no `v0` left to replace).

Simplest victim:
  (str.in_re "aP" (re.++ (re.* "a") "P"))
returned `unsat`. The first call D_'a'(a*P) computed `a*P` and cached it
under key `a*P`; the next call D_'P'(a*P) hit that cache entry and
returned `a*P` instead of epsilon, so the membership check ended on a
non-nullable state.

Fix: set `m_ele = v` (the canonical fresh var) so the derivative is
genuinely symbolic. Concrete-ele callers go through the existing
substitution at the bottom of `operator()`.

Adds a regression test in src/test/seq_regex_bisim.cpp checking that
D_'a'(a*P) is not nullable while D_'P'(a*P) is.

Note: this is independent of the mut_0013 bisim-level unsoundness;
that case still fails and is being tracked separately.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Margus Veanes 2026-06-15 02:28:21 -07:00
parent fb6470a1a1
commit 0b8bb98656
2 changed files with 63 additions and 3 deletions

View file

@ -80,7 +80,12 @@ namespace seq {
result = cached;
}
else {
m_ele = ele;
// Always compute the SYMBOLIC derivative wrt the canonical
// variable v (so the cached result is reusable for any
// concrete ele via substitution below). Using the concrete
// `ele` here would bake it into the cached ITE-tree and
// poison future lookups for the same r with a different ele.
m_ele = v;
m_depth = 0;
// Initialize path state for inline pruning
m_path.reset();

View file

@ -7,14 +7,22 @@
// D(a|b). That made the bisimulation procedure conclude bogus equalities
// like a* == (a|b)*.
//
// This test asserts the basic shape of the derivative and the bisim result
// for that minimal case, so the bug cannot silently regress.
// This file also covers the seq::derive top-level-cache poisoning bug.
// `m_top_cache` is keyed only by the regex; the routine used to populate it
// while `m_ele` was set to a *concrete* character, baking that character
// into the cached "symbolic" derivative. Subsequent calls with the same
// regex but a different ele then returned a stale concrete answer instead
// of the true symbolic derivative. The simplest victim is
// (str.in_re "aP" (re.++ (re.* "a") "P"))
// which used to return false because the derivative wrt 'a' was cached and
// re-used as the derivative wrt 'P'.
#include "ast/ast.h"
#include "ast/ast_pp.h"
#include "ast/reg_decl_plugins.h"
#include "ast/seq_decl_plugin.h"
#include "ast/rewriter/seq_rewriter.h"
#include "ast/rewriter/seq_regex_bisim.h"
#include "ast/rewriter/th_rewriter.h"
#include <iostream>
static void test_a_star_neq_ab_star() {
@ -67,6 +75,53 @@ static void test_a_star_neq_ab_star() {
ENSURE(eq == l_false);
}
// Regression for the derive top-level-cache poisoning bug.
// Take r = (re.* "a") ++ "P" and check str.in_re "aP" r. Before the fix
// the first per-char derivative call (wrt 'a') populated m_top_cache with
// 'a' baked into the symbolic ITE-tree, so the next call (wrt 'P') returned
// that stale cached value instead of computing D_P(r) = epsilon, making
// str.in_re wrongly return false.
static void test_derive_cache_per_ele() {
ast_manager m;
reg_decl_plugins(m);
seq_util u(m);
seq_rewriter rw(m);
sort_ref str_sort(u.str.mk_string_sort(), m);
zstring sa("a"), sP("P"), s_aP("aP");
expr_ref re_a(u.re.mk_to_re(u.str.mk_string(sa)), m);
expr_ref re_P(u.re.mk_to_re(u.str.mk_string(sP)), m);
expr_ref a_star(u.re.mk_star(re_a), m);
expr_ref r(u.re.mk_concat(a_star, re_P), m);
expr_ref aP(u.str.mk_string(s_aP), m);
// Compute D_'a'(a*P) and D_'P'(a*P) directly via mk_derivative.
// Before the fix, m_top_cache was populated while m_ele = ele (the
// concrete char), so the second call hit the stale cached answer from
// the first. After the fix the cache is keyed by a symbolic var, so
// each concrete-ele substitution produces the right answer.
expr_ref ch_a(u.mk_char('a'), m);
expr_ref ch_P(u.mk_char('P'), m);
expr_ref d_a = rw.mk_derivative(ch_a, r);
expr_ref d_P = rw.mk_derivative(ch_P, r);
std::cout << "D_a(a*P) = " << mk_pp(d_a, m) << "\n";
std::cout << "D_P(a*P) = " << mk_pp(d_P, m) << "\n";
// D_P(a*P) must be nullable (it accepts the empty suffix), while
// D_a(a*P) must not be (it still needs a trailing 'P').
expr_ref n_a = rw.is_nullable(d_a);
expr_ref n_P = rw.is_nullable(d_P);
th_rewriter trw(m);
trw(n_a);
trw(n_P);
std::cout << "nullable(D_a) = " << mk_pp(n_a, m) << "\n";
std::cout << "nullable(D_P) = " << mk_pp(n_P, m) << "\n";
ENSURE(m.is_false(n_a));
ENSURE(m.is_true(n_P));
}
void tst_seq_regex_bisim() {
test_a_star_neq_ab_star();
test_derive_cache_per_ele();
}