mirror of
https://github.com/Z3Prover/z3
synced 2026-06-19 15:16:29 +00:00
## Summary egex_bisim::collect_leaves used to descend through `re.union` and `re.antimirov_union` at the top of each leaf of the transition regex, splitting a single bisimulation state into multiple states before they were merged into the union-find. This contradicts the bisimulation invariant: **each leaf of a t-regex represents one state, regardless of its top-level shape**. The fix descends into `ite` only (which is the actual structural splitter of guarded transitions). ## Why it matters The split happens to be *sound* for the current algorithm when the goal is asserting `L(union(A, B)) = empty` (since `L(A) = empty AND L(B) = empty` is equivalent), but it: 1. Adds spurious merges to the union-find that distort state-class identities. 2. Slows convergence on hard equivalence queries (and causes early timeouts in practice). 3. Creates latent unsoundness risk for any extension that interprets leaves more semantically (XOR pair handling, classical-flag propagation, future antimirov re-enable, etc.). ## Empirical validation Run on the 1523-file regex-equivalence corpus, 5s/file timeout, 8 workers: | metric | pre-fix master | post-fix | |---|---|---| | sat | 1008 | 1014 | | unsat | 368 | 368 | | timeout | 145 | 139 | | unknown | 2 | 2 | | SAT↔UNSAT verdict flips | — | **0** | | timeout→sat flips | — | 6 | | commonly-solved wall ratio | 1.000x | **0.902x** | The 6 `timeout` → `sat` cases all return the *same* `sat` under pre-fix master if given 60s; they are previously-slow cases not previously-wrong ones. Z3 unit tests: 89/89 pass (`test-z3 /a`). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
269 lines
8.8 KiB
C++
269 lines
8.8 KiB
C++
/*++
|
|
Copyright (c) 2026 Microsoft Corporation
|
|
|
|
Module Name:
|
|
|
|
seq_regex_bisim.cpp
|
|
|
|
Abstract:
|
|
|
|
See seq_regex_bisim.h.
|
|
|
|
Author:
|
|
|
|
Margus Veanes (veanes)
|
|
Nikolaj Bjorner (nbjorner)
|
|
|
|
--*/
|
|
|
|
#include "ast/rewriter/seq_regex_bisim.h"
|
|
#include "ast/rewriter/seq_rewriter.h"
|
|
#include "ast/ast_pp.h"
|
|
#include "ast/ast_util.h"
|
|
#include "ast/for_each_expr.h"
|
|
|
|
namespace seq {
|
|
|
|
regex_bisim::regex_bisim(seq_rewriter& rw):
|
|
m(rw.m()),
|
|
m_rw(rw),
|
|
m_util(rw.u()),
|
|
m_pinned(m),
|
|
m_worklist(m) {
|
|
}
|
|
|
|
void regex_bisim::reset() {
|
|
m_uf.reset();
|
|
m_node_of.reset();
|
|
m_pinned.reset();
|
|
m_worklist.reset();
|
|
m_steps = 0;
|
|
}
|
|
|
|
/*
|
|
Map an expression to a union-find node, allocating a fresh node on
|
|
first encounter.
|
|
*/
|
|
unsigned regex_bisim::node_of(expr* r) {
|
|
unsigned id = 0;
|
|
if (m_node_of.find(r, id))
|
|
return id;
|
|
id = m_uf.mk_var();
|
|
m_node_of.insert(r, id);
|
|
m_pinned.push_back(r);
|
|
return id;
|
|
}
|
|
|
|
/*
|
|
Compute a definite nullability answer for r.
|
|
If the seq_rewriter is unable to produce a literal true/false (for
|
|
example because r contains an uninterpreted symbol), return l_undef.
|
|
*/
|
|
lbool regex_bisim::nullability(expr* r) {
|
|
expr_ref n = m_rw.is_nullable(r);
|
|
if (m.is_true(n))
|
|
return l_true;
|
|
if (m.is_false(n))
|
|
return l_false;
|
|
return l_undef;
|
|
}
|
|
|
|
/*
|
|
Test whether a regex expression is a kind that the bisimulation
|
|
procedure can reason about. We require it to be a syntactic ground
|
|
term (no free variables) and that its info reports min_length info
|
|
(which implies that it parses cleanly as a regex constructor).
|
|
*/
|
|
bool regex_bisim::is_supported(expr* r) {
|
|
if (!m_util.is_re(r))
|
|
return false;
|
|
if (!m_util.re.get_info(r).is_known())
|
|
return false;
|
|
// Reject regexes mentioning free variables; the symbolic
|
|
// derivative engine introduces (:var 0) only after we call it
|
|
// ourselves, so any pre-existing variable would be a free var.
|
|
return is_ground(r);
|
|
}
|
|
|
|
/*
|
|
Collect the leaves of a t-regex der (an ITE-tree whose leaves are
|
|
regex expressions) into the output vector. Empty (re.empty) leaves
|
|
are dropped.
|
|
|
|
Each leaf is treated as a single bisimulation state regardless of
|
|
its top-level shape (including re.union and re.antimirov_union):
|
|
descending into a union at the leaf would split one state into
|
|
several, which is semantically unsound for the bisimulation /
|
|
union-find merging that follows.
|
|
|
|
Returns false if we encountered an unexpected node (e.g. a free
|
|
variable creeping in) — in that case the caller should bail out.
|
|
*/
|
|
bool regex_bisim::collect_leaves(expr* der, expr_ref_vector& leaves) {
|
|
ptr_vector<expr> work;
|
|
obj_hashtable<expr> seen;
|
|
work.push_back(der);
|
|
seen.insert(der);
|
|
while (!work.empty()) {
|
|
expr* e = work.back();
|
|
work.pop_back();
|
|
expr* c = nullptr, * t = nullptr, * f = nullptr;
|
|
if (m.is_ite(e, c, t, f)) {
|
|
if (seen.insert_if_not_there(t))
|
|
work.push_back(t);
|
|
if (seen.insert_if_not_there(f))
|
|
work.push_back(f);
|
|
continue;
|
|
}
|
|
if (m_util.re.is_empty(e))
|
|
continue;
|
|
if (!m_util.is_re(e))
|
|
return false;
|
|
leaves.push_back(e);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
Fast inequivalence check based on the get_info().classical flag.
|
|
|
|
Invariant: if r is well-formed and get_info(r).classical is true,
|
|
then L(r) is non-empty. The flag is set for regexes built only
|
|
from str.to_re, re.all, union, concat, star, plus, opt, loop;
|
|
it excludes complement, intersection, diff, xor, and the empty
|
|
regex.
|
|
|
|
A bare regex leaf l (i.e. not a XOR pair) represents the implicit
|
|
pair (empty XOR l). If l is classical, L(l) is non-empty so the
|
|
pair is non-empty: the original two regexes have a distinguishing
|
|
prefix and are inequivalent.
|
|
|
|
For an XOR leaf xor(a, b): if both a and b are classical and have
|
|
different min_length, then the shortest word of one is not in the
|
|
other, so the pair is non-empty and we can short-circuit. (The
|
|
case a == b syntactically is already handled by mk_re_xor0.)
|
|
|
|
Returns true if the leaf proves inequivalence; false if no
|
|
conclusion can be drawn.
|
|
*/
|
|
bool regex_bisim::classical_distinguishing(expr* l) {
|
|
expr* a = nullptr, * b = nullptr;
|
|
if (m_util.re.is_xor(l, a, b)) {
|
|
auto ia = m_util.re.get_info(a);
|
|
auto ib = m_util.re.get_info(b);
|
|
if (ia.is_known() && ib.is_known() &&
|
|
ia.classical && ib.classical &&
|
|
ia.min_length != ib.min_length)
|
|
return true;
|
|
return false;
|
|
}
|
|
if (m_util.re.is_empty(l))
|
|
return false;
|
|
auto info = m_util.re.get_info(l);
|
|
return info.is_known() && info.classical;
|
|
}
|
|
|
|
/*
|
|
Merge the two sides of the XOR pair, returning true if a fresh
|
|
merge happened (i.e. the pair must still be processed) and false
|
|
if the two sides were already in the same union-find class.
|
|
|
|
For non-XOR leaves we treat the leaf l as the pair (empty XOR l).
|
|
*/
|
|
bool regex_bisim::merge_leaf(expr* leaf) {
|
|
expr* a = nullptr, * b = nullptr;
|
|
if (!m_util.re.is_xor(leaf, a, b)) {
|
|
a = m_util.re.mk_empty(leaf->get_sort());
|
|
b = leaf;
|
|
m_pinned.push_back(a);
|
|
}
|
|
unsigned ia = node_of(a);
|
|
unsigned ib = node_of(b);
|
|
if (m_uf.find(ia) == m_uf.find(ib))
|
|
return false;
|
|
m_uf.merge(ia, ib);
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
Decide equivalence by bisimulation on D(p XOR q).
|
|
*/
|
|
lbool regex_bisim::are_equivalent(expr* p, expr* q) {
|
|
return are_equivalent_core(p, q);
|
|
}
|
|
|
|
lbool regex_bisim::are_equivalent_core(expr* p, expr* q) {
|
|
if (!is_supported(p) || !is_supported(q))
|
|
return l_undef;
|
|
if (p == q)
|
|
return l_true;
|
|
|
|
reset();
|
|
|
|
// Build the initial pair r0 = p XOR q applying the structural
|
|
// XOR rewrites (r XOR r = empty, AC normalisation, etc.).
|
|
expr_ref r0 = m_rw.mk_re_xor_simplified(p, q);
|
|
|
|
// If r0 simplified to empty, the two regexes are equivalent.
|
|
if (m_util.re.is_empty(r0))
|
|
return l_true;
|
|
|
|
lbool n0 = nullability(r0);
|
|
if (n0 == l_true)
|
|
return l_false; // distinguishing empty word
|
|
if (n0 == l_undef)
|
|
return l_undef;
|
|
|
|
// Classical-leaf shortcut applied to r0 (covers the case where
|
|
// mk_re_xor_simplified collapsed p XOR q to a bare classical
|
|
// residual, e.g. when one side reduced to empty).
|
|
if (classical_distinguishing(r0))
|
|
return l_false;
|
|
|
|
if (!merge_leaf(r0))
|
|
return l_true; // already merged: trivially equivalent
|
|
m_worklist.push_back(r0);
|
|
|
|
while (!m_worklist.empty()) {
|
|
if (++m_steps > m_step_bound)
|
|
return l_undef;
|
|
|
|
expr_ref r(m_worklist.back(), m);
|
|
m_worklist.pop_back();
|
|
|
|
// Compute the symbolic derivative wrt the canonical variable
|
|
// (:var 0). The result is a transition regex (ITE tree) whose
|
|
// leaves are regex expressions. We use the classical Brzozowski
|
|
// entry point so the derivative stays as a single TRegex and
|
|
// does not lift unions to the top via antimirov nodes — this
|
|
// preserves the XOR-pair invariant the bisimulation relies on.
|
|
expr_ref d(m_rw.mk_brz_derivative(r), m);
|
|
expr_ref_vector leaves(m);
|
|
if (!collect_leaves(d, leaves))
|
|
return l_undef;
|
|
|
|
// First pass: check for any nullable leaf (definitive
|
|
// distinguishing empty-continuation word) or any classically
|
|
// non-empty leaf (definitive distinguishing non-empty prefix).
|
|
for (expr* l : leaves) {
|
|
lbool nl = nullability(l);
|
|
if (nl == l_true)
|
|
return l_false;
|
|
if (nl == l_undef)
|
|
return l_undef;
|
|
if (classical_distinguishing(l))
|
|
return l_false;
|
|
}
|
|
|
|
// Second pass: merge each leaf into the union-find; new
|
|
// merges go onto the worklist.
|
|
for (expr* l : leaves) {
|
|
if (merge_leaf(l)) {
|
|
m_pinned.push_back(l);
|
|
m_worklist.push_back(l);
|
|
}
|
|
}
|
|
}
|
|
return l_true;
|
|
}
|
|
}
|