3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2026-06-19 15:16:29 +00:00
z3/src/ast/rewriter/seq_regex_bisim.cpp
Margus Veanes c67bb140dc
Treat each transition-regex leaf as a single bisimulation state (#9871)
## Summary

egex_bisim::collect_leaves used to descend through `re.union` and
`re.antimirov_union` at the top of each leaf of the transition regex,
splitting a single bisimulation state into multiple states before they
were merged into the union-find. This contradicts the bisimulation
invariant: **each leaf of a t-regex represents one state, regardless of
its top-level shape**. The fix descends into `ite` only (which is the
actual structural splitter of guarded transitions).

## Why it matters

The split happens to be *sound* for the current algorithm when the goal
is asserting `L(union(A, B)) = empty` (since `L(A) = empty AND L(B) =
empty` is equivalent), but it:

1. Adds spurious merges to the union-find that distort state-class
identities.
2. Slows convergence on hard equivalence queries (and causes early
timeouts in practice).
3. Creates latent unsoundness risk for any extension that interprets
leaves more semantically (XOR pair handling, classical-flag propagation,
future antimirov re-enable, etc.).

## Empirical validation

Run on the 1523-file regex-equivalence corpus, 5s/file timeout, 8
workers:

| metric | pre-fix master | post-fix |
|---|---|---|
| sat | 1008 | 1014 |
| unsat | 368 | 368 |
| timeout | 145 | 139 |
| unknown | 2 | 2 |
| SAT↔UNSAT verdict flips | — | **0** |
| timeout→sat flips | — | 6 |
| commonly-solved wall ratio | 1.000x | **0.902x** |

The 6 `timeout` → `sat` cases all return the *same* `sat` under
pre-fix master if given 60s; they are previously-slow cases not
previously-wrong ones.

Z3 unit tests: 89/89 pass (`test-z3 /a`).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-15 10:59:41 -07:00

269 lines
8.8 KiB
C++

/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
seq_regex_bisim.cpp
Abstract:
See seq_regex_bisim.h.
Author:
Margus Veanes (veanes)
Nikolaj Bjorner (nbjorner)
--*/
#include "ast/rewriter/seq_regex_bisim.h"
#include "ast/rewriter/seq_rewriter.h"
#include "ast/ast_pp.h"
#include "ast/ast_util.h"
#include "ast/for_each_expr.h"
namespace seq {
regex_bisim::regex_bisim(seq_rewriter& rw):
m(rw.m()),
m_rw(rw),
m_util(rw.u()),
m_pinned(m),
m_worklist(m) {
}
void regex_bisim::reset() {
m_uf.reset();
m_node_of.reset();
m_pinned.reset();
m_worklist.reset();
m_steps = 0;
}
/*
Map an expression to a union-find node, allocating a fresh node on
first encounter.
*/
unsigned regex_bisim::node_of(expr* r) {
unsigned id = 0;
if (m_node_of.find(r, id))
return id;
id = m_uf.mk_var();
m_node_of.insert(r, id);
m_pinned.push_back(r);
return id;
}
/*
Compute a definite nullability answer for r.
If the seq_rewriter is unable to produce a literal true/false (for
example because r contains an uninterpreted symbol), return l_undef.
*/
lbool regex_bisim::nullability(expr* r) {
expr_ref n = m_rw.is_nullable(r);
if (m.is_true(n))
return l_true;
if (m.is_false(n))
return l_false;
return l_undef;
}
/*
Test whether a regex expression is a kind that the bisimulation
procedure can reason about. We require it to be a syntactic ground
term (no free variables) and that its info reports min_length info
(which implies that it parses cleanly as a regex constructor).
*/
bool regex_bisim::is_supported(expr* r) {
if (!m_util.is_re(r))
return false;
if (!m_util.re.get_info(r).is_known())
return false;
// Reject regexes mentioning free variables; the symbolic
// derivative engine introduces (:var 0) only after we call it
// ourselves, so any pre-existing variable would be a free var.
return is_ground(r);
}
/*
Collect the leaves of a t-regex der (an ITE-tree whose leaves are
regex expressions) into the output vector. Empty (re.empty) leaves
are dropped.
Each leaf is treated as a single bisimulation state regardless of
its top-level shape (including re.union and re.antimirov_union):
descending into a union at the leaf would split one state into
several, which is semantically unsound for the bisimulation /
union-find merging that follows.
Returns false if we encountered an unexpected node (e.g. a free
variable creeping in) — in that case the caller should bail out.
*/
bool regex_bisim::collect_leaves(expr* der, expr_ref_vector& leaves) {
ptr_vector<expr> work;
obj_hashtable<expr> seen;
work.push_back(der);
seen.insert(der);
while (!work.empty()) {
expr* e = work.back();
work.pop_back();
expr* c = nullptr, * t = nullptr, * f = nullptr;
if (m.is_ite(e, c, t, f)) {
if (seen.insert_if_not_there(t))
work.push_back(t);
if (seen.insert_if_not_there(f))
work.push_back(f);
continue;
}
if (m_util.re.is_empty(e))
continue;
if (!m_util.is_re(e))
return false;
leaves.push_back(e);
}
return true;
}
/*
Fast inequivalence check based on the get_info().classical flag.
Invariant: if r is well-formed and get_info(r).classical is true,
then L(r) is non-empty. The flag is set for regexes built only
from str.to_re, re.all, union, concat, star, plus, opt, loop;
it excludes complement, intersection, diff, xor, and the empty
regex.
A bare regex leaf l (i.e. not a XOR pair) represents the implicit
pair (empty XOR l). If l is classical, L(l) is non-empty so the
pair is non-empty: the original two regexes have a distinguishing
prefix and are inequivalent.
For an XOR leaf xor(a, b): if both a and b are classical and have
different min_length, then the shortest word of one is not in the
other, so the pair is non-empty and we can short-circuit. (The
case a == b syntactically is already handled by mk_re_xor0.)
Returns true if the leaf proves inequivalence; false if no
conclusion can be drawn.
*/
bool regex_bisim::classical_distinguishing(expr* l) {
expr* a = nullptr, * b = nullptr;
if (m_util.re.is_xor(l, a, b)) {
auto ia = m_util.re.get_info(a);
auto ib = m_util.re.get_info(b);
if (ia.is_known() && ib.is_known() &&
ia.classical && ib.classical &&
ia.min_length != ib.min_length)
return true;
return false;
}
if (m_util.re.is_empty(l))
return false;
auto info = m_util.re.get_info(l);
return info.is_known() && info.classical;
}
/*
Merge the two sides of the XOR pair, returning true if a fresh
merge happened (i.e. the pair must still be processed) and false
if the two sides were already in the same union-find class.
For non-XOR leaves we treat the leaf l as the pair (empty XOR l).
*/
bool regex_bisim::merge_leaf(expr* leaf) {
expr* a = nullptr, * b = nullptr;
if (!m_util.re.is_xor(leaf, a, b)) {
a = m_util.re.mk_empty(leaf->get_sort());
b = leaf;
m_pinned.push_back(a);
}
unsigned ia = node_of(a);
unsigned ib = node_of(b);
if (m_uf.find(ia) == m_uf.find(ib))
return false;
m_uf.merge(ia, ib);
return true;
}
/*
Decide equivalence by bisimulation on D(p XOR q).
*/
lbool regex_bisim::are_equivalent(expr* p, expr* q) {
return are_equivalent_core(p, q);
}
lbool regex_bisim::are_equivalent_core(expr* p, expr* q) {
if (!is_supported(p) || !is_supported(q))
return l_undef;
if (p == q)
return l_true;
reset();
// Build the initial pair r0 = p XOR q applying the structural
// XOR rewrites (r XOR r = empty, AC normalisation, etc.).
expr_ref r0 = m_rw.mk_re_xor_simplified(p, q);
// If r0 simplified to empty, the two regexes are equivalent.
if (m_util.re.is_empty(r0))
return l_true;
lbool n0 = nullability(r0);
if (n0 == l_true)
return l_false; // distinguishing empty word
if (n0 == l_undef)
return l_undef;
// Classical-leaf shortcut applied to r0 (covers the case where
// mk_re_xor_simplified collapsed p XOR q to a bare classical
// residual, e.g. when one side reduced to empty).
if (classical_distinguishing(r0))
return l_false;
if (!merge_leaf(r0))
return l_true; // already merged: trivially equivalent
m_worklist.push_back(r0);
while (!m_worklist.empty()) {
if (++m_steps > m_step_bound)
return l_undef;
expr_ref r(m_worklist.back(), m);
m_worklist.pop_back();
// Compute the symbolic derivative wrt the canonical variable
// (:var 0). The result is a transition regex (ITE tree) whose
// leaves are regex expressions. We use the classical Brzozowski
// entry point so the derivative stays as a single TRegex and
// does not lift unions to the top via antimirov nodes — this
// preserves the XOR-pair invariant the bisimulation relies on.
expr_ref d(m_rw.mk_brz_derivative(r), m);
expr_ref_vector leaves(m);
if (!collect_leaves(d, leaves))
return l_undef;
// First pass: check for any nullable leaf (definitive
// distinguishing empty-continuation word) or any classically
// non-empty leaf (definitive distinguishing non-empty prefix).
for (expr* l : leaves) {
lbool nl = nullability(l);
if (nl == l_true)
return l_false;
if (nl == l_undef)
return l_undef;
if (classical_distinguishing(l))
return l_false;
}
// Second pass: merge each leaf into the union-find; new
// merges go onto the worklist.
for (expr* l : leaves) {
if (merge_leaf(l)) {
m_pinned.push_back(l);
m_worklist.push_back(l);
}
}
}
return l_true;
}
}