3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2026-06-27 19:08:49 +00:00

Add OP_RE_XOR and union-find bisimulation for ground regex equivalence (#9804)

Implements the algorithm of Eq(p,q) = Empty(p XOR q)' using a union-find
driven bisimulation closure (per the CAV'26 ERE paper).

### What's added

* **New primitive OP_RE_XOR (re.xor)** wired through seq_decl_plugin:
parser signature, info propagation (nullable, min_length), and
pretty-printer.
* **seq_rewriter**: structural XOR rewrites ( XOR r = empty, XOR empty =
r, ull XOR r = comp(r), comp/comp absorption, complement push, AC
normalisation), nullability (Null(p XOR q) = Null(p) != Null(q)),
derivative (D_a(p XOR q) = D_a(p) XOR D_a(q)), reverse, antimirov
derivative, and `check_deriv_normal_form` coverage.
* **New class seq::regex_bisim** in
`src/ast/rewriter/seq_regex_bisim.{h,cpp}` to keep the bisim logic out
of the already-large `seq_rewriter.cpp`. Uses `basic_union_find` from
`util/union_find.h`, an `obj_map` for the node assignment, and a
50000-step bound (returns `l_undef` on overrun).
* **Integration** in `seq_rewriter::reduce_re_eq` (with a re-entry
guard) and in `seq_regex::propagate_eq` / `propagate_ne` for ground
regexes; on `l_undef` we fall back to the existing axiomatisation.
* **`sls_seq_plugin`**: extend `OP_RE_DIFF` switch arms to also cover
`OP_RE_XOR`.

### Validation

* Full release build with MSVC + Ninja.
* `./test-z3 /a` -- 89/89 tests passing.
* `./test-z3 /seq smt2print_parse` -- PASS.
* Smoke tests with `(a|b)*` vs `(a*b*)*` (equal) and `a*` vs `(a|b)*`
(not equal) return the expected `sat`/`unsat` quickly.

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Margus Veanes 2026-06-10 14:58:20 -07:00 committed by GitHub
parent 589bd9e6f5
commit 513b81253b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 664 additions and 20 deletions

View file

@ -0,0 +1,267 @@
/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
seq_regex_bisim.cpp
Abstract:
See seq_regex_bisim.h.
Author:
Nikolaj Bjorner (nbjorner)
Margus Veanes (veanes)
--*/
#include "ast/rewriter/seq_regex_bisim.h"
#include "ast/rewriter/seq_rewriter.h"
#include "ast/ast_pp.h"
#include "ast/ast_util.h"
#include "ast/for_each_expr.h"
namespace seq {
regex_bisim::regex_bisim(seq_rewriter& rw):
m(rw.m()),
m_rw(rw),
m_util(rw.u()),
m_pinned(m),
m_worklist(m) {
}
void regex_bisim::reset() {
m_uf.reset();
m_node_of.reset();
m_pinned.reset();
m_worklist.reset();
m_steps = 0;
}
/*
Map an expression to a union-find node, allocating a fresh node on
first encounter.
*/
unsigned regex_bisim::node_of(expr* r) {
unsigned id = 0;
if (m_node_of.find(r, id))
return id;
id = m_uf.mk_var();
m_node_of.insert(r, id);
m_pinned.push_back(r);
return id;
}
/*
Compute a definite nullability answer for r.
If the seq_rewriter is unable to produce a literal true/false (for
example because r contains an uninterpreted symbol), return l_undef.
*/
lbool regex_bisim::nullability(expr* r) {
expr_ref n = m_rw.is_nullable(r);
if (m.is_true(n))
return l_true;
if (m.is_false(n))
return l_false;
return l_undef;
}
/*
Test whether a regex expression is a kind that the bisimulation
procedure can reason about. We require it to be a syntactic ground
term (no free variables) and that its info reports min_length info
(which implies that it parses cleanly as a regex constructor).
*/
bool regex_bisim::is_supported(expr* r) {
if (!m_util.is_re(r))
return false;
if (!m_util.re.get_info(r).is_known())
return false;
// Reject regexes mentioning free variables; the symbolic
// derivative engine introduces (:var 0) only after we call it
// ourselves, so any pre-existing variable would be a free var.
if (!is_ground(r))
return false;
return true;
}
/*
Collect the leaves of a t-regex der (an ITE / antimirov union /
union-tree with regex leaves) into the output vector. Empty
(re.empty) leaves are dropped.
Returns false if we encountered an unexpected node (e.g. a free
variable creeping in) in that case the caller should bail out.
*/
bool regex_bisim::collect_leaves(expr* der, expr_ref_vector& leaves) {
ptr_vector<expr> work;
obj_hashtable<expr> seen;
work.push_back(der);
seen.insert(der);
while (!work.empty()) {
expr* e = work.back();
work.pop_back();
expr* c = nullptr, * t = nullptr, * f = nullptr;
if (m.is_ite(e, c, t, f) ||
m_util.re.is_union(e, t, f) ||
m_util.re.is_antimirov_union(e, t, f)) {
if (seen.insert_if_not_there(t))
work.push_back(t);
if (seen.insert_if_not_there(f))
work.push_back(f);
continue;
}
if (m_util.re.is_empty(e))
continue;
if (!m_util.is_re(e))
return false;
leaves.push_back(e);
}
return true;
}
/*
Fast inequivalence check based on the get_info().classical flag.
Invariant: if r is well-formed and get_info(r).classical is true,
then L(r) is non-empty. The flag is set for regexes built only
from str.to_re, re.all, union, concat, star, plus, opt, loop;
it excludes complement, intersection, diff, xor, and the empty
regex.
A bare regex leaf l (i.e. not a XOR pair) represents the implicit
pair (empty XOR l). If l is classical, L(l) is non-empty so the
pair is non-empty: the original two regexes have a distinguishing
prefix and are inequivalent.
For an XOR leaf xor(a, b): if both a and b are classical and have
different min_length, then the shortest word of one is not in the
other, so the pair is non-empty and we can short-circuit. (The
case a == b syntactically is already handled by mk_re_xor0.)
Returns true if the leaf proves inequivalence; false if no
conclusion can be drawn.
*/
bool regex_bisim::classical_distinguishing(expr* l) {
expr* a = nullptr, * b = nullptr;
if (m_util.re.is_xor(l, a, b)) {
auto ia = m_util.re.get_info(a);
auto ib = m_util.re.get_info(b);
if (ia.is_known() && ib.is_known() &&
ia.classical && ib.classical &&
ia.min_length != ib.min_length)
return true;
return false;
}
if (m_util.re.is_empty(l))
return false;
auto info = m_util.re.get_info(l);
return info.is_known() && info.classical;
}
/*
Merge the two sides of the XOR pair, returning true if a fresh
merge happened (i.e. the pair must still be processed) and false
if the two sides were already in the same union-find class.
For non-XOR leaves we treat the leaf l as the pair (empty XOR l).
*/
bool regex_bisim::merge_leaf(expr* leaf) {
expr* a = nullptr, * b = nullptr;
if (!m_util.re.is_xor(leaf, a, b)) {
a = m_util.re.mk_empty(leaf->get_sort());
b = leaf;
m_pinned.push_back(a);
}
unsigned ia = node_of(a);
unsigned ib = node_of(b);
if (m_uf.find(ia) == m_uf.find(ib))
return false;
m_uf.merge(ia, ib);
return true;
}
/*
Decide equivalence by bisimulation on D(p XOR q).
*/
lbool regex_bisim::are_equivalent(expr* p, expr* q) {
return are_equivalent_core(p, q);
}
lbool regex_bisim::are_equivalent_core(expr* p, expr* q) {
if (!is_supported(p) || !is_supported(q))
return l_undef;
if (p == q)
return l_true;
reset();
// Build the initial pair r0 = p XOR q applying the structural
// XOR rewrites (r XOR r = empty, AC normalisation, etc.).
expr_ref r0 = m_rw.mk_re_xor_simplified(p, q);
// If r0 simplified to empty, the two regexes are equivalent.
if (m_util.re.is_empty(r0))
return l_true;
lbool n0 = nullability(r0);
if (n0 == l_true)
return l_false; // distinguishing empty word
if (n0 == l_undef)
return l_undef;
// Classical-leaf shortcut applied to r0 (covers the case where
// mk_re_xor_simplified collapsed p XOR q to a bare classical
// residual, e.g. when one side reduced to empty).
if (classical_distinguishing(r0))
return l_false;
if (!merge_leaf(r0))
return l_true; // already merged: trivially equivalent
m_worklist.push_back(r0);
while (!m_worklist.empty()) {
if (++m_steps > m_step_bound)
return l_undef;
expr_ref r(m_worklist.back(), m);
m_worklist.pop_back();
// Compute the symbolic derivative wrt the canonical variable
// (:var 0). The result is a transition regex (ITE tree) whose
// leaves are regex expressions. We use the classical Brzozowski
// entry point so the derivative stays as a single TRegex and
// does not lift unions to the top via antimirov nodes — this
// preserves the XOR-pair invariant the bisimulation relies on.
expr_ref d(m_rw.mk_brz_derivative(r), m);
expr_ref_vector leaves(m);
if (!collect_leaves(d, leaves))
return l_undef;
// First pass: check for any nullable leaf (definitive
// distinguishing empty-continuation word) or any classically
// non-empty leaf (definitive distinguishing non-empty prefix).
for (expr* l : leaves) {
lbool nl = nullability(l);
if (nl == l_true)
return l_false;
if (nl == l_undef)
return l_undef;
if (classical_distinguishing(l))
return l_false;
}
// Second pass: merge each leaf into the union-find; new
// merges go onto the worklist.
for (expr* l : leaves) {
if (merge_leaf(l)) {
m_pinned.push_back(l);
m_worklist.push_back(l);
}
}
}
return l_true;
}
}