From 513b81253b7b8f45a796b6563029bc26a657e0a1 Mon Sep 17 00:00:00 2001 From: Margus Veanes Date: Wed, 10 Jun 2026 14:58:20 -0700 Subject: [PATCH] Add OP_RE_XOR and union-find bisimulation for ground regex equivalence (#9804) Implements the algorithm of Eq(p,q) = Empty(p XOR q)' using a union-find driven bisimulation closure (per the CAV'26 ERE paper). ### What's added * **New primitive OP_RE_XOR (re.xor)** wired through seq_decl_plugin: parser signature, info propagation (nullable, min_length), and pretty-printer. * **seq_rewriter**: structural XOR rewrites ( XOR r = empty, XOR empty = r, ull XOR r = comp(r), comp/comp absorption, complement push, AC normalisation), nullability (Null(p XOR q) = Null(p) != Null(q)), derivative (D_a(p XOR q) = D_a(p) XOR D_a(q)), reverse, antimirov derivative, and `check_deriv_normal_form` coverage. * **New class seq::regex_bisim** in `src/ast/rewriter/seq_regex_bisim.{h,cpp}` to keep the bisim logic out of the already-large `seq_rewriter.cpp`. Uses `basic_union_find` from `util/union_find.h`, an `obj_map` for the node assignment, and a 50000-step bound (returns `l_undef` on overrun). * **Integration** in `seq_rewriter::reduce_re_eq` (with a re-entry guard) and in `seq_regex::propagate_eq` / `propagate_ne` for ground regexes; on `l_undef` we fall back to the existing axiomatisation. * **`sls_seq_plugin`**: extend `OP_RE_DIFF` switch arms to also cover `OP_RE_XOR`. ### Validation * Full release build with MSVC + Ninja. * `./test-z3 /a` -- 89/89 tests passing. * `./test-z3 /seq smt2print_parse` -- PASS. * Smoke tests with `(a|b)*` vs `(a*b*)*` (equal) and `a*` vs `(a|b)*` (not equal) return the expected `sat`/`unsat` quickly. --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/ast/rewriter/CMakeLists.txt | 1 + src/ast/rewriter/seq_regex_bisim.cpp | 267 +++++++++++++++++++++++++++ src/ast/rewriter/seq_regex_bisim.h | 99 ++++++++++ src/ast/rewriter/seq_rewriter.cpp | 217 ++++++++++++++++++++-- src/ast/rewriter/seq_rewriter.h | 29 ++- src/ast/seq_decl_plugin.cpp | 32 ++++ src/ast/seq_decl_plugin.h | 5 + src/ast/sls/sls_seq_plugin.cpp | 2 + src/smt/seq_regex.cpp | 32 ++++ 9 files changed, 664 insertions(+), 20 deletions(-) create mode 100644 src/ast/rewriter/seq_regex_bisim.cpp create mode 100644 src/ast/rewriter/seq_regex_bisim.h diff --git a/src/ast/rewriter/CMakeLists.txt b/src/ast/rewriter/CMakeLists.txt index ba7eda277..cfcc179bc 100644 --- a/src/ast/rewriter/CMakeLists.txt +++ b/src/ast/rewriter/CMakeLists.txt @@ -41,6 +41,7 @@ z3_add_component(rewriter seq_eq_solver.cpp seq_subset.cpp seq_rewriter.cpp + seq_regex_bisim.cpp seq_skolem.cpp th_rewriter.cpp value_sweep.cpp diff --git a/src/ast/rewriter/seq_regex_bisim.cpp b/src/ast/rewriter/seq_regex_bisim.cpp new file mode 100644 index 000000000..833c36dca --- /dev/null +++ b/src/ast/rewriter/seq_regex_bisim.cpp @@ -0,0 +1,267 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_regex_bisim.cpp + +Abstract: + + See seq_regex_bisim.h. + +Author: + + Nikolaj Bjorner (nbjorner) + Margus Veanes (veanes) + +--*/ + +#include "ast/rewriter/seq_regex_bisim.h" +#include "ast/rewriter/seq_rewriter.h" +#include "ast/ast_pp.h" +#include "ast/ast_util.h" +#include "ast/for_each_expr.h" + +namespace seq { + + regex_bisim::regex_bisim(seq_rewriter& rw): + m(rw.m()), + m_rw(rw), + m_util(rw.u()), + m_pinned(m), + m_worklist(m) { + } + + void regex_bisim::reset() { + m_uf.reset(); + m_node_of.reset(); + m_pinned.reset(); + m_worklist.reset(); + m_steps = 0; + } + + /* + Map an expression to a union-find node, allocating a fresh node on + first encounter. + */ + unsigned regex_bisim::node_of(expr* r) { + unsigned id = 0; + if (m_node_of.find(r, id)) + return id; + id = m_uf.mk_var(); + m_node_of.insert(r, id); + m_pinned.push_back(r); + return id; + } + + /* + Compute a definite nullability answer for r. + If the seq_rewriter is unable to produce a literal true/false (for + example because r contains an uninterpreted symbol), return l_undef. + */ + lbool regex_bisim::nullability(expr* r) { + expr_ref n = m_rw.is_nullable(r); + if (m.is_true(n)) + return l_true; + if (m.is_false(n)) + return l_false; + return l_undef; + } + + /* + Test whether a regex expression is a kind that the bisimulation + procedure can reason about. We require it to be a syntactic ground + term (no free variables) and that its info reports min_length info + (which implies that it parses cleanly as a regex constructor). + */ + bool regex_bisim::is_supported(expr* r) { + if (!m_util.is_re(r)) + return false; + if (!m_util.re.get_info(r).is_known()) + return false; + // Reject regexes mentioning free variables; the symbolic + // derivative engine introduces (:var 0) only after we call it + // ourselves, so any pre-existing variable would be a free var. + if (!is_ground(r)) + return false; + return true; + } + + /* + Collect the leaves of a t-regex der (an ITE / antimirov union / + union-tree with regex leaves) into the output vector. Empty + (re.empty) leaves are dropped. + + Returns false if we encountered an unexpected node (e.g. a free + variable creeping in) — in that case the caller should bail out. + */ + bool regex_bisim::collect_leaves(expr* der, expr_ref_vector& leaves) { + ptr_vector work; + obj_hashtable seen; + work.push_back(der); + seen.insert(der); + while (!work.empty()) { + expr* e = work.back(); + work.pop_back(); + expr* c = nullptr, * t = nullptr, * f = nullptr; + if (m.is_ite(e, c, t, f) || + m_util.re.is_union(e, t, f) || + m_util.re.is_antimirov_union(e, t, f)) { + if (seen.insert_if_not_there(t)) + work.push_back(t); + if (seen.insert_if_not_there(f)) + work.push_back(f); + continue; + } + if (m_util.re.is_empty(e)) + continue; + if (!m_util.is_re(e)) + return false; + leaves.push_back(e); + } + return true; + } + + /* + Fast inequivalence check based on the get_info().classical flag. + + Invariant: if r is well-formed and get_info(r).classical is true, + then L(r) is non-empty. The flag is set for regexes built only + from str.to_re, re.all, union, concat, star, plus, opt, loop; + it excludes complement, intersection, diff, xor, and the empty + regex. + + A bare regex leaf l (i.e. not a XOR pair) represents the implicit + pair (empty XOR l). If l is classical, L(l) is non-empty so the + pair is non-empty: the original two regexes have a distinguishing + prefix and are inequivalent. + + For an XOR leaf xor(a, b): if both a and b are classical and have + different min_length, then the shortest word of one is not in the + other, so the pair is non-empty and we can short-circuit. (The + case a == b syntactically is already handled by mk_re_xor0.) + + Returns true if the leaf proves inequivalence; false if no + conclusion can be drawn. + */ + bool regex_bisim::classical_distinguishing(expr* l) { + expr* a = nullptr, * b = nullptr; + if (m_util.re.is_xor(l, a, b)) { + auto ia = m_util.re.get_info(a); + auto ib = m_util.re.get_info(b); + if (ia.is_known() && ib.is_known() && + ia.classical && ib.classical && + ia.min_length != ib.min_length) + return true; + return false; + } + if (m_util.re.is_empty(l)) + return false; + auto info = m_util.re.get_info(l); + return info.is_known() && info.classical; + } + + /* + Merge the two sides of the XOR pair, returning true if a fresh + merge happened (i.e. the pair must still be processed) and false + if the two sides were already in the same union-find class. + + For non-XOR leaves we treat the leaf l as the pair (empty XOR l). + */ + bool regex_bisim::merge_leaf(expr* leaf) { + expr* a = nullptr, * b = nullptr; + if (!m_util.re.is_xor(leaf, a, b)) { + a = m_util.re.mk_empty(leaf->get_sort()); + b = leaf; + m_pinned.push_back(a); + } + unsigned ia = node_of(a); + unsigned ib = node_of(b); + if (m_uf.find(ia) == m_uf.find(ib)) + return false; + m_uf.merge(ia, ib); + return true; + } + + /* + Decide equivalence by bisimulation on D(p XOR q). + */ + lbool regex_bisim::are_equivalent(expr* p, expr* q) { + return are_equivalent_core(p, q); + } + + lbool regex_bisim::are_equivalent_core(expr* p, expr* q) { + if (!is_supported(p) || !is_supported(q)) + return l_undef; + if (p == q) + return l_true; + + reset(); + + // Build the initial pair r0 = p XOR q applying the structural + // XOR rewrites (r XOR r = empty, AC normalisation, etc.). + expr_ref r0 = m_rw.mk_re_xor_simplified(p, q); + + // If r0 simplified to empty, the two regexes are equivalent. + if (m_util.re.is_empty(r0)) + return l_true; + + lbool n0 = nullability(r0); + if (n0 == l_true) + return l_false; // distinguishing empty word + if (n0 == l_undef) + return l_undef; + + // Classical-leaf shortcut applied to r0 (covers the case where + // mk_re_xor_simplified collapsed p XOR q to a bare classical + // residual, e.g. when one side reduced to empty). + if (classical_distinguishing(r0)) + return l_false; + + if (!merge_leaf(r0)) + return l_true; // already merged: trivially equivalent + m_worklist.push_back(r0); + + while (!m_worklist.empty()) { + if (++m_steps > m_step_bound) + return l_undef; + + expr_ref r(m_worklist.back(), m); + m_worklist.pop_back(); + + // Compute the symbolic derivative wrt the canonical variable + // (:var 0). The result is a transition regex (ITE tree) whose + // leaves are regex expressions. We use the classical Brzozowski + // entry point so the derivative stays as a single TRegex and + // does not lift unions to the top via antimirov nodes — this + // preserves the XOR-pair invariant the bisimulation relies on. + expr_ref d(m_rw.mk_brz_derivative(r), m); + expr_ref_vector leaves(m); + if (!collect_leaves(d, leaves)) + return l_undef; + + // First pass: check for any nullable leaf (definitive + // distinguishing empty-continuation word) or any classically + // non-empty leaf (definitive distinguishing non-empty prefix). + for (expr* l : leaves) { + lbool nl = nullability(l); + if (nl == l_true) + return l_false; + if (nl == l_undef) + return l_undef; + if (classical_distinguishing(l)) + return l_false; + } + + // Second pass: merge each leaf into the union-find; new + // merges go onto the worklist. + for (expr* l : leaves) { + if (merge_leaf(l)) { + m_pinned.push_back(l); + m_worklist.push_back(l); + } + } + } + return l_true; + } +} diff --git a/src/ast/rewriter/seq_regex_bisim.h b/src/ast/rewriter/seq_regex_bisim.h new file mode 100644 index 000000000..561088270 --- /dev/null +++ b/src/ast/rewriter/seq_regex_bisim.h @@ -0,0 +1,99 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_regex_bisim.h + +Abstract: + + Union-find based bisimulation algorithm for symbolic regular expression + equivalence, based on the construction described in: + + "Symbolic Extended Regular Expression Equivalence" + Veanes, Bjorner et al., CAV'26 (see \git\ere\cav26\paper.tex) + + The algorithm decides equivalence of two regexes p, q by performing + a bisimulation search on the symbolic derivative of p XOR q. A + union-find structure tracks pairs of regex states that should be + bisimilar, and the worklist is initialised with the singleton + {p XOR q}. + + The algorithm returns: + l_true p and q are equivalent (bisimilar) + l_false p and q are inequivalent (a distinguishing prefix exists) + l_undef the algorithm could not decide within the configured bound + or encountered a non-ground regex it cannot reason about + +Author: + + Nikolaj Bjorner (nbjorner) + Margus Veanes (veanes) + +--*/ +#pragma once + +#include "ast/seq_decl_plugin.h" +#include "util/lbool.h" +#include "util/obj_hashtable.h" +#include "util/union_find.h" + +class seq_rewriter; + +namespace seq { + + /* + Union-find based bisimulation algorithm for deciding equivalence + of two ground (closed) regular expressions in ERE. + + Usage: + + seq::regex_bisim bisim(rewriter); + lbool r = bisim.are_equivalent(p, q); + if (r == l_true) // p ≡ q + if (r == l_false) // p ≢ q + if (r == l_undef) // cannot decide + + The implementation only attempts the equivalence check on ground + regexes (no free variables) that use the standard symbolic regex + constructors. For non-ground or unsupported inputs the procedure + conservatively returns l_undef. + */ + class regex_bisim { + private: + ast_manager& m; + seq_rewriter& m_rw; + seq_util m_util; + basic_union_find m_uf; + obj_map m_node_of; + expr_ref_vector m_pinned; + expr_ref_vector m_worklist; + unsigned m_step_bound { 50000 }; + unsigned m_steps { 0 }; + + unsigned node_of(expr* r); + bool merge_leaf(expr* xor_pair); + bool collect_leaves(expr* der, expr_ref_vector& leaves); + lbool nullability(expr* r); + bool is_supported(expr* r); + // Returns true if the leaf l proves that the original pair is + // inequivalent and the bisim can short-circuit to l_false. + // Uses the get_info().classical invariant: a classical regex + // has non-empty language, so a bare classical leaf (implicitly + // empty XOR r) is a distinguishing prefix. Similarly an XOR of + // two classical regexes with different min_length is + // distinguishing. + bool classical_distinguishing(expr* l); + void reset(); + + lbool are_equivalent_core(expr* p, expr* q); + + public: + regex_bisim(seq_rewriter& rw); + + void set_step_bound(unsigned n) { m_step_bound = n; } + + lbool are_equivalent(expr* p, expr* q); + }; + +} diff --git a/src/ast/rewriter/seq_rewriter.cpp b/src/ast/rewriter/seq_rewriter.cpp index 46da49cf2..1654c5777 100644 --- a/src/ast/rewriter/seq_rewriter.cpp +++ b/src/ast/rewriter/seq_rewriter.cpp @@ -20,6 +20,7 @@ Authors: #include "util/uint_set.h" #include "ast/rewriter/seq_rewriter.h" +#include "ast/rewriter/seq_regex_bisim.h" #include "ast/arith_decl_plugin.h" #include "ast/array_decl_plugin.h" #include "ast/ast_pp.h" @@ -267,6 +268,14 @@ br_status seq_rewriter::mk_app_core(func_decl * f, unsigned num_args, expr * con st = BR_DONE; } break; + case OP_RE_XOR: + if (num_args == 2) + st = mk_re_xor(args[0], args[1], result); + else if (num_args == 1) { + result = args[0]; + st = BR_DONE; + } + break; case OP_RE_INTERSECT: if (num_args == 1) { result = args[0]; @@ -2622,6 +2631,23 @@ expr_ref seq_rewriter::is_nullable_rec(expr* r) { m_br.mk_not(is_nullable(r2), result); m_br.mk_and(result, is_nullable(r1), result); } + else if (re().is_xor(r, r1, r2)) { + // Null(r1 XOR r2) = Null(r1) XOR Null(r2) + expr_ref n1(is_nullable(r1), m()); + expr_ref n2(is_nullable(r2), m()); + // Simplify when either operand is a boolean literal so the + // bisimulation procedure can use the answer directly. + if (m().is_true(n1)) + result = mk_not(m(), n2); + else if (m().is_false(n1)) + result = n2; + else if (m().is_true(n2)) + result = mk_not(m(), n1); + else if (m().is_false(n2)) + result = n1; + else + result = m().mk_xor(n1, n2); + } else if (re().is_star(r) || re().is_opt(r) || re().is_full_seq(r) || @@ -2742,6 +2768,12 @@ br_status seq_rewriter::mk_re_reverse(expr* r, expr_ref& result) { result = re().mk_diff(a, b); return BR_REWRITE2; } + else if (re().is_xor(r, r1, r2)) { + auto a = re().mk_reverse(r1); + auto b = re().mk_reverse(r2); + result = re().mk_xor(a, b); + return BR_REWRITE2; + } else if (m().is_ite(r, p, r1, r2)) { result = m().mk_ite(p, re().mk_reverse(r1), re().mk_reverse(r2)); return BR_REWRITE2; @@ -2882,6 +2914,7 @@ bool seq_rewriter::check_deriv_normal_form(expr* r, int level) { re().is_concat(r, r1, r2) || re().is_union(r, r1, r2) || re().is_intersection(r, r1, r2) || + re().is_xor(r, r1, r2) || m().is_ite(r, p, r1, r2)) { check_deriv_normal_form(r1, new_level); check_deriv_normal_form(r2, new_level); @@ -2921,6 +2954,14 @@ expr_ref seq_rewriter::mk_derivative(expr* r) { return mk_antimirov_deriv(v, r, m().mk_true()); } +expr_ref seq_rewriter::mk_brz_derivative(expr* r) { + sort* seq_sort = nullptr, * ele_sort = nullptr; + VERIFY(m_util.is_re(r, seq_sort)); + VERIFY(m_util.is_seq(seq_sort, ele_sort)); + expr_ref v(m().mk_var(0, ele_sort), m()); + return mk_derivative_rec(v, r); +} + expr_ref seq_rewriter::mk_derivative(expr* ele, expr* r) { return mk_antimirov_deriv(ele, r, m().mk_true()); } @@ -3120,6 +3161,10 @@ void seq_rewriter::mk_antimirov_deriv_rec(expr* e, expr* r, expr* path, expr_ref result = mk_antimirov_deriv_intersection(e, mk_antimirov_deriv(e, r1, path), mk_antimirov_deriv_negate(e, mk_antimirov_deriv(e, r2, path)), m().mk_true()); + else if (re().is_xor(r, r1, r2)) + // D(e, r1 XOR r2) = D(e, r1) XOR D(e, r2) + result = mk_der_xor(mk_antimirov_deriv(e, r1, path), + mk_antimirov_deriv(e, r2, path)); else if (re().is_of_pred(r, r1)) { array_util array(m()); expr* args[2] = { r1, e }; @@ -3451,6 +3496,11 @@ expr_ref seq_rewriter::mk_regex_reverse(expr* r) { auto b1 = mk_regex_reverse(r2); result = re().mk_diff(a1, b1); } + else if (re().is_xor(r, r1, r2)) { + auto a1 = mk_regex_reverse(r1); + auto b1 = mk_regex_reverse(r2); + result = re().mk_xor(a1, b1); + } else if (re().is_star(r, r1)) result = re().mk_star(mk_regex_reverse(r1)); else if (re().is_plus(r, r1)) @@ -3546,7 +3596,6 @@ expr_ref seq_rewriter::simplify_path(expr* elem, expr* path) { expr_ref seq_rewriter::mk_der_antimirov_union(expr* r1, expr* r2) { - verbose_stream() << "union " << r1->get_id() << " " << r2->get_id() << "\n"; return mk_der_op(_OP_RE_ANTIMIROV_UNION, r1, r2); } @@ -3558,6 +3607,10 @@ expr_ref seq_rewriter::mk_der_inter(expr* r1, expr* r2) { return mk_der_op(OP_RE_INTERSECT, r1, r2); } +expr_ref seq_rewriter::mk_der_xor(expr* r1, expr* r2) { + return mk_der_op(OP_RE_XOR, r1, r2); +} + expr_ref seq_rewriter::mk_der_concat(expr* r1, expr* r2) { return mk_der_op(OP_RE_CONCAT, r1, r2); } @@ -3729,7 +3782,7 @@ expr_ref seq_rewriter::mk_der_op_rec(decl_kind k, expr* a, expr* b) { return result; } // Order with higher IDs on the outside - bool is_symmetric = k == OP_RE_UNION || k == OP_RE_INTERSECT; + bool is_symmetric = k == OP_RE_UNION || k == OP_RE_INTERSECT || k == OP_RE_XOR; if (is_symmetric && get_id(ca) < get_id(cb)) { std::swap(a, b); std::swap(ca, cb); @@ -3776,6 +3829,10 @@ expr_ref seq_rewriter::mk_der_op_rec(decl_kind k, expr* a, expr* b) { if (BR_FAILED == mk_re_concat(a, b, result)) result = re().mk_concat(a, b); break; + case OP_RE_XOR: + if (BR_FAILED == mk_re_xor(a, b, result)) + result = re().mk_xor(a, b); + break; default: UNREACHABLE(); break; @@ -3806,6 +3863,10 @@ expr_ref seq_rewriter::mk_der_op(decl_kind k, expr* a, expr* b) { if (BR_FAILED != mk_re_concat(a, b, result)) return result; break; + case OP_RE_XOR: + if (BR_FAILED != mk_re_xor0(a, b, result)) + return result; + break; default: break; } @@ -3909,6 +3970,13 @@ expr_ref seq_rewriter::mk_der_cond(expr* cond, expr* ele, sort* seq_sort) { return result; } +/* + Classical Brzozowski derivative used by the regex_bisim equivalence + procedure. Unlike `mk_antimirov_deriv`, this variant never creates + _OP_RE_ANTIMIROV_UNION nodes — it stays in a classical (single regex + tree) form. The bisimulation algorithm relies on this so that each + leaf of D(p XOR q) is a coherent XOR pair (D_v p) XOR (D_v q). +*/ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) { expr_ref result(m()); sort* seq_sort = nullptr, *ele_sort = nullptr; @@ -3920,57 +3988,59 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) { unsigned lo = 0, hi = 0; if (re().is_concat(r, r1, r2)) { expr_ref is_n = is_nullable(r1); - expr_ref dr1 = mk_derivative(ele, r1); + expr_ref dr1 = mk_derivative_rec(ele, r1); result = mk_der_concat(dr1, r2); if (m().is_false(is_n)) { return result; } - expr_ref dr2 = mk_derivative(ele, r2); + expr_ref dr2 = mk_derivative_rec(ele, r2); is_n = re_predicate(is_n, seq_sort); if (re().is_empty(dr2)) { //do not concatenate [], it is a deade-end return result; } else { - // Instead of mk_der_union here, we use mk_der_antimirov_union to - // force the two cases to be considered separately and lifted to - // the top level. This avoids blowup in cases where determinization - // is expensive. - return mk_der_antimirov_union(result, mk_der_concat(is_n, dr2)); + // Classical Brzozowski union: keep the derivative tree free of + // antimirov-union nodes so the bisimulation procedure sees a + // single regex tree whose leaves are XOR pairs. + return mk_der_union(result, mk_der_concat(is_n, dr2)); } } else if (re().is_star(r, r1)) { - return mk_der_concat(mk_derivative(ele, r1), r); + return mk_der_concat(mk_derivative_rec(ele, r1), r); } else if (re().is_plus(r, r1)) { expr_ref star(re().mk_star(r1), m()); - return mk_derivative(ele, star); + return mk_derivative_rec(ele, star); } else if (re().is_union(r, r1, r2)) { - return mk_der_union(mk_derivative(ele, r1), mk_derivative(ele, r2)); + return mk_der_union(mk_derivative_rec(ele, r1), mk_derivative_rec(ele, r2)); } else if (re().is_intersection(r, r1, r2)) { - return mk_der_inter(mk_derivative(ele, r1), mk_derivative(ele, r2)); + return mk_der_inter(mk_derivative_rec(ele, r1), mk_derivative_rec(ele, r2)); } else if (re().is_diff(r, r1, r2)) { - return mk_der_inter(mk_derivative(ele, r1), mk_der_compl(mk_derivative(ele, r2))); + return mk_der_inter(mk_derivative_rec(ele, r1), mk_der_compl(mk_derivative_rec(ele, r2))); + } + else if (re().is_xor(r, r1, r2)) { + return mk_der_xor(mk_derivative_rec(ele, r1), mk_derivative_rec(ele, r2)); } else if (m().is_ite(r, p, r1, r2)) { // there is no BDD normalization here - result = m().mk_ite(p, mk_derivative(ele, r1), mk_derivative(ele, r2)); + result = m().mk_ite(p, mk_derivative_rec(ele, r1), mk_derivative_rec(ele, r2)); return result; } else if (re().is_opt(r, r1)) { - return mk_derivative(ele, r1); + return mk_derivative_rec(ele, r1); } else if (re().is_complement(r, r1)) { - return mk_der_compl(mk_derivative(ele, r1)); + return mk_der_compl(mk_derivative_rec(ele, r1)); } else if (re().is_loop(r, r1, lo)) { if (lo > 0) { lo--; } - result = mk_derivative(ele, r1); + result = mk_derivative_rec(ele, r1); //do not concatenate with [] (emptyset) if (re().is_empty(result)) { return result; @@ -3988,7 +4058,7 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) { if (lo > 0) { lo--; } - result = mk_derivative(ele, r1); + result = mk_derivative_rec(ele, r1); //do not concatenate with [] (emptyset) or handle the rest of the loop if no more iterations remain if (re().is_empty(result) || hi == 0) { return result; @@ -4756,6 +4826,91 @@ br_status seq_rewriter::mk_re_diff(expr* a, expr* b, expr_ref& result) { return BR_REWRITE2; } +/* + Symmetric difference / XOR of regexes. + LANG(a XOR b) = (LANG(a) \ LANG(b)) U (LANG(b) \ LANG(a)) + + Equivalence preserving rewrites applied here (paper Section 5): + r XOR r = [] + r XOR [] = r + [] XOR r = r + comp(r) XOR comp(s) = r XOR s + r XOR comp(s) = comp(r XOR s) + comp(r) XOR s = comp(r XOR s) + full_seq XOR r = comp(r) + r XOR full_seq = comp(r) + We also normalize the argument order using expression ids so that the + structure is canonical for AC. +*/ +br_status seq_rewriter::mk_re_xor0(expr* a, expr* b, expr_ref& result) { + // Reduction-only variant of mk_re_xor for use inside mk_der_op. + // Avoids any transformation that would create a top-level re.xor + // node (e.g. AC normalisation or complement absorption), because + // mk_der_op needs to keep distributing the operation through ITE + // BDDs. Only structural simplifications that produce a non-XOR + // result are applied here. + if (a == b) { + result = re().mk_empty(a->get_sort()); + return BR_DONE; + } + if (re().is_empty(a)) { + result = b; + return BR_DONE; + } + if (re().is_empty(b)) { + result = a; + return BR_DONE; + } + return BR_FAILED; +} + +br_status seq_rewriter::mk_re_xor(expr* a, expr* b, expr_ref& result) { + if (a == b) { + result = re().mk_empty(a->get_sort()); + return BR_DONE; + } + if (re().is_empty(a)) { + result = b; + return BR_DONE; + } + if (re().is_empty(b)) { + result = a; + return BR_DONE; + } + if (re().is_full_seq(a)) { + result = re().mk_complement(b); + return BR_REWRITE1; + } + if (re().is_full_seq(b)) { + result = re().mk_complement(a); + return BR_REWRITE1; + } + expr* ra = nullptr, * rb = nullptr; + bool ca = re().is_complement(a, ra); + bool cb = re().is_complement(b, rb); + if (ca && cb) { + // comp(ra) XOR comp(rb) = ra XOR rb + result = re().mk_xor(ra, rb); + return BR_REWRITE1; + } + if (ca) { + // comp(ra) XOR b = comp(ra XOR b) + result = re().mk_complement(re().mk_xor(ra, b)); + return BR_REWRITE2; + } + if (cb) { + // a XOR comp(rb) = comp(a XOR rb) + result = re().mk_complement(re().mk_xor(a, rb)); + return BR_REWRITE2; + } + // Normalize order using expression ids (AC normalization). + if (a->get_id() > b->get_id()) { + result = re().mk_xor(b, a); + return BR_DONE; + } + return BR_FAILED; +} + br_status seq_rewriter::mk_re_loop(func_decl* f, unsigned num_args, expr* const* args, expr_ref& result) { rational n1, n2; @@ -5177,6 +5332,30 @@ br_status seq_rewriter::reduce_re_eq(expr* l, expr* r, expr_ref& result) { if (re().is_empty(r)) { return reduce_re_is_empty(l, result); } + if (l == r) { + result = m().mk_true(); + return BR_DONE; + } + /* + * Try the union-find bisimulation procedure for ground regex equality. + * Guarded against re-entry because the bisim may construct equalities + * indirectly. On l_undef the rewriter falls through to the existing + * axiomatisation path. + */ + if (!m_in_bisim && is_ground(l) && is_ground(r)) { + flet _block(m_in_bisim, true); + seq::regex_bisim bisim(*this); + switch (bisim.are_equivalent(l, r)) { + case l_true: + result = m().mk_true(); + return BR_DONE; + case l_false: + result = m().mk_false(); + return BR_DONE; + case l_undef: + break; + } + } return BR_FAILED; } diff --git a/src/ast/rewriter/seq_rewriter.h b/src/ast/rewriter/seq_rewriter.h index 4ebd770a7..618124e10 100644 --- a/src/ast/rewriter/seq_rewriter.h +++ b/src/ast/rewriter/seq_rewriter.h @@ -135,7 +135,8 @@ class seq_rewriter { // re2automaton m_re2aut; op_cache m_op_cache; expr_ref_vector m_es, m_lhs, m_rhs; - bool m_coalesce_chars; + bool m_coalesce_chars; + bool m_in_bisim { false }; enum length_comparison { shorter_c, @@ -180,6 +181,7 @@ class seq_rewriter { expr_ref mk_der_concat(expr* a, expr* b); expr_ref mk_der_union(expr* a, expr* b); expr_ref mk_der_inter(expr* a, expr* b); + expr_ref mk_der_xor(expr* a, expr* b); expr_ref mk_der_compl(expr* a); expr_ref mk_der_cond(expr* cond, expr* ele, sort* seq_sort); expr_ref mk_der_antimirov_union(expr* r1, expr* r2); @@ -262,6 +264,8 @@ class seq_rewriter { br_status mk_re_complement(expr* a, expr_ref& result); br_status mk_re_star(expr* a, expr_ref& result); br_status mk_re_diff(expr* a, expr* b, expr_ref& result); + br_status mk_re_xor(expr* a, expr* b, expr_ref& result); + br_status mk_re_xor0(expr* a, expr* b, expr_ref& result); br_status mk_re_plus(expr* a, expr_ref& result); br_status mk_re_opt(expr* a, expr_ref& result); br_status mk_re_power(func_decl* f, expr* a, expr_ref& result); @@ -381,6 +385,18 @@ public: return result; } + /* + * Construct r1 XOR r2 applying the structural rewrites in + * mk_re_xor (r XOR r = empty, comp/empty/full normalisation, AC + * ordering). Used by the bisimulation procedure. + */ + expr_ref mk_re_xor_simplified(expr* r1, expr* r2) { + expr_ref result(m()); + if (mk_re_xor(r1, r2, result) == BR_FAILED) + result = re().mk_xor(r1, r2); + return result; + } + /** * check if regular expression is of the form all ++ s ++ all ++ t + u ++ all, where, s, t, u are sequences */ @@ -410,6 +426,17 @@ public: */ expr_ref mk_derivative(expr* r); + /* + Classical (non-antimirov) Brzozowski derivative wrt the canonical + variable v0 = (:var 0). Unlike `mk_derivative` this entry point keeps + the symbolic derivative as a single transition regex (TRegex): boolean + operators are pushed into the ITE leaves rather than lifted to the top + via _OP_RE_ANTIMIROV_UNION. Used by the regex_bisim equivalence + procedure which relies on each leaf of D(p XOR q) being a coherent + XOR pair (D_v p) XOR (D_v q). + */ + expr_ref mk_brz_derivative(expr* r); + // heuristic elimination of element from condition that comes form a derivative. // special case optimization for conjunctions of equalities, disequalities and ranges. void elim_condition(expr* elem, expr_ref& cond); diff --git a/src/ast/seq_decl_plugin.cpp b/src/ast/seq_decl_plugin.cpp index 8986e7ec1..59f5d046c 100644 --- a/src/ast/seq_decl_plugin.cpp +++ b/src/ast/seq_decl_plugin.cpp @@ -230,6 +230,7 @@ void seq_decl_plugin::init() { m_sigs[OP_RE_UNION] = alloc(psig, m, "re.union", 1, 2, reAreA, reA); m_sigs[OP_RE_INTERSECT] = alloc(psig, m, "re.inter", 1, 2, reAreA, reA); m_sigs[OP_RE_DIFF] = alloc(psig, m, "re.diff", 1, 2, reAreA, reA); + m_sigs[OP_RE_XOR] = alloc(psig, m, "re.xor", 1, 2, reAreA, reA); m_sigs[OP_RE_LOOP] = alloc(psig, m, "re.loop", 1, 1, &reA, reA); m_sigs[OP_RE_POWER] = alloc(psig, m, "re.^", 1, 1, &reA, reA); m_sigs[OP_RE_COMPLEMENT] = alloc(psig, m, "re.comp", 1, 1, &reA, reA); @@ -507,6 +508,7 @@ func_decl* seq_decl_plugin::mk_func_decl(decl_kind k, unsigned num_parameters, p case OP_RE_CONCAT: case OP_RE_INTERSECT: case OP_RE_DIFF: + case OP_RE_XOR: m_has_re = true; return mk_left_assoc_fun(k, arity, domain, range, k, k); @@ -1513,6 +1515,13 @@ std::ostream& seq_util::rex::pp::print(std::ostream& out, expr* e) const { print(out, r2); out << ")"; } + else if (re.is_xor(e, r1, r2)) { + out << "("; + print(out, r1); + out << ")XOR("; + print(out, r2); + out << ")"; + } else if (re.m.is_ite(e, s, r1, r2)) { out << (html_encode ? "(𝐢𝐟 " : "(if "); print(out, s); @@ -1704,6 +1713,10 @@ seq_util::rex::info seq_util::rex::mk_info_rec(app* e) const { i1 = get_info_rec(e->get_arg(0)); i2 = get_info_rec(e->get_arg(1)); return i1.diff(i2); + case OP_RE_XOR: + i1 = get_info_rec(e->get_arg(0)); + i2 = get_info_rec(e->get_arg(1)); + return i1.xor_(i2); } return unknown_info; } @@ -1829,6 +1842,25 @@ seq_util::rex::info seq_util::rex::info::diff(seq_util::rex::info const& rhs) co return *this; } +seq_util::rex::info seq_util::rex::info::xor_(seq_util::rex::info const& rhs) const { + if (is_known()) { + if (rhs.is_known()) { + // Null(p XOR q) = Null(p) XOR Null(q) + lbool xor_nullable = l_undef; + if (nullable != l_undef && rhs.nullable != l_undef) + xor_nullable = (nullable == rhs.nullable) ? l_false : l_true; + return info(interpreted & rhs.interpreted, + xor_nullable, + 0, + false); + } + else + return rhs; + } + else + return *this; +} + seq_util::rex::info seq_util::rex::info::orelse(seq_util::rex::info const& i) const { if (is_known()) { if (i.is_known()) { diff --git a/src/ast/seq_decl_plugin.h b/src/ast/seq_decl_plugin.h index 756ec38e4..75995ef7b 100644 --- a/src/ast/seq_decl_plugin.h +++ b/src/ast/seq_decl_plugin.h @@ -68,6 +68,7 @@ enum seq_op_kind { OP_RE_UNION, OP_RE_DIFF, OP_RE_INTERSECT, + OP_RE_XOR, OP_RE_LOOP, OP_RE_POWER, OP_RE_COMPLEMENT, @@ -491,6 +492,7 @@ public: info disj(info const& rhs) const; info conj(info const& rhs) const; info diff(info const& rhs) const; + info xor_(info const& rhs) const; info orelse(info const& rhs) const; info loop(unsigned lower, unsigned upper) const; @@ -523,6 +525,7 @@ public: app* mk_union(expr* r1, expr* r2) { return m.mk_app(m_fid, OP_RE_UNION, r1, r2); } app* mk_inter(expr* r1, expr* r2) { return m.mk_app(m_fid, OP_RE_INTERSECT, r1, r2); } app* mk_diff(expr* r1, expr* r2) { return m.mk_app(m_fid, OP_RE_DIFF, r1, r2); } + app* mk_xor(expr* r1, expr* r2) { return m.mk_app(m_fid, OP_RE_XOR, r1, r2); } app* mk_complement(expr* r) { return m.mk_app(m_fid, OP_RE_COMPLEMENT, r); } app* mk_star(expr* r) { return m.mk_app(m_fid, OP_RE_STAR, r); } app* mk_plus(expr* r) { return m.mk_app(m_fid, OP_RE_PLUS, r); } @@ -546,6 +549,7 @@ public: bool is_union(expr const* n) const { return is_app_of(n, m_fid, OP_RE_UNION); } bool is_intersection(expr const* n) const { return is_app_of(n, m_fid, OP_RE_INTERSECT); } bool is_diff(expr const* n) const { return is_app_of(n, m_fid, OP_RE_DIFF); } + bool is_xor(expr const* n) const { return is_app_of(n, m_fid, OP_RE_XOR); } bool is_complement(expr const* n) const { return is_app_of(n, m_fid, OP_RE_COMPLEMENT); } bool is_star(expr const* n) const { return is_app_of(n, m_fid, OP_RE_STAR); } bool is_plus(expr const* n) const { return is_app_of(n, m_fid, OP_RE_PLUS); } @@ -578,6 +582,7 @@ public: MATCH_BINARY(is_union); MATCH_BINARY(is_intersection); MATCH_BINARY(is_diff); + MATCH_BINARY(is_xor); MATCH_BINARY(is_range); MATCH_UNARY(is_complement); MATCH_UNARY(is_star); diff --git a/src/ast/sls/sls_seq_plugin.cpp b/src/ast/sls/sls_seq_plugin.cpp index 8584741f3..d8dab45a7 100644 --- a/src/ast/sls/sls_seq_plugin.cpp +++ b/src/ast/sls/sls_seq_plugin.cpp @@ -511,6 +511,7 @@ namespace sls { case OP_RE_CONCAT: case OP_RE_UNION: case OP_RE_DIFF: + case OP_RE_XOR: case OP_RE_INTERSECT: case OP_RE_LOOP: case OP_RE_POWER: @@ -1294,6 +1295,7 @@ namespace sls { case OP_RE_CONCAT: case OP_RE_UNION: case OP_RE_DIFF: + case OP_RE_XOR: case OP_RE_INTERSECT: case OP_RE_LOOP: case OP_RE_POWER: diff --git a/src/smt/seq_regex.cpp b/src/smt/seq_regex.cpp index 64487a21e..0e9a03b63 100644 --- a/src/smt/seq_regex.cpp +++ b/src/smt/seq_regex.cpp @@ -21,6 +21,7 @@ Author: #include "ast/expr_abstract.h" #include "ast/ast_util.h" #include "ast/for_each_expr.h" +#include "ast/rewriter/seq_regex_bisim.h" #include namespace smt { @@ -460,6 +461,23 @@ namespace smt { if (re().is_empty(r)) //trivially true return; + // Try the bisimulation procedure on ground regexes first. If it + // returns a definite answer, dispatch the corresponding axiom and + // bypass the symbolic emptiness/derivative closure. + if (is_ground(r1) && is_ground(r2)) { + seq::regex_bisim bisim(seq_rw()); + switch (bisim.are_equivalent(r1, r2)) { + case l_true: + STRACE(seq_regex_brief, tout << "bisim:eq ";); + return; // trivially true + case l_false: + STRACE(seq_regex_brief, tout << "bisim:neq ";); + th.add_axiom(~th.mk_eq(r1, r2, false), false_literal); + return; + case l_undef: + break; + } + } expr_ref emp(re().mk_empty(r->get_sort()), m); expr_ref f(m.mk_fresh_const("re.char", seq_sort), m); expr_ref is_empty = sk().mk_is_empty(r, r, f); @@ -478,6 +496,20 @@ namespace smt { sort* seq_sort = nullptr; VERIFY(u().is_re(r1, seq_sort)); expr_ref r = symmetric_diff(r1, r2); + if (is_ground(r1) && is_ground(r2)) { + seq::regex_bisim bisim(seq_rw()); + switch (bisim.are_equivalent(r1, r2)) { + case l_true: + STRACE(seq_regex_brief, tout << "bisim:eq ";); + th.add_axiom(th.mk_eq(r1, r2, false), false_literal); + return; + case l_false: + STRACE(seq_regex_brief, tout << "bisim:neq ";); + return; // trivially satisfied + case l_undef: + break; + } + } expr_ref emp(re().mk_empty(r->get_sort()), m); expr_ref n(m.mk_fresh_const("re.char", seq_sort), m); expr_ref is_non_empty = sk().mk_is_non_empty(r, r, n);