diff --git a/.gitignore b/.gitignore index 2d268c988f..8e5bd7294d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,12 @@ *~ rebase.cmd +reports/ +crashes/ *.pyc *.pyo # Ignore callgrind files callgrind.out.* +gmon.out # .hpp files are automatically generated *.hpp .env diff --git a/gmon.out b/gmon.out deleted file mode 100644 index cba10a19b2..0000000000 Binary files a/gmon.out and /dev/null differ diff --git a/src/ast/rewriter/CMakeLists.txt b/src/ast/rewriter/CMakeLists.txt index 6db1320ab9..c118501926 100644 --- a/src/ast/rewriter/CMakeLists.txt +++ b/src/ast/rewriter/CMakeLists.txt @@ -39,7 +39,11 @@ z3_add_component(rewriter rewriter.cpp seq_axioms.cpp seq_eq_solver.cpp + seq_derive.cpp seq_subset.cpp + seq_derive.cpp + seq_range_collapse.cpp + seq_range_predicate.cpp seq_rewriter.cpp seq_regex_bisim.cpp seq_skolem.cpp diff --git a/src/ast/rewriter/bool_rewriter.cpp b/src/ast/rewriter/bool_rewriter.cpp index 321bd6f47d..945aa297ee 100644 --- a/src/ast/rewriter/bool_rewriter.cpp +++ b/src/ast/rewriter/bool_rewriter.cpp @@ -19,6 +19,7 @@ Notes: #include "ast/rewriter/bool_rewriter.h" #include "params/bool_rewriter_params.hpp" #include "ast/rewriter/rewriter_def.h" +#include "ast/rewriter/expr_safe_replace.h" #include "ast/ast_lt.h" #include "ast/for_each_expr.h" #include @@ -1185,4 +1186,30 @@ void bool_rewriter::mk_ge2(expr* a, expr* b, expr* c, expr_ref& r) { } -template class rewriter_tpl; +bool bool_rewriter::decompose_ite(expr *r, expr_ref &c, expr_ref &th, expr_ref &el) { + expr *cond = nullptr, *r1 = nullptr, *r2 = nullptr; + if (m().is_ite(r, cond, r1, r2)) { + c = cond; + th = r1; + el = r2; + return true; + } + for (expr *e : subterms::ground(expr_ref(r, m()))) { + if (m().is_ite(e, cond, r1, r2)) { + m_rep1.reset(); + m_rep2.reset(); + m_rep1.insert(e, r1); + m_rep2.insert(e, r2); + c = cond; + th = r; + el = r; + m_rep1(th); + m_rep2(el); + return true; + } + } + return false; +} + + +template class rewriter_tpl; \ No newline at end of file diff --git a/src/ast/rewriter/bool_rewriter.h b/src/ast/rewriter/bool_rewriter.h index 2b52404e50..87c50f171e 100644 --- a/src/ast/rewriter/bool_rewriter.h +++ b/src/ast/rewriter/bool_rewriter.h @@ -20,6 +20,7 @@ Notes: #include "ast/ast.h" #include "ast/rewriter/rewriter.h" +#include "ast/rewriter/expr_safe_replace.h" #include "util/params.h" /** @@ -64,6 +65,7 @@ class bool_rewriter { ptr_vector m_todo1, m_todo2; unsigned_vector m_counts1, m_counts2; expr_mark m_marked; + expr_safe_replace m_rep1, m_rep2; br_status mk_flat_and_core(unsigned num_args, expr * const * args, expr_ref & result); br_status mk_flat_or_core(unsigned num_args, expr * const * args, expr_ref & result); @@ -87,7 +89,7 @@ class bool_rewriter { expr_ref simplify_eq_ite(expr* value, expr* ite); public: - bool_rewriter(ast_manager & m, params_ref const & p = params_ref()):m_manager(m), m_local_ctx_cost(0) { + bool_rewriter(ast_manager & m, params_ref const & p = params_ref()):m_manager(m), m_local_ctx_cost(0), m_rep1(m), m_rep2(m) { updt_params(p); } ast_manager & m() const { return m_manager; } @@ -242,6 +244,11 @@ public: void mk_nand(expr * arg1, expr * arg2, expr_ref & result); void mk_nor(expr * arg1, expr * arg2, expr_ref & result); void mk_ge2(expr* a, expr* b, expr* c, expr_ref& result); + + // If r is, or contains, an if-then-else, decompose it into a top-level + // ite by hoisting the (first) inner ite condition: returns c, th, el such + // that r is equivalent to (ite c th el). Returns false if r has no ite. + bool decompose_ite(expr *r, expr_ref &c, expr_ref &th, expr_ref &el); }; struct bool_rewriter_cfg : public default_rewriter_cfg { diff --git a/src/ast/rewriter/seq_derive.cpp b/src/ast/rewriter/seq_derive.cpp new file mode 100644 index 0000000000..f99abb382f --- /dev/null +++ b/src/ast/rewriter/seq_derive.cpp @@ -0,0 +1,1416 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_derive.cpp + +Abstract: + + Symbolic derivative computation for regular expressions. + Produces an ITE-tree (transition regex) representation following + the approach of RE# (Varatalu, Veanes, Ernits - POPL 2025). + + The symbolic derivative δ(r) maps each character to the resulting + derivative state via an ITE-tree. The free variable (:var 0) represents + the input character. + +Authors: + + Nikolaj Bjorner (nbjorner) 2026-06-03 + +--*/ + +#include "ast/rewriter/seq_derive.h" +#include "ast/rewriter/seq_rewriter.h" +#include "ast/rewriter/var_subst.h" +#include "ast/ast_pp.h" +#include "ast/array_decl_plugin.h" +#include "ast/rewriter/bool_rewriter.h" +#include "util/util.h" +#include + +namespace seq { + + derive::derive(ast_manager& m, seq_rewriter& re) : + m(m), + m_util(m), + m_autil(m), + m_br(m), + m_re(re), + m_trail(m), + m_ele(m), + m_path_expr(m) { + m_br.set_flat_and_or(false); + } + + void derive::reset() { + m_acache.reset(); + m_bcache.reset(); + m_atop_cache.reset(); + m_btop_cache.reset(); + reset_op_caches(); + m_trail.reset(); + m_ele = nullptr; + } + + // Reset only operation caches (union/inter/concat/complement) + // while preserving derivative caches (m_cache, m_top_cache) + // The op cache does index on m_ele so it has to be reset if m_ele changes. + void derive::reset_op_caches() { + m_aunion_cache.reset(); + m_ainter_cache.reset(); + m_aconcat_cache.reset(); + m_acomplement_cache.reset(); + m_bunion_cache.reset(); + m_binter_cache.reset(); + m_bconcat_cache.reset(); + m_bcomplement_cache.reset(); + m_ele = nullptr; + } + + expr_ref derive::operator()(derivative_kind k, expr* ele, expr* r) { + m_derivative_kind = k; + SASSERT(m_util.is_re(r)); + if (m_trail.size() > 500000) + reset(); + else if (m_trail.size() > 100000 || ele != m_ele) + reset_op_caches(); + sort *seq_sort = nullptr, *ele_sort = nullptr; + VERIFY(m_util.is_re(r, seq_sort)); + VERIFY(m_util.is_seq(seq_sort, ele_sort)); + // Check top-level cache (post-simplify result) + expr* cached = nullptr; + expr_ref result(m); + if (top_cache().find(ele, r, cached)) { + result = cached; + return result; + } + // Pin ele and r + m_trail.push_back(ele); + m_trail.push_back(r); + + // Always compute the SYMBOLIC derivative wrt the canonical + // variable v (so the cached result is reusable for any + // concrete ele via substitution below). Using the concrete + // `ele` here would bake it into the cached ITE-tree and + // poison future lookups for the same r with a different ele. + m_ele = ele; + m_depth = 0; + // Initialize path state for inline pruning + m_path.reset(); + m_intervals.reset(); + m_intervals.push_back({0u, u().max_char()}); + m_intervals_start = 0; + m_path_expr = m.mk_true(); + result = derive_rec(r); + top_cache().insert(ele, r, result); + + // pin the final result + m_trail.push_back(result); + return result; + } + + expr_ref derive::operator()(derivative_kind k, expr* r) { + SASSERT(m_util.is_re(r)); + sort* seq_sort = nullptr, * ele_sort = nullptr; + VERIFY(m_util.is_re(r, seq_sort)); + VERIFY(m_util.is_seq(seq_sort, ele_sort)); + expr_ref v(m.mk_var(0, ele_sort), m); + return (*this)(k,v, r); + } + + // ------------------------------------------------------- + // Core derivative computation + // ------------------------------------------------------- + + expr_ref derive::derive_rec(expr* r) { + SASSERT(m_util.is_re(r)); + + // Check cache (indexed by both m_ele and r) + expr* cached = nullptr; + if (cache().find(m_ele, r, cached)) + return expr_ref(cached, m); + + // Depth check + if (m_depth >= m_max_depth) { + // Return stuck derivative (the derivative operator applied symbolically) + return expr_ref(re().mk_derivative(m_ele, r), m); + } + + flet _scoped_depth(m_depth, m_depth + 1); + expr_ref result = derive_core(r); + + // Cache the result + cache().insert(m_ele, r, result); + m_trail.push_back(m_ele); + m_trail.push_back(r); + m_trail.push_back(result); + return result; + } + + // Forward declaration helper + expr_ref derive::derive_core(expr* r) { + sort* s = nullptr; + VERIFY(m_util.is_re(r, s)); + + auto nothing = [&]() { return expr_ref(re().mk_empty(r->get_sort()), m); }; + auto epsilon = [&]() { return expr_ref(re().mk_to_re(u().str.mk_empty(s)), m); }; + auto dotstar = [&]() { return expr_ref(re().mk_full_seq(r->get_sort()), m); }; + + expr* r1 = nullptr; + expr* r2 = nullptr; + expr* cond = nullptr; + unsigned lo = 0, hi = 0; + + // δ(∅) = ∅, δ(ε) = ∅ + if (re().is_empty(r) || re().is_epsilon(r)) + return nothing(); + + // δ(Σ*) = Σ*, δ(.+) = Σ* + if (re().is_full_seq(r) || re().is_dot_plus(r)) + return dotstar(); + + // δ(.) = ε (full char accepts any single character) + if (re().is_full_char(r)) + return epsilon(); + + // δ(str.to_re(s)) - derivative of a literal string + if (re().is_to_re(r, r1)) + return derive_to_re(r1, s); + + // δ(re.range(lo, hi)) - character range + if (re().is_range(r, r1, r2)) + return derive_range(r1, r2, s); + + // δ(re.of_pred(p)) - predicate-based regex + if (re().is_of_pred(r, r1)) + return derive_of_pred(r1, s); + + // δ(r1 · r2) = δ(r1) · r2 ∪ (if nullable(r1) then δ(r2) else ∅) + if (re().is_concat(r, r1, r2)) { + // Ensure right-associative form first. A left-nested concat + // (a·b)·r2 makes the head r1 a large sub-concat, so deriving it + // recurses through the whole left spine and can exceed + // m_max_depth, producing stuck symbolic re.derivative terms that + // accumulate across states and blow up. mk_concat right- + // associates in a single linear pass (without touching the + // derivative depth counter), keeping the head atomic. + if (re().is_concat(r1)) { + expr_ref rr = mk_concat(r1, r2); + if (rr != r) + return derive_rec(rr); + } + expr_ref d1 = derive_rec(r1); + expr_ref d1_r2 = mk_deriv_concat(d1, r2); + expr_ref nullable_r1 = is_nullable(r1); + if (m.is_true(nullable_r1)) + return mk_union(d1_r2, derive_rec(r2)); + if (m.is_false(nullable_r1)) + return d1_r2; + // Conditional: nullable is a Boolean expression + expr_ref d2 = derive_rec(r2); + expr_ref guarded = mk_ite(nullable_r1, d2, nothing()); + return mk_union(d1_r2, guarded); + } + + // δ(r1 ∪ r2) = δ(r1) ∪ δ(r2) + if (re().is_union(r, r1, r2)) { + expr_ref d1 = derive_rec(r1); + expr_ref d2 = derive_rec(r2); + return mk_union(d1, d2); + } + + // δ(r1 x r2) = δ(r1) x δ(r2) + if (re().is_xor(r, r1, r2)) { + expr_ref d1 = derive_rec(r1); + expr_ref d2 = derive_rec(r2); + return mk_xor(d1, d2); + } + + // δ(r1 ∩ r2) = δ(r1) ∩ δ(r2) + if (re().is_intersection(r, r1, r2)) { + expr_ref d1 = derive_rec(r1); + expr_ref d2 = derive_rec(r2); + return mk_inter(d1, d2); + } + + // δ(~r1) = ~δ(r1) + if (re().is_complement(r, r1)) { + expr_ref d1 = derive_rec(r1); + return mk_complement(d1); + } + + // δ(r1*) = δ(r1) · r1* + if (re().is_star(r, r1)) { + expr_ref d1 = derive_rec(r1); + expr_ref star_r1(re().mk_star(r1), m); + return mk_deriv_concat(d1, star_r1); + } + + // δ(r1+) = δ(r1) · r1* + if (re().is_plus(r, r1)) { + expr_ref d1 = derive_rec(r1); + expr_ref star_r1(re().mk_star(r1), m); + return mk_deriv_concat(d1, star_r1); + } + + // δ(r1?) = δ(r1) + if (re().is_opt(r, r1)) + return derive_rec(r1); + + // δ(r1{lo,hi}) + if (re().is_loop(r, r1, lo, hi)) { + if (hi == 0 || hi < lo) + return nothing(); + expr_ref d1 = derive_rec(r1); + expr_ref tail(re().mk_loop_proper(r1, (lo == 0 ? 0 : lo - 1), hi - 1), m); + return mk_deriv_concat(d1, tail); + } + + // δ(r1{lo,}) - unbounded loop + if (re().is_loop(r, r1, lo)) { + expr_ref d1 = derive_rec(r1); + expr_ref tail(re().mk_loop(r1, (lo == 0 ? 0 : lo - 1)), m); + return mk_deriv_concat(d1, tail); + } + + // δ(r1 \ r2) = δ(r1) ∩ ~δ(r2) + if (re().is_diff(r, r1, r2)) { + expr_ref d1 = derive_rec(r1); + expr_ref d2 = derive_rec(r2); + expr_ref neg_d2 = mk_complement(d2); + return mk_inter(d1, neg_d2); + } + + // δ(ite(c, r1, r2)) = ite(c, δ(r1), δ(r2)) + if (m.is_ite(r, cond, r1, r2)) { + expr_ref d1 = derive_rec(r1); + expr_ref d2 = derive_rec(r2); + return mk_ite(cond, d1, d2); + } + + // δ(reverse(r1)) - normalize by pushing reverse inward, then derive + if (re().is_reverse(r, r1)) { + expr_ref norm = mk_regex_reverse(r1); + if (norm != r) + return derive_rec(norm); + return expr_ref(re().mk_derivative(m_ele, r), m); + } + + // Stuck/uninterpreted case + return expr_ref(re().mk_derivative(m_ele, r), m); + } + + // ------------------------------------------------------- + // Derivative of specific regex constructs + // ------------------------------------------------------- + + expr_ref derive::derive_to_re(expr* s, sort* seq_sort) { + sort* re_sort = re().mk_re(seq_sort); + // δ(to_re("")) = ∅ + if (u().str.is_empty(s)) + return expr_ref(re().mk_empty(re_sort), m); + + // δ(to_re("c₁c₂...cₙ")) = ite(ele = c₁, to_re("c₂...cₙ"), ∅) + zstring zs; + if (u().str.is_string(s, zs)) { + if (zs.length() == 0) + return expr_ref(re().mk_empty(re_sort), m); + // First character + expr_ref head(m_util.mk_char(zs[0]), m); + expr_ref cond(m.mk_eq(m_ele, head), m); + // Tail string + expr_ref tail_str(u().str.mk_string(zs.extract(1, zs.length() - 1)), m); + expr_ref tail_re(re().mk_to_re(tail_str), m); + expr_ref empty(re().mk_empty(re_sort), m); + return mk_ite(cond, tail_re, empty); + } + + // δ(to_re(unit(c))) = ite(ele = c, ε, ∅) + expr* ch = nullptr; + if (u().str.is_unit(s, ch)) { + expr_ref eps(re().mk_to_re(u().str.mk_empty(seq_sort)), m); + expr_ref empty(re().mk_empty(re_sort), m); + expr_ref cond(m.mk_eq(m_ele, ch), m); + return mk_ite(cond, eps, empty); + } + + // δ(to_re(s1 ++ s2)) = ite(head matches, to_re(tail ++ s2), ∅) + expr* s1 = nullptr, * s2 = nullptr; + if (u().str.is_concat(s, s1, s2)) { + expr_ref hd(m), tl(m); + if (get_head_tail(s1, s2, hd, tl)) { + expr_ref cond(m.mk_eq(m_ele, hd), m); + expr_ref tail_re(re().mk_to_re(tl), m); + expr_ref empty(re().mk_empty(re_sort), m); + return mk_ite(cond, tail_re, empty); + } + } + + // δ(to_re(itos(n))) - derivative of integer-to-string + // itos(n) produces digits '0'-'9' when n >= 0, empty when n < 0 + expr* n = nullptr; + if (u().str.is_itos(s, n)) { + expr_ref empty(re().mk_empty(re_sort), m); + // Guard: n >= 0 and element is a digit and element = s[0] + expr_ref n_ge_0(m_autil.mk_ge(n, m_autil.mk_int(0)), m); + expr_ref char_0(m_util.mk_char('0'), m); + expr_ref char_9(m_util.mk_char('9'), m); + expr_ref ge_0(m_util.mk_le(char_0, m_ele), m); + expr_ref le_9(m_util.mk_le(m_ele, char_9), m); + expr_ref is_digit(m.mk_and(ge_0, le_9), m); + // First character of itos(n) matches ele + expr_ref zero_idx(m_autil.mk_int(0), m); + expr_ref first(u().str.mk_nth_i(s, zero_idx), m); + expr_ref eq_first(m.mk_eq(m_ele, first), m); + // Guard = n >= 0 && is_digit && ele = s[0] + expr_ref guard(m.mk_and(n_ge_0, m.mk_and(is_digit, eq_first)), m); + // Tail: to_re(substr(itos(n), 1, len(itos(n)) - 1)) + expr_ref one(m_autil.mk_int(1), m); + expr_ref len(u().str.mk_length(s), m); + expr_ref rest_len(m_autil.mk_sub(len, one), m); + expr_ref rest(u().str.mk_substr(s, one, rest_len), m); + expr_ref rest_re(re().mk_to_re(rest), m); + return mk_ite(guard, rest_re, empty); + } + + // Non-ground sequence: δ(to_re(s)) = ite(s ≠ "" ∧ ele = s[0], to_re(s[1:]), ∅) + expr_ref empty_seq(u().str.mk_empty(seq_sort), m); + expr_ref is_non_empty(m.mk_not(m.mk_eq(s, empty_seq)), m); + expr_ref zero(m_autil.mk_int(0), m); + expr_ref first(u().str.mk_nth_i(s, zero), m); + expr_ref eq_first(m.mk_eq(m_ele, first), m); + expr_ref guard(m.mk_and(is_non_empty, eq_first), m); + expr_ref one(m_autil.mk_int(1), m); + expr_ref len(u().str.mk_length(s), m); + expr_ref rest_len(m_autil.mk_sub(len, one), m); + expr_ref rest(u().str.mk_substr(s, one, rest_len), m); + expr_ref rest_re(re().mk_to_re(rest), m); + expr_ref empty(re().mk_empty(re_sort), m); + return mk_ite(guard, rest_re, empty); + } + + expr_ref derive::derive_range(expr* lo, expr* hi, sort* seq_sort) { + sort* re_sort = re().mk_re(seq_sort); + expr_ref empty(re().mk_empty(re_sort), m); + expr_ref eps(re().mk_to_re(u().str.mk_empty(seq_sort)), m); + + // Extract character values from unit strings + expr_ref c_lo(m), c_hi(m); + if (u().str.is_unit_string(lo, c_lo) && u().str.is_unit_string(hi, c_hi)) { + // Build range condition, simplifying trivial bounds + unsigned lo_val = 0, hi_val = 0; + bool lo_trivial = m_util.is_const_char(c_lo, lo_val) && lo_val == 0; + bool hi_trivial = m_util.is_const_char(c_hi, hi_val) && hi_val == u().max_char(); + + if (lo_trivial && hi_trivial) + return eps; // full charset range — always matches + + expr_ref in_range(m); + if (lo_trivial) + in_range = m_util.mk_le(m_ele, c_hi); + else if (hi_trivial) + in_range = m_util.mk_le(c_lo, m_ele); + else + in_range = m.mk_and(m_util.mk_le(c_lo, m_ele), m_util.mk_le(m_ele, c_hi)); + + return mk_ite(in_range, eps, empty); + } + + // Fallback: stuck derivative + return expr_ref(re().mk_derivative(m_ele, re().mk_range(lo, hi)), m); + } + + expr_ref derive::derive_of_pred(expr* pred, sort* seq_sort) { + sort* re_sort = re().mk_re(seq_sort); + expr_ref empty(re().mk_empty(re_sort), m); + expr_ref eps(re().mk_to_re(u().str.mk_empty(seq_sort)), m); + + // Apply predicate to the element + array_util autil(m); + expr* args[2] = { pred, m_ele }; + expr_ref cond(autil.mk_select(2, args), m); + return mk_ite(cond, eps, empty); + } + + // Extract head character and remaining tail from a sequence + // s1 is the first part, s2 is the continuation (from str.concat(s1, s2)) + bool derive::get_head_tail(expr* s1, expr* s2, expr_ref& hd, expr_ref& tl) { + expr* ch = nullptr; + expr* a = nullptr, * b = nullptr; + if (u().str.is_unit(s1, ch)) { + hd = ch; + tl = s2; + return true; + } + if (u().str.is_concat(s1, a, b)) { + expr_ref new_s2(u().str.mk_concat(b, s2), m); + return get_head_tail(a, new_s2, hd, tl); + } + zstring zs; + if (u().str.is_string(s1, zs) && zs.length() > 0) { + hd = m_util.mk_char(zs[0]); + if (zs.length() == 1) + tl = s2; + else { + expr_ref rest(u().str.mk_string(zs.extract(1, zs.length() - 1)), m); + tl = u().str.mk_concat(rest, s2); + } + return true; + } + return false; + } + + // ------------------------------------------------------- + // Normalize reverse + // ------------------------------------------------------- + + expr_ref derive::mk_regex_reverse(expr* r) { + expr* r1 = nullptr, * r2 = nullptr, * c = nullptr; + unsigned lo = 0, hi = 0; + expr_ref result(m); + if (re().is_empty(r) || re().is_range(r) || re().is_epsilon(r) || re().is_full_seq(r) || + re().is_full_char(r) || re().is_dot_plus(r) || re().is_of_pred(r)) + result = r; + else if (re().is_to_re(r)) + result = re().mk_reverse(r); + else if (re().is_reverse(r, r1)) + result = r1; + else if (re().is_concat(r, r1, r2)) + result = re().mk_concat(mk_regex_reverse(r2), mk_regex_reverse(r1)); + else if (m.is_ite(r, c, r1, r2)) + result = m.mk_ite(c, mk_regex_reverse(r1), mk_regex_reverse(r2)); + else if (re().is_union(r, r1, r2)) { + auto a1 = mk_regex_reverse(r1); + auto b1 = mk_regex_reverse(r2); + result = re().mk_union(a1, b1); + } + else if (re().is_intersection(r, r1, r2)) { + auto a1 = mk_regex_reverse(r1); + auto b1 = mk_regex_reverse(r2); + result = re().mk_inter(a1, b1); + } + else if (re().is_diff(r, r1, r2)) { + auto a1 = mk_regex_reverse(r1); + auto b1 = mk_regex_reverse(r2); + result = re().mk_diff(a1, b1); + } + else if (re().is_star(r, r1)) + result = re().mk_star(mk_regex_reverse(r1)); + else if (re().is_plus(r, r1)) + result = re().mk_plus(mk_regex_reverse(r1)); + else if (re().is_loop(r, r1, lo)) + result = re().mk_loop(mk_regex_reverse(r1), lo); + else if (re().is_loop(r, r1, lo, hi)) + result = re().mk_loop_proper(mk_regex_reverse(r1), lo, hi); + else if (re().is_opt(r, r1)) + result = re().mk_opt(mk_regex_reverse(r1)); + else if (re().is_complement(r, r1)) + result = re().mk_complement(mk_regex_reverse(r1)); + else + result = re().mk_reverse(r); + return result; + } + + // ------------------------------------------------------- + // Nullability - uses info class from seq_decl_plugin.h + // ------------------------------------------------------- + + expr_ref derive::is_nullable(expr* r) { + SASSERT(m_util.is_re(r) || m_util.is_seq(r)); + expr* r1 = nullptr, * r2 = nullptr, * cond = nullptr; + sort* seq_sort = nullptr; + unsigned lo = 0, hi = 0; + zstring s1; + if (m_util.is_re(r)) { + auto info = re().get_info(r); + switch (info.nullable) { + case l_true: return expr_ref(m.mk_true(), m); + case l_false: return expr_ref(m.mk_false(), m); + default: break; + } + } + expr_ref result(m); + if (re().is_concat(r, r1, r2) || + re().is_intersection(r, r1, r2)) { + m_br.mk_and(is_nullable(r1), is_nullable(r2), result); + } + else if (re().is_union(r, r1, r2) || re().is_antimirov_union(r, r1, r2)) { + m_br.mk_or(is_nullable(r1), is_nullable(r2), result); + } + else if (re().is_diff(r, r1, r2)) { + m_br.mk_not(is_nullable(r2), result); + m_br.mk_and(result, is_nullable(r1), result); + } + else if (re().is_xor(r, r1, r2)) { + m_br.mk_xor(is_nullable(r1), is_nullable(r2), result); + } + else if (re().is_star(r) || + re().is_opt(r) || + re().is_full_seq(r) || + re().is_epsilon(r) || + (re().is_loop(r, r1, lo) && lo == 0) || + (re().is_loop(r, r1, lo, hi) && lo == 0)) { + result = m.mk_true(); + } + else if (re().is_full_char(r) || + re().is_empty(r) || + re().is_of_pred(r) || + re().is_range(r)) { + result = m.mk_false(); + } + else if (re().is_plus(r, r1) || + (re().is_loop(r, r1, lo) && lo > 0) || + (re().is_loop(r, r1, lo, hi) && lo > 0) || + (re().is_reverse(r, r1))) { + result = is_nullable(r1); + } + else if (re().is_complement(r, r1)) { + m_br.mk_not(is_nullable(r1), result); + } + else if (re().is_to_re(r, r1)) { + result = is_nullable(r1); + } + else if (m.is_ite(r, cond, r1, r2)) { + m_br.mk_ite(cond, is_nullable(r1), is_nullable(r2), result); + } + else if (m_util.is_re(r, seq_sort)) { + result = is_nullable_symbolic_regex(r, seq_sort); + } + else if (u().str.is_concat(r, r1, r2)) { + m_br.mk_and(is_nullable(r1), is_nullable(r2), result); + } + else if (u().str.is_empty(r)) { + result = m.mk_true(); + } + else if (u().str.is_unit(r)) { + result = m.mk_false(); + } + else if (u().str.is_string(r, s1)) { + result = m.mk_bool_val(s1.length() == 0); + } + else { + SASSERT(m_util.is_seq(r)); + result = m.mk_eq(u().str.mk_empty(r->get_sort()), r); + } + return result; + } + + expr_ref derive::is_nullable_symbolic_regex(expr* r, sort* seq_sort) { + SASSERT(m_util.is_re(r)); + expr* elem = nullptr, * r1 = r, * r2 = nullptr, * s = nullptr; + expr_ref elems(u().str.mk_empty(seq_sort), m); + expr_ref result(m); + while (re().is_derivative(r1, elem, r2)) { + if (u().str.is_empty(elems)) + elems = u().str.mk_unit(elem); + else + elems = u().str.mk_concat(u().str.mk_unit(elem), elems); + r1 = r2; + } + if (re().is_to_re(r1, s)) { + result = m.mk_eq(elems, s); + return result; + } + result = re().mk_in_re(u().str.mk_empty(seq_sort), r); + return result; + } + + // ------------------------------------------------------- + // Smart constructors with simplification + // ------------------------------------------------------- + + + // Extract character range [lo, hi] from a derivative condition. + // Conditions are of the form: + // ele == c → range [c, c] + // char_le(lo_expr, ele) && char_le(ele, hi_expr) → range [lo, hi] + // char_le(lo_expr, ele) → range [lo, max_char] + // char_le(ele, hi_expr) → range [0, hi] + // Returns false if not a recognizable range condition. + // Predicate implication for character range conditions. + // Returns true if: whenever cond_a is true, cond_b must also be true. + // pred_implies(sign_a, a, sign_b, b): does (sign_a ? ¬a : a) imply (sign_b ? ¬b : b)? + bool derive::pred_implies(bool sign_a, expr* a, bool sign_b, expr* b) { + // Same atom: check sign compatibility + if (a == b) return sign_a == sign_b; + + // Both negated: ¬a → ¬b iff b → a, i.e. pred_implies(false, b, false, a) + if (sign_a && sign_b) + return pred_implies(false, b, false, a); + + unsigned lo_a, hi_a, lo_b, hi_b; + bool neg_a, neg_b; + + if (!sign_a && !sign_b) { + // a → b: range_a ⊆ range_b + if (u().is_char_const_range(m_ele, a, lo_a, hi_a, neg_a) && !neg_a && + u().is_char_const_range(m_ele, b, lo_b, hi_b, neg_b) && !neg_b) + return lo_b <= lo_a && hi_a <= hi_b; + } + else if (!sign_a && sign_b) { + // a → ¬b: range_a ∩ range_b = ∅ + if (u().is_char_const_range(m_ele, a, lo_a, hi_a, neg_a) && !neg_a && + u().is_char_const_range(m_ele, b, lo_b, hi_b, neg_b) && !neg_b) + return hi_a < lo_b || hi_b < lo_a; + } + else if (sign_a && !sign_b) { + // ¬a → b: complement of range_a ⊆ range_b + if (u().is_char_const_range(m_ele, a, lo_a, hi_a, neg_a) && !neg_a && + u().is_char_const_range(m_ele, b, lo_b, hi_b, neg_b) && !neg_b) + return lo_b == 0 && hi_b >= u().max_char(); + } + + return false; + } + + bool derive::pred_implies(expr* a, expr* b) { + bool sign_a = m.is_not(a, a); + bool sign_b = m.is_not(b, b); + return pred_implies(sign_a, a, sign_b, b); + } + + expr_ref derive::mk_xor(expr *a, expr *b) { + return mk_core(OP_RE_XOR, a, b); + } + + expr_ref derive::mk_xor_core(expr *a, expr *b) { + + return m_re.mk_xor0(a, b); + } + + expr_ref derive::mk_core(decl_kind k, expr* a, expr* b) { + expr *pe = get_path_expr(); + expr *cached = nullptr; + auto& cache = k == OP_RE_UNION ? union_cache() : k == OP_RE_INTERSECT ? inter_cache() : xor_cache(); + if (cache.find(a, b, pe, cached)) + return expr_ref(cached, m); + expr_ref result(m); + // ITE handling with path pruning + auto inter_op = [&](expr *x, expr *y) { return mk_inter(x, y); }; + auto union_op = [&](expr *x, expr *y) { return mk_union(x, y); }; + auto xor_op = [&](expr *x, expr *y) { return mk_xor(x, y); }; + switch (k) { + case OP_RE_UNION: + if (m_derivative_kind == derivative_kind::brzozowski_t) + result = hoist_ite(a, b, union_op); + if (!result) + result = mk_union_core(a, b); + break; + case OP_RE_INTERSECT: + result = hoist_ite(a, b, inter_op); + if (!result) + result = mk_inter_core(a, b); + break; + case OP_RE_XOR: + result = hoist_ite(a, b, xor_op); + if (!result) + result = mk_xor_core(a, b); + break; + default: + UNREACHABLE(); + break; + } + // Store in cache + cache.insert(a, b, pe, result); + m_trail.push_back(a); + m_trail.push_back(b); + m_trail.push_back(pe); + m_trail.push_back(result); + return result; + } + + expr_ref derive::mk_union(expr* a, expr* b) { + return mk_core(OP_RE_UNION, a, b); + } + + // Lightweight structural subsumption: checks if L(a) ⊆ L(b) + bool derive::is_subset(expr* a, expr* b) { + return m_re.is_subset(a, b); + } + + bool derive::are_complements(expr* a, expr* b) { + expr* c = nullptr; + if (re().is_complement(a, c) && c == b) return true; + if (re().is_complement(b, c) && c == a) return true; + return false; + } + + expr_ref derive::mk_union_core(expr* a, expr* b) { + + // Identity: none ∪ R = R (none is the unit of union) + // Idempotence: R ∪ R = R + // Absorption: Σ* ∪ R = Σ* + // Without these the derivative of an intersection accumulates + // un-simplified unions such as union(inter, union(none, none)), + // producing many syntactically distinct but semantically equal + // states. That defeats state dedup in the emptiness/bisim closure + // and makes contains-pattern intersections blow up. + if (re().is_empty(a)) return expr_ref(b, m); + if (re().is_empty(b)) return expr_ref(a, m); + if (a == b) return expr_ref(a, m); + if (re().is_full_seq(a) || re().is_full_seq(b)) + return expr_ref(re().mk_full_seq(a->get_sort()), m); + + // Prefix factoring: a·x ∪ a·y = a·(x ∪ y) + expr *a1, *a2, *b1, *b2; + if (re().is_concat(a, a1, a2) && re().is_concat(b, b1, b2) && a1 == b1) { + expr_ref tail = mk_union(a2, b2); + return mk_deriv_concat(a1, tail); + } + + // Subsumption: L(a) ⊆ L(b) ⇒ a ∪ b = b (and symmetrically). + // This collapses semantically-equal union states that arise in + // antimirov intersection derivatives, which is essential to keep the + // emptiness/bisim worklist from blowing up on contains-patterns. + if (is_subset(a, b)) return expr_ref(b, m); + if (is_subset(b, a)) return expr_ref(a, m); + + return m_re.mk_union(a, b); + } + + expr_ref derive::mk_inter(expr* a, expr* b) { + return mk_core(OP_RE_INTERSECT, a, b); + } + + expr_ref derive::mk_inter_core(expr* a, expr* b) { + + // Subsumption covers: a==b, empty(a), empty(b), full(a), full(b), etc. + if (is_subset(a, b)) return expr_ref(a, m); + if (is_subset(b, a)) return expr_ref(b, m); + + // Complement absorption: r ∩ ~r = ∅ + expr *c = nullptr, *d = nullptr; + if (re().is_complement(a, c) && c == b) + return expr_ref(re().mk_empty(a->get_sort()), m); + if (re().is_complement(b, c) && c == a) + return expr_ref(re().mk_empty(a->get_sort()), m); + if (re().is_complement(a, c) && re().is_complement(b, d)) + return expr_ref(re().mk_complement(mk_union_core(c, d)), m); + + + + // Distribution of intersection over union: (x ∪ y) ∩ b → (x ∩ b) ∪ (y ∩ b). + // This is essential for keeping the symbolic derivative a proper + // *transition regex*: with antimirov derivatives the operands of an + // intersection are unions of ITE-branches. If the intersection is left + // *above* the union/ITE structure, the solver's get_derivative_targets + // (which only descends through top-level ITE and union, not + // intersection) cannot decompose the transition regex into ground + // target states. The result is a handful of gigantic states still + // carrying the (:var 0) character conditions, on which the solver's + // cofactor/accept enumeration explodes. Distributing here pushes the + // intersection into the ITE leaves so states stay small and ground. + expr *u1 = nullptr, *u2 = nullptr; + if (re().is_union(a, u1, u2)) + return mk_union(mk_inter(u1, b), mk_inter(u2, b)); + if (re().is_union(b, u1, u2)) + return mk_union(mk_inter(a, u1), mk_inter(a, u2)); + + // Base case: build raw intersection + return m_re.mk_inter(a, b); + } + + + expr_ref derive::mk_concat(expr* a, expr* b) { + sort* seq_s = nullptr, * ele_s = nullptr; + VERIFY(m_util.is_re(a, seq_s)); + VERIFY(u().is_seq(seq_s, ele_s)); + if (re().is_empty(a)) return expr_ref(a, m); + if (re().is_empty(b)) return expr_ref(b, m); + if (re().is_epsilon(a)) return expr_ref(b, m); + if (re().is_epsilon(b)) return expr_ref(a, m); + if (re().is_full_seq(a) && re().is_full_seq(b)) + return expr_ref(a, m); + if (re().is_full_char(a) && re().is_full_seq(b)) + return expr_ref(re().mk_plus(re().mk_full_char(a->get_sort())), m); + if (re().is_full_seq(a) && re().is_full_char(b)) + return expr_ref(re().mk_plus(re().mk_full_char(a->get_sort())), m); + + // to_re(s1) · to_re(s2) → to_re(s1 ++ s2) + expr* s1 = nullptr, * s2 = nullptr; + if (re().is_to_re(a, s1) && re().is_to_re(b, s2)) + return expr_ref(re().mk_to_re(u().str.mk_concat(s1, s2)), m); + + // r* · r* → r* + + expr* a1 = nullptr, *a2 = nullptr, * b1 = nullptr; + + if (re().is_star(a, a1) && re().is_star(b, b1) && a1 == b1) + return expr_ref(a, m); + + // Right-associate: (a · b) · c → a · (b · c) + + if (re().is_concat(a, a1, a2)) + return mk_concat(a1, mk_concat(a2, b)); + + return expr_ref(re().mk_concat(a, b), m); + } + + expr_ref derive::mk_complement(expr* a) { + // Check path-aware op cache + expr* pe = get_path_expr(); + expr* cached = nullptr; + if (complement_cache().find(a, pe, cached)) + return expr_ref(cached, m); + + expr_ref result = mk_complement_core(a); + + // Store in cache + complement_cache().insert(a, pe, result); + m_trail.push_back(a); + m_trail.push_back(pe); + m_trail.push_back(result); + return result; + } + + expr_ref derive::mk_complement_core(expr* a) { + // ~~r → r + expr* r = nullptr; + if (re().is_complement(a, r)) + return expr_ref(r, m); + // ~∅ → Σ* + if (re().is_empty(a)) + return expr_ref(re().mk_full_seq(a->get_sort()), m); + // ~Σ* → ∅ + if (re().is_full_seq(a)) + return expr_ref(re().mk_empty(a->get_sort()), m); + + // Push through ITE with path pruning: ~(ite(c, t, e)) → ite(c, ~t, ~e) + expr* c, * t, * e; + if (m.is_ite(a, c, t, e)) { + auto comp_op = [&](expr* x) { return mk_complement(x); }; + expr_ref r = apply_ite(c, t, e, comp_op); + if (r) return r; + return expr_ref(re().mk_full_seq(a->get_sort()), m); + } + + // ~ε → .+ + sort* s = nullptr; + expr* r1 = nullptr; + if (re().is_to_re(a, r1) && u().str.is_empty(r1)) { + VERIFY(m_util.is_re(a, s)); + return expr_ref(re().mk_plus(re().mk_full_char(a->get_sort())), m); + } + + return expr_ref(re().mk_complement(a), m); + } + + expr_ref derive::mk_ite(expr* c, expr* t, expr* e) { + if (m.is_true(c) || t == e) + return expr_ref(t, m); + if (m.is_false(c)) + return expr_ref(e, m); + // Use path-aware condition evaluation + lbool cond_val = eval_path_cond(c); + if (cond_val == l_true) return expr_ref(t, m); + if (cond_val == l_false) return expr_ref(e, m); + return expr_ref(m.mk_ite(c, t, e), m); + } + + // ------------------------------------------------------- + // Distribute concat through ITE/union structure of derivative + // ------------------------------------------------------- + + expr_ref derive::mk_deriv_concat(expr* d, expr* tail) { + // Check op cache + expr* cached = nullptr; + if (concat_cache().find(d, tail, cached)) + return expr_ref(cached, m); + + expr_ref result = mk_deriv_concat_core(d, tail); + + // Store in cache + concat_cache().insert(d, tail, result); + m_trail.push_back(d); + m_trail.push_back(tail); + m_trail.push_back(result); + return result; + } + + expr_ref derive::mk_deriv_concat_core(expr* d, expr* tail) { + expr_ref _d(d, m), _tail(tail, m); + expr* c, * t, * e; + + if (re().is_empty(d)) + return expr_ref(d, m); + if (re().is_epsilon(d)) + return expr_ref(tail, m); + + if (m.is_ite(d, c, t, e)) { + expr_ref then_r = mk_deriv_concat(t, tail); + expr_ref else_r = mk_deriv_concat(e, tail); + return mk_ite(c, then_r, else_r); + } + + // (t ∪ e) · tail → (t · tail) ∪ (e · tail) + if (m_derivative_kind == derivative_kind::antimirov_t && re().is_union(d, t, e)) { + expr_ref left = mk_deriv_concat(t, tail); + expr_ref right = mk_deriv_concat(e, tail); + return mk_union(left, right); + } + + return mk_concat(d, tail); + } + + // ------------------------------------------------------- + // Path management for inline pruning + // ------------------------------------------------------- + + lbool derive::push(expr* c, bool sign) { + // Check if (c, sign) is already determined by the path + lbool cv = eval_path_cond(c); + if (cv == l_true && !sign) return l_true; // c implied true, push(c,false) is redundant + if (cv == l_false && sign) return l_true; // c implied false, push(c,true) is redundant + if (cv == l_true && sign) return l_false; // c implied true, push(c,true) contradicts + if (cv == l_false && !sign) return l_false; // c implied false, push(c,false) contradicts + + // Save current state + unsigned saved_path_sz = m_path.size(); + unsigned saved_intervals_sz = m_intervals.size(); + unsigned saved_intervals_start = m_intervals_start; + expr* saved_path_expr = m_path_expr; + + // Push atoms onto path and check for contradiction or implication + lbool result = push_path_atoms(c, sign); + if (result != l_undef) { + m_path.shrink(saved_path_sz); + m_intervals.shrink(saved_intervals_sz); + m_intervals_start = saved_intervals_start; + return result; + } + + // Update intervals + result = push_intervals_impl(c, sign); + if (result != l_undef) { + m_path.shrink(saved_path_sz); + m_intervals.shrink(saved_intervals_sz); + m_intervals_start = saved_intervals_start; + return result; + } + + // Update path expression + expr* atom = sign ? m.mk_not(c) : c; + m_path_expr = m.mk_and(m_path_expr, atom); + m_trail.push_back(m_path_expr); + + // Commit: save state for pop() + m_path_stack.push_back({ saved_path_sz, saved_intervals_sz, saved_intervals_start, saved_path_expr }); + return l_undef; + } + + void derive::pop() { + SASSERT(!m_path_stack.empty()); + auto const& saved = m_path_stack.back(); + m_path.shrink(saved.path_sz); + m_intervals.shrink(saved.intervals_sz); + m_intervals_start = saved.intervals_start; + m_path_expr = saved.path_expr; + m_path_stack.pop_back(); + } + + // Binary apply_ite: hoist ite(c, t, e) op r with path pruning + expr_ref derive::apply_ite(expr* c, expr* t, expr* e, expr* r, std::function apply_op) { + expr_ref then_br(m), else_br(m); + switch (push(c, false)) { + case l_true: return apply_op(t, r); + case l_undef: then_br = apply_op(t, r); pop(); break; + case l_false: break; + } + switch (push(c, true)) { + case l_true: return apply_op(e, r); + case l_undef: else_br = apply_op(e, r); pop(); break; + case l_false: break; + } + if (then_br && else_br) return mk_ite(c, then_br, else_br); + if (then_br) return then_br; + if (else_br) return else_br; + return expr_ref(nullptr, m); + } + + // Same-condition merge: ite(c, t1, e1) op ite(c, t2, e2) → ite(c, t1 op t2, e1 op e2) + expr_ref derive::apply_ite(expr* c, expr* t1, expr* e1, expr* t2, expr* e2, std::function apply_op) { + expr_ref then_br(m), else_br(m); + switch (push(c, false)) { + case l_true: return apply_op(t1, t2); + case l_undef: then_br = apply_op(t1, t2); pop(); break; + case l_false: break; + } + switch (push(c, true)) { + case l_true: return apply_op(e1, e2); + case l_undef: else_br = apply_op(e1, e2); pop(); break; + case l_false: break; + } + if (then_br && else_br) return mk_ite(c, then_br, else_br); + if (then_br) return then_br; + if (else_br) return else_br; + return expr_ref(nullptr, m); + } + + // Unary apply_ite: hoist ite(c, t, e) through unary op with path pruning + expr_ref derive::apply_ite(expr* c, expr* t, expr* e, std::function apply_op) { + expr_ref then_br(m), else_br(m); + switch (push(c, false)) { + case l_true: return apply_op(t); + case l_undef: then_br = apply_op(t); pop(); break; + case l_false: break; + } + switch (push(c, true)) { + case l_true: return apply_op(e); + case l_undef: else_br = apply_op(e); pop(); break; + case l_false: break; + } + if (then_br && else_br) return mk_ite(c, then_br, else_br); + if (then_br) return then_br; + if (else_br) return else_br; + return expr_ref(nullptr, m); + } + + // Common ITE dispatch for binary ops (union/inter). + // Returns nullptr if neither a nor b is ITE. + expr_ref derive::hoist_ite(expr* a, expr* b, std::function apply_op) { + expr *c1, *t1, *e1, *c2, *t2, *e2; + if (m.is_ite(a, c1, t1, e1) && m.is_ite(b, c2, t2, e2) && c1->get_id() > c2->get_id()) + std::swap(a, b); + if (m.is_ite(a, c1, t1, e1) && m.is_ite(b, c2, t2, e2)) { + expr_ref r(m); + if (c1 == c2) + r = apply_ite(c1, t1, e1, t2, e2, apply_op); + else + r = apply_ite(c1, t1, e1, b, apply_op); + if (r) return r; + return expr_ref(re().mk_empty(a->get_sort()), m); + } + // Single-ITE hoisting: must always recurse to maintain path-aware + // soundness — falling back to a non-path-aware structural rewrite + // here would bake unreachable branches into the result tree. + if (m.is_ite(a, c1, t1, e1)) { + expr_ref r = apply_ite(c1, t1, e1, b, apply_op); + if (r) return r; + return expr_ref(re().mk_empty(a->get_sort()), m); + } + if (m.is_ite(b, c2, t2, e2)) { + expr_ref r = apply_ite(c2, t2, e2, a, apply_op); + if (r) return r; + return expr_ref(re().mk_empty(a->get_sort()), m); + } + return expr_ref(nullptr, m); + } + + // Push signed atoms onto m_path. Returns l_true if implied, l_false if contradicted, l_undef if pushed. + lbool derive::push_path_atoms(expr* c, bool sign) { + // Check if (c, sign) is already determined by the path + for (auto const& [cond, csign] : m_path) { + if (c == cond) + return csign == sign ? l_true : l_false; + expr* lhs1 = nullptr, * rhs1 = nullptr, * lhs2 = nullptr, * rhs2 = nullptr; + // x = v, v != w |-> x != w + if (!csign && m.is_eq(cond, lhs1, rhs1) && m.is_eq(c, lhs2, rhs2)) { + if (m.is_value(lhs1)) std::swap(lhs1, rhs1); + if (m.is_value(lhs2)) std::swap(lhs2, rhs2); + if (lhs1 == lhs2 && m.are_distinct(rhs1, rhs2)) + return sign ? l_true : l_false; + } + } + + // Composite: conjunction assumed true, or disjunction assumed false + if ((!sign && m.is_and(c)) || (sign && m.is_or(c))) { + bool all_implied = true; + for (expr* arg : *to_app(c)) { + lbool r = push_path_atoms(arg, sign); + if (r == l_false) return l_false; + if (r == l_undef) all_implied = false; + } + return all_implied ? l_true : l_undef; + } + + // Atomic: push onto path + m_path.push_back({ c, sign }); + return l_undef; + } + + // Update m_intervals based on the condition. Returns l_true if implied, l_false if inconsistent, l_undef if pushed. + // Operates on the active suffix m_intervals[m_intervals_start..end]. + // On modification, appends new intervals and updates m_intervals_start. + lbool derive::push_intervals_impl(expr* c, bool sign) { + unsigned lo = 0, hi = 0; + bool negated = false; + if (m_util.is_char_const_range(m_ele, c, lo, hi, negated)) { + bool effective_neg = (negated != sign); + if (!effective_neg) { + if (lo <= hi) { + // Check if current intervals already imply [lo,hi] + bool already_subset = true; + for (unsigned i = m_intervals_start; i < m_intervals.size(); ++i) { + if (m_intervals[i].first < lo || m_intervals[i].second > hi) { already_subset = false; break; } + } + if (already_subset) return l_true; + intersect_intervals(lo, hi); + } else { + // lo > hi means empty range — contradiction + return l_false; + } + } else { + if (lo <= hi) { + // Check if current intervals already exclude [lo,hi] + bool already_excluded = true; + for (unsigned i = m_intervals_start; i < m_intervals.size(); ++i) { + if (m_intervals[i].first <= hi && m_intervals[i].second >= lo) { already_excluded = false; break; } + } + if (already_excluded) return l_true; + exclude_interval(lo, hi); + } + } + } else if ((!sign && m.is_and(c)) || (sign && m.is_or(c))) { + bool all_implied = true; + for (expr* arg : *to_app(c)) { + lbool r = push_intervals_impl(arg, sign); + if (r == l_false) return l_false; + if (r == l_undef) all_implied = false; + } + unsigned n = m_intervals.size() - m_intervals_start; + return all_implied ? l_true : (n == 0 ? l_false : l_undef); + } + unsigned n = m_intervals.size() - m_intervals_start; + return n == 0 ? l_false : l_undef; + } + + // Evaluate a condition against the current path and intervals. + lbool derive::eval_path_cond(expr* c) { + // First try static evaluation (concrete m_ele, tautologies) + lbool v = eval_cond(c); + if (v != l_undef) return v; + + // Check against path atoms + for (auto const& [cond, sign] : m_path) { + if (c == cond) + return sign ? l_false : l_true; + } + + // Check against intervals + v = eval_range_cond(c); + if (v != l_undef) return v; + + // Check pred_implies from path atoms + for (auto const& [cond, sign] : m_path) { + if (pred_implies(sign, cond, false, c)) + return l_true; + if (pred_implies(sign, cond, true, c)) + return l_false; + } + + return l_undef; + } + + // ------------------------------------------------------- + // Condition evaluation helpers + // ------------------------------------------------------- + + lbool derive::eval_cond(expr* cond) { + expr* e1 = nullptr; + + if (m.is_true(cond)) return l_true; + if (m.is_false(cond)) return l_false; + + // Use is_char_const_range to evaluate conditions involving m_ele + unsigned lo = 0, hi = 0, ele_val = 0; + bool negated = false; + if (m_util.is_char_const_range(m_ele, cond, lo, hi, negated) && u().is_const_char(m_ele, ele_val)) { + bool in_range = (lo <= ele_val && ele_val <= hi); + return (in_range != negated) ? l_true : l_false; + } + + // Handle self-equality and constant comparisons not involving m_ele + expr* lhs = nullptr, * rhs = nullptr; + if (m.is_eq(cond, lhs, rhs) && lhs == rhs) + return l_true; + + unsigned vl = 0, vr = 0; + if (u().is_char_le(cond, lhs, rhs)) { + if (u().is_const_char(lhs, vl) && u().is_const_char(rhs, vr)) + return vl <= vr ? l_true : l_false; + if (u().is_const_char(lhs, vl) && vl == 0) + return l_true; + if (u().is_const_char(rhs, vr) && vr == u().max_char()) + return l_true; + } + + // not(e1) + if (m.is_not(cond, e1)) + return ~eval_cond(e1); + + // and(...) + if (m.is_and(cond)) { + lbool r = l_true; + for (expr* arg : *to_app(cond)) { + lbool v = eval_cond(arg); + if (v == l_false) return l_false; + if (v == l_undef) r = l_undef; + } + return r; + } + + // or(...) + if (m.is_or(cond)) { + lbool r = l_false; + for (expr* arg : *to_app(cond)) { + lbool v = eval_cond(arg); + if (v == l_true) return l_true; + if (v == l_undef) r = l_undef; + } + return r; + } + + return l_undef; + } + + lbool derive::eval_range_cond(expr* c) { + unsigned n = m_intervals.size() - m_intervals_start; + if (n == 0) + return l_false; + unsigned lo = 0, hi = 0; + bool negated = false; + if (!m_util.is_char_const_range(m_ele, c, lo, hi, negated)) + return l_undef; + if (lo > hi) { + return negated ? l_true : l_false; + } + // Check if [lo, hi] overlaps with intervals and/or contains all intervals + bool any_overlap = false; + bool all_contained = true; + for (unsigned i = m_intervals_start; i < m_intervals.size(); ++i) { + auto [r_lo, r_hi] = m_intervals[i]; + if (std::max(r_lo, lo) <= std::min(r_hi, hi)) + any_overlap = true; + if (r_lo < lo || r_hi > hi) + all_contained = false; + } + if (!negated) { + if (!any_overlap) return l_false; + if (all_contained) return l_true; + } else { + if (all_contained) return l_false; + if (!any_overlap) return l_true; + } + return l_undef; + } + + // Intersect the active suffix m_intervals[m_intervals_start..end] with [lo, hi] + void derive::intersect_intervals(unsigned lo, unsigned hi) { + // Copy active suffix to end, update start, then filter + unsigned old_sz = m_intervals.size(); + for (unsigned i = m_intervals_start; i < old_sz; ++i) { + auto e = m_intervals[i]; + m_intervals.push_back(e); + } + m_intervals_start = old_sz; + // Filter in-place within new suffix: drop intervals disjoint from [lo,hi], + // keep the intersection for overlapping ones. + unsigned j = m_intervals_start; + for (unsigned i = m_intervals_start; i < m_intervals.size(); ++i) { + auto [lo1, hi1] = m_intervals[i]; + if (hi < lo1 || lo > hi1) + continue; // disjoint with this interval — drop it + m_intervals[j++] = {std::max(lo1, lo), std::min(hi1, hi)}; + } + m_intervals.shrink(j); + } + + // Exclude [lo, hi] from the active suffix m_intervals[m_intervals_start..end] + void derive::exclude_interval(unsigned lo, unsigned hi) { + unsigned max_char = u().max_char(); + if (lo == 0 && hi >= max_char) { m_intervals_start = m_intervals.size(); return; } + if (lo == 0) { intersect_intervals(hi + 1, max_char); return; } + if (hi >= max_char) { intersect_intervals(0, lo - 1); return; } + // Each interval [ilo, ihi] minus [lo, hi] → up to 2 pieces + // Append new results past the end, then move start + unsigned old_start = m_intervals_start; + unsigned old_sz = m_intervals.size(); + for (unsigned i = old_start; i < old_sz; ++i) { + auto [ilo, ihi] = m_intervals[i]; + if (ihi < lo || ilo > hi) { + auto e = m_intervals[i]; + m_intervals.push_back(e); + } else { + if (ilo < lo) + m_intervals.push_back({ilo, lo - 1}); + if (ihi > hi) + m_intervals.push_back({hi + 1, ihi}); + } + } + m_intervals_start = old_sz; + } + + // ------------------------------------------------------- + // Cofactor enumeration over a transition regex + // ------------------------------------------------------- + + expr_ref derive::clean_leaf(expr* r) { + expr* a = nullptr, * b = nullptr; + if (re().is_union(r, a, b)) + return mk_union(clean_leaf(a), clean_leaf(b)); + if (re().is_intersection(r, a, b)) + return mk_inter(clean_leaf(a), clean_leaf(b)); + return expr_ref(r, m); + } + + void derive::get_cofactors_rec(expr* r, expr_ref_pair_vector& result) { + // Hoist the (first) if-then-else condition to the top of r, splitting it + // into the equivalent ite(c, th, el); when r contains no ite it is a + // leaf of the transition regex. + expr_ref c(m), th(m), el(m); + if (!m_br.decompose_ite(r, c, th, el)) { + // Re-normalize the leaf: decompose_ite substitutes ITE branches + // structurally so the leaf may carry un-simplified union(_, none) + // / inter(_, none) nodes. Cleaning them keeps semantically equal + // states syntactically identical, which is essential for state + // dedup in the emptiness/bisim closure. + expr_ref cr = clean_leaf(r); + if (!re().is_empty(cr)) + result.push_back(get_path_expr(), cr); + return; + } + // Positive branch: c holds. + switch (push(c, false)) { + case l_true: get_cofactors_rec(th, result); break; + case l_undef: get_cofactors_rec(th, result); pop(); break; + case l_false: break; + } + // Negative branch: c does not hold. + switch (push(c, true)) { + case l_true: get_cofactors_rec(el, result); break; + case l_undef: get_cofactors_rec(el, result); pop(); break; + case l_false: break; + } + } + + void derive::get_cofactors(expr* ele, expr* r, expr_ref_pair_vector& result) { + SASSERT(m_util.is_re(r)); + if (ele != m_ele) + reset_op_caches(); + m_ele = ele; + m_trail.push_back(ele); + m_trail.push_back(r); + // Initialize a fresh path/interval context for this traversal. + m_path.reset(); + m_path_stack.reset(); + m_intervals.reset(); + m_intervals.push_back({0u, u().max_char()}); + m_intervals_start = 0; + m_path_expr = m.mk_true(); + get_cofactors_rec(r, result); + } + + void derive::derivative_cofactors(expr* r, expr_ref_pair_vector& result) { + // Compute the symbolic derivative wrt the canonical variable + // (:var 0); operator() sets m_ele to that variable. + expr_ref d = (*this)(derivative_kind::brzozowski_t, r); + // Enumerate the reachable, fully ITE-hoisted leaves of the + // transition regex. get_cofactors uses the SAME m_ele set above, + // so the (:var 0) conditions in d are matched and pruned. + get_cofactors(m_ele, d, result); + } + +} + diff --git a/src/ast/rewriter/seq_derive.h b/src/ast/rewriter/seq_derive.h new file mode 100644 index 0000000000..d3288dccf8 --- /dev/null +++ b/src/ast/rewriter/seq_derive.h @@ -0,0 +1,265 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_derive.h + +Abstract: + + Symbolic derivative computation for regular expressions. + Produces an ITE-tree (transition regex) representation where + the free variable is de Bruijn index 0 representing the input character. + + Based on the theory of symbolic derivatives and transition regexes: + - Veanes et al., "On Symbolic Derivatives and Transition Regexes" (LPAR 2024) + - Varatalu, Veanes, Ernits, "RE#" (POPL 2025) + - Stanford, Veanes, Bjørner, "Symbolic Boolean Derivatives" (PLDI 2021) + +Authors: + + Nikolaj Bjorner (nbjorner) 2025-06-03 + +--*/ + +#pragma once + +#include "ast/seq_decl_plugin.h" +#include "ast/arith_decl_plugin.h" +#include "ast/array_decl_plugin.h" +#include "ast/rewriter/bool_rewriter.h" +#include "util/obj_pair_hashtable.h" +#include "util/obj_triple_hashtable.h" +#include + +class seq_rewriter; + +namespace seq { + + enum class derivative_kind { antimirov_t, brzozowski_t }; + /** + * Symbolic derivative engine for regular expressions. + * + * Given a regex r, operator()(r) computes a symbolic derivative δ(r) + * represented as an ITE-tree over character predicates (using de Bruijn + * variable 0 for the character). Evaluating the ITE-tree for a concrete + * character 'a' yields the classical Brzozowski derivative δ_a(r). + * + * The ITE-tree structure implicitly defines minterms (equivalence classes + * of characters indistinguishable by the regex). + * + * Key properties: + * - Results are memoized for termination on cyclic derivative graphs + * - Union/intersection operands are sorted for ACI canonicalization + * - Depth-bounded to prevent stack overflow + */ + class derive { + ast_manager& m; + seq_util m_util; + arith_util m_autil; + bool_rewriter m_br; + seq_rewriter& m_re; + + // Cache: maps (ele, regex) pair to its derivative + obj_pair_map m_acache, m_bcache; + obj_pair_map m_atop_cache, m_btop_cache; // post-simplify cache + expr_ref_vector m_trail; // pin cached results + + // Op cache for ITE-hoisting operations (union, inter, concat, complement) + // Path-aware caches: key is (a, b, path_expr) for binary ops, (a, path_expr) for complement + obj_triple_map m_aunion_cache, m_bunion_cache, m_ainter_cache, m_binter_cache, m_axor_cache, m_bxor_cache; + obj_pair_map m_aconcat_cache, m_bconcat_cache; + obj_pair_map m_acomplement_cache, m_bcomplement_cache; + + // Depth limiting + unsigned m_depth { 0 }; + static const unsigned m_max_depth = 512; + + seq_util::rex& re() { return m_util.re; } + seq_util& u() { return m_util; } + + derivative_kind m_derivative_kind = derivative_kind::antimirov_t; + + // The element (character) for the current derivative computation + expr_ref m_ele; + + // Path state for inline pruning during mk_inter/mk_union/mk_complement + using intervals_t = svector>; + + // Path: vector of signed atoms + svector> m_path; + // Intervals: feasible character ranges under current path (append-only) + intervals_t m_intervals; + unsigned m_intervals_start { 0 }; + // Stack of saved states for push/pop + struct path_save { unsigned path_sz; unsigned intervals_sz; unsigned intervals_start; expr* path_expr; }; + svector m_path_stack; + // Boolean expression encoding of current path (for cache keys) + expr_ref m_path_expr; + + // Path interface + lbool push(expr* c, bool sign); // l_true: implied, l_undef: pushed (must pop), l_false: contradicts + void pop(); // restore state to matching push + expr* get_path_expr() { return m_path_expr; } + + obj_pair_map &cache() { + return m_derivative_kind == derivative_kind::antimirov_t ? m_acache : m_bcache; + } + + obj_pair_map &top_cache() { + return m_derivative_kind == derivative_kind::antimirov_t ? m_atop_cache : m_btop_cache; + } + + obj_triple_map &union_cache() { + return m_derivative_kind == derivative_kind::antimirov_t ? m_aunion_cache : m_bunion_cache; + } + + obj_triple_map &inter_cache() { + return m_derivative_kind == derivative_kind::antimirov_t ? m_ainter_cache : m_binter_cache; + } + + obj_triple_map &xor_cache() { + return m_derivative_kind == derivative_kind::antimirov_t ? m_axor_cache : m_bxor_cache; + } + + obj_pair_map &concat_cache() { + return m_derivative_kind == derivative_kind::antimirov_t ? m_aconcat_cache : m_bconcat_cache; + } + + obj_pair_map &complement_cache() { + return m_derivative_kind == derivative_kind::antimirov_t ? m_acomplement_cache : m_bcomplement_cache; + } + + // Hoist ITE: apply_op through ite(c, t, e) with path pruning + expr_ref apply_ite(expr* c, expr* t, expr* e, expr* r, std::function apply_op); + expr_ref apply_ite(expr* c, expr* t1, expr* e1, expr* t2, expr* e2, std::function apply_op); + expr_ref apply_ite(expr* c, expr* t, expr* e, std::function apply_op); + // Common ITE dispatch for binary ops (union/inter) + expr_ref hoist_ite(expr* a, expr* b, std::function apply_op); + + // Evaluate a condition against the current path/intervals + lbool eval_path_cond(expr* c); + + // Internal helpers for push + lbool push_path_atoms(expr* c, bool sign); + lbool push_intervals_impl(expr* c, bool sign); + + // Core derivative computation + expr_ref derive_rec(expr* r); + expr_ref derive_core(expr* r); + + // Helpers for specific regex constructs + expr_ref derive_to_re(expr* s, sort* seq_sort); + expr_ref derive_range(expr* lo, expr* hi, sort* seq_sort); + expr_ref derive_of_pred(expr* pred, sort* seq_sort); + + // Nullable check: returns a Boolean expression + expr_ref is_nullable(expr* r); + expr_ref is_nullable_symbolic_regex(expr* r, sort* seq_sort); + + // Smart constructors with path-aware simplification and ACI canonicalization + expr_ref mk_union(expr* a, expr* b); + bool are_complements(expr* a, expr* b); + unsigned union_id(expr* e); // complement-aware ID for sorting + bool is_subset(expr* a, expr* b); + expr_ref mk_union_core(expr* a, expr* b); + expr_ref mk_inter(expr* a, expr* b); + expr_ref mk_inter_core(expr* a, expr* b); + expr_ref mk_concat(expr* a, expr* b); + expr_ref mk_complement(expr* a); + expr_ref mk_complement_core(expr* a); + expr_ref mk_xor(expr *a, expr *b); + expr_ref mk_xor_core(expr *a, expr *b); + expr_ref mk_core(decl_kind k, expr* a, expr* b); + expr_ref mk_ite(expr* c, expr* t, expr* e); + + // Distribute concatenation through ITE/union in derivative + expr_ref mk_deriv_concat(expr* d, expr* tail); + expr_ref mk_deriv_concat_core(expr* d, expr* tail); + + // Extract head character and tail from a sequence expression + bool get_head_tail(expr* s1, expr* s2, expr_ref& hd, expr_ref& tl); + + // Predicate implication for character range conditions. + bool pred_implies(bool sign_a, expr* a, bool sign_b, expr* b); + bool pred_implies(expr* a, expr* b); + + // Normalize reverse(r) + expr_ref mk_regex_reverse(expr* r); + + // Condition evaluation helpers + lbool eval_cond(expr* cond); + lbool eval_range_cond(expr* c); + void intersect_intervals(unsigned lo, unsigned hi); + void exclude_interval(unsigned lo, unsigned hi); + + // Cofactor enumeration over a transition regex (ITE-tree). + void get_cofactors_rec(expr* r, expr_ref_pair_vector& result); + + // Re-apply union/intersection simplifications bottom-up to a cofactor + // leaf. decompose_ite substitutes ITE branch values structurally + // (no simplification), so leaves can contain un-normalized nodes such + // as union(R, none) or inter(R, none); this rebuilds them through + // mk_union/mk_inter so equal states share a canonical form. + expr_ref clean_leaf(expr* r); + + sort* re_sort(expr* r) { return r->get_sort(); } + sort* seq_sort(expr* r) { sort* s = nullptr; m_util.is_re(r, s); return s; } + sort* ele_sort(expr* r) { sort* s = seq_sort(r); sort* e = nullptr; m_util.is_seq(s, e); return e; } + + void reset(); + void reset_op_caches(); + + public: + derive(ast_manager& m, seq_rewriter& re); + + /** + * Compute the derivative of regex r with respect to element ele. + * When ele is a de Bruijn variable, produces a symbolic ITE-tree. + * When ele is a concrete character, produces the concrete derivative. + */ + expr_ref operator()(derivative_kind k, expr* ele, expr* r); + + /** + * Convenience: symbolic derivative using de Bruijn var 0. + */ + expr_ref operator()(derivative_kind k, expr* r); + + /** + * Nullable check: returns a Boolean expression that is true iff r accepts the empty string. + */ + expr_ref nullable(expr* r) { return is_nullable(r); } + + /** + * Enumerate the cofactors (min-terms) of a transition regex r taken with + * respect to element ele. r is an ITE-tree over character predicates on + * ele; for every feasible path through the tree this produces a pair + * (path_condition, leaf_regex). Infeasible character-interval + * combinations are pruned using the same path/interval context that the + * derivative engine uses while hoisting ITEs. + */ + void get_cofactors(expr* ele, expr* r, expr_ref_pair_vector& result); + + /** + * Compute the symbolic derivative of r and enumerate its reachable + * leaves in fully ITE-hoisted normal form. + * + * Concretely this returns, for every feasible minterm (character + * class) of δ(r), a pair (path_condition, target_regex). Every + * if-then-else over the input character (including ones that would + * otherwise be buried under a concat/union) is hoisted to the top + * via the same path/interval pruning used by the derivative engine, + * so each target_regex is free of (:var 0) and its nullability is + * always decidable. Unions are kept intact as single leaves (a + * union leaf denotes a single bisimulation state). Infeasible + * minterms are pruned, so all returned leaves are reachable. + * + * This is the entry point the regex_bisim equivalence procedure + * uses: it consumes the target_regex of each pair and ignores the + * (redundant) path condition. + */ + void derivative_cofactors(expr* r, expr_ref_pair_vector& result); + + }; + +} diff --git a/src/ast/rewriter/seq_range_collapse.cpp b/src/ast/rewriter/seq_range_collapse.cpp new file mode 100644 index 0000000000..0c739e0c07 --- /dev/null +++ b/src/ast/rewriter/seq_range_collapse.cpp @@ -0,0 +1,168 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_range_collapse.cpp + +Abstract: + + Implementation of regex <-> range_predicate translation for the + boolean-combination-of-ranges fragment. See header for the recognized + grammar and the canonical regex AST emitted by materialization. + +Authors: + + Margus Veanes (veanes) 2026 + +--*/ + +#include "ast/rewriter/seq_range_collapse.h" + +namespace seq { + + bool regex_to_range_predicate(seq_util& u, expr* r, range_predicate& out) { + // The range algebra only models sets of single characters over the + // unsigned character domain [0, max_char]. Guard against any regex + // whose element type is not a sequence of characters (e.g. a regex + // over (Seq Int) or (Seq (Seq Char))): for such regexes the + // re.range/re.union/... matchers below would silently fabricate a + // character-class predicate and change semantics. Reject them up + // front so callers fall back to the generic regex path. + sort* seq_sort = nullptr; + if (!u.is_re(r, seq_sort) || !u.is_string(seq_sort)) + return false; + + unsigned const max_char = u.max_char(); + auto& re = u.re; + + if (re.is_empty(r)) { + out = range_predicate::empty(max_char); + return true; + } + if (re.is_full_char(r)) { + out = range_predicate::top(max_char); + return true; + } + unsigned lo = 0, hi = 0; + expr* lo_e = nullptr; + expr* hi_e = nullptr; + if (re.is_range(r, lo_e, hi_e)) { + auto extract_char = [&](expr* e, unsigned& c) -> bool { + if (u.is_const_char(e, c)) return true; + expr* inner = nullptr; + if (u.str.is_unit(e, inner) && u.is_const_char(inner, c)) return true; + zstring s; + if (u.str.is_string(e, s) && s.length() == 1) { + c = s[0]; + return true; + } + return false; + }; + if (!extract_char(lo_e, lo) || !extract_char(hi_e, hi)) + return false; + // Empty/inverted range [lo > hi] is the empty regex. + if (lo > hi) { + out = range_predicate::empty(max_char); + return true; + } + out = range_predicate::range(lo, hi, max_char); + return true; + } + expr *a = nullptr, *b = nullptr, *c = nullptr; + if (re.is_union(r, a, b)) { + range_predicate pa(max_char), pb(max_char); + if (!regex_to_range_predicate(u, a, pa)) return false; + if (!regex_to_range_predicate(u, b, pb)) return false; + out = pa | pb; + return true; + } + auto mk_diff = [&](expr *a, expr *b) -> bool { + range_predicate pa(max_char), pb(max_char); + if (!regex_to_range_predicate(u, a, pa)) + return false; + if (!regex_to_range_predicate(u, b, pb)) + return false; + out = pa - pb; + return true; + }; + if (re.is_diff(r, a, b)) + return mk_diff(a, b); + + if (re.is_intersection(r, a, b) && re.is_complement(b, c)) + return mk_diff(a, c); + + if (re.is_intersection(r, a, b) && re.is_complement(a, c)) + return mk_diff(b, c); + + if (re.is_intersection(r, a, b)) { + range_predicate pa(max_char), pb(max_char); + if (!regex_to_range_predicate(u, a, pa)) return false; + if (!regex_to_range_predicate(u, b, pb)) return false; + out = pa & pb; + return true; + } + + + // NOTE: re.complement is intentionally NOT handled here. + // re.complement is the SEQUENCE-level complement: its language + // includes the empty string, strings of length >= 2, and any + // length-1 string outside the operand. A character-class + // range_predicate can only describe a set of length-1 strings, + // so collapsing re.complement(R) to ~R (character-level + // complement) would change semantics whenever R is wrapped in + // any sequence-level context (e.g. re.diff at the top level, + // or membership tests). De-Morgan equivalences and the + // special cases re.complement(re.empty) / re.complement(re.full) + // are already handled directly in seq_rewriter::mk_re_complement. + return false; + } + + static expr_ref mk_unit_string_from_char(seq_util& u, unsigned c) { + return expr_ref(u.str.mk_string(zstring(c)), u.get_manager()); + } + + static expr_ref mk_single_range_regex(seq_util& u, unsigned lo, unsigned hi, sort* re_sort) { + ast_manager& m = u.get_manager(); + if (lo == 0 && hi == u.max_char()) + return expr_ref(u.re.mk_full_char(re_sort), m); + // Use the canonical unit-character form (seq.unit (Char N)) for + // range bounds. This matches the shape used elsewhere in + // seq_rewriter and avoids creating duplicate AST nodes with + // different ids for semantically identical ranges. + expr_ref slo(u.str.mk_unit(u.str.mk_char(lo)), m); + expr_ref shi(u.str.mk_unit(u.str.mk_char(hi)), m); + return expr_ref(u.re.mk_range(slo, shi), m); + } + + expr_ref range_predicate_to_regex(seq_util& u, range_predicate const& p, sort* seq_sort) { + ast_manager& m = u.get_manager(); + sort* re_sort = u.re.mk_re(seq_sort); + if (p.is_empty()) + return expr_ref(u.re.mk_empty(re_sort), m); + unsigned const n = p.num_ranges(); + SASSERT(n > 0); + if (n == 1) { + auto [lo, hi] = p[0]; + return mk_single_range_regex(u, lo, hi, re_sort); + } + // Build single-range AST nodes first, then sort by expression id + // so the resulting right-associated union matches the canonical + // id-sorted shape that seq_rewriter::merge_regex_sets expects. + // Without this the merge algorithm produces incorrect unions + // when it has to combine our materialized output with another + // (id-sorted) regex set. + expr_ref_vector ranges(m); + for (unsigned i = 0; i < n; ++i) { + auto [lo, hi] = p[i]; + ranges.push_back(mk_single_range_regex(u, lo, hi, re_sort)); + } + std::sort(ranges.data(), ranges.data() + ranges.size(), + [](expr* a, expr* b) { return a->get_id() < b->get_id(); }); + expr_ref acc(ranges.get(n - 1), m); + for (unsigned i = n - 1; i-- > 0; ) + acc = expr_ref(u.re.mk_union(ranges.get(i), acc), m); + return acc; + } + +} diff --git a/src/ast/rewriter/seq_range_collapse.h b/src/ast/rewriter/seq_range_collapse.h new file mode 100644 index 0000000000..16cd5fd67b --- /dev/null +++ b/src/ast/rewriter/seq_range_collapse.h @@ -0,0 +1,71 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_range_collapse.h + +Abstract: + + Recognize regexes that are boolean combinations of character-class + primitives (re.empty, re.full_char, re.range with concrete chars, + and re.union/inter/comp/diff over translatable arguments), and + materialize a seq::range_predicate back into a canonical regex AST. + + Together with seq_rewriter integration, this lets any boolean + combination of character-class regexes collapse to a canonical + multi-range form, so that equivalent character classes share AST + identity, and downstream consumers (derivative, OneStep, caching) + can short-circuit them as pure range predicates. + +Authors: + + Margus Veanes (veanes) 2026 + +--*/ +#pragma once + +#include "ast/rewriter/seq_range_predicate.h" +#include "ast/seq_decl_plugin.h" + +namespace seq { + + /** + * If r is a boolean combination of character-class regex primitives + * over the unsigned character domain [0, max_char], compute the + * equivalent range_predicate and return true. Otherwise return false + * with out untouched. + * + * Recognized fragment (all character-class-preserving operations): + * re.empty -> empty + * re.full_char_set -> top + * re.range "c_lo" "c_hi" (concrete) -> [c_lo, c_hi] + * re.union r1 r2 -> p1 | p2 + * re.intersection r1 r2 -> p1 & p2 + * re.diff r1 r2 -> p1 - p2 + * + * Notably re.complement is NOT recognized: it is a SEQUENCE-level + * complement (over all of Σ*), not a character-class complement, so + * collapsing it would change semantics whenever the result is used + * in any non-character-class context. Sequence-level rewrites for + * re.complement (double-comp, deMorgan, etc.) are handled directly + * in seq_rewriter::mk_re_complement. + */ + bool regex_to_range_predicate(seq_util& u, expr* r, range_predicate& out); + + /** + * Canonical materialization of p as a regex AST over the given + * sequence sort. Two range_predicates with equal canonical + * representations produce structurally identical regex ASTs: + * + * empty -> re.empty + * top -> re.full_char_set + * single range [lo, hi] -> re.range "lo" "hi" + * multiple ranges -> right-associated re.union of single + * ranges, in increasing order of lo + * (matching the canonical range order + * held by range_predicate). + */ + expr_ref range_predicate_to_regex(seq_util& u, range_predicate const& p, sort* seq_sort); + +} diff --git a/src/ast/rewriter/seq_range_predicate.cpp b/src/ast/rewriter/seq_range_predicate.cpp new file mode 100644 index 0000000000..7bb9eac821 --- /dev/null +++ b/src/ast/rewriter/seq_range_predicate.cpp @@ -0,0 +1,292 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_range_predicate.cpp + +Abstract: + + Implementation of the specialized range-algebra used by symbolic + derivative computation and regex rewriting. See seq_range_predicate.h + for the algebraic specification. + + All Boolean operations are implemented as single linear sweeps over + the canonical sorted range vectors and produce canonical output + (sorted, disjoint, non-adjacent). + +Authors: + + Margus Veanes (veanes) 2026 + +--*/ + +#include "ast/rewriter/seq_range_predicate.h" +#include "util/debug.h" +#include +#include + +namespace seq { + + // ----------------------------------------------------------------------- + // Factories + // ----------------------------------------------------------------------- + + range_predicate range_predicate::empty(unsigned max_char) { + return range_predicate(max_char); + } + + range_predicate range_predicate::top(unsigned max_char) { + range_predicate r(max_char); + r.m_ranges.push_back({0u, max_char}); + SASSERT(r.well_formed()); + return r; + } + + range_predicate range_predicate::singleton(unsigned c, unsigned max_char) { + SASSERT(c <= max_char); + range_predicate r(max_char); + r.m_ranges.push_back({c, c}); + SASSERT(r.well_formed()); + return r; + } + + range_predicate range_predicate::range(unsigned lo, unsigned hi, unsigned max_char) { + range_predicate r(max_char); + if (lo <= hi && lo <= max_char) { + unsigned clipped_hi = hi <= max_char ? hi : max_char; + r.m_ranges.push_back({lo, clipped_hi}); + } + SASSERT(r.well_formed()); + return r; + } + + // ----------------------------------------------------------------------- + // Invariants and observers + // ----------------------------------------------------------------------- + + bool range_predicate::well_formed() const { + for (unsigned i = 0; i < m_ranges.size(); ++i) { + auto [lo, hi] = m_ranges[i]; + if (lo > hi) return false; + if (hi > m_max_char) return false; + if (i > 0) { + unsigned prev_hi = m_ranges[i - 1].second; + // Non-adjacent and sorted: prev_hi + 1 < lo, with care + // around prev_hi == UINT_MAX which we never expect because + // hi <= m_max_char. + if (prev_hi + 1 >= lo) return false; + } + } + return true; + } + + bool range_predicate::contains(unsigned c) const { + // Binary search on first element of pairs. + unsigned lo = 0, hi = m_ranges.size(); + while (lo < hi) { + unsigned mid = lo + (hi - lo) / 2; + auto [a, b] = m_ranges[mid]; + if (c < a) hi = mid; + else if (c > b) lo = mid + 1; + else return true; + } + return false; + } + + uint64_t range_predicate::cardinality() const { + uint64_t n = 0; + for (auto [lo, hi] : m_ranges) + n += static_cast(hi) - static_cast(lo) + 1u; + return n; + } + + // ----------------------------------------------------------------------- + // Equality, ordering, hashing + // ----------------------------------------------------------------------- + + bool range_predicate::equals(range_predicate const& o) const { + if (m_max_char != o.m_max_char) return false; + if (m_ranges.size() != o.m_ranges.size()) return false; + for (unsigned i = 0; i < m_ranges.size(); ++i) + if (m_ranges[i] != o.m_ranges[i]) return false; + return true; + } + + bool range_predicate::operator<(range_predicate const& o) const { + if (m_max_char != o.m_max_char) + return m_max_char < o.m_max_char; + unsigned n = std::min(m_ranges.size(), o.m_ranges.size()); + for (unsigned i = 0; i < n; ++i) { + auto a = m_ranges[i]; + auto b = o.m_ranges[i]; + if (a.first != b.first) return a.first < b.first; + if (a.second != b.second) return a.second < b.second; + } + return m_ranges.size() < o.m_ranges.size(); + } + + unsigned range_predicate::hash() const { + // FNV-1a 32-bit over (max_char, then each (lo, hi)). + uint32_t h = 2166136261u; + auto step = [&](uint32_t x) { + h ^= x; + h *= 16777619u; + }; + step(m_max_char); + for (auto [lo, hi] : m_ranges) { + step(lo); + step(hi); + } + return h; + } + + // ----------------------------------------------------------------------- + // Boolean operations + // ----------------------------------------------------------------------- + + namespace { + // Append (lo, hi) to result, merging with the previous range if + // adjacent or overlapping. Maintains canonical form. + inline void append_merged(svector>& result, + unsigned lo, unsigned hi) { + SASSERT(lo <= hi); + if (!result.empty() && result.back().second + 1 >= lo) { + if (result.back().second < hi) + result.back().second = hi; + } else { + result.push_back({lo, hi}); + } + } + } + + range_predicate range_predicate::operator|(range_predicate const& o) const { + SASSERT(m_max_char == o.m_max_char); + range_predicate r(m_max_char); + unsigned i = 0, j = 0; + const unsigned n = m_ranges.size(); + const unsigned m = o.m_ranges.size(); + while (i < n && j < m) { + auto a = m_ranges[i]; + auto b = o.m_ranges[j]; + if (a.first <= b.first) { + append_merged(r.m_ranges, a.first, a.second); + ++i; + } else { + append_merged(r.m_ranges, b.first, b.second); + ++j; + } + } + while (i < n) { + auto a = m_ranges[i++]; + append_merged(r.m_ranges, a.first, a.second); + } + while (j < m) { + auto b = o.m_ranges[j++]; + append_merged(r.m_ranges, b.first, b.second); + } + SASSERT(r.well_formed()); + return r; + } + + range_predicate range_predicate::operator&(range_predicate const& o) const { + SASSERT(m_max_char == o.m_max_char); + range_predicate r(m_max_char); + unsigned i = 0, j = 0; + const unsigned n = m_ranges.size(); + const unsigned m = o.m_ranges.size(); + while (i < n && j < m) { + auto [a_lo, a_hi] = m_ranges[i]; + auto [b_lo, b_hi] = o.m_ranges[j]; + unsigned lo = std::max(a_lo, b_lo); + unsigned hi = std::min(a_hi, b_hi); + if (lo <= hi) + r.m_ranges.push_back({lo, hi}); + // Advance the range that ends first. + if (a_hi < b_hi) ++i; + else if (b_hi < a_hi) ++j; + else { ++i; ++j; } + } + SASSERT(r.well_formed()); + return r; + } + + range_predicate range_predicate::operator~() const { + range_predicate r(m_max_char); + unsigned cursor = 0; + for (auto [lo, hi] : m_ranges) { + if (cursor < lo) + r.m_ranges.push_back({cursor, lo - 1}); + // Step past hi without overflow: hi <= m_max_char and we + // only step when more characters remain. + if (hi >= m_max_char) { + cursor = m_max_char + 1; // sentinel: no more characters + break; + } + cursor = hi + 1; + } + if (cursor <= m_max_char) + r.m_ranges.push_back({cursor, m_max_char}); + SASSERT(r.well_formed()); + return r; + } + + range_predicate range_predicate::operator-(range_predicate const& o) const { + SASSERT(m_max_char == o.m_max_char); + // A - B by linear sweep: for each range of A, subtract overlapping + // ranges of B. Both inputs are sorted so we advance j monotonically. + range_predicate r(m_max_char); + unsigned j = 0; + const unsigned m = o.m_ranges.size(); + for (auto [a_lo, a_hi] : m_ranges) { + unsigned cursor = a_lo; + while (j < m && o.m_ranges[j].second < cursor) + ++j; + unsigned k = j; + while (k < m && o.m_ranges[k].first <= a_hi) { + auto [b_lo, b_hi] = o.m_ranges[k]; + if (cursor < b_lo) + r.m_ranges.push_back({cursor, std::min(a_hi, b_lo - 1)}); + if (b_hi >= a_hi) { + cursor = a_hi + 1; + break; + } + cursor = b_hi + 1; + ++k; + } + if (cursor <= a_hi) + r.m_ranges.push_back({cursor, a_hi}); + } + SASSERT(r.well_formed()); + return r; + } + + range_predicate range_predicate::operator^(range_predicate const& o) const { + SASSERT(m_max_char == o.m_max_char); + // (A | B) - (A & B), but implemented directly with one linear sweep + // over the union of breakpoints. + return (*this | o) - (*this & o); + } + + // ----------------------------------------------------------------------- + // Display + // ----------------------------------------------------------------------- + + std::ostream& range_predicate::display(std::ostream& out) const { + if (m_ranges.empty()) { + return out << "[]"; + } + out << "["; + bool first = true; + for (auto [lo, hi] : m_ranges) { + if (!first) out << ","; + first = false; + if (lo == hi) + out << lo; + else + out << lo << "-" << hi; + } + return out << "]"; + } + +} diff --git a/src/ast/rewriter/seq_range_predicate.h b/src/ast/rewriter/seq_range_predicate.h new file mode 100644 index 0000000000..4fbf4938f5 --- /dev/null +++ b/src/ast/rewriter/seq_range_predicate.h @@ -0,0 +1,127 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_range_predicate.h + +Abstract: + + Specialized range-algebra over an unsigned character domain [0, max_char]. + + A range_predicate represents a subset of the character domain as a + sorted sequence of non-overlapping, non-adjacent, non-empty ranges: + + [(lo_0, hi_0), (lo_1, hi_1), ...] with hi_i + 1 < lo_{i+1}. + + The representation is canonical, so two range_predicates over the same + domain are extensionally equivalent iff their internal vectors are + elementwise equal. + + All Boolean operations (union, intersection, complement, difference) + are linear in the total number of ranges and produce the canonical + representation. + + Intended use: + * path conditions for symbolic derivative computation, + * OneStep predicates capturing length-1 acceptance, + * smart-constructor side conditions for regex rewrites such as + R & psi --> toregex(OneStep(R) & psi). + + The type is a pure value: no ast_manager allocation occurs in its + construction or its Boolean operations. Conversion to and from + expr* is the responsibility of a separate translator (see callers + in seq_derive / seq_rewriter). + +Authors: + + Margus Veanes (veanes) 2026 + +--*/ +#pragma once + +#include "util/vector.h" +#include +#include + +namespace seq { + + class range_predicate { + using range_t = std::pair; + using ranges_t = svector; + + // Sorted by first; ranges are disjoint and non-adjacent; + // every range satisfies lo <= hi <= m_max_char. + ranges_t m_ranges; + unsigned m_max_char { 0 }; + + // Invariant check used in debug builds. + bool well_formed() const; + + public: + range_predicate() = default; + explicit range_predicate(unsigned max_char) : m_max_char(max_char) {} + + // ---------------- Factory functions ---------------- + + static range_predicate empty(unsigned max_char); + static range_predicate top(unsigned max_char); + static range_predicate singleton(unsigned c, unsigned max_char); + static range_predicate range(unsigned lo, unsigned hi, unsigned max_char); + + // ---------------- Observers ---------------- + + unsigned max_char() const { return m_max_char; } + unsigned num_ranges() const { return m_ranges.size(); } + range_t operator[](unsigned i) const { return m_ranges[i]; } + ranges_t const& ranges() const { return m_ranges; } + + bool is_empty() const { return m_ranges.empty(); } + bool is_top() const { + return m_ranges.size() == 1 + && m_ranges[0].first == 0 + && m_ranges[0].second == m_max_char; + } + bool is_singleton(unsigned& c) const { + if (m_ranges.size() != 1) return false; + if (m_ranges[0].first != m_ranges[0].second) return false; + c = m_ranges[0].first; + return true; + } + bool contains(unsigned c) const; + + // Number of characters in the predicate (well-defined for any domain). + uint64_t cardinality() const; + + // ---------------- Equality, ordering, hashing ---------------- + + bool equals(range_predicate const& o) const; + bool operator==(range_predicate const& o) const { return equals(o); } + bool operator!=(range_predicate const& o) const { return !equals(o); } + + // Total order: lexicographic on the canonical range sequence, + // with shorter sequences ordered before longer prefixes. + // Predicates over different domains compare by max_char first. + bool operator<(range_predicate const& o) const; + bool less_than(range_predicate const& o) const { return *this < o; } + + unsigned hash() const; + + // ---------------- Boolean operations ---------------- + + range_predicate operator|(range_predicate const& o) const; // union + range_predicate operator&(range_predicate const& o) const; // intersection + range_predicate operator-(range_predicate const& o) const; // difference + range_predicate operator^(range_predicate const& o) const; // symmetric diff + range_predicate operator~() const; // complement + + // ---------------- Display ---------------- + + std::ostream& display(std::ostream& out) const; + }; + + inline std::ostream& operator<<(std::ostream& out, range_predicate const& p) { + return p.display(out); + } + +} diff --git a/src/ast/rewriter/seq_regex_bisim.cpp b/src/ast/rewriter/seq_regex_bisim.cpp index e296d5b3e0..68b2907a55 100644 --- a/src/ast/rewriter/seq_regex_bisim.cpp +++ b/src/ast/rewriter/seq_regex_bisim.cpp @@ -85,45 +85,6 @@ namespace seq { return is_ground(r); } - /* - Collect the leaves of a t-regex der (an ITE-tree whose leaves are - regex expressions) into the output vector. Empty (re.empty) leaves - are dropped. - - Each leaf is treated as a single bisimulation state regardless of - its top-level shape (including re.union and re.antimirov_union): - descending into a union at the leaf would split one state into - several, which is semantically unsound for the bisimulation / - union-find merging that follows. - - Returns false if we encountered an unexpected node (e.g. a free - variable creeping in) — in that case the caller should bail out. - */ - bool regex_bisim::collect_leaves(expr* der, expr_ref_vector& leaves) { - ptr_vector work; - obj_hashtable seen; - work.push_back(der); - seen.insert(der); - while (!work.empty()) { - expr* e = work.back(); - work.pop_back(); - expr* c = nullptr, * t = nullptr, * f = nullptr; - if (m.is_ite(e, c, t, f)) { - if (seen.insert_if_not_there(t)) - work.push_back(t); - if (seen.insert_if_not_there(f)) - work.push_back(f); - continue; - } - if (m_util.re.is_empty(e)) - continue; - if (!m_util.is_re(e)) - return false; - leaves.push_back(e); - } - return true; - } - /* Fast inequivalence check based on the get_info().classical flag. @@ -232,15 +193,19 @@ namespace seq { m_worklist.pop_back(); // Compute the symbolic derivative wrt the canonical variable - // (:var 0). The result is a transition regex (ITE tree) whose - // leaves are regex expressions. We use the classical Brzozowski - // entry point so the derivative stays as a single TRegex and - // does not lift unions to the top via antimirov nodes — this - // preserves the XOR-pair invariant the bisimulation relies on. - expr_ref d(m_rw.mk_brz_derivative(r), m); + // (:var 0) and enumerate its reachable leaves in fully + // ITE-hoisted normal form. Every if-then-else over the input + // character — even one that would otherwise be buried under a + // concat or union — is hoisted to the top and infeasible + // minterms are pruned, so each leaf is a ground regex free of + // (:var 0) whose nullability is always decidable. Unions are + // kept intact as single leaves (a union leaf denotes a single + // bisimulation state, never a split into separate states). + expr_ref_pair_vector cofs(m); + m_rw.brz_derivative_cofactors(r, cofs); expr_ref_vector leaves(m); - if (!collect_leaves(d, leaves)) - return l_undef; + for (auto const& p : cofs) + leaves.push_back(p.second); // First pass: check for any nullable leaf (definitive // distinguishing empty-continuation word) or any classically diff --git a/src/ast/rewriter/seq_regex_bisim.h b/src/ast/rewriter/seq_regex_bisim.h index d158cc3793..7ec5c30a33 100644 --- a/src/ast/rewriter/seq_regex_bisim.h +++ b/src/ast/rewriter/seq_regex_bisim.h @@ -74,7 +74,6 @@ namespace seq { unsigned node_of(expr* r); bool merge_leaf(expr* xor_pair); - bool collect_leaves(expr* der, expr_ref_vector& leaves); lbool nullability(expr* r); bool is_supported(expr* r); // Returns true if the leaf l proves that the original pair is diff --git a/src/ast/rewriter/seq_rewriter.cpp b/src/ast/rewriter/seq_rewriter.cpp index 3dd2d9a364..85adb94d17 100644 --- a/src/ast/rewriter/seq_rewriter.cpp +++ b/src/ast/rewriter/seq_rewriter.cpp @@ -21,6 +21,7 @@ Authors: #include "util/uint_set.h" #include "ast/rewriter/seq_rewriter.h" #include "ast/rewriter/seq_regex_bisim.h" +#include "ast/rewriter/seq_range_collapse.h" #include "ast/arith_decl_plugin.h" #include "ast/array_decl_plugin.h" #include "ast/ast_pp.h" @@ -2726,7 +2727,7 @@ expr_ref seq_rewriter::is_nullable(expr* r) { << mk_pp(r, m()) << std::endl;); expr_ref result(m_op_cache.find(_OP_RE_IS_NULLABLE, r, nullptr, nullptr), m()); if (!result) { - result = is_nullable_rec(r); + result = m_derive.nullable(r); m_op_cache.insert(_OP_RE_IS_NULLABLE, r, nullptr, nullptr, result); } STRACE(seq_verbose, tout << "is_nullable result: " @@ -2734,117 +2735,6 @@ expr_ref seq_rewriter::is_nullable(expr* r) { return result; } -expr_ref seq_rewriter::is_nullable_rec(expr* r) { - SASSERT(m_util.is_re(r) || m_util.is_seq(r)); - expr* r1 = nullptr, *r2 = nullptr, *cond = nullptr; - sort* seq_sort = nullptr; - unsigned lo = 0, hi = 0; - zstring s1; - expr_ref result(m()); - if (re().is_concat(r, r1, r2) || - re().is_intersection(r, r1, r2)) { - m_br.mk_and(is_nullable(r1), is_nullable(r2), result); - } - else if (re().is_union(r, r1, r2) || re().is_antimirov_union(r, r1, r2)) { - m_br.mk_or(is_nullable(r1), is_nullable(r2), result); - } - else if (re().is_diff(r, r1, r2)) { - m_br.mk_not(is_nullable(r2), result); - m_br.mk_and(result, is_nullable(r1), result); - } - else if (re().is_xor(r, r1, r2)) { - // Null(r1 XOR r2) = Null(r1) XOR Null(r2) - expr_ref n1(is_nullable(r1), m()); - expr_ref n2(is_nullable(r2), m()); - // Simplify when either operand is a boolean literal so the - // bisimulation procedure can use the answer directly. - if (m().is_true(n1)) - result = mk_not(m(), n2); - else if (m().is_false(n1)) - result = n2; - else if (m().is_true(n2)) - result = mk_not(m(), n1); - else if (m().is_false(n2)) - result = n1; - else - result = m().mk_xor(n1, n2); - } - else if (re().is_star(r) || - re().is_opt(r) || - re().is_full_seq(r) || - re().is_epsilon(r) || - (re().is_loop(r, r1, lo) && lo == 0) || - (re().is_loop(r, r1, lo, hi) && lo == 0)) { - result = m().mk_true(); - } - else if (re().is_full_char(r) || - re().is_empty(r) || - re().is_of_pred(r) || - re().is_range(r)) { - result = m().mk_false(); - } - else if (re().is_plus(r, r1) || - (re().is_loop(r, r1, lo) && lo > 0) || - (re().is_loop(r, r1, lo, hi) && lo > 0) || - (re().is_reverse(r, r1))) { - result = is_nullable(r1); - } - else if (re().is_complement(r, r1)) { - m_br.mk_not(is_nullable(r1), result); - } - else if (re().is_to_re(r, r1)) { - result = is_nullable(r1); - } - else if (m().is_ite(r, cond, r1, r2)) { - m_br.mk_ite(cond, is_nullable(r1), is_nullable(r2), result); - } - else if (m_util.is_re(r, seq_sort)) { - result = is_nullable_symbolic_regex(r, seq_sort); - } - else if (str().is_concat(r, r1, r2)) { - m_br.mk_and(is_nullable(r1), is_nullable(r2), result); - } - else if (str().is_empty(r)) { - result = m().mk_true(); - } - else if (str().is_unit(r)) { - result = m().mk_false(); - } - else if (str().is_string(r, s1)) { - result = m().mk_bool_val(s1.length() == 0); - } - else { - SASSERT(m_util.is_seq(r)); - result = m().mk_eq(str().mk_empty(r->get_sort()), r); - } - return result; -} - -expr_ref seq_rewriter::is_nullable_symbolic_regex(expr* r, sort* seq_sort) { - SASSERT(m_util.is_re(r)); - expr* elem = nullptr, *r1 = r, * r2 = nullptr, * s = nullptr; - expr_ref elems(str().mk_empty(seq_sort), m()); - expr_ref result(m()); - while (re().is_derivative(r1, elem, r2)) { - if (str().is_empty(elems)) - elems = str().mk_unit(elem); - else - elems = str().mk_concat(str().mk_unit(elem), elems); - r1 = r2; - } - if (re().is_to_re(r1, s)) { - // r is nullable - // iff after taking the derivatives the remaining sequence is empty - // iff the inner sequence equals to the sequence of derivative elements in reverse - result = m().mk_eq(elems, s); - return result; - } - // the default case when either r is not a derivative - // or when the nested derivatives are not applied to a sequence - result = re().mk_in_re(str().mk_empty(seq_sort), r); - return result; -} - /* Push reverse inwards (whenever possible). */ @@ -3068,370 +2958,18 @@ bool seq_rewriter::check_deriv_normal_form(expr* r, int level) { #endif expr_ref seq_rewriter::mk_derivative(expr* r) { - sort* seq_sort = nullptr, * ele_sort = nullptr; - VERIFY(m_util.is_re(r, seq_sort)); - VERIFY(m_util.is_seq(seq_sort, ele_sort)); - expr_ref v(m().mk_var(0, ele_sort), m()); - return mk_antimirov_deriv(v, r, m().mk_true()); -} - -expr_ref seq_rewriter::mk_brz_derivative(expr* r) { - sort* seq_sort = nullptr, * ele_sort = nullptr; - VERIFY(m_util.is_re(r, seq_sort)); - VERIFY(m_util.is_seq(seq_sort, ele_sort)); - expr_ref v(m().mk_var(0, ele_sort), m()); - return mk_derivative_rec(v, r); + auto result = m_derive(seq::derivative_kind::antimirov_t, r); + TRACE(seq, tout << "Derivative of " << mk_pp(r, m()) << "\nis\n" << result << std::endl;); + return result; } expr_ref seq_rewriter::mk_derivative(expr* ele, expr* r) { - return mk_antimirov_deriv(ele, r, m().mk_true()); -} - -expr_ref seq_rewriter::mk_antimirov_deriv(expr* e, expr* r, expr* path) { - // Ensure references are owned - expr_ref _e(e, m()), _path(path, m()), _r(r, m()); - expr_ref result(m_op_cache.find(OP_RE_DERIVATIVE, e, r, path), m()); - if (!result) { - mk_antimirov_deriv_rec(e, r, path, result); - m_op_cache.insert(OP_RE_DERIVATIVE, e, r, path, result); - STRACE(seq_regex, tout << "D(" << mk_pp(e, m()) << "," << mk_pp(r, m()) << "," << mk_pp(path, m()) << ")" << std::endl;); - STRACE(seq_regex, tout << "= " << mk_pp(result, m()) << std::endl;); - } + auto result = m_derive(seq::derivative_kind::antimirov_t, ele, r); + TRACE(seq, + tout << "Derivative of " << mk_pp(r, m()) << " w.r.t. " << mk_pp(ele, m()) << "\nis\n" << result << std::endl;); return result; } -void seq_rewriter::mk_antimirov_deriv_rec(expr* e, expr* r, expr* path, expr_ref& result) { - sort* seq_sort = nullptr, * ele_sort = nullptr; - expr_ref _r(r, m()), _path(path, m()); - VERIFY(m_util.is_re(r, seq_sort)); - VERIFY(m_util.is_seq(seq_sort, ele_sort)); - SASSERT(ele_sort == e->get_sort()); - expr* r1 = nullptr, * r2 = nullptr, * c = nullptr; - expr_ref c1(m()); - expr_ref c2(m()); - auto nothing = [&]() { return expr_ref(re().mk_empty(r->get_sort()), m()); }; - auto epsilon = [&]() { return expr_ref(re().mk_epsilon(seq_sort), m()); }; - auto dotstar = [&]() { return expr_ref(re().mk_full_seq(r->get_sort()), m()); }; - unsigned lo = 0, hi = 0; - if (re().is_empty(r) || re().is_epsilon(r)) - // D(e,[]) = D(e,()) = [] - result = nothing(); - else if (re().is_full_seq(r) || re().is_dot_plus(r)) - // D(e,.*) = D(e,.+) = .* - result = dotstar(); - else if (re().is_full_char(r)) - // D(e,.) = () - result = epsilon(); - else if (re().is_to_re(r, r1)) { - expr_ref h(m()); - expr_ref t(m()); - // here r1 is a sequence - if (get_head_tail(r1, h, t)) { - if (eq_char(e, h)) - result = re().mk_to_re(t); - else if (neq_char(e, h)) - result = nothing(); - else - result = re().mk_ite_simplify(m().mk_eq(e, h), re().mk_to_re(t), nothing()); - } - else { - // observe that the precondition |r1|>0 is is implied by c1 for use of mk_seq_first - { - auto is_non_empty = m().mk_not(m().mk_eq(r1, str().mk_empty(seq_sort))); - auto eq_first = m().mk_eq(mk_seq_first(r1), e); - m_br.mk_and(is_non_empty, eq_first, c1); - } - m_br.mk_and(path, c1, c2); - if (m().is_false(c2)) - result = nothing(); - else - // observe that the precondition |r1|>0 is implied by c1 for use of mk_seq_rest - result = m().mk_ite(c1, re().mk_to_re(mk_seq_rest(r1)), nothing()); - } - } - else if (re().is_reverse(r, r2)) - if (re().is_to_re(r2, r1)) { - // here r1 is a sequence - // observe that the precondition |r1|>0 of mk_seq_last is implied by c1 - { - auto is_non_empty = m().mk_not(m().mk_eq(r1, str().mk_empty(seq_sort))); - auto eq_last = m().mk_eq(mk_seq_last(r1), e); - m_br.mk_and(is_non_empty, eq_last, c1); - } - m_br.mk_and(path, c1, c2); - if (m().is_false(c2)) - result = nothing(); - else - // observe that the precondition |r1|>0 of mk_seq_rest is implied by c1 - result = re().mk_ite_simplify(c1, re().mk_reverse(re().mk_to_re(mk_seq_butlast(r1))), nothing()); - } - else { - result = mk_regex_reverse(r2); - if (result.get() == r) - //r2 is an uninterpreted regex that is stuck - //for example if r = (re.reverse R) where R is a regex variable then - //here result.get() == r - result = re().mk_derivative(e, result); - else - result = mk_antimirov_deriv(e, result, path); - } - else if (re().is_concat(r, r1, r2)) { - expr_ref r1nullable(is_nullable(r1), m()); - c1 = mk_antimirov_deriv_concat(mk_antimirov_deriv(e, r1, path), r2); - expr_ref r1nullable_and_path(m()); - m_br.mk_and(r1nullable, path, r1nullable_and_path); - if (m().is_false(r1nullable_and_path)) - // D(e,r1)r2 - result = c1; - else - // D(e,r1)r2|(ite (r1nullable) (D(e,r2)) []) - // observe that (mk_ite_simplify(true, D(e,r2), []) = D(e,r2) - result = mk_antimirov_deriv_union(c1, re().mk_ite_simplify(r1nullable, mk_antimirov_deriv(e, r2, path), nothing())); - } - else if (m().is_ite(r, c, r1, r2)) { - { - auto cp = m().mk_and(c, path); - c1 = simplify_path(e, cp); - } - { - auto notc = m().mk_not(c); - auto np = m().mk_and(notc, path); - c2 = simplify_path(e, np); - } - if (m().is_false(c1)) - result = mk_antimirov_deriv(e, r2, c2); - else if (m().is_false(c2)) - result = mk_antimirov_deriv(e, r1, c1); - else - result = re().mk_ite_simplify(c, mk_antimirov_deriv(e, r1, c1), mk_antimirov_deriv(e, r2, c2)); - } - else if (re().is_range(r, r1, r2)) { - expr_ref range(m()); - expr_ref psi(m().mk_false(), m()); - if (str().is_unit_string(r1, c1) && str().is_unit_string(r2, c2)) { - // SASSERT(u().is_char(c1)); - // SASSERT(u().is_char(c2)); - // case: c1 <= e <= c2 - // deterministic evaluation for range bounds - auto a_le = u().mk_le(c1, e); - auto b_le = u().mk_le(e, c2); - auto rng_cond = m().mk_and(a_le, b_le); - range = simplify_path(e, rng_cond); - psi = simplify_path(e, m().mk_and(path, range)); - } - else if (!str().is_string(r1) && str().is_unit_string(r2, c2)) { - SASSERT(u().is_char(c2)); - // r1 nonground: |r1|=1 & r1[0] <= e <= c2 - expr_ref one(m_autil.mk_int(1), m()); - expr_ref zero(m_autil.mk_int(0), m()); - expr_ref r1_length_eq_one(m().mk_eq(str().mk_length(r1), one), m()); - expr_ref r1_0(str().mk_nth_i(r1, zero), m()); - range = simplify_path(e, m().mk_and(r1_length_eq_one, m().mk_and(u().mk_le(r1_0, e), u().mk_le(e, c2)))); - psi = simplify_path(e, m().mk_and(path, range)); - } - else if (!str().is_string(r2) && str().is_unit_string(r1, c1)) { - SASSERT(u().is_char(c1)); - // r2 nonground: |r2|=1 & c1 <= e <= r2_0 - expr_ref one(m_autil.mk_int(1), m()); - expr_ref zero(m_autil.mk_int(0), m()); - expr_ref r2_length_eq_one(m().mk_eq(str().mk_length(r2), one), m()); - expr_ref r2_0(str().mk_nth_i(r2, zero), m()); - range = simplify_path(e, m().mk_and(r2_length_eq_one, m().mk_and(u().mk_le(c1, e), u().mk_le(e, r2_0)))); - psi = simplify_path(e, m().mk_and(path, range)); - } - else if (!str().is_string(r1) && !str().is_string(r2)) { - // both r1 and r2 nonground: |r1|=1 & |r2|=1 & r1[0] <= e <= r2[0] - expr_ref one(m_autil.mk_int(1), m()); - expr_ref zero(m_autil.mk_int(0), m()); - expr_ref r1_length_eq_one(m().mk_eq(str().mk_length(r1), one), m()); - expr_ref r1_0(str().mk_nth_i(r1, zero), m()); - expr_ref r2_length_eq_one(m().mk_eq(str().mk_length(r2), one), m()); - expr_ref r2_0(str().mk_nth_i(r2, zero), m()); - range = simplify_path(e, m().mk_and(r1_length_eq_one, m().mk_and(r2_length_eq_one, m().mk_and(u().mk_le(r1_0, e), u().mk_le(e, r2_0))))); - psi = simplify_path(e, m().mk_and(path, range)); - } - if (m().is_false(psi)) - result = nothing(); - else - result = re().mk_ite_simplify(range, epsilon(), nothing()); - } - else if (re().is_union(r, r1, r2)) - result = mk_antimirov_deriv_union(mk_antimirov_deriv(e, r1, path), mk_antimirov_deriv(e, r2, path)); - else if (re().is_intersection(r, r1, r2)) - result = mk_antimirov_deriv_intersection(e, - mk_antimirov_deriv(e, r1, path), - mk_antimirov_deriv(e, r2, path), m().mk_true()); - else if (re().is_star(r, r1) || re().is_plus(r, r1) || (re().is_loop(r, r1, lo) && 0 <= lo && lo <= 1)) - result = mk_antimirov_deriv_concat(mk_antimirov_deriv(e, r1, path), re().mk_star(r1)); - else if (re().is_loop(r, r1, lo)) - result = mk_antimirov_deriv_concat(mk_antimirov_deriv(e, r1, path), re().mk_loop(r1, lo - 1)); - else if (re().is_loop(r, r1, lo, hi)) { - if ((lo == 0 && hi == 0) || hi < lo) - result = nothing(); - else { - expr_ref t(re().mk_loop_proper(r1, (lo == 0 ? 0 : lo - 1), hi - 1), m()); - result = mk_antimirov_deriv_concat(mk_antimirov_deriv(e, r1, path), t); - } - } - else if (re().is_opt(r, r1)) - result = mk_antimirov_deriv(e, r1, path); - else if (re().is_complement(r, r1)) - // D(e,~r1) = ~D(e,r1) - result = mk_antimirov_deriv_negate(e, mk_antimirov_deriv(e, r1, path)); - else if (re().is_diff(r, r1, r2)) - result = mk_antimirov_deriv_intersection(e, - mk_antimirov_deriv(e, r1, path), - mk_antimirov_deriv_negate(e, mk_antimirov_deriv(e, r2, path)), m().mk_true()); - else if (re().is_xor(r, r1, r2)) - // D(e, r1 XOR r2) = D(e, r1) XOR D(e, r2) - result = mk_der_xor(mk_antimirov_deriv(e, r1, path), - mk_antimirov_deriv(e, r2, path)); - else if (re().is_of_pred(r, r1)) { - array_util array(m()); - expr* args[2] = { r1, e }; - result = array.mk_select(2, args); - // Use mk_der_cond to normalize - result = mk_der_cond(result, e, seq_sort); - } - else - // stuck cases - result = re().mk_derivative(e, r); -} - -expr_ref seq_rewriter::mk_antimirov_deriv_intersection(expr* e, expr* d1, expr* d2, expr* path) { - sort* seq_sort = nullptr, * ele_sort = nullptr; - VERIFY(m_util.is_re(d1, seq_sort)); - VERIFY(m_util.is_seq(seq_sort, ele_sort)); - expr_ref result(m()); - expr* c, * a, * b; - if (re().is_empty(d1)) - result = d1; - else if (re().is_empty(d2)) - result = d2; - else if (m().is_ite(d1, c, a, b)) { - expr_ref path_and_c(simplify_path(e, m().mk_and(path, c)), m()); - expr_ref path_and_notc(simplify_path(e, m().mk_and(path, m().mk_not(c))), m()); - if (m().is_false(path_and_c)) - result = mk_antimirov_deriv_intersection(e, b, d2, path); - else if (m().is_false(path_and_notc)) - result = mk_antimirov_deriv_intersection(e, a, d2, path); - else - result = m().mk_ite(c, mk_antimirov_deriv_intersection(e, a, d2, path_and_c), - mk_antimirov_deriv_intersection(e, b, d2, path_and_notc)); - } - else if (m().is_ite(d2)) - // swap d1 and d2 - result = mk_antimirov_deriv_intersection(e, d2, d1, path); - else if (d1 == d2 || re().is_full_seq(d2)) - result = mk_antimirov_deriv_restrict(e, d1, path); - else if (re().is_full_seq(d1)) - result = mk_antimirov_deriv_restrict(e, d2, path); - else if (re().is_union(d1, a, b)) - // distribute intersection over the union in d1 - result = mk_antimirov_deriv_union(mk_antimirov_deriv_intersection(e, a, d2, path), - mk_antimirov_deriv_intersection(e, b, d2, path)); - else if (re().is_union(d2, a, b)) - // distribute intersection over the union in d2 - result = mk_antimirov_deriv_union(mk_antimirov_deriv_intersection(e, d1, a, path), - mk_antimirov_deriv_intersection(e, d1, b, path)); - else - result = mk_regex_inter_normalize(d1, d2); - return result; -} - -expr_ref seq_rewriter::mk_antimirov_deriv_concat(expr* d, expr* r) { - expr_ref result(m()); - expr_ref _r(r, m()), _d(d, m()); - expr* c, * t, * e; - if (m().is_ite(d, c, t, e)) { - auto r2 = mk_antimirov_deriv_concat(e, r); - auto r1 = mk_antimirov_deriv_concat(t, r); - result = m().mk_ite(c, r1, r2); - } - else if (re().is_union(d, t, e)) - result = mk_antimirov_deriv_union(mk_antimirov_deriv_concat(t, r), mk_antimirov_deriv_concat(e, r)); - else - result = mk_re_append(d, r); - SASSERT(result.get()); - return result; -} - -expr_ref seq_rewriter::mk_antimirov_deriv_negate(expr* elem, expr* d) { - sort* seq_sort = nullptr; - VERIFY(m_util.is_re(d, seq_sort)); - auto nothing = [&]() { return expr_ref(re().mk_empty(d->get_sort()), m()); }; - auto epsilon = [&]() { return expr_ref(re().mk_epsilon(seq_sort), m()); }; - auto dotstar = [&]() { return expr_ref(re().mk_full_seq(d->get_sort()), m()); }; - auto dotplus = [&]() { return expr_ref(re().mk_plus(re().mk_full_char(d->get_sort())), m()); }; - expr_ref result(m()); - expr* c, * t, * e; - if (re().is_empty(d)) - result = dotstar(); - else if (re().is_epsilon(d)) - result = dotplus(); - else if (re().is_full_seq(d)) - result = nothing(); - else if (re().is_dot_plus(d)) - result = epsilon(); - else if (m().is_ite(d, c, t, e)) - result = m().mk_ite(c, mk_antimirov_deriv_negate(elem, t), mk_antimirov_deriv_negate(elem, e)); - else if (re().is_union(d, t, e)) - result = mk_antimirov_deriv_intersection(elem, mk_antimirov_deriv_negate(elem, t), mk_antimirov_deriv_negate(elem, e), m().mk_true()); - else if (re().is_intersection(d, t, e)) - result = mk_antimirov_deriv_union(mk_antimirov_deriv_negate(elem, t), mk_antimirov_deriv_negate(elem, e)); - else if (re().is_complement(d, t)) - result = t; - else - result = re().mk_complement(d); - return result; -} - -expr_ref seq_rewriter::mk_antimirov_deriv_union(expr* d1, expr* d2) { - sort* seq_sort = nullptr, * ele_sort = nullptr; - VERIFY(m_util.is_re(d1, seq_sort)); - VERIFY(m_util.is_seq(seq_sort, ele_sort)); - expr_ref result(m()); - expr* c1, * t1, * e1, * c2, * t2, * e2; - if (m().is_ite(d1, c1, t1, e1) && m().is_ite(d2, c2, t2, e2) && c1 == c2) - // eliminate duplicate branching on exactly the same condition - result = m().mk_ite(c1, mk_antimirov_deriv_union(t1, t2), mk_antimirov_deriv_union(e1, e2)); - else - result = mk_regex_union_normalize(d1, d2); - return result; -} - -// restrict the guards of all conditionals id d and simplify the resulting derivative -// restrict(if(c, a, b), cond) = if(c, restrict(a, cond & c), restrict(b, cond & ~c)) -// restrict(a U b, cond) = restrict(a, cond) U restrict(b, cond) -// where {} U X = X, X U X = X -// restrict(R, cond) = R -// -// restrict(d, false) = [] -// -// it is already assumed that the restriction takes place within a branch -// so the condition is not added explicitly but propagated down in order to eliminate -// infeasible cases -expr_ref seq_rewriter::mk_antimirov_deriv_restrict(expr* e, expr* d, expr* cond) { - expr_ref result(d, m()); - expr_ref _cond(cond, m()); - expr* c, * a, * b; - if (m().is_false(cond)) - result = re().mk_empty(d->get_sort()); - else if (re().is_empty(d) || m().is_true(cond)) - result = d; - else if (m().is_ite(d, c, a, b)) { - expr_ref path_and_c(simplify_path(e, m().mk_and(cond, c)), m()); - expr_ref path_and_notc(simplify_path(e, m().mk_and(cond, m().mk_not(c))), m()); - result = re().mk_ite_simplify(c, mk_antimirov_deriv_restrict(e, a, path_and_c), - mk_antimirov_deriv_restrict(e, b, path_and_notc)); - } - else if (re().is_union(d, a, b)) { - expr_ref a1(mk_antimirov_deriv_restrict(e, a, cond), m()); - expr_ref b1(mk_antimirov_deriv_restrict(e, b, cond), m()); - result = mk_antimirov_deriv_union(a1, b1); - } - return result; -} expr_ref seq_rewriter::mk_regex_union_normalize(expr* r1, expr* r2) { expr_ref _r1(r1, m()), _r2(r2, m()); @@ -3624,126 +3162,6 @@ expr_ref seq_rewriter::merge_regex_sets(expr* r1, expr* r2, expr* unit, } } -expr_ref seq_rewriter::mk_regex_reverse(expr* r) { - expr* r1 = nullptr, * r2 = nullptr, * c = nullptr; - unsigned lo = 0, hi = 0; - expr_ref result(m()); - if (re().is_empty(r) || re().is_range(r) || re().is_epsilon(r) || re().is_full_seq(r) || - re().is_full_char(r) || re().is_dot_plus(r) || re().is_of_pred(r)) - result = r; - else if (re().is_to_re(r)) - result = re().mk_reverse(r); - else if (re().is_reverse(r, r1)) - result = r1; - else if (re().is_concat(r, r1, r2)) - result = mk_regex_concat(mk_regex_reverse(r2), mk_regex_reverse(r1)); - else if (m().is_ite(r, c, r1, r2)) - result = m().mk_ite(c, mk_regex_reverse(r1), mk_regex_reverse(r2)); - else if (re().is_union(r, r1, r2)) { - // enforce deterministic evaluation order - auto a1 = mk_regex_reverse(r1); - auto b1 = mk_regex_reverse(r2); - result = re().mk_union(a1, b1); - } - else if (re().is_intersection(r, r1, r2)) { - auto a1 = mk_regex_reverse(r1); - auto b1 = mk_regex_reverse(r2); - result = re().mk_inter(a1, b1); - } - else if (re().is_diff(r, r1, r2)) { - auto a1 = mk_regex_reverse(r1); - auto b1 = mk_regex_reverse(r2); - result = re().mk_diff(a1, b1); - } - else if (re().is_xor(r, r1, r2)) { - auto a1 = mk_regex_reverse(r1); - auto b1 = mk_regex_reverse(r2); - result = re().mk_xor(a1, b1); - } - else if (re().is_star(r, r1)) - result = re().mk_star(mk_regex_reverse(r1)); - else if (re().is_plus(r, r1)) - result = re().mk_plus(mk_regex_reverse(r1)); - else if (re().is_loop(r, r1, lo)) - result = re().mk_loop(mk_regex_reverse(r1), lo); - else if (re().is_loop(r, r1, lo, hi)) - result = re().mk_loop_proper(mk_regex_reverse(r1), lo, hi); - else if (re().is_opt(r, r1)) - result = re().mk_opt(mk_regex_reverse(r1)); - else if (re().is_complement(r, r1)) - result = re().mk_complement(mk_regex_reverse(r1)); - else - //stuck cases: such as r being a regex variable - //observe that re().mk_reverse(to_re(s)) is not a stuck case - result = re().mk_reverse(r); - return result; -} - -expr_ref seq_rewriter::mk_regex_concat(expr* r, expr* s) { - sort* seq_sort = nullptr, * ele_sort = nullptr; - VERIFY(m_util.is_re(r, seq_sort)); - VERIFY(u().is_seq(seq_sort, ele_sort)); - SASSERT(r->get_sort() == s->get_sort()); - expr_ref result(m()); - expr* r1, * r2; - if (re().is_epsilon(r) || re().is_empty(s)) - result = s; - else if (re().is_epsilon(s) || re().is_empty(r)) - result = r; - else if (re().is_full_seq(r) && re().is_full_seq(s)) - result = r; - else if (re().is_full_char(r) && re().is_full_seq(s)) - // ..* = .+ - result = re().mk_plus(re().mk_full_char(ele_sort)); - else if (re().is_full_seq(r) && re().is_full_char(s)) - // .*. = .+ - result = re().mk_plus(re().mk_full_char(ele_sort)); - else if (re().is_concat(r, r1, r2)) - // create the resulting concatenation in right-associative form except for the following case - // TODO: maintain the following invariant for A ++ B{m,n} + C - // concat(concat(A, B{m,n}), C) (if A != () and C != ()) - // concat(B{m,n}, C) (if A == () and C != ()) - // where A, B, C are regexes - // Using & below for Intersection and | for Union - // In other words, do not make A ++ B{m,n} into right-assoc form, but keep B{m,n} at the top - // This will help to identify this situation in the merge routine: - // concat(concat(A, B{0,m}), C) | concat(concat(A, B{0,n}), C) - // simplifies to - // concat(concat(A, B{0,max(m,n)}), C) - // analogously: - // concat(concat(A, B{0,m}), C) & concat(concat(A, B{0,n}), C) - // simplifies to - // concat(concat(A, B{0,min(m,n)}), C) - result = mk_regex_concat(r1, mk_regex_concat(r2, s)); - else { - result = re().mk_concat(r, s); - } - return result; -} - -expr_ref seq_rewriter::mk_in_antimirov(expr* s, expr* d){ - expr_ref result(mk_in_antimirov_rec(s, d), m()); - return result; -} - -expr_ref seq_rewriter::mk_in_antimirov_rec(expr* s, expr* d) { - expr* c, * d1, * d2; - expr_ref result(m()); - if (re().is_full_seq(d) || (str().min_length(s) > 0 && re().is_dot_plus(d))) - // s in .* <==> true, also: s in .+ <==> true when |s|>0 - result = m().mk_true(); - else if (re().is_empty(d) || (str().min_length(s) > 0 && re().is_epsilon(d))) - // s in [] <==> false, also: s in () <==> false when |s|>0 - result = m().mk_false(); - else if (m().is_ite(d, c, d1, d2)) - result = re().mk_ite_simplify(c, mk_in_antimirov_rec(s, d1), mk_in_antimirov_rec(s, d2)); - else if (re().is_union(d, d1, d2)) - m_br.mk_or(mk_in_antimirov_rec(s, d1), mk_in_antimirov_rec(s, d2), result); - else - result = re().mk_in_re(s, d); - return result; -} - /* * calls elim_condition */ @@ -3804,6 +3222,10 @@ bool seq_rewriter::le_char(expr* ch1, expr* ch2) { Current cases handled: - a and b are char <= constraints, or negations of char <= constraints + - a and b are equalities (element = constant char), or their negations. + These arise from derivatives of single characters and must be pruned + when combining BDDs so that no unreachable branch such as + ite(x = 'a', ite(x = 'b', ...), ...) (with 'a' != 'b') is created. */ bool seq_rewriter::pred_implies(expr* a, expr* b) { STRACE(seq_verbose, tout << "pred_implies: " @@ -3811,6 +3233,26 @@ bool seq_rewriter::pred_implies(expr* a, expr* b) { << "," << mk_pp(b, m()) << std::endl;); expr *cha1 = nullptr, *cha2 = nullptr, *nota = nullptr, *chb1 = nullptr, *chb2 = nullptr, *notb = nullptr; + // (element = constant char), returning the element and char code. + auto is_char_eq = [&](expr* e, expr*& x, unsigned& v) { + expr* t1 = nullptr, *t2 = nullptr; + if (!m().is_eq(e, t1, t2)) + return false; + if (u().is_const_char(t2, v)) { x = t1; return true; } + if (u().is_const_char(t1, v)) { x = t2; return true; } + return false; + }; + expr *xa = nullptr, *xb = nullptr; + unsigned va = 0, vb = 0; + if (is_char_eq(a, xa, va)) { + // a is (xa = va) + if (is_char_eq(b, xb, vb) && xa == xb) + // (x = va) => (x = vb) iff va == vb + return va == vb; + if (m().is_not(b, notb) && is_char_eq(notb, xb, vb) && xa == xb) + // (x = va) => not (x = vb) iff va != vb + return va != vb; + } if (m().is_not(a, nota) && m().is_not(b, notb)) { return pred_implies(notb, nota); @@ -4008,26 +3450,33 @@ expr_ref seq_rewriter::mk_der_op(decl_kind k, expr* a, expr* b) { // transformations hide ite sub-terms, // Rewriting that changes associativity of // operators may hide ite sub-terms. - - switch (k) { - case OP_RE_INTERSECT: - if (BR_FAILED != mk_re_inter0(a, b, result)) - return result; - break; - case OP_RE_UNION: - if (BR_FAILED != mk_re_union0(a, b, result)) - return result; - break; - case OP_RE_CONCAT: - if (BR_FAILED != mk_re_concat(a, b, result)) - return result; - break; - case OP_RE_XOR: - if (BR_FAILED != mk_re_xor0(a, b, result)) - return result; - break; - default: - break; + // + // When either operand is an ite (a derivative BDD), skip the + // pre-simplification: its blind ite-hoisting would bypass the + // pred_implies-based pruning in mk_der_op_rec and create unreachable + // branches such as ite(x = 'a', ite(x = 'b', ...), ...) with 'a' != 'b'. + bool has_ite = m().is_ite(a) || m().is_ite(b); + if (!has_ite) { + switch (k) { + case OP_RE_INTERSECT: + if (BR_FAILED != mk_re_inter0(a, b, result)) + return result; + break; + case OP_RE_UNION: + if (BR_FAILED != mk_re_union0(a, b, result)) + return result; + break; + case OP_RE_CONCAT: + if (BR_FAILED != mk_re_concat(a, b, result)) + return result; + break; + case OP_RE_XOR: + if (BR_FAILED != mk_re_xor0(a, b, result)) + return result; + break; + default: + break; + } } result = m_op_cache.find(k, a, b, nullptr); if (!result) { @@ -4129,230 +3578,6 @@ expr_ref seq_rewriter::mk_der_cond(expr* cond, expr* ele, sort* seq_sort) { return result; } -/* - Classical Brzozowski derivative used by the regex_bisim equivalence - procedure. Unlike `mk_antimirov_deriv`, this variant never creates - _OP_RE_ANTIMIROV_UNION nodes — it stays in a classical (single regex - tree) form. The bisimulation algorithm relies on this so that each - leaf of D(p XOR q) is a coherent XOR pair (D_v p) XOR (D_v q). -*/ -expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) { - expr_ref result(m()); - sort* seq_sort = nullptr, *ele_sort = nullptr; - VERIFY(m_util.is_re(r, seq_sort)); - VERIFY(m_util.is_seq(seq_sort, ele_sort)); - SASSERT(ele_sort == ele->get_sort()); - expr* r1 = nullptr, *r2 = nullptr, *p = nullptr; - auto mk_empty = [&]() { return expr_ref(re().mk_empty(r->get_sort()), m()); }; - unsigned lo = 0, hi = 0; - if (re().is_concat(r, r1, r2)) { - expr_ref is_n = is_nullable(r1); - expr_ref dr1 = mk_derivative_rec(ele, r1); - result = mk_der_concat(dr1, r2); - if (m().is_false(is_n)) { - return result; - } - expr_ref dr2 = mk_derivative_rec(ele, r2); - is_n = re_predicate(is_n, seq_sort); - if (re().is_empty(dr2)) { - //do not concatenate [], it is a deade-end - return result; - } - else { - // Classical Brzozowski union: keep the derivative tree free of - // antimirov-union nodes so the bisimulation procedure sees a - // single regex tree whose leaves are XOR pairs. - return mk_der_union(result, mk_der_concat(is_n, dr2)); - } - } - else if (re().is_star(r, r1)) { - return mk_der_concat(mk_derivative_rec(ele, r1), r); - } - else if (re().is_plus(r, r1)) { - expr_ref star(re().mk_star(r1), m()); - return mk_derivative_rec(ele, star); - } - else if (re().is_union(r, r1, r2)) { - return mk_der_union(mk_derivative_rec(ele, r1), mk_derivative_rec(ele, r2)); - } - else if (re().is_intersection(r, r1, r2)) { - return mk_der_inter(mk_derivative_rec(ele, r1), mk_derivative_rec(ele, r2)); - } - else if (re().is_diff(r, r1, r2)) { - return mk_der_inter(mk_derivative_rec(ele, r1), mk_der_compl(mk_derivative_rec(ele, r2))); - } - else if (re().is_xor(r, r1, r2)) { - return mk_der_xor(mk_derivative_rec(ele, r1), mk_derivative_rec(ele, r2)); - } - else if (m().is_ite(r, p, r1, r2)) { - // there is no BDD normalization here - result = m().mk_ite(p, mk_derivative_rec(ele, r1), mk_derivative_rec(ele, r2)); - return result; - } - else if (re().is_opt(r, r1)) { - return mk_derivative_rec(ele, r1); - } - else if (re().is_complement(r, r1)) { - return mk_der_compl(mk_derivative_rec(ele, r1)); - } - else if (re().is_loop(r, r1, lo)) { - if (lo > 0) { - lo--; - } - result = mk_derivative_rec(ele, r1); - //do not concatenate with [] (emptyset) - if (re().is_empty(result)) { - return result; - } - else { - //do not create loop r1{0,}, instead create r1* - return mk_der_concat(result, (lo == 0 ? re().mk_star(r1) : re().mk_loop(r1, lo))); - } - } - else if (re().is_loop(r, r1, lo, hi)) { - if (hi == 0) { - return mk_empty(); - } - hi--; - if (lo > 0) { - lo--; - } - result = mk_derivative_rec(ele, r1); - //do not concatenate with [] (emptyset) or handle the rest of the loop if no more iterations remain - if (re().is_empty(result) || hi == 0) { - return result; - } - else { - return mk_der_concat(result, re().mk_loop_proper(r1, lo, hi)); - } - } - else if (re().is_full_seq(r) || - re().is_empty(r)) { - return expr_ref(r, m()); - } - else if (re().is_to_re(r, r1)) { - // r1 is a string here (not a regexp) - expr_ref hd(m()), tl(m()); - if (get_head_tail(r1, hd, tl)) { - // head must be equal; if so, derivative is tail - // Use mk_der_cond to normalize - STRACE(seq_verbose, tout << "deriv to_re" << std::endl;); - result = m().mk_eq(ele, hd); - result = mk_der_cond(result, ele, seq_sort); - expr_ref r1(re().mk_to_re(tl), m()); - result = mk_der_concat(result, r1); - return result; - } - else if (str().is_empty(r1)) { - //observe: str().is_empty(r1) checks that r = () = epsilon - //while mk_empty() = [], because deriv(epsilon) = [] = nothing - return mk_empty(); - } - else if (str().is_itos(r1)) { - // - // here r1 = (str.from_int r2) and r2 is non-ground - // or else the expression would have been simplified earlier - // so r1 must be nonempty and must consists of decimal digits - // '0' <= elem <= '9' - // if ((isdigit ele) and (ele = (hd r1))) then (to_re (tl r1)) else [] - // - hd = mk_seq_first(r1); - // isolate nested conjunction for deterministic evaluation - auto a0 = u().mk_le(m_util.mk_char('0'), ele); - auto a1 = u().mk_le(ele, m_util.mk_char('9')); - auto a2 = m().mk_not(m().mk_eq(r1, str().mk_empty(seq_sort))); - auto a3 = m().mk_eq(hd, ele); - auto inner = m().mk_and(a2, a3); - m_br.mk_and(a0, a1, inner, result); - tl = re().mk_to_re(mk_seq_rest(r1)); - return re_and(result, tl); - } - else { - // recall: [] denotes the empty language (nothing) regex, () denotes epsilon or empty sequence - // construct the term (if (r1 != () and (ele = (first r1)) then (to_re (rest r1)) else [])) - hd = mk_seq_first(r1); - m_br.mk_and(m().mk_not(m().mk_eq(r1, str().mk_empty(seq_sort))), m().mk_eq(hd, ele), result); - tl = re().mk_to_re(mk_seq_rest(r1)); - return re_and(result, tl); - } - } - else if (re().is_reverse(r, r1)) { - if (re().is_to_re(r1, r2)) { - // First try to extract hd and tl such that r = hd ++ tl and |tl|=1 - expr_ref hd(m()), tl(m()); - if (get_head_tail_reversed(r2, hd, tl)) { - // Use mk_der_cond to normalize - STRACE(seq_verbose, tout << "deriv reverse to_re" << std::endl;); - result = m().mk_eq(ele, tl); - result = mk_der_cond(result, ele, seq_sort); - result = mk_der_concat(result, re().mk_reverse(re().mk_to_re(hd))); - return result; - } - else if (str().is_empty(r2)) { - return mk_empty(); - } - else { - // construct the term (if (r2 != () and (ele = (last r2)) then reverse(to_re (butlast r2)) else [])) - // hd = first of reverse(r2) i.e. last of r2 - // tl = rest of reverse(r2) i.e. butlast of r2 - //hd = str().mk_nth_i(r2, m_autil.mk_sub(str().mk_length(r2), one())); - hd = mk_seq_last(r2); - // factor nested constructor calls to enforce deterministic argument evaluation order - auto a_non_empty = m().mk_not(m().mk_eq(r2, str().mk_empty(seq_sort))); - auto a_eq = m().mk_eq(hd, ele); - m_br.mk_and(a_non_empty, a_eq, result); - tl = re().mk_to_re(mk_seq_butlast(r2)); - return re_and(result, re().mk_reverse(tl)); - } - } - } - else if (re().is_range(r, r1, r2)) { - // r1, r2 are sequences. - zstring s1, s2; - if (str().is_string(r1, s1) && str().is_string(r2, s2)) { - if (s1.length() == 1 && s2.length() == 1) { - expr_ref ch1(m_util.mk_char(s1[0]), m()); - expr_ref ch2(m_util.mk_char(s2[0]), m()); - // Use mk_der_cond to normalize - STRACE(seq_verbose, tout << "deriv range zstring" << std::endl;); - expr_ref p1(u().mk_le(ch1, ele), m()); - p1 = mk_der_cond(p1, ele, seq_sort); - expr_ref p2(u().mk_le(ele, ch2), m()); - p2 = mk_der_cond(p2, ele, seq_sort); - result = mk_der_inter(p1, p2); - return result; - } - else { - return mk_empty(); - } - } - expr* e1 = nullptr, * e2 = nullptr; - if (str().is_unit(r1, e1) && str().is_unit(r2, e2)) { - SASSERT(u().is_char(e1)); - // Use mk_der_cond to normalize - STRACE(seq_verbose, tout << "deriv range str" << std::endl;); - expr_ref p1(u().mk_le(e1, ele), m()); - p1 = mk_der_cond(p1, ele, seq_sort); - expr_ref p2(u().mk_le(ele, e2), m()); - p2 = mk_der_cond(p2, ele, seq_sort); - result = mk_der_inter(p1, p2); - return result; - } - } - else if (re().is_full_char(r)) { - return expr_ref(re().mk_to_re(str().mk_empty(seq_sort)), m()); - } - else if (re().is_of_pred(r, p)) { - array_util array(m()); - expr* args[2] = { p, ele }; - result = array.mk_select(2, args); - // Use mk_der_cond to normalize - STRACE(seq_verbose, tout << "deriv of_pred" << std::endl;); - return mk_der_cond(result, ele, seq_sort); - } - // stuck cases: re.derivative, re variable, - return expr_ref(re().mk_derivative(ele, r), m()); -} /************************************************* ***** End Derivative Code ***** @@ -4780,6 +4005,17 @@ br_status seq_rewriter::mk_re_concat(expr* a, expr* b, expr_ref& result) { result = b; return BR_DONE; } + // Collapse adjacent full_seq factors regardless of concat grouping: + // (R ++ Σ*) ++ Σ* → R ++ Σ* (a ends with Σ*, b is Σ*) + // Σ* ++ (Σ* ++ R) → Σ* ++ R (a is Σ*, b starts with Σ*) + if (re().is_full_seq(b) && ends_with_full_seq(a)) { + result = a; + return BR_DONE; + } + if (re().is_full_seq(a) && starts_with_full_seq(b)) { + result = b; + return BR_DONE; + } expr* u1 = nullptr, *u2 = nullptr; if (re().is_full_seq(a) && re().is_union(b, u1, u2) && (starts_with_full_seq(u1) || starts_with_full_seq(u2))) { @@ -4821,7 +4057,7 @@ br_status seq_rewriter::mk_re_concat(expr* a, expr* b, expr_ref& result) { result = re().mk_to_re(str().mk_concat(a_str, b_str)); return BR_REWRITE2; } - expr* a1 = nullptr; + expr *a1 = nullptr, *a2 = nullptr; expr* b1 = nullptr; if (re().is_to_re(a, a1) && re().is_to_re(b, b1)) { result = re().mk_to_re(str().mk_concat(a1, b1)); @@ -4842,6 +4078,7 @@ br_status seq_rewriter::mk_re_concat(expr* a, expr* b, expr_ref& result) { } unsigned lo1, hi1, lo2, hi2; + if (re().is_loop(a, a1, lo1, hi1) && lo1 <= hi1 && re().is_loop(b, b1, lo2, hi2) && lo2 <= hi2 && a1 == b1) { result = re().mk_loop_proper(a1, lo1 + lo2, hi1 + hi2); return BR_DONE; @@ -4873,9 +4110,68 @@ br_status seq_rewriter::mk_re_concat(expr* a, expr* b, expr_ref& result) { } std::swap(a, b); } + // Hoist ite out of concat: concat(ite(c, r1, r2), b) → ite(c, concat(r1, b), concat(r2, b)) + expr* c = nullptr; + if (m().is_ite(a, c, a1, b1)) { + result = m().mk_ite(c, re().mk_concat(a1, b), re().mk_concat(b1, b)); + return BR_REWRITE3; + } + if (m().is_ite(b, c, a1, b1)) { + result = m().mk_ite(c, re().mk_concat(a, a1), re().mk_concat(a, b1)); + return BR_REWRITE3; + } + if (re().is_concat(a, a1, a2)) { + // Maintain right-associative normal form: re().mk_concat is a raw + // constructor, so re-simplify the result to recursively reassociate + // any concat nested in a2 (and re-apply concat simplifications). + result = re().mk_concat(a1, re().mk_concat(a2, b)); + return BR_DONE; + } return BR_FAILED; } +expr_ref seq_rewriter::mk_regex_concat(expr *r, expr *s) { + sort *seq_sort = nullptr, *ele_sort = nullptr; + VERIFY(m_util.is_re(r, seq_sort)); + VERIFY(u().is_seq(seq_sort, ele_sort)); + SASSERT(r->get_sort() == s->get_sort()); + expr_ref result(m()); + expr *r1, *r2; + if (re().is_epsilon(r) || re().is_empty(s)) + result = s; + else if (re().is_epsilon(s) || re().is_empty(r)) + result = r; + else if (re().is_full_seq(r) && re().is_full_seq(s)) + result = r; + else if (re().is_full_char(r) && re().is_full_seq(s)) + // ..* = .+ + result = re().mk_plus(re().mk_full_char(r->get_sort())); + else if (re().is_full_seq(r) && re().is_full_char(s)) + // .*. = .+ + result = re().mk_plus(re().mk_full_char(r->get_sort())); + else if (re().is_concat(r, r1, r2)) + // create the resulting concatenation in right-associative form except for the following case + // TODO: maintain the following invariant for A ++ B{m,n} + C + // concat(concat(A, B{m,n}), C) (if A != () and C != ()) + // concat(B{m,n}, C) (if A == () and C != ()) + // where A, B, C are regexes + // Using & below for Intersection and | for Union + // In other words, do not make A ++ B{m,n} into right-assoc form, but keep B{m,n} at the top + // This will help to identify this situation in the merge routine: + // concat(concat(A, B{0,m}), C) | concat(concat(A, B{0,n}), C) + // simplifies to + // concat(concat(A, B{0,max(m,n)}), C) + // analogously: + // concat(concat(A, B{0,m}), C) & concat(concat(A, B{0,n}), C) + // simplifies to + // concat(concat(A, B{0,min(m,n)}), C) + result = mk_regex_concat(r1, mk_regex_concat(r2, s)); + else { + result = re().mk_concat(r, s); + } + return result; +} + bool seq_rewriter::are_complements(expr* r1, expr* r2) const { expr* r = nullptr; if (re().is_complement(r1, r) && r == r2) @@ -4892,6 +4188,32 @@ bool seq_rewriter::is_subset(expr* r1, expr* r2) const { return m_subset.is_subset(r1, r2); } +bool seq_rewriter::try_collapse_re_union(expr* a, expr* b, expr_ref& result) { + sort* seq_sort = nullptr; + if (!u().is_re(a->get_sort(), seq_sort)) + return false; + seq::range_predicate pa(u().max_char()), pb(u().max_char()); + if (!seq::regex_to_range_predicate(u(), a, pa)) + return false; + if (!seq::regex_to_range_predicate(u(), b, pb)) + return false; + result = seq::range_predicate_to_regex(u(), pa | pb, seq_sort); + return true; +} + +bool seq_rewriter::try_collapse_re_inter(expr* a, expr* b, expr_ref& result) { + sort* seq_sort = nullptr; + if (!u().is_re(a->get_sort(), seq_sort)) + return false; + seq::range_predicate pa(u().max_char()), pb(u().max_char()); + if (!seq::regex_to_range_predicate(u(), a, pa)) + return false; + if (!seq::regex_to_range_predicate(u(), b, pb)) + return false; + result = seq::range_predicate_to_regex(u(), pa & pb, seq_sort); + return true; +} + br_status seq_rewriter::mk_re_union0(expr* a, expr* b, expr_ref& result) { if (a == b) { result = a; @@ -4921,11 +4243,30 @@ br_status seq_rewriter::mk_re_union0(expr* a, expr* b, expr_ref& result) { result = b; return BR_DONE; } + // r ∪ ~r → Σ* (complement absorption) + if (are_complements(a, b)) { + result = re().mk_full_seq(a->get_sort()); + return BR_DONE; + } + // Hoist ite out of union: union(ite(c, r1, r2), b) → ite(c, union(r1, b), union(r2, b)) + expr *c = nullptr, *r1 = nullptr, *r2 = nullptr; + if (m().is_ite(a, c, r1, r2)) { + result = m().mk_ite(c, re().mk_union(r1, b), re().mk_union(r2, b)); + return BR_REWRITE3; + } + if (m().is_ite(b, c, r1, r2)) { + result = m().mk_ite(c, re().mk_union(a, r1), re().mk_union(a, r2)); + return BR_REWRITE3; + } + if (try_collapse_re_union(a, b, result)) + return BR_DONE; return BR_FAILED; } /* Creates a normalized union. */ br_status seq_rewriter::mk_re_union(expr* a, expr* b, expr_ref& result) { + if (try_collapse_re_union(a, b, result)) + return BR_DONE; result = mk_regex_union_normalize(a, b); return BR_DONE; } @@ -4963,42 +4304,11 @@ br_status seq_rewriter::mk_re_complement(expr* a, expr_ref& result) { result = re().mk_plus(re().mk_full_char(a->get_sort())); return BR_DONE; } - // Range complement: comp([a,b]) → [0,a-1] ∪ [b+1,max] (or one half when a=0 or b=max) - unsigned lo_v = 0, hi_v = 0; - if (re().is_range(a, lo_v, hi_v)) { - unsigned max_c = u().max_char(); - sort *srt = a->get_sort(), *seq_sort = nullptr; - VERIFY(m_util.is_re(a, seq_sort)); - bool has_left = (lo_v > 0); - bool has_right = (hi_v < max_c); - auto empty_re = [&]() { return re().mk_empty(srt); }; - auto len0_re = [&]() { return re().mk_to_re(str().mk_empty(seq_sort)); }; - auto full_re = [&]() { return re().mk_full_seq(srt); }; - auto len2_plus_re = [&]() { return re().mk_concat(re().mk_full_char(srt), re().mk_plus(re().mk_full_char(srt))); }; - if (!has_left && !has_right) { - // [0, max_c]: complement is empty - result = empty_re(); - return BR_DONE; - } - if (lo_v > hi_v) { - result = full_re(); - return BR_DONE; - } - if (!has_left) { - // [0, b]: complement is [b+1, max] - result = re().mk_union(len0_re(), re().mk_union(re().mk_range(srt, hi_v + 1, max_c), len2_plus_re())); - return BR_DONE; - } - if (!has_right) { - // [a, max]: complement is [0, a-1] - result = re().mk_union(len0_re(), re().mk_union(re().mk_range(srt, 0u, lo_v - 1), len2_plus_re())); - return BR_DONE; - } - // General: [a, b] → [0, a-1] ∪ [b+1, max] - auto left = re().mk_range(srt, 0u, lo_v - 1); - auto right = re().mk_range(srt, hi_v + 1, max_c); - result = re().mk_union(len0_re(), re().mk_union(left, re().mk_union(right, len2_plus_re()))); - return BR_DONE; + // Hoist ite out of complement: ~(ite(c, r1, r2)) → ite(c, ~r1, ~r2) + expr* c = nullptr; + if (m().is_ite(a, c, e1, e2)) { + result = m().mk_ite(c, re().mk_complement(e1), re().mk_complement(e2)); + return BR_REWRITE3; } return BR_FAILED; } @@ -5025,16 +4335,43 @@ br_status seq_rewriter::mk_re_inter0(expr* a, expr* b, expr_ref& result) { result = a; return BR_DONE; } + // r ∩ ~r → ∅ (complement absorption) + if (are_complements(a, b)) { + result = re().mk_empty(a->get_sort()); + return BR_DONE; + } + // Hoist ite out of intersection: inter(ite(c, r1, r2), b) → ite(c, inter(r1, b), inter(r2, b)) + expr *c = nullptr, *r1 = nullptr, *r2 = nullptr; + if (m().is_ite(a, c, r1, r2)) { + result = m().mk_ite(c, re().mk_inter(r1, b), re().mk_inter(r2, b)); + return BR_REWRITE3; + } + if (m().is_ite(b, c, r1, r2)) { + result = m().mk_ite(c, re().mk_inter(a, r1), re().mk_inter(a, r2)); + return BR_REWRITE3; + } + if (try_collapse_re_inter(a, b, result)) + return BR_DONE; return BR_FAILED; } /* Creates a normalized intersection. */ br_status seq_rewriter::mk_re_inter(expr* a, expr* b, expr_ref& result) { + if (try_collapse_re_inter(a, b, result)) + return BR_DONE; result = mk_regex_inter_normalize(a, b); return BR_DONE; } br_status seq_rewriter::mk_re_diff(expr* a, expr* b, expr_ref& result) { + seq::range_predicate pa(u().max_char()), pb(u().max_char()); + sort* seq_sort = nullptr; + if (u().is_re(a->get_sort(), seq_sort) + && seq::regex_to_range_predicate(u(), a, pa) + && seq::regex_to_range_predicate(u(), b, pb)) { + result = seq::range_predicate_to_regex(u(), pa - pb, seq_sort); + return BR_DONE; + } result = mk_regex_inter_normalize(a, re().mk_complement(b)); return BR_REWRITE2; } @@ -5264,7 +4601,9 @@ br_status seq_rewriter::mk_re_star(expr* a, expr_ref& result) { result = re().mk_full_seq(b1->get_sort()); return BR_REWRITE2; } - + // Hoist ite out of star: (ite c r1 r2)* → ite(c, r1*, r2*) + result = m().mk_ite(c, re().mk_star(b1), re().mk_star(c1)); + return BR_REWRITE3; } return BR_FAILED; } @@ -6449,7 +5788,7 @@ void seq_rewriter::op_cache::cleanup() { lbool seq_rewriter::some_string_in_re(expr* r, zstring& s) { sort* rs; (void)rs; - // SASSERT(re().is_re(r, rs) && m_util.is_string(rs)); + // SASSERT(u().is_re(r, rs) && m_util.is_string(rs)); expr_mark visited; unsigned_vector str; diff --git a/src/ast/rewriter/seq_rewriter.h b/src/ast/rewriter/seq_rewriter.h index 1b693ca3d6..fc47f2c636 100644 --- a/src/ast/rewriter/seq_rewriter.h +++ b/src/ast/rewriter/seq_rewriter.h @@ -19,6 +19,7 @@ Notes: #pragma once #include "ast/seq_decl_plugin.h" +#include "ast/rewriter/seq_derive.h" #include "ast/ast_pp.h" #include "ast/arith_decl_plugin.h" #include "ast/rewriter/rewriter_types.h" @@ -128,15 +129,20 @@ class seq_rewriter { void insert(decl_kind op, expr* a, expr* b, expr* c, expr* r); }; + friend class seq::derive; + seq_util m_util; seq_subset m_subset; arith_util m_autil; bool_rewriter m_br; + seq::derive m_derive; // re2automaton m_re2aut; op_cache m_op_cache; expr_ref_vector m_es, m_lhs, m_rhs; - bool m_coalesce_chars; - bool m_in_bisim { false }; + bool m_coalesce_chars = false; + bool m_in_bisim { false }; + unsigned m_re_deriv_depth { 0 }; + static const unsigned m_max_re_deriv_depth = 512; enum length_comparison { shorter_c, @@ -179,8 +185,6 @@ class seq_rewriter { expr_ref re_replace_char(expr *r, unsigned a_ch, unsigned b_ch, expr *a_str, expr *b_str); // Calculate derivative, memoized and enforcing a normal form - expr_ref is_nullable_rec(expr* r); - expr_ref mk_derivative_rec(expr* ele, expr* r); expr_ref mk_der_op(decl_kind k, expr* a, expr* b); expr_ref mk_der_op_rec(decl_kind k, expr* a, expr* b); expr_ref mk_der_concat(expr* a, expr* b); @@ -191,26 +195,10 @@ class seq_rewriter { expr_ref mk_der_cond(expr* cond, expr* ele, sort* seq_sort); expr_ref mk_der_antimirov_union(expr* r1, expr* r2); bool ite_bdds_compatible(expr* a, expr* b); - /* if r has the form deriv(en..deriv(e1,to_re(s))..) returns 's = [e1..en]' else returns '() in r'*/ - expr_ref is_nullable_symbolic_regex(expr* r, sort* seq_sort); #ifdef Z3DEBUG bool check_deriv_normal_form(expr* r, int level = 3); #endif - void mk_antimirov_deriv_rec(expr* e, expr* r, expr* path, expr_ref& result); - - expr_ref mk_antimirov_deriv(expr* e, expr* r, expr* path); - expr_ref mk_in_antimirov_rec(expr* s, expr* d); - expr_ref mk_in_antimirov(expr* s, expr* d); - - expr_ref mk_antimirov_deriv_intersection(expr* elem, expr* d1, expr* d2, expr* path); - expr_ref mk_antimirov_deriv_concat(expr* d, expr* r); - expr_ref mk_antimirov_deriv_negate(expr* elem, expr* d); - expr_ref mk_antimirov_deriv_union(expr* d1, expr* d2); - expr_ref mk_antimirov_deriv_restrict(expr* elem, expr* d1, expr* cond); - expr_ref mk_regex_reverse(expr* r); - expr_ref mk_regex_concat(expr* r1, expr* r2); - expr_ref merge_regex_sets(expr* r1, expr* r2, expr* unit, std::function& decompose, std::function& compose); // elem is (:var 0) and path a condition that may have (:var 0) as a free variable @@ -267,6 +255,14 @@ class seq_rewriter { br_status mk_re_union0(expr* a, expr* b, expr_ref& result); br_status mk_re_inter0(expr* a, expr* b, expr_ref& result); br_status mk_re_complement(expr* a, expr_ref& result); + // Range-set collapse helpers: if the operands form a boolean + // combination of character-class regexes, materialize the result as a + // canonical regex over a single range_predicate. See + // ast/rewriter/seq_range_collapse.h for the recognized fragment. + // NOTE: re.complement is intentionally not in this set because it + // operates at the sequence level, not the character-class level. + bool try_collapse_re_union(expr* a, expr* b, expr_ref& result); + bool try_collapse_re_inter(expr* a, expr* b, expr_ref& result); br_status mk_re_star(expr* a, expr_ref& result); br_status mk_re_diff(expr* a, expr* b, expr_ref& result); br_status mk_re_xor(expr* a, expr* b, expr_ref& result); @@ -351,9 +347,9 @@ class seq_rewriter { public: seq_rewriter(ast_manager & m, params_ref const & p = params_ref()): - m_util(m), m_subset(m_util.re), m_autil(m), m_br(m, p), // m_re2aut(m), + m_util(m), m_subset(m_util.re), m_autil(m), m_br(m, p), m_derive(m, *this), m_op_cache(m), m_es(m), - m_lhs(m), m_rhs(m), m_coalesce_chars(true) { + m_lhs(m), m_rhs(m) { } ast_manager & m() const { return m_util.get_manager(); } family_id get_fid() const { return m_util.get_family_id(); } @@ -364,7 +360,7 @@ public: static void get_param_descrs(param_descrs & r); - bool coalesce_chars() const { return m_coalesce_chars; } + // bool coalesce_chars() const { return m_coalesce_chars; } br_status mk_app_core(func_decl * f, unsigned num_args, expr * const * args, expr_ref & result); br_status mk_eq_core(expr * lhs, expr * rhs, expr_ref & result); @@ -380,6 +376,34 @@ public: return result; } + expr_ref mk_xor0(expr *a, expr *b) { + expr_ref result(m()); + if (mk_re_xor0(a, b, result) == BR_FAILED) + result = re().mk_xor(a, b); + return result; + } + + expr_ref mk_union(expr *a, expr *b) { + expr_ref result(m()); + if (mk_re_union(a, b, result) == BR_FAILED) + result = re().mk_union(a, b); + return result; + } + + expr_ref mk_inter(expr *a, expr *b) { + expr_ref result(m()); + if (mk_re_inter(a, b, result) == BR_FAILED) + result = re().mk_inter(a, b); + return result; + } + + expr_ref mk_complement(expr *a) { + expr_ref result(m()); + if (mk_re_complement(a, result) == BR_FAILED) + result = re().mk_complement(a); + return result; + } + /* * makes concat and simplifies */ @@ -440,7 +464,31 @@ public: procedure which relies on each leaf of D(p XOR q) being a coherent XOR pair (D_v p) XOR (D_v q). */ - expr_ref mk_brz_derivative(expr* r); + expr_ref mk_brz_derivative(expr *r) { + return mk_derivative(r); + } + + /* + Enumerate the cofactors (min-terms) of a transition regex r taken with + respect to ele. Produces (path_condition, leaf_regex) pairs for every + feasible path through the ITE-tree, pruning infeasible character ranges. + Delegates to the derivative engine so the same path/interval context used + while hoisting ITEs is reused for the leaf simplification. + */ + void get_cofactors(expr* ele, expr* r, expr_ref_pair_vector& result) { + m_derive.get_cofactors(ele, r, result); + } + + /* + Compute the symbolic derivative of r and enumerate its reachable leaves + in fully ITE-hoisted normal form: a list of (path_condition, target) + pairs where every target is free of (:var 0) (so nullability is always + decidable) and unions are kept intact as single states. Used by + regex_bisim, which consumes the targets and ignores the path conditions. + */ + void brz_derivative_cofactors(expr* r, expr_ref_pair_vector& result) { + m_derive.derivative_cofactors(r, result); + } // heuristic elimination of element from condition that comes form a derivative. // special case optimization for conjunctions of equalities, disequalities and ranges. @@ -451,6 +499,8 @@ public: /* Apply simplifications to the intersection to keep it normalized (r1 and r2 are not normalized)*/ expr_ref mk_regex_inter_normalize(expr* r1, expr* r2); + expr_ref mk_regex_concat(expr *r1, expr *r2); + /* * Extract some string that is a member of r. * Return true if a valid string was extracted. diff --git a/src/ast/rewriter/seq_subset.cpp b/src/ast/rewriter/seq_subset.cpp index 2fc4d1f715..1af42d06d8 100644 --- a/src/ast/rewriter/seq_subset.cpp +++ b/src/ast/rewriter/seq_subset.cpp @@ -19,7 +19,7 @@ Author: bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const { while (true) { - + if (a == b) return true; if (m_re.is_empty(a)) @@ -30,7 +30,7 @@ bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const { return true; if (depth >= m_max_depth) - return false; + return false; expr* a1 = nullptr, * a2 = nullptr, * b1 = nullptr, * b2 = nullptr; unsigned la, ua, lb, ub; @@ -39,16 +39,12 @@ bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const { if (m_re.is_dot_plus(b) && m_re.get_info(a).nullable == l_false) return true; - // a ⊆ a* - if (m_re.is_star(b, b1) && is_subset_rec(a, b1, depth)) - return true; - // e ⊆ a* if (m_re.is_epsilon(a) && m_re.is_star(b, b1)) return true; - // R ⊆ R* - if (m_re.is_star(b, b1) && is_subset_rec(a, b1, depth + 1)) + // a ⊆ a*: if b = b1* and a ⊆ b1, then a ⊆ b1* + if (m_re.is_star(b, b1) && is_subset_rec(a, b1, depth)) return true; // R1* ⊆ R2* if R1 ⊆ R2 @@ -112,6 +108,12 @@ bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const { if (m_re.is_concat(b, b1, b2) && m_re.is_full_seq(b1) && is_subset_rec(a, b2, depth)) return true; + // prefix absorption: P·R' ⊆ Σ*·R' for any prefix P (since P ⊆ Σ*). + // Detect that a has R' (= b2) as a concatenation suffix, where b = Σ*·R'. + // Covers contains-patterns, e.g. Σ*·a·Σ*·b·Σ* ⊆ Σ*·b·Σ*. + if (m_re.is_concat(b, b1, b2) && m_re.is_full_seq(b1) && ends_with(a, b2)) + return true; + // R ⊆ R'·Σ* if R ⊆ R' if (m_re.is_concat(b, b1, b2) && m_re.is_full_seq(b2) && is_subset_rec(a, b1, depth)) return true; @@ -144,3 +146,30 @@ bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const { bool seq_subset::is_subset(expr* a, expr* b) const { return is_subset_rec(a, b, 0); } + +bool seq_subset::ends_with(expr* a, expr* suf) const { + if (a == suf) + return true; + // Flatten both regexes into their sequence of concatenation factors + // (independent of left/right associativity) and test list-suffix equality. + ptr_vector af, sf; + flatten_concat(a, af); + flatten_concat(suf, sf); + if (sf.size() > af.size()) + return false; + unsigned off = af.size() - sf.size(); + for (unsigned i = 0; i < sf.size(); ++i) + if (af[off + i] != sf[i]) + return false; + return true; +} + +void seq_subset::flatten_concat(expr* a, ptr_vector& out) const { + expr* a1 = nullptr, * a2 = nullptr; + if (m_re.is_concat(a, a1, a2)) { + flatten_concat(a1, out); + flatten_concat(a2, out); + } + else + out.push_back(a); +} diff --git a/src/ast/rewriter/seq_subset.h b/src/ast/rewriter/seq_subset.h index 7329c898e1..e62333dea3 100644 --- a/src/ast/rewriter/seq_subset.h +++ b/src/ast/rewriter/seq_subset.h @@ -24,6 +24,12 @@ class seq_subset { bool is_subset_rec(expr* a, expr* b, unsigned depth) const; + // true if regex a, viewed as a flattened concatenation, has suf as a + // structural (concatenation) suffix. + bool ends_with(expr* a, expr* suf) const; + + void flatten_concat(expr* a, ptr_vector& out) const; + public: explicit seq_subset(seq_util::rex& re) : m_re(re) {} bool is_subset(expr* a, expr* b) const; diff --git a/src/smt/seq_regex.cpp b/src/smt/seq_regex.cpp index 0e9a03b633..5a801550cd 100644 --- a/src/smt/seq_regex.cpp +++ b/src/smt/seq_regex.cpp @@ -461,6 +461,24 @@ namespace smt { if (re().is_empty(r)) //trivially true return; + // When one side is re.none the equation is a pure emptiness check on + // the other regex (symmetric_diff already returned it as r). Decide + // it directly by antimirov NFA reachability instead of running the + // bisimulation/XOR closure, which would build large un-canonicalized + // product states for intersections of contains-patterns. + if ((re().is_empty(r1) || re().is_empty(r2)) && is_ground(r)) { + switch (re_is_empty(r)) { + case l_true: + STRACE(seq_regex_brief, tout << "empty:eq ";); + return; // languages equal (both empty): trivially true + case l_false: + STRACE(seq_regex_brief, tout << "empty:neq ";); + th.add_axiom(~th.mk_eq(r1, r2, false), false_literal); + return; + case l_undef: + break; + } + } // Try the bisimulation procedure on ground regexes first. If it // returns a definite answer, dispatch the corresponding axiom and // bypass the symbolic emptiness/derivative closure. @@ -562,7 +580,7 @@ namespace smt { lits.push_back(null_lit); expr_ref_pair_vector cofactors(m); - get_cofactors(d, cofactors); + seq_rw().get_cofactors(hd, d, cofactors); for (auto const& p : cofactors) { if (is_member(p.second, u)) continue; @@ -671,6 +689,67 @@ namespace smt { return result; } + /* + Decide emptiness of a ground regex r via antimirov-mode NFA + reachability. + + The symbolic derivative engine runs in antimirov mode, so the + derivative of an intersection distributes into a *set* of individual + product states inter(A_i, B_j) (each a small, ground regex) rather + than one giant union-of-intersections term. get_derivative_targets + enumerates these NFA successor states. + + We short-circuit to l_false (non-empty) as soon as a reachable state + is nullable (accepts the empty word) or classical (a regex built only + from to_re/all/union/concat/star/plus/opt/loop, hence non-empty). An + intersection itself is never classical, but once one operand reduces + to Σ* the intersection collapses (via the derivative's subset + simplification) to the other, classical, operand. + + If the worklist is exhausted with no such state, r is empty (l_true). + Returns l_undef if a step bound is hit, so callers can fall back to + the general procedure. + */ + lbool seq_regex::re_is_empty(expr* r) { + if (re().is_empty(r)) + return l_true; + expr_ref_vector pinned(m); + obj_hashtable visited; + ptr_vector work; + work.push_back(r); + visited.insert(r); + pinned.push_back(r); + unsigned const bound = 100000; + unsigned steps = 0; + while (!work.empty()) { + if (++steps > bound) + return l_undef; + expr* s = work.back(); + work.pop_back(); + auto info = re().get_info(s); + if (!info.is_known()) + return l_undef; + // ε ∈ L(s) or s is a non-empty classical regex ⇒ L(r) non-empty. + if (info.nullable == l_true || info.classical) + return l_false; + // Dead state: prune (min_length == UINT_MAX means no word is + // accepted from here). + if (info.min_length == UINT_MAX) + continue; + expr_ref_vector targets(m); + get_derivative_targets(s, targets); + for (expr* t : targets) { + if (visited.contains(t)) + continue; + visited.insert(t); + pinned.push_back(t); + work.push_back(t); + } + } + return l_true; + } + + /* Return a list of all target regexes in the derivative of a regex r, ignoring the conditions along each path. @@ -707,53 +786,26 @@ namespace smt { /* Return a list of all (cond, leaf) pairs in a given derivative - expression r. + expression r, where elem is the character symbol the derivative was + taken with respect to. - Note: this implementation is inefficient: it simply collects all expressions under an if and - iterates over all combinations. + The transition regexes produced by the symbolic derivative engine are + ITE-trees over character predicates ci on elem (equalities such as + elem = 'A', and ranges such as 'a' <= elem <= 'z'). These predicates + are typically mutually exclusive, so the number of feasible truth + assignments to {c1,..,ck} ("minterms") is small. - This method is still used by: + The enumeration is delegated to seq::derive (via seq_rw().get_cofactors) + so it reuses the very same path/interval context that the derivative + engine uses while hoisting ITEs: each feasible path through the ITE-tree + yields one (path_condition, leaf) cofactor, infeasible character-range + combinations are pruned, and the leaf is simplified with the path-aware + smart constructors. + + This is used by: propagate_is_empty propagate_is_non_empty */ - void seq_regex::get_cofactors(expr* r, expr_ref_pair_vector& result) { - obj_hashtable ifs; - expr* cond = nullptr, * r1 = nullptr, * r2 = nullptr; - for (expr* e : subterms::ground(expr_ref(r, m))) - if (m.is_ite(e, cond, r1, r2)) - ifs.insert(cond); - - expr_ref_vector rs(m); - vector conds; - conds.push_back(expr_ref_vector(m)); - rs.push_back(r); - for (expr* c : ifs) { - unsigned sz = conds.size(); - expr_safe_replace rep1(m); - expr_safe_replace rep2(m); - rep1.insert(c, m.mk_true()); - rep2.insert(c, m.mk_false()); - expr_ref r2(m); - for (unsigned i = 0; i < sz; ++i) { - expr_ref_vector cs = conds[i]; - cs.push_back(mk_not(m, c)); - conds.push_back(cs); - conds[i].push_back(c); - expr_ref r1(rs.get(i), m); - rep1(r1, r2); - rs[i] = r2; - rep2(r1, r2); - rs.push_back(r2); - } - } - for (unsigned i = 0; i < conds.size(); ++i) { - expr_ref conj = mk_and(conds[i]); - expr_ref r(rs.get(i), m); - ctx.get_rewriter()(r); - if (!m.is_false(conj) && !re().is_empty(r)) - result.push_back(conj, r); - } - } /* is_empty(r, u) => ~is_nullable(r) @@ -781,7 +833,7 @@ namespace smt { d = mk_derivative_wrapper(hd, r); literal_vector lits; expr_ref_pair_vector cofactors(m); - get_cofactors(d, cofactors); + seq_rw().get_cofactors(hd, d, cofactors); for (auto const& p : cofactors) { if (is_member(p.second, u)) continue; diff --git a/src/smt/seq_regex.h b/src/smt/seq_regex.h index 5c3fddd252..dd1c474b31 100644 --- a/src/smt/seq_regex.h +++ b/src/smt/seq_regex.h @@ -164,7 +164,12 @@ namespace smt { // returned by derivative_wrapper expr_ref mk_deriv_accept(expr* s, unsigned i, expr* r); void get_derivative_targets(expr* r, expr_ref_vector& targets); - void get_cofactors(expr* r, expr_ref_pair_vector& result); + + // Decide emptiness of a ground regex by antimirov-mode NFA + // reachability: explore derivative target states, short-circuiting to + // "non-empty" on the first reachable nullable or classical state. + // Returns l_true (empty), l_false (non-empty), l_undef (gave up). + lbool re_is_empty(expr* r); /* Pretty print the regex of the state id to the out stream, diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 404cf45538..e26f17cf44 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -115,15 +115,18 @@ add_executable(test-z3 polynomial_factorization.cpp polynorm.cpp prime_generator.cpp + seq_regex_bisim.cpp proof_checker.cpp qe_arith.cpp mbp_qel.cpp quant_elim.cpp quant_solve.cpp random.cpp + range_predicate.cpp rational.cpp rcf.cpp region.cpp + regex_range_collapse.cpp sat_local_search.cpp sat_lookahead.cpp sat_user_scope.cpp diff --git a/src/test/main.cpp b/src/test/main.cpp index b78e387892..dc5854da7c 100644 --- a/src/test/main.cpp +++ b/src/test/main.cpp @@ -113,6 +113,8 @@ X(api_bug) \ X(api_special_relations) \ X(arith_rewriter) \ + X(range_predicate) \ + X(regex_range_collapse) \ X(seq_rewriter) \ X(check_assumptions) \ X(smt_context) \ @@ -195,6 +197,7 @@ X(finite_set) \ X(finite_set_rewriter) \ X(fpa) \ + X(seq_regex_bisim) \ X(term_enumeration) \ X(lcube) diff --git a/src/test/range_predicate.cpp b/src/test/range_predicate.cpp new file mode 100644 index 0000000000..e526a63302 --- /dev/null +++ b/src/test/range_predicate.cpp @@ -0,0 +1,260 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + test/range_predicate.cpp + +Abstract: + + Unit tests for the range-algebra value type seq::range_predicate. + + The tests exercise: + * factory constructors and canonical-form invariants, + * extensional equality and total ordering, + * Boolean operations (|, &, ~, -, ^) on hand-picked instances, + * exhaustive verification of de-Morgan and lattice laws on a + small character domain, by enumerating every subset. + +Author: + + Margus Veanes (veanes) 2026 + +--*/ + +#include "ast/rewriter/seq_range_predicate.h" +#include "util/debug.h" +#include +#include +#include + +using seq::range_predicate; + +namespace { + + // Build a range_predicate from a bitmask over [0, max_char] for testing. + range_predicate from_mask(uint64_t mask, unsigned max_char) { + range_predicate r = range_predicate::empty(max_char); + for (unsigned c = 0; c <= max_char; ++c) + if ((mask >> c) & 1u) + r = r | range_predicate::singleton(c, max_char); + return r; + } + + // Convert a range_predicate back to a bitmask for cross-checking. + uint64_t to_mask(range_predicate const& r) { + uint64_t mask = 0; + for (unsigned c = 0; c <= r.max_char(); ++c) + if (r.contains(c)) + mask |= (uint64_t(1) << c); + return mask; + } + + void test_factories() { + auto e = range_predicate::empty(255); + ENSURE(e.is_empty()); + ENSURE(!e.is_top()); + ENSURE(e.num_ranges() == 0); + ENSURE(e.cardinality() == 0); + + auto t = range_predicate::top(255); + ENSURE(!t.is_empty()); + ENSURE(t.is_top()); + ENSURE(t.num_ranges() == 1); + ENSURE(t.cardinality() == 256); + ENSURE(t.contains(0)); + ENSURE(t.contains(255)); + + auto s = range_predicate::singleton(42, 255); + ENSURE(s.num_ranges() == 1); + ENSURE(s.cardinality() == 1); + ENSURE(s.contains(42)); + ENSURE(!s.contains(41)); + unsigned c = 0; + ENSURE(s.is_singleton(c)); + ENSURE(c == 42); + + auto r = range_predicate::range(10, 20, 255); + ENSURE(r.num_ranges() == 1); + ENSURE(r.cardinality() == 11); + ENSURE(r.contains(10)); + ENSURE(r.contains(20)); + ENSURE(!r.contains(9)); + ENSURE(!r.contains(21)); + + // Reversed bounds produce empty. + auto bad = range_predicate::range(20, 10, 255); + ENSURE(bad.is_empty()); + + // Clipping at max_char. + auto clipped = range_predicate::range(200, 1000, 255); + ENSURE(clipped.num_ranges() == 1); + ENSURE(clipped[0] == std::make_pair(200u, 255u)); + } + + void test_equality_and_order() { + auto a = range_predicate::range(1, 5, 31); + auto b = range_predicate::range(1, 5, 31); + auto c = range_predicate::range(1, 6, 31); + ENSURE(a == b); + ENSURE(a != c); + ENSURE(a.hash() == b.hash()); + ENSURE(a < c || c < a); + ENSURE(!(a < a)); + + auto empty = range_predicate::empty(31); + ENSURE(empty < a); + + // Canonical merging of adjacent ranges. + auto d = range_predicate::range(0, 4, 31) | range_predicate::range(5, 10, 31); + auto e = range_predicate::range(0, 10, 31); + ENSURE(d == e); + } + + void test_union_intersection_hand() { + unsigned const M = 31; + auto a = range_predicate::range(0, 4, M) | range_predicate::range(10, 14, M); + auto b = range_predicate::range(3, 11, M); + + auto u = a | b; // [0,14] + ENSURE(u.num_ranges() == 1); + ENSURE(u[0] == std::make_pair(0u, 14u)); + + auto i = a & b; // [3,4] U [10,11] + ENSURE(i.num_ranges() == 2); + ENSURE(i[0] == std::make_pair(3u, 4u)); + ENSURE(i[1] == std::make_pair(10u, 11u)); + + auto d = a - b; // [0,2] U [12,14] + ENSURE(d.num_ranges() == 2); + ENSURE(d[0] == std::make_pair(0u, 2u)); + ENSURE(d[1] == std::make_pair(12u, 14u)); + + auto x = a ^ b; // [0,2] U [5,9] U [12,14] + ENSURE(x.num_ranges() == 3); + ENSURE(x[0] == std::make_pair(0u, 2u)); + ENSURE(x[1] == std::make_pair(5u, 9u)); + ENSURE(x[2] == std::make_pair(12u, 14u)); + } + + void test_complement_hand() { + unsigned const M = 10; + auto e = range_predicate::empty(M); + ENSURE((~e).is_top()); + auto t = range_predicate::top(M); + ENSURE((~t).is_empty()); + + // ~([2,3] U [7,8]) = [0,1] U [4,6] U [9,10] + auto a = range_predicate::range(2, 3, M) | range_predicate::range(7, 8, M); + auto na = ~a; + ENSURE(na.num_ranges() == 3); + ENSURE(na[0] == std::make_pair(0u, 1u)); + ENSURE(na[1] == std::make_pair(4u, 6u)); + ENSURE(na[2] == std::make_pair(9u, 10u)); + + // ~([0,4]) = [5,10] + auto b = range_predicate::range(0, 4, M); + auto nb = ~b; + ENSURE(nb.num_ranges() == 1); + ENSURE(nb[0] == std::make_pair(5u, 10u)); + + // ~([5,10]) = [0,4] + auto cnb = ~nb; + ENSURE(cnb == b); + } + + // Exhaustively verify the lattice / de-Morgan laws on a small domain + // by enumerating every possible subset (bitmask). + void test_exhaustive_laws() { + unsigned const M = 5; // 6 characters -> 64 subsets + unsigned const N = 1u << (M + 1); + for (unsigned i = 0; i < N; ++i) { + range_predicate A = from_mask(i, M); + ENSURE(to_mask(A) == i); + // ~ ~ A == A + ENSURE(~~A == A); + // A | ~A == top + ENSURE((A | ~A).is_top()); + // A & ~A == empty + ENSURE((A & ~A).is_empty()); + // cardinality matches popcount + unsigned pop = 0; + for (unsigned k = 0; k <= M; ++k) if ((i >> k) & 1u) ++pop; + ENSURE(A.cardinality() == pop); + } + for (unsigned i = 0; i < N; ++i) { + range_predicate A = from_mask(i, M); + for (unsigned j = 0; j < N; ++j) { + range_predicate B = from_mask(j, M); + // Bitmask reference semantics. + ENSURE(to_mask(A | B) == (i | j)); + ENSURE(to_mask(A & B) == (i & j)); + ENSURE(to_mask(A - B) == (i & ~j & ((1u << (M + 1)) - 1u))); + ENSURE(to_mask(A ^ B) == (i ^ j)); + // de-Morgan + ENSURE(~(A | B) == (~A & ~B)); + ENSURE(~(A & B) == (~A | ~B)); + // Commutativity + ENSURE((A | B) == (B | A)); + ENSURE((A & B) == (B & A)); + // (A - B) == A & ~B + ENSURE((A - B) == (A & ~B)); + // (A ^ B) == (A | B) - (A & B) + ENSURE((A ^ B) == ((A | B) - (A & B))); + // Extensional equality is reflexive on equal masks. + if (i == j) { + ENSURE(A == B); + ENSURE(A.hash() == B.hash()); + } + } + } + } + + void test_total_order_strict() { + unsigned const M = 5; + unsigned const N = 1u << (M + 1); + // Strict total order: for any distinct A, B exactly one of A + +namespace { + + using seq::range_predicate; + using seq::regex_to_range_predicate; + using seq::range_predicate_to_regex; + + static void check(bool ok, char const* what) { + if (!ok) { + std::cerr << "regex_range_collapse FAILED: " << what << "\n"; + ENSURE(false); + } + } + + static expr_ref mk_singleton_str(seq_util& u, unsigned c) { + return expr_ref(u.str.mk_string(zstring(c)), u.get_manager()); + } + + static bool extract_range_chars(seq_util& u, expr* e, unsigned& lo, unsigned& hi) { + expr* lo_e = nullptr; expr* hi_e = nullptr; + if (!u.re.is_range(e, lo_e, hi_e)) + return false; + // Accept either string-constant or (seq.unit (Char N)) bound form. + if (u.re.is_range(e, lo, hi)) + return true; + expr* lc = nullptr; expr* hc = nullptr; + if (u.str.is_unit(lo_e, lc) && u.is_const_char(lc, lo) && + u.str.is_unit(hi_e, hc) && u.is_const_char(hc, hi)) + return true; + return false; + } + + static void run() { + ast_manager m; + reg_decl_plugins(m); + seq_util u(m); + unsigned const M = u.max_char(); + + sort* str_sort = u.str.mk_string_sort(); + sort* re_sort = u.re.mk_re(str_sort); + + // primitives + { + range_predicate p(M); + check(regex_to_range_predicate(u, u.re.mk_empty(re_sort), p) && p.is_empty(), + "re.empty -> empty"); + check(regex_to_range_predicate(u, u.re.mk_full_char(re_sort), p) && p.is_top(), + "re.full_char -> top"); + } + // re.range "a" "z" + { + range_predicate p(M); + expr_ref a = mk_singleton_str(u, 'a'); + expr_ref z = mk_singleton_str(u, 'z'); + expr_ref r(u.re.mk_range(a, z), m); + check(regex_to_range_predicate(u, r, p) && p.num_ranges() == 1 && + p[0].first == 'a' && p[0].second == 'z', + "re.range a z -> [a,z]"); + } + // Disjoint union: (a..z) | (0..9) + { + range_predicate p(M); + expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'z')), m); + expr_ref r2(u.re.mk_range(mk_singleton_str(u, '0'), mk_singleton_str(u, '9')), m); + expr_ref un(u.re.mk_union(r1, r2), m); + check(regex_to_range_predicate(u, un, p) && p.num_ranges() == 2, + "(a-z)|(0-9) -> 2 ranges"); + // canonical order: lower lo first + check(p[0].first == '0' && p[0].second == '9' && p[1].first == 'a' && p[1].second == 'z', + "(a-z)|(0-9) ranges in canonical order"); + } + // Overlapping union: (a..c) | (b..f) -> (a..f) + { + range_predicate p(M); + expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'c')), m); + expr_ref r2(u.re.mk_range(mk_singleton_str(u, 'b'), mk_singleton_str(u, 'f')), m); + expr_ref un(u.re.mk_union(r1, r2), m); + check(regex_to_range_predicate(u, un, p) && p.num_ranges() == 1 && + p[0].first == 'a' && p[0].second == 'f', + "(a-c)|(b-f) -> (a-f)"); + } + // Adjacent union: (a..c) | (d..f) -> (a..f) (canonical predicate merges adjacent) + { + range_predicate p(M); + expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'c')), m); + expr_ref r2(u.re.mk_range(mk_singleton_str(u, 'd'), mk_singleton_str(u, 'f')), m); + expr_ref un(u.re.mk_union(r1, r2), m); + check(regex_to_range_predicate(u, un, p) && p.num_ranges() == 1 && + p[0].first == 'a' && p[0].second == 'f', + "(a-c)|(d-f) -> (a-f) via adjacency"); + } + // Disjoint intersection: (a..z) & (0..9) -> empty + { + range_predicate p(M); + expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'z')), m); + expr_ref r2(u.re.mk_range(mk_singleton_str(u, '0'), mk_singleton_str(u, '9')), m); + expr_ref ix(u.re.mk_inter(r1, r2), m); + check(regex_to_range_predicate(u, ix, p) && p.is_empty(), + "(a-z)&(0-9) -> empty"); + } + // Overlapping intersection: (a..f) & (c..z) -> (c..f) + { + range_predicate p(M); + expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'f')), m); + expr_ref r2(u.re.mk_range(mk_singleton_str(u, 'c'), mk_singleton_str(u, 'z')), m); + expr_ref ix(u.re.mk_inter(r1, r2), m); + check(regex_to_range_predicate(u, ix, p) && p.num_ranges() == 1 && + p[0].first == 'c' && p[0].second == 'f', + "(a-f)&(c-z) -> (c-f)"); + } + // Complement: re.complement is intentionally NOT a char-class op + // (it operates over Σ*), so it must NOT be translated. + { + range_predicate p(M); + expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'z')), m); + expr_ref cmp(u.re.mk_complement(r1), m); + check(!regex_to_range_predicate(u, cmp, p), + "re.comp of range is NOT translatable (sequence-level complement)"); + } + // Diff: (a..f) \ (c..z) -> (a..b) + { + range_predicate p(M); + expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'f')), m); + expr_ref r2(u.re.mk_range(mk_singleton_str(u, 'c'), mk_singleton_str(u, 'z')), m); + expr_ref df(u.re.mk_diff(r1, r2), m); + check(regex_to_range_predicate(u, df, p) && p.num_ranges() == 1 && + p[0].first == 'a' && p[0].second == 'b', + "(a-f) \\ (c-z) -> (a-b)"); + } + // Negative: re.* of a range is NOT a char class + { + range_predicate p(M); + expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'z')), m); + expr_ref star(u.re.mk_star(r1), m); + check(!regex_to_range_predicate(u, star, p), + "re.* of range not translatable"); + } + + // Negative: a regex whose element type is NOT a sequence of + // characters (here (Seq Int)) must be rejected outright, even for + // shapes that structurally resemble char-class operators. + { + range_predicate p(M); + arith_util a(m); + sort* int_seq = u.str.mk_seq(a.mk_int()); + sort* int_re = u.re.mk_re(int_seq); + check(!regex_to_range_predicate(u, u.re.mk_empty(int_re), p), + "re.empty over (Seq Int) is NOT a char class"); + check(!regex_to_range_predicate(u, u.re.mk_full_char(int_re), p), + "re.full_char over (Seq Int) is NOT a char class"); + } + + // ---- materialization round-trip ---- + + // empty -> re.empty + { + range_predicate p = range_predicate::empty(M); + expr_ref e = range_predicate_to_regex(u, p, str_sort); + check(u.re.is_empty(e), "empty -> re.empty"); + } + // top -> re.full_char + { + range_predicate p = range_predicate::top(M); + expr_ref e = range_predicate_to_regex(u, p, str_sort); + check(u.re.is_full_char(e), "top -> re.full_char"); + } + // single range -> re.range + { + range_predicate p = range_predicate::range('a', 'z', M); + expr_ref e = range_predicate_to_regex(u, p, str_sort); + unsigned lo = 0, hi = 0; + check(extract_range_chars(u, e, lo, hi) && lo == 'a' && hi == 'z', + "[a-z] -> re.range a z"); + } + // singleton -> re.range c c + { + range_predicate p = range_predicate::singleton('A', M); + expr_ref e = range_predicate_to_regex(u, p, str_sort); + unsigned lo = 0, hi = 0; + check(extract_range_chars(u, e, lo, hi) && lo == 'A' && hi == 'A', + "{A} -> re.range A A"); + } + // 2 ranges -> re.union(range_0, range_1) in canonical order + { + range_predicate p = range_predicate::range('0', '9', M) + | range_predicate::range('a', 'z', M); + expr_ref e = range_predicate_to_regex(u, p, str_sort); + expr* a = nullptr; expr* b = nullptr; + check(u.re.is_union(e, a, b), "2-range -> union"); + unsigned lo0 = 0, hi0 = 0, lo1 = 0, hi1 = 0; + check(extract_range_chars(u, a, lo0, hi0) && lo0 == '0' && hi0 == '9', + "union arg0 = (0-9) (canonical: lower lo first)"); + check(extract_range_chars(u, b, lo1, hi1) && lo1 == 'a' && hi1 == 'z', + "union arg1 = (a-z)"); + } + // 3 ranges -> right-associated union + { + range_predicate p = range_predicate::range(0, 5, M) + | range_predicate::range(10, 15, M) + | range_predicate::range(20, 25, M); + expr_ref e = range_predicate_to_regex(u, p, str_sort); + expr* a = nullptr; expr* rest = nullptr; + check(u.re.is_union(e, a, rest), "3-range -> union(...)"); + unsigned lo = 0, hi = 0; + check(extract_range_chars(u, a, lo, hi) && lo == 0 && hi == 5, "first arg = (0-5)"); + expr* b = nullptr; expr* c = nullptr; + check(u.re.is_union(rest, b, c), "rest is union(...,...)"); + check(extract_range_chars(u, b, lo, hi) && lo == 10 && hi == 15, "second range"); + check(extract_range_chars(u, c, lo, hi) && lo == 20 && hi == 25, "third range"); + } + // Round-trip identity for an arbitrary range-set + { + range_predicate p_in = range_predicate::range('a', 'c', M) + | range_predicate::range('m', 'p', M) + | range_predicate::range('x', 'z', M); + expr_ref e = range_predicate_to_regex(u, p_in, str_sort); + range_predicate p_out(M); + check(regex_to_range_predicate(u, e, p_out), "round-trip translatable"); + check(p_in == p_out, "round-trip equal"); + } + + std::cerr << "regex_range_collapse tests passed\n"; + } +} + +void tst_regex_range_collapse() { + run(); +} diff --git a/src/test/seq_regex_bisim.cpp b/src/test/seq_regex_bisim.cpp new file mode 100644 index 0000000000..83404439b5 --- /dev/null +++ b/src/test/seq_regex_bisim.cpp @@ -0,0 +1,127 @@ +// Regression test for the seq::derive::intersect_intervals bug. +// +// Background: derive uses a path-tracking interval set to compute symbolic +// derivatives. The intersect_intervals routine used to react to a single +// disjoint interval by dropping the entire kept suffix and skipping the rest +// of the list, which silently killed valid branches in derivatives such as +// D(a|b). That made the bisimulation procedure conclude bogus equalities +// like a* == (a|b)*. +// +// This file also covers the seq::derive top-level-cache poisoning bug. +// `m_top_cache` is keyed only by the regex; the routine used to populate it +// while `m_ele` was set to a *concrete* character, baking that character +// into the cached "symbolic" derivative. Subsequent calls with the same +// regex but a different ele then returned a stale concrete answer instead +// of the true symbolic derivative. The simplest victim is +// (str.in_re "aP" (re.++ (re.* "a") "P")) +// which used to return false because the derivative wrt 'a' was cached and +// re-used as the derivative wrt 'P'. +#include "ast/ast.h" +#include "ast/ast_pp.h" +#include "ast/reg_decl_plugins.h" +#include "ast/seq_decl_plugin.h" +#include "ast/rewriter/seq_rewriter.h" +#include "ast/rewriter/seq_regex_bisim.h" +#include "ast/rewriter/th_rewriter.h" +#include + +static void test_a_star_neq_ab_star() { + ast_manager m; + reg_decl_plugins(m); + seq_util u(m); + seq_rewriter rw(m); + + sort_ref str_sort(u.str.mk_string_sort(), m); + + zstring sa("a"), sb("b"); + expr_ref re_a(u.re.mk_to_re(u.str.mk_string(sa)), m); + expr_ref re_b(u.re.mk_to_re(u.str.mk_string(sb)), m); + expr_ref a_star(u.re.mk_star(re_a), m); + expr_ref ab(u.re.mk_union(re_a, re_b), m); + expr_ref ab_star(u.re.mk_star(ab), m); + + expr_ref d_ab = rw.mk_brz_derivative(ab); + std::cout << "D(a|b) = " << mk_pp(d_ab, m) << "\n"; + + // Both the 'a' branch and the 'b' branch of D(a|b) must reach epsilon. + // Collect the regex leaves of the symbolic derivative and require at + // least two distinct accepting leaves (one for 'a' and one for 'b'). + expr_ref_vector leaves(m); + auto collect = [&](expr* e, auto&& self) -> void { + expr* c, *t, *f; + if (m.is_ite(e, c, t, f) || u.re.is_union(e, t, f) || u.re.is_antimirov_union(e, t, f)) { + self(t, self); + self(f, self); + return; + } + if (u.re.is_empty(e)) return; + leaves.push_back(e); + }; + collect(d_ab, collect); + unsigned nullable_leaves = 0; + for (expr* l : leaves) { + expr_ref n = rw.is_nullable(l); + if (m.is_true(n)) ++nullable_leaves; + } + std::cout << "D(a|b) leaves=" << leaves.size() + << " nullable=" << nullable_leaves << "\n"; + ENSURE(nullable_leaves >= 2); + + // Bisim must report the two languages are not equivalent. + seq::regex_bisim bisim(rw); + lbool eq = bisim.are_equivalent(a_star, ab_star); + std::cout << "bisim(a*, (a|b)*) = " + << (eq == l_true ? "true" : eq == l_false ? "false" : "undef") << "\n"; + ENSURE(eq == l_false); +} + +// Regression for the derive top-level-cache poisoning bug. +// Take r = (re.* "a") ++ "P" and check str.in_re "aP" r. Before the fix +// the first per-char derivative call (wrt 'a') populated m_top_cache with +// 'a' baked into the symbolic ITE-tree, so the next call (wrt 'P') returned +// that stale cached value instead of computing D_P(r) = epsilon, making +// str.in_re wrongly return false. +static void test_derive_cache_per_ele() { + ast_manager m; + reg_decl_plugins(m); + seq_util u(m); + seq_rewriter rw(m); + + sort_ref str_sort(u.str.mk_string_sort(), m); + + zstring sa("a"), sP("P"), s_aP("aP"); + expr_ref re_a(u.re.mk_to_re(u.str.mk_string(sa)), m); + expr_ref re_P(u.re.mk_to_re(u.str.mk_string(sP)), m); + expr_ref a_star(u.re.mk_star(re_a), m); + expr_ref r(u.re.mk_concat(a_star, re_P), m); + expr_ref aP(u.str.mk_string(s_aP), m); + + // Compute D_'a'(a*P) and D_'P'(a*P) directly via mk_derivative. + // Before the fix, m_top_cache was populated while m_ele = ele (the + // concrete char), so the second call hit the stale cached answer from + // the first. After the fix the cache is keyed by a symbolic var, so + // each concrete-ele substitution produces the right answer. + expr_ref ch_a(u.mk_char('a'), m); + expr_ref ch_P(u.mk_char('P'), m); + expr_ref d_a = rw.mk_derivative(ch_a, r); + expr_ref d_P = rw.mk_derivative(ch_P, r); + std::cout << "D_a(a*P) = " << mk_pp(d_a, m) << "\n"; + std::cout << "D_P(a*P) = " << mk_pp(d_P, m) << "\n"; + + // D_P(a*P) must be nullable (it accepts the empty suffix), while + // D_a(a*P) must not be (it still needs a trailing 'P'). + expr_ref n_a = rw.is_nullable(d_a); + expr_ref n_P = rw.is_nullable(d_P); + th_rewriter trw(m); + trw(n_a); + trw(n_P); + std::cout << "nullable(D_a) = " << mk_pp(n_a, m) << "\n"; + std::cout << "nullable(D_P) = " << mk_pp(n_P, m) << "\n"; + ENSURE(m.is_false(n_a)); + ENSURE(m.is_true(n_P)); +} + +void tst_seq_regex_bisim() { + test_a_star_neq_ab_star(); + test_derive_cache_per_ele(); +}