From dbb3f70873ad7fba8df919685466ae7d8c1e442a Mon Sep 17 00:00:00 2001 From: CEisenhofer Date: Wed, 10 Jun 2026 15:00:42 +0200 Subject: [PATCH] Moved the regex splitting into rewriter Added some simplifications --- src/ast/rewriter/CMakeLists.txt | 1 + src/ast/rewriter/seq_rewriter.h | 10 +- src/ast/rewriter/seq_split.cpp | 366 ++++++++++++++++++++++++++++++++ src/ast/rewriter/seq_split.h | 93 ++++++++ src/smt/seq/seq_nielsen.cpp | 302 +------------------------- src/smt/seq/seq_nielsen.h | 24 --- src/smt/theory_nseq.cpp | 12 +- 7 files changed, 484 insertions(+), 324 deletions(-) create mode 100644 src/ast/rewriter/seq_split.cpp create mode 100644 src/ast/rewriter/seq_split.h diff --git a/src/ast/rewriter/CMakeLists.txt b/src/ast/rewriter/CMakeLists.txt index ba7eda277..ccbc23be7 100644 --- a/src/ast/rewriter/CMakeLists.txt +++ b/src/ast/rewriter/CMakeLists.txt @@ -40,6 +40,7 @@ z3_add_component(rewriter seq_axioms.cpp seq_eq_solver.cpp seq_subset.cpp + seq_split.cpp seq_rewriter.cpp seq_skolem.cpp th_rewriter.cpp diff --git a/src/ast/rewriter/seq_rewriter.h b/src/ast/rewriter/seq_rewriter.h index 843f32bbf..d57a50fb1 100644 --- a/src/ast/rewriter/seq_rewriter.h +++ b/src/ast/rewriter/seq_rewriter.h @@ -24,6 +24,7 @@ Notes: #include "ast/rewriter/rewriter_types.h" #include "ast/rewriter/bool_rewriter.h" #include "ast/rewriter/seq_subset.h" +#include "ast/rewriter/seq_split.h" #include "util/params.h" #include "util/lbool.h" #include "util/sign.h" @@ -130,6 +131,7 @@ class seq_rewriter { seq_util m_util; seq_subset m_subset; + seq_split m_split; arith_util m_autil; bool_rewriter m_br; // re2automaton m_re2aut; @@ -342,7 +344,7 @@ class seq_rewriter { public: seq_rewriter(ast_manager & m, params_ref const & p = params_ref()): - m_util(m), m_subset(m_util.re), m_autil(m), m_br(m, p), // m_re2aut(m), + m_util(m), m_subset(m_util.re), m_split(*this), m_autil(m), m_br(m, p), // m_re2aut(m), m_op_cache(m), m_es(m), m_lhs(m), m_rhs(m), m_coalesce_chars(true) { } @@ -381,6 +383,12 @@ public: return result; } + // Split decomposition (sigma) of a regex; see seq_split.h. + bool split(expr* r, split_set& out, unsigned threshold, split_mode mode = split_mode::strong) { + return m_split.compute(r, out, threshold, mode); + } + void simplify_split(split_set& s) { m_split.simplify(s); } + expr_ref mk_symmetric_diff(expr *r1, expr *r2); /** diff --git a/src/ast/rewriter/seq_split.cpp b/src/ast/rewriter/seq_split.cpp new file mode 100644 index 000000000..a74030301 --- /dev/null +++ b/src/ast/rewriter/seq_split.cpp @@ -0,0 +1,366 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_split.cpp + +Abstract: + + Regex split decomposition (the split function sigma). See seq_split.h. + +Author: + + Nikolaj Bjorner (nbjorner) 2026-6-10 + Clemens Eisenhofer 2026-6-10 + +--*/ + +#include "ast/rewriter/seq_split.h" +#include "ast/rewriter/seq_rewriter.h" +#include "ast/ast_pp.h" +#include "util/obj_hashtable.h" + +seq_split::seq_split(seq_rewriter& rw) : + m_rw(rw), m_subset(rw.u().re) {} + +ast_manager& seq_split::m() const { return m_rw.m(); } +seq_util& seq_split::seq() const { return m_rw.u(); } +seq_util::rex& seq_split::re() const { return m_rw.u().re; } + +// Cross-product intersection of two split-sets (split algebra): +// S1 cap S2 = { | in S1, in S2 }. +// Pairs where any component is bottom (the empty regex) are dropped. +bool seq_split::intersect(split_set const& s1, split_set const& s2, split_set& result, unsigned threshold) { + seq_util::rex& r = re(); + for (auto const& p1 : s1) { + for (auto const& p2 : s2) { + if (r.is_empty(p1.m_d) || r.is_empty(p2.m_d) || + r.is_empty(p1.m_n) || r.is_empty(p2.m_n)) + continue; + result.push_back(split_pair(r.mk_inter(p1.m_d, p2.m_d), + r.mk_inter(p1.m_n, p2.m_n), m())); + if (result.size() > threshold) + return false; + } + } + return true; +} + +// Complement of a split-set via De Morgan: ~S = cap_{s in S} ~s with +// ~ = { <~D, .*>, <.*, ~N> } and ~{} = { <.*, .*> }. +// May produce up to 2^|sp| pairs (bounded by the threshold). A threshold +// overrun must abort entirely: a partial fold is a strictly weaker (unsound) +// split-set, since each ~sp[i] further constrains ~S. +bool seq_split::complement(sort* seq_sort, split_set const& sp, split_set& result, unsigned threshold) { + seq_util::rex& r = re(); + sort* re_sort = r.mk_re(seq_sort); + const expr_ref full(r.mk_full_seq(re_sort), m()); // .* + if (sp.empty()) { // ~{} = <.*, .*> + result.push_back(split_pair(full, full, m())); + return true; + } + split_set acc; + acc.push_back(split_pair(r.mk_complement(sp[0].m_d), full, m())); + acc.push_back(split_pair(full, r.mk_complement(sp[0].m_n), m())); + for (unsigned i = 1; i < sp.size(); ++i) { + split_set next; + next.push_back(split_pair(r.mk_complement(sp[i].m_d), full, m())); + next.push_back(split_pair(full, r.mk_complement(sp[i].m_n), m())); + split_set tmp; + if (!intersect(acc, next, tmp, threshold)) + return false; + acc = std::move(tmp); + if (acc.empty()) // intersection empty => ~S is empty + break; + if (acc.size() > threshold) + return false; + } + result.append(acc); + return true; +} + +bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mode mode) { + SASSERT(r); + seq_util& sq = seq(); + seq_util::rex& rex = re(); + ast_manager& mm = m(); + + sort* seq_sort = nullptr; + if (!sq.is_re(r, seq_sort)) + return false; + + // bottom: sigma(empty) = {} (the empty split-set) + if (rex.is_empty(r)) + return true; + + // epsilon: sigma(eps) = { } + if (rex.is_epsilon(r)) { + const expr_ref eps(rex.mk_epsilon(seq_sort), mm); + result.push_back(split_pair(eps, eps, mm)); + return true; + } + + expr* a = nullptr, *b = nullptr; + + // to_re(s): split the literal word s at every position. + expr* s = nullptr; + if (rex.is_to_re(r, s)) { + zstring str; + if (sq.str.is_string(s, str)) { + for (unsigned i = 0; i <= str.length(); ++i) { + const expr_ref p(rex.mk_to_re(sq.str.mk_string(str.extract(0, i))), mm); + const expr_ref q(rex.mk_to_re(sq.str.mk_string(str.extract(i, str.length() - i))), mm); + result.push_back(split_pair(p, q, mm)); + } + return true; + } + // a single symbolic unit behaves like one token: { , } + if (sq.str.is_unit(s)) { + const expr_ref ex(r, mm); + const expr_ref eps(rex.mk_epsilon(seq_sort), mm); + result.push_back(split_pair(eps, ex, mm)); + result.push_back(split_pair(ex, eps, mm)); + return true; + } + // to_re over a non-literal sequence: not handled. + return false; + } + + // single-character class alpha (., [lo-hi], of_pred): + // sigma(alpha) = { , } + if (rex.is_full_char(r) || rex.is_range(r) || rex.is_of_pred(r)) { + const expr_ref ex(r, mm); + const expr_ref eps(rex.mk_epsilon(seq_sort), mm); + result.push_back(split_pair(eps, ex, mm)); + result.push_back(split_pair(ex, eps, mm)); + return true; + } + + // .* : sigma(.*) = { <.*, .*> } + if (rex.is_full_seq(r)) { + const expr_ref ex(r, mm); + result.push_back(split_pair(ex, ex, mm)); + return true; + } + + // union: sigma(r0 | ... | r_{n-1}) = U sigma(ri) (re.union may be n-ary) + if (rex.is_union(r)) { + app* ap = to_app(r); + for (unsigned i = 0; i < ap->get_num_args(); ++i) + if (!compute(ap->get_arg(i), result, threshold, mode)) + return false; + return true; + } + + // concat: sigma(r0...r_{n-1}) = U_i (r0...r_{i-1}) . sigma(ri) . (r_{i+1}...r_{n-1}) + // (re.++ may be n-ary) + if (rex.is_concat(r)) { + app* ap = to_app(r); + const unsigned n = ap->get_num_args(); + for (unsigned i = 0; i < n; ++i) { + split_set sigma_arg; + if (!compute(ap->get_arg(i), sigma_arg, threshold, mode)) + return false; + + expr_ref left(mm), right(mm); + if (i == 0) + left = rex.mk_epsilon(seq_sort); + else + for (unsigned j = 0; j < i; ++j) { + expr* arg = ap->get_arg(j); + left = left ? expr_ref(rex.mk_concat(left, arg), mm) : expr_ref(arg, mm); + } + if (i == n - 1) + right = rex.mk_epsilon(seq_sort); + else + for (unsigned j = i + 1; j < n; ++j) { + expr* arg = ap->get_arg(j); + right = right ? expr_ref(rex.mk_concat(right, arg), mm) : expr_ref(arg, mm); + } + + for (auto const& [d, nn] : sigma_arg) { + const expr_ref p = m_rw.mk_re_append(left, d); + const expr_ref q = m_rw.mk_re_append(nn, right); + result.push_back(split_pair(p, q, mm)); + } + } + return true; + } + + // star: sigma(a*) = { } cup a*.sigma(a).a* + if (rex.is_star(r, a)) { + const expr_ref eps(rex.mk_epsilon(seq_sort), mm); + result.push_back(split_pair(eps, eps, mm)); + split_set sa; + if (!compute(a, sa, threshold, mode)) + return false; + for (auto const& [d, n] : sa) { + const expr_ref p = m_rw.mk_re_append(r, d); // a*.D + const expr_ref q = m_rw.mk_re_append(n, r); // N.a* + result.push_back(split_pair(p, q, mm)); + } + return true; + } + + // plus: a+ = a.a* ; sigma(a+) = a*.sigma(a).a* (star rule without ) + if (rex.is_plus(r, a)) { + const expr_ref star(rex.mk_star(a), mm); // a* + split_set sa; + if (!compute(a, sa, threshold, mode)) + return false; + for (auto const& [d, n] : sa) { + const expr_ref p = m_rw.mk_re_append(star, d); + const expr_ref q = m_rw.mk_re_append(n, star); + result.push_back(split_pair(p, q, mm)); + } + return true; + } + + // intersection: sigma(r0 & ... & r_{n-1}) = cap sigma(ri) (re.inter may be n-ary) + if (rex.is_intersection(r)) { + if (mode == split_mode::weak) + return false; + app* ap = to_app(r); + const unsigned n = ap->get_num_args(); + split_set current; + if (!compute(ap->get_arg(0), current, threshold, mode)) + return false; + // A give-up on any conjunct must propagate as a give-up: silently treating + // it as the empty split-set would collapse the whole intersection to bottom + // and be misreported as an (unsound) conflict. + for (unsigned i = 1; i < n && !current.empty(); ++i) { + split_set arg_i, tmp; + if (!compute(ap->get_arg(i), arg_i, threshold, mode)) + return false; + if (!intersect(current, arg_i, tmp, threshold)) + return false; + current = std::move(tmp); + } + result.append(current); + return true; + } + + // complement: sigma(~a) = ~sigma(a) + if (rex.is_complement(r, a)) { + if (mode == split_mode::weak) + return false; + split_set sa; + if (!compute(a, sa, threshold, mode)) + return false; + return complement(seq_sort, sa, result, threshold); + } + + // difference: a \ b = a & ~b ; sigma(a \ b) = sigma(a) cap ~sigma(b) + if (rex.is_diff(r, a, b)) { + if (mode == split_mode::weak) + return false; + split_set sa, sb, sb_compl, tmp; + if (!compute(a, sa, threshold, mode)) + return false; + if (!compute(b, sb, threshold, mode)) + return false; + if (!complement(seq_sort, sb, sb_compl, threshold)) + return false; + if (!intersect(sa, sb_compl, tmp, threshold)) + return false; + result.append(tmp); + return true; + } + + // bounded loop / ite / other: not handled (paper "v1: bail"). + TRACE(seq, tout << "seq_split: unsupported regex " << mk_pp(r, mm) << "\n";); + return false; +} + +// same-D / same-N merge (paper eqs. 1 & 2): +// { , } -> (by_left = true, group by D) +// { , } -> (by_left = false, group by N) +// Only fires on syntactically-identical (perfectly-shared) key components, so +// it is a conservative instance of the rule. +void seq_split::merge_by(split_set& pairs, bool by_left) { + seq_util::rex& r = re(); + ast_manager& mm = m(); + obj_map idx; // key component -> position in `out` + split_set out; + for (auto const& p : pairs) { + expr* key = by_left ? p.m_d.get() : p.m_n.get(); + expr* other = by_left ? p.m_n.get() : p.m_d.get(); + unsigned pos; + if (idx.find(key, pos)) { + expr* prev = by_left ? out[pos].m_n.get() : out[pos].m_d.get(); + const expr_ref u(r.mk_union(prev, other), mm); + if (by_left) + out[pos].m_n = u; + else + out[pos].m_d = u; + } + else { + idx.insert(key, out.size()); + out.push_back(p); + } + } + pairs.swap(out); +} + +void seq_split::simplify(split_set& pairs) { + seq_util::rex& r = re(); + + // 1. drop pairs with a bottom (empty-language) component. + unsigned w = 0; + for (unsigned i = 0; i < pairs.size(); ++i) { + if (r.is_empty(pairs[i].m_d) || r.is_empty(pairs[i].m_n)) + continue; + if (w != i) + pairs[w] = pairs[i]; + ++w; + } + pairs.shrink(w); + if (pairs.size() <= 1) + return; + + // 2. same-D / same-N merge rules. + merge_by(pairs, true); + merge_by(pairs, false); + if (pairs.size() <= 1) + return; + + // 3. subsumption: drop when L(D_i) subseteq L(D_j) and + // L(N_i) subseteq L(N_j) for some kept j. seq_subset is conservative + // (returns true only for definite containment), so we never drop a + // needed split. Size-capped to bound the O(n^2) subset checks. + if (pairs.size() > 64) + return; + + struct row { expr* d; expr* n; unsigned idx; }; + vector rows; + for (unsigned i = 0; i < pairs.size(); ++i) + rows.push_back({ pairs[i].m_d.get(), pairs[i].m_n.get(), i }); + + auto subsumes = [&](row const& a, row const& b) { + return m_subset.is_subset(b.d, a.d) && m_subset.is_subset(b.n, a.n); + }; + + vector kept; + for (row const& row_r : rows) { + bool redundant = false; + for (row const& k : kept) + if (subsumes(k, row_r)) { redundant = true; break; } + if (redundant) + continue; + // drop already-kept rows strictly subsumed by row_r + unsigned kw = 0; + for (unsigned t = 0; t < kept.size(); ++t) { + if (subsumes(row_r, kept[t])) + continue; + kept[kw++] = kept[t]; + } + kept.shrink(kw); + kept.push_back(row_r); + } + + split_set result; + for (row const& k : kept) + result.push_back(pairs[k.idx]); + pairs.swap(result); +} diff --git a/src/ast/rewriter/seq_split.h b/src/ast/rewriter/seq_split.h new file mode 100644 index 000000000..308ab338a --- /dev/null +++ b/src/ast/rewriter/seq_split.h @@ -0,0 +1,93 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_split.h + +Abstract: + + Regex split decomposition: the split function sigma from the paper + "Solving by Splitting". For a regular expression r, sigma(r) is a finite + "split-set" of pairs { } such that + + u.v in L(r) iff exists i: u in L(D_i) and v in L(N_i). + + This lifts the decomposition previously buried in the nseq solver + (seq::compute_sigma over euf::snode) into a self-contained, expr*-based + engine that can be reused anywhere a seq_rewriter is available (the legacy + seq solver, nseq, and the regex rewriter itself). + + The split algebra (intersection, De Morgan complement, left/right + concatenation with a regex) and the cardinality-reducing simplification + heuristics (drop bottom, same-D/same-N merge, subsumption via seq_subset) + follow the paper. + +Author: + + Nikolaj Bjorner (nbjorner) 2026-6-10 + Clemens Eisenhofer 2026-6-10 + +--*/ +#pragma once + +#include "ast/seq_decl_plugin.h" +#include "ast/rewriter/seq_subset.h" + +class seq_rewriter; + +// An individual split : the left (prefix) regex D and right (suffix) +// regex N. u.v in L(r) for this split iff u in L(D) and v in L(N). +struct split_pair { + expr_ref m_d; + expr_ref m_n; + split_pair(expr* d, expr* n, ast_manager& m) : m_d(d, m), m_n(n, m) { + SASSERT(d && n); + } +}; + +// A split-set is a union of individual splits. +typedef vector split_set; + +// Controls how aggressively sigma expands the Boolean-closure cases: +// strong - fully expand complement / intersection via the split algebra +// (De Morgan / cross product). This is the behaviour the nseq +// solver relies on. +// weak - do not perform the (potentially 2^k) Boolean-closure expansion; +// give up (return false) on complement / intersection instead. +enum class split_mode { weak, strong }; + +class seq_split { + seq_rewriter& m_rw; // for mk_re_append + manager / seq_util access + seq_subset m_subset; // language-subset checks for subsumption + + ast_manager& m() const; + seq_util& seq() const; + seq_util::rex& re() const; + + // S1 cap S2 = { } dropping any pair with a bottom + // component. Returns false on threshold overrun. + bool intersect(split_set const& s1, split_set const& s2, split_set& result, unsigned threshold); + + // De Morgan complement of a split-set: ~S = cap_{s in S} ~s with + // ~ = { <~D, .*>, <.*, ~N> } and ~{} = { <.*, .*> }. + bool complement(sort* seq_sort, split_set const& sp, split_set& result, unsigned threshold); + + // same-D / same-N merge: groups pairs that share a (syntactically identical) + // left (resp. right) component and unions the other component. + void merge_by(split_set& pairs, bool by_left); + +public: + explicit seq_split(seq_rewriter& rw); + + // Compute sigma(r), appending to `out` (does not clear it). `threshold` + // bounds the number of produced splits; an overrun, an unsupported regex + // shape (bounded loop / ite), or a Boolean-closure case in weak mode makes + // it return false ("give up"). + bool compute(expr* r, split_set& out, unsigned threshold, split_mode mode = split_mode::strong); + + // In-place simplification of a split-set: drop bottom components, apply the + // same-D / same-N merge rules, and drop splits subsumed by another (using + // seq_subset). Size-capped to keep the O(n^2) subsumption affordable. + void simplify(split_set& s); +}; diff --git a/src/smt/seq/seq_nielsen.cpp b/src/smt/seq/seq_nielsen.cpp index b818243c8..e0ee27240 100644 --- a/src/smt/seq/seq_nielsen.cpp +++ b/src/smt/seq/seq_nielsen.cpp @@ -3576,291 +3576,6 @@ namespace seq { // Modifier: apply_regex_factorization (Boolean Closure) // ----------------------------------------------------------------------- - // Cross-product intersection of two split-sets (split algebra): - // S1 ⊓ S2 = { ⟨Δ1⊓Δ2, ∇1⊓∇2⟩ | ⟨Δ1,∇1⟩∈S1, ⟨Δ2,∇2⟩∈S2 } - // Pairs where either component is the empty regex are dropped (∅⊓r ≡ ∅). - static bool intersect_sigma_pairs(ast_manager& m, seq_util& seq, - sigma_pairs const& s1, sigma_pairs const& s2, sigma_pairs& result, unsigned threshold) { - for (auto const& p1 : s1) { - for (auto const& p2 : s2) { - if (seq.re.is_empty(p1.m_p) || seq.re.is_empty(p2.m_p) || - seq.re.is_empty(p1.m_q) || seq.re.is_empty(p2.m_q)) - continue; - result.push_back(sigma_pair(seq.re.mk_inter(p1.m_p, p2.m_p), - seq.re.mk_inter(p1.m_q, p2.m_q), m)); - if (result.size() > threshold) - return false; - } - } - return true; - } - - // Complement of a split-set via De Morgan: ~S = ⊓_{s∈S} ~s, - // ~⟨Δ,∇⟩ = { ⟨~Δ, .*⟩, ⟨.*, ~∇⟩ } and ~{} = { ⟨.*, .*⟩ }. - // str_sort is the sequence-element sort; mk_full_seq needs the regex sort. - // May produce up to 2^|sp| pairs (bounded downstream by the factorization threshold). - static bool complement_sigma_pairs(ast_manager& m, seq_util& seq, sort* str_sort, - sigma_pairs const& sp, sigma_pairs& result, unsigned threshold) { - - sort* re_sort = seq.re.mk_re(str_sort); - const expr_ref full(seq.re.mk_full_seq(re_sort), m); // .* - if (sp.empty()) { // ~{} = ⟨.*, .*⟩ - result.push_back(sigma_pair(full, full, m)); - return true; - } - sigma_pairs acc; - acc.push_back(sigma_pair(seq.re.mk_complement(sp[0].m_p), full, m)); - acc.push_back(sigma_pair(full, seq.re.mk_complement(sp[0].m_q), m)); - for (unsigned i = 1; i < sp.size(); ++i) { - sigma_pairs next; - next.push_back(sigma_pair(seq.re.mk_complement(sp[i].m_p), full, m)); - next.push_back(sigma_pair(full, seq.re.mk_complement(sp[i].m_q), m)); - sigma_pairs tmp; - // De Morgan fold: acc := acc ⊓ ~sp[i]. intersect_sigma_pairs returns - // false ONLY when it overruns the threshold; in that case we must give - // up entirely (a partial fold is a strictly weaker — hence unsound — - // split set, since each ~sp[i] further constrains ~S). - if (!intersect_sigma_pairs(m, seq, acc, next, tmp, threshold)) - return false; - acc = std::move(tmp); - if (acc.empty()) // intersection empty ⇒ ~S is empty - break; - if (acc.size() > threshold) - return false; - } - result.append(acc); - return true; - } - - bool compute_sigma(ast_manager& m, seq_util& seq, seq_rewriter& rw, const euf::snode* r, sigma_pairs& result, unsigned threshold) { - SASSERT(r); - sort* str_sort = nullptr; - if (!seq.is_re(r->get_expr(), str_sort)) - return false; - // std::cout << "Computing sigma of " << snode_label_html(r, m, false) << std::endl; - - if (r->is_empty()) { - const expr_ref eps(seq.re.mk_epsilon(str_sort), m); - result.push_back(sigma_pair(eps, eps, m)); - return true; - } - if (r->is_to_re()) { - const euf::snode* const c = r->arg0(); - if (c->is_range()) { - const expr_ref ex(c->get_expr(), m); - const expr_ref eps(seq.re.mk_epsilon(str_sort), m); - - result.push_back(sigma_pair(eps, ex, m)); - result.push_back(sigma_pair(ex, eps, m)); - return true; - } - if (c->is_empty()) { - const expr_ref eps(seq.re.mk_epsilon(str_sort), m); - result.push_back(sigma_pair(eps, eps, m)); - return true; - } - if (c->is_char()) { - unsigned val; - VERIFY(seq.is_const_char(c->arg0()->get_expr(), val)); - const expr_ref ex(seq.re.mk_to_re(seq.str.mk_string(zstring(val))), m); - const expr_ref eps(seq.re.mk_epsilon(str_sort), m); - - result.push_back(sigma_pair(eps, ex, m)); - result.push_back(sigma_pair(ex, eps, m)); - return true; - } - zstring s; - if (c->is_string(s, seq)) { - for (unsigned i = 0; i <= s.length(); ++i) { - expr_ref p(seq.re.mk_to_re(seq.str.mk_string(s.extract(0, i))), m); - expr_ref q(seq.re.mk_to_re(seq.str.mk_string(s.extract(i, s.length() - i))), m); - result.push_back(sigma_pair(p, q, m)); - } - return true; - } - std::cout << mk_pp(c->get_expr(), m) << std::endl; - UNREACHABLE(); - return false; - } - if (r->is_full_char()) { - const expr_ref ex(r->get_expr(), m); - const expr_ref eps(seq.re.mk_epsilon(str_sort), m); - - result.push_back(sigma_pair(eps, ex, m)); - result.push_back(sigma_pair(ex, eps, m)); - return true; - } - if (r->is_full_seq()) { - const expr_ref ex(r->get_expr(), m); - result.push_back(sigma_pair(ex, ex, m)); - return true; - } - if (r->is_union()) { - // σ(r₁ ⊔ r₂) = σ(r₁) ∪ σ(r₂) - SASSERT(r->num_args() >= 2); - for (unsigned i = 0; i < r->num_args(); i++) { - if (!compute_sigma(m, seq, rw, r->arg(i), result, threshold)) - return false; - } - return true; - } - if (r->is_intersect()) { - // σ(r₁ ⊓ r₂ ⊓ …) = σ(r₁) ⊓ σ(r₂) ⊓ …; empty intersection (0 args) = {⟨.*,.*⟩} - const unsigned n = r->num_args(); - SASSERT(n >= 2); - sigma_pairs current; - if (!compute_sigma(m, seq, rw, r->arg0(), current, threshold)) - return false; - for (unsigned i = 1; i < n && !current.empty(); ++i) { - sigma_pairs arg_i; - // A give-up on any conjunct must propagate as a give-up: silently - // treating arg_i as the empty split-set would collapse the whole - // intersection to ∅ and be misreported as an (unsound) conflict. - if (!compute_sigma(m, seq, rw, r->arg(i), arg_i, threshold)) - return false; - sigma_pairs tmp; - if (!intersect_sigma_pairs(m, seq, current, arg_i, tmp, threshold)) - return false; - current = std::move(tmp); - } - result.append(current); - return true; - } - if (r->is_complement()) { - // σ(~r) = ~σ(r) - sigma_pairs body_pairs; - if (!compute_sigma(m, seq, rw, r->arg0(), body_pairs, threshold)) - return false; - return complement_sigma_pairs(m, seq, str_sort, body_pairs, result, threshold); - } - if (r->is_concat()) { - const unsigned n = r->num_args(); - SASSERT(n >= 2); - for (unsigned i = 0; i < n; ++i) { - sigma_pairs sigma_arg; - if (!compute_sigma(m, seq, rw, r->arg(i), sigma_arg, threshold)) - return false; - - expr_ref left(m); - expr_ref right(m); - - if (i == 0) - left = seq.re.mk_epsilon(str_sort); - else { - for (unsigned j = 0; j < i; ++j) { - const euf::snode* arg = r->arg(j); - left = left ? seq.re.mk_concat(left, arg->get_expr()) : arg->get_expr(); - } - } - - if (i == n - 1) - right = seq.re.mk_epsilon(str_sort); - else { - for (unsigned j = i + 1; j < n; ++j) { - const euf::snode* arg = r->arg(j); - right = right ? seq.re.mk_concat(right, arg->get_expr()) : arg->get_expr(); - } - } - - for (auto const &[tp, tq] : sigma_arg) { - expr_ref p = rw.mk_re_append(left, tp); - expr_ref q = rw.mk_re_append(tq, right); - result.push_back(sigma_pair(p, q, m)); - } - } - return true; - } - if (r->is_star()) { - const euf::snode* body = r->arg0(); - const expr_ref eps(seq.re.mk_epsilon(str_sort), m); - result.push_back(sigma_pair(eps, eps, m)); - - sigma_pairs sigma_body; - if (!compute_sigma(m, seq, rw, body, sigma_body, threshold)) - return false; - for (auto const &[tp, tq] : sigma_body) { - auto p = rw.mk_re_append(r->get_expr(), tp); - auto q = rw.mk_re_append(tq, r->get_expr()); - result.push_back(sigma_pair(p, q, m)); - } - return true; - } - if (r->is_plus()) { - // r⁺ = r·r* ; by Kleene factorization σ(r⁺) = r*·σ(r)·r*. - // Same shape as the star rule but with the surrounding factor being - // body* without the {⟨ε,ε⟩} pair - const euf::snode* body = r->arg0(); - const expr_ref star(seq.re.mk_star(body->get_expr()), m); // body* - - sigma_pairs sigma_body; - if (!compute_sigma(m, seq, rw, body, sigma_body, threshold)) - return false; - for (auto const &[tp, tq] : sigma_body) { - auto p = rw.mk_re_append(star, tp); // body* · tp - auto q = rw.mk_re_append(tq, star); // tq · body* - result.push_back(sigma_pair(p, q, m)); - } - return true; - } - // the simplifier should have eliminated most of them already - // TODO: so far, we are, however, still missing bounded repetitions and ite - std::cout << "Unknown element " << mk_pp(r->get_expr(), m) << std::endl; - return false; - } - - void simplify_sigma_pairs(sigma_pairs& pairs, seq_regex& sr, euf::sgraph& sg) { - return; // For now - if (pairs.size() <= 1) - return; - // Guard against pathological cost: subsumption is O(n^2) language-subset - // BFS checks. Large split-sets are left to the factorization threshold. - if (pairs.size() > 64) - return; - - struct row { euf::snode* p; euf::snode* q; unsigned idx; }; - - // Materialise snodes once; drop pairs with a structurally-empty component. - vector rows; - for (unsigned i = 0; i < pairs.size(); ++i) { - euf::snode* p = sg.mk(pairs[i].m_p); - euf::snode* q = sg.mk(pairs[i].m_q); - if (sr.is_empty_regex(p) || sr.is_empty_regex(q)) - continue; - rows.push_back({ p, q, i }); - } - - // a subsumes b iff L(b.p) ⊆ L(a.p) and L(b.q) ⊆ L(a.q). - // is_language_subset may return l_undef (inconclusive); only treat a - // definite l_true as subsumption, so we never drop a needed split. - auto subsumes = [&](row const& a, row const& b) { - return sr.is_language_subset(b.p, a.p) == l_true && - sr.is_language_subset(b.q, a.q) == l_true; - }; - - vector kept; - for (row const& r : rows) { - bool redundant = false; - for (row const& k : kept) - if (subsumes(k, r)) { redundant = true; break; } - if (redundant) - continue; - // drop already-kept rows strictly subsumed by r - unsigned w = 0; - for (unsigned t = 0; t < kept.size(); ++t) { - if (subsumes(r, kept[t])) - continue; - kept[w++] = kept[t]; - } - kept.shrink(w); - kept.push_back(r); - } - - sigma_pairs result; - for (row const& k : kept) - result.push_back(pairs[k.idx]); - pairs.swap(result); - } - bool nielsen_graph::apply_regex_factorization(nielsen_node* node) { if (m_regex_factorization_threshold == 0) return false; @@ -3874,14 +3589,14 @@ namespace seq { for (str_mem const& mem : node->str_mems()) { SASSERT(mem.well_formed()); - // compute_sigma handles all regex forms (incl. complement / intersection), + // split() handles all regex forms (incl. complement / intersection), // so the classical restriction is no longer needed. if (mem.m_str->is_empty() || mem.is_primitive()) continue; - // compute_sigma / compute_tau do not understand the projection - // operator (re.proj) — they would recurse into it and hit an - // UNREACHABLE. Projection-constrained memberships are handled by the + // The split engine works on plain regex AST and does not understand the + // projection operator (re.proj) — it would give up on it anyway. + // Projection-constrained memberships are handled by the // cycle-decomposition path, so skip them here. if (mem.m_regex->has_projection()) continue; @@ -3892,13 +3607,14 @@ namespace seq { euf::snode* tail = m_sg.drop_first(mem.m_str); SASSERT(tail); - sigma_pairs pairs; + // Decompose the regex into a split-set via the shared seq_split engine + // (sigma from the paper): first ∈ Δ ∧ tail ∈ ∇ for each ⟨Δ,∇⟩. + split_set pairs; seq_rewriter rw(m); - if (!compute_sigma(m, m_seq, rw, mem.m_regex, pairs, m_regex_factorization_threshold)) + if (!rw.split(mem.m_regex->get_expr(), pairs, m_regex_factorization_threshold)) continue; - if (m_seq_regex) - simplify_sigma_pairs(pairs, *m_seq_regex, m_sg); + rw.simplify_split(pairs); vector feasible; dep_tracker eliminated_dep = mem.m_dep; diff --git a/src/smt/seq/seq_nielsen.h b/src/smt/seq/seq_nielsen.h index 76d568b85..a30a80442 100644 --- a/src/smt/seq/seq_nielsen.h +++ b/src/smt/seq/seq_nielsen.h @@ -262,30 +262,6 @@ namespace seq { std::string snode_label_html(euf::snode const* n, ast_manager& m, bool html_escape); - // Split-pair produced by compute_sigma: uv |= r iff exists i: u |= m_p[i] and v |= m_q[i] - struct sigma_pair { - expr_ref m_p; - expr_ref m_q; - sigma_pair(expr* p, expr* q, ast_manager& m) : m_p(p, m), m_q(q, m) { - SASSERT(p && q); - } - }; - typedef vector sigma_pairs; - - // Compute the split-set sigma(r) per the splitting rules of the paper - // "Extended Regular Expression Membership". Generalises the classical compute_tau - // with intersection and complement cases via the split-set algebra. - // `result` is appended to (not cleared). - bool compute_sigma(ast_manager& m, seq_util& seq, seq_rewriter& rw, const euf::snode* r, sigma_pairs& result, unsigned threshold); - - // Simplify a split-set in place using the split algebra's language-level rules - // (paper section "split-set simplification heuristics"): drop pairs with an - // empty-language component, and drop any split subsumed by another - // ( is subsumed by iff L(D_i) subseteq L(D_j) and L(N_i) subseteq L(N_j)). - // Subsumption tames the 2^k blow-up of sigma(~r) (e.g. sigma(~a*): 8 -> 2). - // Requires the regex emptiness/subset checker and an sgraph to build snodes. - void simplify_sigma_pairs(sigma_pairs& pairs, seq_regex& sr, euf::sgraph& sg); - // simplification result for constraint processing // mirrors ZIPT's SimplifyResult enum enum class simplify_result { diff --git a/src/smt/theory_nseq.cpp b/src/smt/theory_nseq.cpp index 93831655e..256a3469b 100644 --- a/src/smt/theory_nseq.cpp +++ b/src/smt/theory_nseq.cpp @@ -555,7 +555,7 @@ namespace smt { // Eager sigma factorization (token-level): when enabled, split a non-primitive // membership s ∈ r at the boundary between the first concat argument (head) and - // the rest (tail), using compute_sigma. This mirrors the lazy Nielsen + // the rest (tail), using the shared seq_split engine. This mirrors the lazy Nielsen // apply_regex_factorization and the paper's Reduce rule for x·u'. // (s ∈ r) → ⋁_{⟨Δ,∇⟩∈σ(r)} ( head ∈ Δ ∧ tail ∈ ∇ ) // Only fires for a concatenation s (single-variable s is already primitive). @@ -570,12 +570,12 @@ namespace smt { const expr_ref tail(m_seq.str.mk_concat(na - 1, a->get_args() + 1, s->get_sort()), m); const unsigned threshold = get_fparams().m_nseq_regex_factorization_threshold; - seq::sigma_pairs pairs; - if (!seq::compute_sigma(m, m_seq, m_rewriter, mem.m_regex, pairs, threshold)) + split_set pairs; + if (!m_rewriter.split(mem.m_regex->get_expr(), pairs, threshold)) // we give up return; - seq::simplify_sigma_pairs(pairs, m_regex, m_sgraph); + m_rewriter.simplify_split(pairs); if (pairs.empty()) { // no viable splits @@ -598,8 +598,8 @@ namespace smt { lits.push_back(~mem.lit); //std::cout << "Decomposing into:\n"; for (auto const& sp : pairs) { - expr_ref mem_head(m_seq.re.mk_in_re(head, sp.m_p), m); - expr_ref mem_tail(m_seq.re.mk_in_re(tail, sp.m_q), m); + expr_ref mem_head(m_seq.re.mk_in_re(head, sp.m_d), m); + expr_ref mem_tail(m_seq.re.mk_in_re(tail, sp.m_n), m); expr_ref conj(m.mk_and(mem_head, mem_tail), m); lits.push_back(mk_literal(conj)); //seq::dep_tracker dep = nullptr;