From b3143e759bd9e83a695e719dfc40e608c2a376cf Mon Sep 17 00:00:00 2001 From: Clemens Eisenhofer <56730610+CEisenhofer@users.noreply.github.com> Date: Tue, 30 Jun 2026 19:18:28 +0200 Subject: [PATCH] Porting seq_split to master (#9840) Co-authored-by: Nikolaj Bjorner --- src/ast/rewriter/CMakeLists.txt | 1 + src/ast/rewriter/seq_rewriter.h | 18 +- src/ast/rewriter/seq_split.cpp | 820 +++++++++++++++++++++++++++++++ src/ast/rewriter/seq_split.h | 225 +++++++++ src/params/smt_params_helper.pyg | 2 + src/params/theory_seq_params.cpp | 2 + src/params/theory_seq_params.h | 2 + src/smt/seq_regex.cpp | 24 + src/test/CMakeLists.txt | 1 + src/test/main.cpp | 1 + src/test/seq_split.cpp | 450 +++++++++++++++++ 11 files changed, 1545 insertions(+), 1 deletion(-) create mode 100644 src/ast/rewriter/seq_split.cpp create mode 100644 src/ast/rewriter/seq_split.h create mode 100644 src/test/seq_split.cpp diff --git a/src/ast/rewriter/CMakeLists.txt b/src/ast/rewriter/CMakeLists.txt index c118501926..df06cedfe3 100644 --- a/src/ast/rewriter/CMakeLists.txt +++ b/src/ast/rewriter/CMakeLists.txt @@ -41,6 +41,7 @@ z3_add_component(rewriter seq_eq_solver.cpp seq_derive.cpp seq_subset.cpp + seq_split.cpp seq_derive.cpp seq_range_collapse.cpp seq_range_predicate.cpp diff --git a/src/ast/rewriter/seq_rewriter.h b/src/ast/rewriter/seq_rewriter.h index 695e9ad876..7cd5bf7153 100644 --- a/src/ast/rewriter/seq_rewriter.h +++ b/src/ast/rewriter/seq_rewriter.h @@ -18,6 +18,7 @@ Notes: --*/ #pragma once +#include "seq_split.h" #include "ast/seq_decl_plugin.h" #include "ast/rewriter/seq_derive.h" #include "ast/ast_pp.h" @@ -133,6 +134,7 @@ class seq_rewriter { seq_util m_util; seq_subset m_subset; + seq_split m_split; arith_util m_autil; bool_rewriter m_br; seq::derive m_derive; @@ -332,7 +334,7 @@ class seq_rewriter { public: seq_rewriter(ast_manager & m, params_ref const & p = params_ref()): - m_util(m), m_subset(m_util.re), m_autil(m), m_br(m, p), m_derive(m, *this), + m_util(m), m_subset(m_util.re), m_split(*this), m_autil(m), m_br(m, p), m_derive(m, *this), // m_re2aut(m), m_op_cache(m), m_es(m), m_lhs(m), m_rhs(m) { } @@ -411,6 +413,20 @@ public: return result; } + // Split decomposition (sigma) of a regex; see seq_split.h. `oracle` (optional) + // prunes non-viable splits during generation. + bool split(expr* r, split_set& out, unsigned threshold, + const split_mode mode = split_mode::strong, split_oracle const& oracle = {}) { + return m_split.compute(r, out, threshold, mode, oracle); + } + + void simplify_split(split_set& s) { m_split.simplify(s); } + + // decompose a membership constraint into a set of pairs of regex splits + std::pair split_membership(expr* str, expr* regex, unsigned threshold, split_set& result) const { + return m_split.split_membership(str, regex, threshold, result); + } + /** * check if regular expression is of the form all ++ s ++ all ++ t + u ++ all, where, s, t, u are sequences */ diff --git a/src/ast/rewriter/seq_split.cpp b/src/ast/rewriter/seq_split.cpp new file mode 100644 index 0000000000..8977065fc0 --- /dev/null +++ b/src/ast/rewriter/seq_split.cpp @@ -0,0 +1,820 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_split.cpp + +Abstract: + + Regex split decomposition (the split function sigma). See seq_split.h. + +Author: + + Clemens Eisenhofer 2026-6-10 + +--*/ + +#include "ast/rewriter/seq_split.h" +#include "ast/rewriter/seq_rewriter.h" +#include "ast/ast_pp.h" +#include "util/obj_hashtable.h" +#include "util/stack.h" + +seq_split::seq_split(seq_rewriter& rw) : + m(rw.m()), m_rw(rw), m_subset(rw.u().re), + m_set_sort(m), + m_d_empty(m), m_d_single(m), m_d_fromre(m), m_d_union(m), + m_d_inter(m), m_d_compl(m), m_d_lcat(m), m_d_rcat(m), + m_empty_app(m) {} + +// --------------------------------------------------------------------------- +// Suspended split-set representation (split algebra over `expr`). +// --------------------------------------------------------------------------- + +void seq_split::ensure_decls(sort* seq_sort) { + SASSERT(seq_sort); + if (m_seq_sort == seq_sort) + return; + sort* re_sort = re().mk_re(seq_sort); + m_set_sort = m.mk_uninterpreted_sort(symbol("seq.split.set")); + sort* ss = m_set_sort; + m_d_empty = m.mk_func_decl(symbol("seq.split.empty"), 0u, nullptr, ss); + m_d_single = m.mk_func_decl(symbol("seq.split.single"), re_sort, re_sort, ss); + m_d_fromre = m.mk_func_decl(symbol("seq.split.from_re"), re_sort, ss); + m_d_union = m.mk_func_decl(symbol("seq.split.union"), ss, ss, ss); + m_d_inter = m.mk_func_decl(symbol("seq.split.inter"), ss, ss, ss); + m_d_compl = m.mk_func_decl(symbol("seq.split.compl"), ss, ss); + m_d_lcat = m.mk_func_decl(symbol("seq.split.lcat"), re_sort, ss, ss); + m_d_rcat = m.mk_func_decl(symbol("seq.split.rcat"), ss, re_sort, ss); + m_empty_app = m.mk_const(m_d_empty); + m_seq_sort = seq_sort; +} + +// --- smart constructors ---------------------------------------------------- + +expr_ref seq_split::mk_empty() { + SASSERT(m_empty_app); + return m_empty_app; +} + +expr_ref seq_split::mk_single(expr* d, expr* n) { + SASSERT(d && n); + if (re().is_empty(d) || re().is_empty(n)) + return mk_empty(); + return expr_ref(m.mk_app(m_d_single, d, n), m); +} + +expr_ref seq_split::mk_fromre(expr* r) { + SASSERT(r); + sort* seq_sort = nullptr; + VERIFY(seq().is_re(r, seq_sort)); + ensure_decls(seq_sort); + if (re().is_empty(r)) + return mk_empty(); + return expr_ref(m.mk_app(m_d_fromre, r), m); +} + +expr_ref seq_split::mk_union(expr* a, expr* b) { + SASSERT(a && b); + if (is_empty_ss(a)) + return expr_ref(b, m); + if (is_empty_ss(b)) + return expr_ref(a, m); + return expr_ref(m.mk_app(m_d_union, a, b), m); +} + +expr_ref seq_split::mk_inter(expr* a, expr* b) { + SASSERT(a && b); + if (is_empty_ss(a) || is_empty_ss(b)) + return mk_empty(); + return expr_ref(m.mk_app(m_d_inter, a, b), m); +} + +expr_ref seq_split::mk_compl(expr* a) { + SASSERT(a); + return expr_ref(m.mk_app(m_d_compl, a), m); +} + +expr_ref seq_split::mk_lcat(expr* r, expr* s) { + SASSERT(r && s); + if (is_empty_ss(s)) + return mk_empty(); + if (re().is_epsilon(r)) // eps . S = S + return expr_ref(s, m); + return expr_ref(m.mk_app(m_d_lcat, r, s), m); +} + +expr_ref seq_split::mk_rcat(expr* s, expr* r) { + SASSERT(r && s); + if (is_empty_ss(s)) + return mk_empty(); + if (re().is_epsilon(r)) // S . eps = S + return expr_ref(s, m); + return expr_ref(m.mk_app(m_d_rcat, s, r), m); +} + +// --- recognizers ----------------------------------------------------------- + +bool seq_split::is_empty_ss(expr* e) const { + return is_app(e) && to_app(e)->get_decl() == m_d_empty; +} +bool seq_split::is_single(expr* e, expr*& d, expr*& n) const { + if (!is_app(e) || to_app(e)->get_decl() != m_d_single) + return false; + d = to_app(e)->get_arg(0); + n = to_app(e)->get_arg(1); + return true; +} +bool seq_split::is_fromre(expr* e, expr*& r) const { + if (!is_app(e) || to_app(e)->get_decl() != m_d_fromre) + return false; + r = to_app(e)->get_arg(0); + return true; +} +bool seq_split::is_union(expr* e, expr*& a, expr*& b) const { + if (!is_app(e) || to_app(e)->get_decl() != m_d_union) + return false; + a = to_app(e)->get_arg(0); + b = to_app(e)->get_arg(1); + return true; +} +bool seq_split::is_inter(expr* e, expr*& a, expr*& b) const { + if (!is_app(e) || to_app(e)->get_decl() != m_d_inter) + return false; + a = to_app(e)->get_arg(0); + b = to_app(e)->get_arg(1); + return true; +} +bool seq_split::is_compl(expr* e, expr*& a) const { + if (!is_app(e) || to_app(e)->get_decl() != m_d_compl) + return false; + a = to_app(e)->get_arg(0); + return true; +} +bool seq_split::is_lcat(expr* e, expr*& r, expr*& s) const { + if (!is_app(e) || to_app(e)->get_decl() != m_d_lcat) + return false; + r = to_app(e)->get_arg(0); + s = to_app(e)->get_arg(1); + return true; +} +bool seq_split::is_rcat(expr* e, expr*& s, expr*& r) const { + if (!is_app(e) || to_app(e)->get_decl() != m_d_rcat) + return false; + s = to_app(e)->get_arg(0); + r = to_app(e)->get_arg(1); + return true; +} +bool seq_split::is_frontier(expr* e) const { + expr *a = nullptr, *b = nullptr; + return is_empty_ss(e) || is_single(e, a, b) || is_union(e, a, b); +} + +seq_util& seq_split::seq() const { return m_rw.u(); } +seq_util::rex& seq_split::re() const { return m_rw.u().re; } + +// Add unless the (optional) lookahead oracle prunes it. +void seq_split::push(split_set& out, split_oracle const& oracle, expr* d, expr* n) const { + if (!oracle || oracle(d, n)) + out.push_back(split_pair(d, n, m)); +} + +// Cross-product intersection of two split-sets (split algebra): +// S1 cap S2 = { | in S1, in S2 }. +// Pairs where any component is bottom (the empty regex) are dropped. +bool seq_split::intersect(split_set const& s1, split_set const& s2, split_set& result, + unsigned threshold, split_oracle const& oracle) const { + const seq_util::rex& r = re(); + for (auto const& p1 : s1) { + for (auto const& p2 : s2) { + if (r.is_empty(p1.m_d) || r.is_empty(p2.m_d) || + r.is_empty(p1.m_n) || r.is_empty(p2.m_n)) + continue; + const expr_ref di(m_rw.mk_regex_inter_normalize(p1.m_d, p2.m_d), m); + const expr_ref ni(m_rw.mk_regex_inter_normalize(p1.m_n, p2.m_n), m); + push(result, oracle, di, ni); + if (result.size() > threshold) + return false; + } + } + return true; +} + +// Complement of a split-set via De Morgan: ~S = cap_{s in S} ~s with +// ~ = { <~D, .*>, <.*, ~N> } and ~{} = { <.*, .*> }. +// May produce up to 2^|sp| pairs (bounded by the threshold). A threshold +// overrun must abort entirely: a partial fold is a strictly weaker (unsound) +// split-set, since each ~sp[i] further constrains ~S. +bool seq_split::complement(sort* seq_sort, split_set const& sp, split_set& result, + const unsigned threshold, split_oracle const& oracle) const { + + seq_util::rex& r = re(); + sort* re_sort = r.mk_re(seq_sort); + const expr_ref full(r.mk_full_seq(re_sort), m); // .* + if (sp.empty()) { // ~{} = <.*, .*> + push(result, oracle, full, full); + return true; + } + // The acc/next pairs carry genuine output-orientation N components (the De + // Morgan ~ = {<~D,.*>, <.*,~N>}), so the oracle prunes them soundly and + // keeps the 2^|sp| fold from blowing up. + split_set acc; + push(acc, oracle, r.mk_complement(sp[0].m_d), full); + push(acc, oracle, full, r.mk_complement(sp[0].m_n)); + for (unsigned i = 1; i < sp.size(); ++i) { + split_set next; + push(next, oracle, r.mk_complement(sp[i].m_d), full); + push(next, oracle, full, r.mk_complement(sp[i].m_n)); + split_set tmp; + if (!intersect(acc, next, tmp, threshold, oracle)) + return false; + acc = std::move(tmp); + if (acc.empty()) // intersection empty => ~S is empty + break; + if (acc.size() > threshold) + return false; + } + result.append(acc); + return true; +} + +// One level of the sigma rules. Mirrors the historic eager `compute`, except it +// emits *suspended* split-algebra terms (from_re / lcat / rcat / inter / compl) for +// the subterms instead of recursing. `mode` is irrelevant here: weak vs. strong is +// decided when `head_normalize` reaches an inter / compl node. +expr_ref seq_split::expand_fromre(expr* r, bool& ok) { + ok = true; + seq_util& sq = seq(); + seq_util::rex& rex = re(); + + sort* seq_sort = nullptr; + if (!sq.is_re(r, seq_sort)) { + ok = false; + return expr_ref(m); + } + ensure_decls(seq_sort); + + // bottom: sigma(empty) = {} + if (rex.is_empty(r)) + return mk_empty(); + + // epsilon: sigma(eps) = { } + if (rex.is_epsilon(r)) { + const expr_ref eps(rex.mk_epsilon(seq_sort), m); + return mk_single(eps, eps); + } + + expr* a = nullptr, *b = nullptr; + + // to_re(s): split the literal word s at every position. + expr* s = nullptr; + if (rex.is_to_re(r, s)) { + zstring str; + vector stack; + stack.push_back(s); + + while (!stack.empty()) { + expr* cur = stack.back(); + stack.pop_back(); + if (seq().str.is_concat(cur, a, b)) { + stack.push_back(b); + stack.push_back(a); + } + else { + expr* ch; + unsigned cv; + if (seq().str.is_unit(cur, ch) && seq().is_const_char(ch, cv)) { + str += zstring(cv); + continue; + } + zstring str2; + if (sq.str.is_string(s, str2)) { + str = str2; + continue; + } + // not a constant string; unsupported for now + ok = false; + return expr_ref(m); + } + } + expr_ref acc = mk_empty(); + for (unsigned i = 0; i <= str.length(); ++i) { + const expr_ref p(rex.mk_to_re(sq.str.mk_string(str.extract(0, i))), m); + const expr_ref q(rex.mk_to_re(sq.str.mk_string(str.extract(i, str.length() - i))), m); + acc = mk_union(acc, mk_single(p, q)); + } + return acc; + } + + // single-character class alpha (., [lo-hi], of_pred): + // sigma(alpha) = { , } + if (rex.is_full_char(r) || rex.is_range(r) || rex.is_of_pred(r)) { + const expr_ref ex(r, m); + const expr_ref eps(rex.mk_epsilon(seq_sort), m); + return mk_union(mk_single(eps, ex), mk_single(ex, eps)); + } + + // .* : sigma(.*) = { <.*, .*> } + if (rex.is_full_seq(r)) { + const expr_ref ex(r, m); + return mk_single(ex, ex); + } + + // union: sigma(r0 | ... | r_{n-1}) = U from_re(ri) (re.union may be n-ary) + if (rex.is_union(r)) { + app* ap = to_app(r); + expr_ref acc = mk_empty(); + for (expr* arg : *ap) { + acc = mk_union(acc, mk_fromre(arg)); + } + return acc; + } + + // concat: sigma(r0...r_{n-1}) = U_i (r0...r_{i-1}) . sigma(ri) . (r_{i+1}...r_{n-1}) + // emitted as U_i lcat(left, rcat(from_re(ri), right)) (re.++ may be n-ary) + if (rex.is_concat(r)) { + app* ap = to_app(r); + const unsigned n = ap->get_num_args(); + expr_ref acc = mk_empty(); + for (unsigned i = 0; i < n; ++i) { + expr_ref left(m), right(m); + if (i == 0) + left = rex.mk_epsilon(seq_sort); + else { + for (unsigned j = 0; j < i; ++j) { + expr* arg = ap->get_arg(j); + left = left ? expr_ref(rex.mk_concat(left, arg), m) : expr_ref(arg, m); + } + } + if (i == n - 1) + right = rex.mk_epsilon(seq_sort); + else { + right = ap->get_arg(i + 1); + for (unsigned j = i + 2; j < n; ++j) { + expr* arg = ap->get_arg(j); + right = rex.mk_concat(right, arg); + } + } + expr_ref term = mk_lcat(left, mk_rcat(mk_fromre(ap->get_arg(i)), right)); + acc = mk_union(acc, term); + } + return acc; + } + + // star: sigma(a*) = { } cup a*.sigma(a).a* + if (rex.is_star(r, a)) { + const expr_ref eps(rex.mk_epsilon(seq_sort), m); + expr_ref body = mk_lcat(r, mk_rcat(mk_fromre(a), r)); // a*.from_re(a).a* + return mk_union(mk_single(eps, eps), body); + } + + // plus: a+ = a.a* ; sigma(a+) = a*.sigma(a).a* (star rule without ) + if (rex.is_plus(r, a)) { + const expr_ref star(rex.mk_star(a), m); // a* + return mk_lcat(star, mk_rcat(mk_fromre(a), star)); + } + + // intersection: sigma(r0 & ... & r_{n-1}) = cap from_re(ri) (re.inter may be n-ary) + if (rex.is_intersection(r)) { + app* ap = to_app(r); + const unsigned n = ap->get_num_args(); + expr_ref acc = mk_fromre(ap->get_arg(0)); + for (unsigned i = 1; i < n; ++i) + acc = mk_inter(acc, mk_fromre(ap->get_arg(i))); + return acc; + } + + // complement: sigma(~a) = ~sigma(a). + if (rex.is_complement(r, a)) + return mk_compl(mk_fromre(a)); + + // difference: a \ b = a & ~b ; sigma(a \ b) = sigma(a) cap ~sigma(b). + if (rex.is_diff(r, a, b)) + return mk_inter(mk_fromre(a), mk_compl(mk_fromre(b))); + + // bounded loop / ite / other: not handled (paper "v1: bail"). + TRACE(seq, tout << "seq_split: unsupported regex " << mk_pp(r, m) << "\n";); + ok = false; + return expr_ref(m); +} + +// r . hs : push the left regex onto the D component of a head-normal split-set. +expr_ref seq_split::distribute_lcat(expr* r, expr* hs) { + expr *a = nullptr, *b = nullptr, *d = nullptr, *n = nullptr; + if (is_empty_ss(hs)) + return mk_empty(); + if (is_single(hs, d, n)) + return mk_single(m_rw.mk_re_append(r, d), n); // r.D + if (is_union(hs, a, b)) + return mk_union(mk_lcat(r, a), mk_lcat(r, b)); + UNREACHABLE(); + return expr_ref(hs, m); +} + +// hs . r : push the right regex onto the N component of a head-normal split-set. +expr_ref seq_split::distribute_rcat(expr* hs, expr* r) { + expr *a = nullptr, *b = nullptr, *d = nullptr, *n = nullptr; + if (is_empty_ss(hs)) + return mk_empty(); + if (is_single(hs, d, n)) + return mk_single(d, m_rw.mk_re_append(n, r)); // N.r + if (is_union(hs, a, b)) + return mk_union(mk_rcat(a, r), mk_rcat(b, r)); + UNREACHABLE(); + return expr_ref(hs, m); +} + +expr_ref seq_split::from_split_set(split_set const& s) { + expr_ref acc = mk_empty(); + for (auto const& p : s) + acc = mk_union(acc, mk_single(p.m_d, p.m_n)); + return acc; +} + +expr_ref seq_split::head_normalize(expr* t, split_mode mode, unsigned threshold, + split_oracle const& oracle, bool& ok) { + ok = true; + expr *a = nullptr, *b = nullptr, *r = nullptr, *s = nullptr; + + // already a frontier node + if (is_frontier(t)) + return expr_ref(t, m); + + // from_re(r): one level of sigma; recurse to settle a non-frontier head + // (plus / inter / compl / diff expand to lcat / inter / compl nodes). + if (is_fromre(t, r)) { + expr_ref e = expand_fromre(r, ok); + if (!ok) + return expr_ref(m); + if (is_frontier(e)) + return e; + return head_normalize(e, mode, threshold, oracle, ok); + } + + // r.S : head-normalize S, then distribute r over the frontier. + if (is_lcat(t, r, s)) { + expr_ref hs = head_normalize(s, mode, threshold, oracle, ok); + if (!ok) + return expr_ref(m); + return distribute_lcat(r, hs); + } + if (is_rcat(t, s, r)) { + expr_ref hs = head_normalize(s, mode, threshold, oracle, ok); + if (!ok) + return expr_ref(m); + return distribute_rcat(hs, r); + } + + // inter / compl are eager by nature: a single split of S1 cap S2 (or ~S) + // cannot be produced without materializing the operand split-sets. + if (is_inter(t, a, b)) { + if (mode == split_mode::weak) { + ok = false; + return expr_ref(m); + } + split_set sa, sb, tmp; + if (!materialize(a, mode, threshold, oracle, sa) || + !materialize(b, mode, threshold, oracle, sb) || + !intersect(sa, sb, tmp, threshold, oracle)) { + ok = false; + return expr_ref(m); + } + return from_split_set(tmp); + } + if (is_compl(t, a)) { + if (mode == split_mode::weak) { + ok = false; + return expr_ref(m); + } + // The body is materialized WITHOUT the oracle (its pairs are inverted, so + // their N is unrelated to the output N); the oracle is re-applied in + // complement(). + split_set sa, res; + if (!materialize(a, mode, threshold, split_oracle{}, sa) || + !complement(m_seq_sort, sa, res, threshold, oracle)) { + ok = false; + return expr_ref(m); + } + return from_split_set(res); + } + + UNREACHABLE(); + ok = false; + return expr_ref(m); +} + +bool seq_split::materialize(expr* node, split_mode mode, unsigned threshold, + split_oracle const& oracle, split_set& out) { + iterator it(*this, node, mode, threshold, oracle); + expr_ref d(m), n(m); + while (it.next(d, n)) + out.push_back(split_pair(d, n, m)); + return !it.gave_up(); +} + +expr_ref seq_split::make(expr* r) { + SASSERT(r); + sort* seq_sort = nullptr; + if (!seq().is_re(r, seq_sort)) + return expr_ref(m); + return mk_fromre(r); +} + +// --- Lazy enumerator -------------------------------------------------------- +// The worklist holds suspended split-sets. Each next() pops a node, head- +// normalizes it to a frontier (empty | single | union), and either returns the +// single split, pushes the two union branches back, or skips an empty. All the +// expansion work happens lazily, one split per next() call. + +seq_split::iterator::iterator(seq_split& engine, expr* node, split_mode mode, + unsigned threshold, split_oracle oracle) : + m_engine(engine), m(engine.m), m_mode(mode), m_threshold(threshold), + m_oracle(std::move(oracle)), m_work(engine.m) { + SASSERT(node); + m_work.push_back(node); +} + +bool seq_split::iterator::next(expr_ref& out_d, expr_ref& out_n) { + if (m_giveup) + return false; // a prior give-up is sticky + while (!m_work.empty()) { + expr_ref t(m_work.back(), m); + m_work.pop_back(); + + bool ok = true; + expr_ref hn = m_engine.head_normalize(t, m_mode, m_threshold, m_oracle, ok); + if (!ok) { + m_giveup = true; // unsupported / weak Boolean / overrun + return false; + } + + expr *a = nullptr, *b = nullptr, *d = nullptr, *n = nullptr; + if (m_engine.is_empty_ss(hn)) + continue; + if (m_engine.is_single(hn, d, n)) { + if (m_oracle && !m_oracle(d, n)) + continue; // pruned by lookahead + if (++m_count > m_threshold) { + m_giveup = true; // safety cap against space bloat + return false; + } + out_d = d; + out_n = n; + return true; + } + if (m_engine.is_union(hn, a, b)) { + m_work.push_back(a); + m_work.push_back(b); + continue; + } + UNREACHABLE(); + } + return false; // exhausted (m_giveup stays false) +} + +seq_split::iterator seq_split::iterate(expr* node, split_mode mode, unsigned threshold, + split_oracle const& oracle) { + return iterator(*this, node, mode, threshold, oracle); +} + +// Eager wrapper: drain the lazy enumeration into `out`. Semantics (give-up cases, +// oracle discipline) match the historic engine. +bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mode mode, + split_oracle const& oracle) { + SASSERT(r); + sort* seq_sort = nullptr; + if (!seq().is_re(r, seq_sort)) + return false; + expr_ref node = mk_fromre(r); + return materialize(node, mode, threshold, oracle, result); +} + +// same-D / same-N merge (paper eqs. 1 & 2): +// { , } -> (by_left = true, group by D) +// { , } -> (by_left = false, group by N) +// Only fires on syntactically-identical (perfectly-shared) key components, so +// it is a conservative instance of the rule. +void seq_split::merge_by(split_set& pairs, const bool by_left) const { + obj_map idx; // key component -> position in `out` + split_set out; + for (auto const& p : pairs) { + expr* key = by_left ? p.m_d.get() : p.m_n.get(); + expr* other = by_left ? p.m_n.get() : p.m_d.get(); + unsigned pos; + if (idx.find(key, pos)) { + expr* prev = by_left ? out[pos].m_n.get() : out[pos].m_d.get(); + const expr_ref u(m_rw.mk_regex_union_normalize(prev, other), m); + if (by_left) + out[pos].m_n = u; + else + out[pos].m_d = u; + } + else { + idx.insert(key, out.size()); + out.push_back(p); + } + } + pairs.swap(out); +} + +void seq_split::simplify(split_set& pairs) const { + seq_util::rex& r = re(); + + // 1. drop pairs with a bottom (empty-language) component. + unsigned w = 0; + for (unsigned i = 0; i < pairs.size(); ++i) { + if (r.is_empty(pairs[i].m_d) || r.is_empty(pairs[i].m_n)) + continue; + if (w != i) + pairs[w] = pairs[i]; + ++w; + } + pairs.shrink(w); + if (pairs.size() <= 1) + return; + + // 2. same-D / same-N merge rules. + merge_by(pairs, true); + merge_by(pairs, false); + if (pairs.size() <= 1) + return; + + // 3. subsumption: drop when L(D_i) subseteq L(D_j) and + // L(N_i) subseteq L(N_j) for some kept j. seq_subset is conservative + // (returns true only for definite containment), so we never drop a + // needed split. + //if (pairs.size() > 64) + // return; + + struct row { expr* d; expr* n; unsigned idx; }; + vector rows; + for (unsigned i = 0; i < pairs.size(); ++i) + rows.push_back({ pairs[i].m_d.get(), pairs[i].m_n.get(), i }); + + auto subsumes = [&](row const& a, row const& b) { + return m_subset.is_subset(b.d, a.d) && m_subset.is_subset(b.n, a.n); + }; + + vector kept; + for (row const& row_r : rows) { + bool redundant = false; + for (row const& k : kept) + if (subsumes(k, row_r)) { redundant = true; break; } + if (redundant) + continue; + // drop already-kept rows strictly subsumed by row_r + unsigned kw = 0; + for (unsigned t = 0; t < kept.size(); ++t) { + if (subsumes(row_r, kept[t])) + continue; + kept[kw++] = kept[t]; + } + kept.shrink(kw); + kept.push_back(row_r); + } + + split_set result; + for (row const& k : kept) + result.push_back(pairs[k.idx]); + pairs.swap(result); +} + +std::pair seq_split::split_membership(expr* str, expr* regex, unsigned threshold, split_set& result) const { + expr_ref_vector tokens(m); + vector stack; + stack.push_back(str); + + while (!stack.empty()) { + expr* cur = stack.back(); + stack.pop_back(); + expr* l, *r; + if (seq().str.is_concat(cur, l, r)) { + stack.push_back(r); + stack.push_back(l); + } + else + tokens.push_back(expr_ref(cur, m)); + } + + expr* ch; + unsigned i = 0; + + while (i < tokens.size() && (seq().str.is_string(tokens.get(i)) || (seq().str.is_unit(tokens.get(i), ch) && seq().is_const_char(ch)))) { + zstring s; + if (seq().str.is_string(tokens.get(i), s)) { + if (s.empty()) { + i++; + continue; + } + ch = seq().mk_char(s[0]); + tokens[i] = seq().str.mk_string(s.extract(1, s.length() - 1)); + } + else + i++; + regex = m_rw.mk_derivative(ch, regex); + } + + if (i > 0) { + unsigned j = 0; + for (; i < tokens.size(); i++, j++) { + tokens[j] = tokens.get(i); + } + tokens.shrink(j); + } + + // TODO: Do this for the back as well (also, why did no rule before do that?) + + if (tokens.empty()) + return { expr_ref(m), expr_ref(m) }; + + // Choose the factorization boundary so the tail starts with the + // longest run of concrete characters c. + // This gives the split-engine lookahead oracle the most pruning information. + // head = u' (tokens before the run), tail = c · u''' (tokens from the run onward). + const unsigned total = tokens.size(); + unsigned run_start = 0, run_len = 0; + for (i = 1; i < total; ) { + if (!(seq().str.is_unit(tokens.get(i), ch) && seq().is_const_char(ch))) { + i++; + continue; + } + unsigned j = i; + while (j < total && seq().str.is_unit(tokens.get(j), ch) && seq().is_const_char(ch)) { + j++; + } + if (j - i > run_len) { + run_len = j - i; + run_start = i; + } + i = j; + } + // No constant run => fall back to splitting off the first token. + const unsigned p = run_len == 0 ? 1 : run_start; + SASSERT(p >= 1); + expr* head = tokens.get(0); + for (i = 1; i < p; i++) { + head = seq().str.mk_concat(head, tokens.get(i)); + } + expr* tail = seq().str.mk_empty(head->get_sort()); + if (tokens.size() > p + run_len) { + tail = tokens.get(p + run_len); + for (i = p + run_len + 1; i < tokens.size(); i++) { + tail = seq().str.mk_concat(tail, tokens.get(i)); + } + } + SASSERT(head && tail); + + // Build the constant lookahead c and (if non-empty) an oracle that + // prunes splits whose postfix cannot match c. + zstring c; + for (i = 0; i < run_len; ++i) { + unsigned cv; + VERIFY(seq().str.is_unit(tokens.get(run_start + i), ch)); + VERIFY(seq().is_const_char(ch, cv)); + c = c + zstring(cv); + } + split_oracle oracle; + if (!c.empty()) + oracle = [this, &c](expr*, expr* n) { return split_lookahead_viable(n, c); }; + + // Decompose the regex into a split-set via the shared seq_split engine + if (!m_rw.split(regex, result, threshold, split_mode::strong, oracle)) { + result.clear(); + return { expr_ref(m), expr_ref(m) }; + } + + simplify(result); + + // Eagerly consume the constant run c from the tail by taking the c-derivative + // of each postfix + if (!c.empty()) { + unsigned w = 0; + for (i = 0; i < result.size(); ++i) { + expr* d = result[i].m_n; + for (unsigned k = 0; d && !seq().re.is_empty(d) && k < c.length(); ++k) { + d = m_rw.mk_derivative(seq().mk_char(c[k]), d); + } + SASSERT(d); + if (re().is_empty(d)) + continue; // postfix can't start with c => infeasible split, drop + result[w++] = split_pair(result[i].m_d, d, m); + } + result.shrink(w); + } + + return { expr_ref(head, m), expr_ref(tail, m) }; +} + +bool seq_split::split_lookahead_viable(expr* regex, zstring const& c) const { + SASSERT(regex); + for (unsigned i = 0; i < c.length(); i++) { + if (m.is_true(m_rw.is_nullable(regex))) + return true; // N accepts the prefix c[0..i) => a suffix completes it + regex = m_rw.mk_derivative(seq().mk_char(c[i]), regex); + SASSERT(regex); + if (re().is_empty(regex)) + return false; // N went (syntactically) dead before reaching c + } + return !re().is_empty(regex); +} \ No newline at end of file diff --git a/src/ast/rewriter/seq_split.h b/src/ast/rewriter/seq_split.h new file mode 100644 index 0000000000..f3b7a57675 --- /dev/null +++ b/src/ast/rewriter/seq_split.h @@ -0,0 +1,225 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_split.h + +Abstract: + + Regex split decomposition: the split function sigma from the paper + "Solving by Splitting". For a regular expression r, sigma(r) is a finite + "split-set" of pairs { } such that + + u.v in L(r) iff exists i: u in L(D_i) and v in L(N_i). + + The split algebra (intersection, De Morgan complement, left/right + concatenation with a regex) and the cardinality-reducing simplification + heuristics (drop bottom, same-D/same-N merge, subsumption via seq_subset) + follow the paper. + +Author: + + Clemens Eisenhofer 2026-6-10 + +--*/ +#pragma once + +#include "ast/seq_decl_plugin.h" +#include "ast/rewriter/seq_subset.h" +#include + +class seq_rewriter; + +// An individual split : the left (prefix) regex D and right (suffix) +// regex N. u.v in L(r) for this split iff u in L(D) and v in L(N). +struct split_pair { + expr_ref m_d; + expr_ref m_n; + split_pair(expr* d, expr* n, ast_manager& m) : m_d(d, m), m_n(n, m) { + SASSERT(d && n); + } +}; + +// A split-set is a union of individual splits. +typedef vector split_set; + +// Controls how aggressively sigma expands the Boolean-closure cases: +// strong - fully expand complement / intersection via the split algebra +// (De Morgan / cross product). This is the behaviour the nseq +// solver relies on. +// weak - do not perform the (potentially 2^k) Boolean-closure expansion; +// give up (return false) on complement / intersection instead. +enum class split_mode { weak, strong }; + +// Optional lookahead oracle. Called for each candidate split as it is +// generated; returns true to keep it, false to prune it. An empty oracle (the +// default) keeps everything, so sigma is unchanged. See seq_split::compute. +typedef std::function split_oracle; + +class seq_split { + ast_manager& m; + seq_rewriter& m_rw; // for mk_re_append + manager / seq_util access + seq_subset m_subset; // language-subset checks for subsumption + + // --- Suspended split-set representation ------------------------------- + // A split-set computation is kept as an `expr` term over a small family of + // locally-declared, uninterpreted function symbols (the split algebra of the + // paper / split-algebra.md). Nothing here is ever asserted to the solver; + // the terms are only used as scratch structure to drive lazy expansion. + // + // empty : SplitSet -- {} (bottom) + // single : Re x Re -> SplitSet -- a single split + // from_re : Re -> SplitSet -- the *suspended* sigma(r) + // union : SplitSet x SplitSet -> SplitSet + // inter : SplitSet x SplitSet -> SplitSet + // compl : SplitSet -> SplitSet + // lcat : Re x SplitSet -> SplitSet -- r . S (left-concat onto D) + // rcat : SplitSet x Re -> SplitSet -- S . r (right-concat onto N) + sort* m_seq_sort = nullptr; // sequence sort the decls are built for + sort_ref m_set_sort; // the uninterpreted SplitSet sort + func_decl_ref m_d_empty, m_d_single, m_d_fromre, m_d_union, + m_d_inter, m_d_compl, m_d_lcat, m_d_rcat; + expr_ref m_empty_app; // cached nullary `empty` term + + seq_util& seq() const; + seq_util::rex& re() const; + + // (Re)build the local declarations for `seq_sort` if not already current. + void ensure_decls(sort* seq_sort); + + // Smart constructors: apply the cheap normalizations the eager engine relies + // on (drop-bottom, eps cancellation, union absorption of empty). + expr_ref mk_empty(); + expr_ref mk_single(expr* d, expr* n); + expr_ref mk_fromre(expr* r); + expr_ref mk_union(expr* a, expr* b); + expr_ref mk_inter(expr* a, expr* b); + expr_ref mk_compl(expr* a); + expr_ref mk_lcat(expr* r, expr* s); + expr_ref mk_rcat(expr* s, expr* r); + + // Recognizers over the local decls. + bool is_empty_ss(expr* e) const; + bool is_single(expr* e, expr*& d, expr*& n) const; + bool is_fromre(expr* e, expr*& r) const; + bool is_union (expr* e, expr*& a, expr*& b) const; + bool is_inter (expr* e, expr*& a, expr*& b) const; + bool is_compl (expr* e, expr*& a) const; + bool is_lcat (expr* e, expr*& r, expr*& s) const; + bool is_rcat (expr* e, expr*& s, expr*& r) const; + // A term whose head is empty | single | union (ready for the worklist loop). + bool is_frontier(expr* e) const; + + // One level of the sigma rules: from_re(r) -> a SplitSet term built from the + // immediate subterms. `ok` is set false on an unsupported shape. + expr_ref expand_fromre(expr* r, bool& ok); + // Distribute a left/right concatenation over a head-normal split-set. + expr_ref distribute_lcat(expr* r, expr* hs); + expr_ref distribute_rcat(expr* hs, expr* r); + // Materialized split-set -> a `union` of `single`s. + expr_ref from_split_set(split_set const& s); + // Reduce `t` until its head is empty | single | union (one outermost level + // for the lazy nodes; inter/compl are expanded eagerly via `materialize`, + // since the paper's De Morgan / cross-product cannot yield a split lazily). + // `ok` is set false on a give-up (unsupported shape, weak-mode Boolean, or + // threshold overrun). + expr_ref head_normalize(expr* t, split_mode mode, unsigned threshold, + split_oracle const& oracle, bool& ok); + // Fully drain a suspended split-set into `out` (used for inter/compl bodies). + // Runs an `iterator` to exhaustion; returns false on a give-up. + bool materialize(expr* node, split_mode mode, unsigned threshold, + split_oracle const& oracle, split_set& out); + + // Push onto `out`, unless `oracle` rejects it. + void push(split_set& out, split_oracle const& oracle, expr* d, expr* n) const; + + // S1 cap S2 = { } dropping any pair with a bottom + // component (and any rejected by `oracle`). Returns false on threshold overrun. + bool intersect(split_set const& s1, split_set const& s2, split_set& result, + unsigned threshold, split_oracle const& oracle) const; + + // De Morgan complement of a split-set: ~S = cap_{s in S} ~s with + // ~ = { <~D, .*>, <.*, ~N> } and ~{} = { <.*, .*> }. + bool complement(sort* seq_sort, split_set const& sp, split_set& result, + unsigned threshold, split_oracle const& oracle) const; + + // same-D / same-N merge: groups pairs that share a (syntactically identical) + // left (resp. right) component and unions the other component. + void merge_by(split_set& pairs, bool by_left) const; + +public: + explicit seq_split(seq_rewriter& rw); + + // Lazy split enumerator. Holds the suspended split-set worklist and produces + // the concrete splits one at a time, on demand, instead of computing + // them all up front. Obtain one from seq_split::iterate (or construct it + // directly) and pull splits with next() until it returns false; gave_up() then + // tells a normal exhaustion (false) apart from a give-up (true). + // + // The threshold is supplied by the caller and serves only as a safety cap + // against space bloat (lazy expansion still has to materialize the operands of + // intersection / complement). A threshold overrun, an unsupported regex shape, + // or a Boolean-closure case in weak mode aborts the enumeration: next() returns + // false and gave_up() returns true. To stop early, simply stop calling next(). + // + // `oracle` (optional) prunes non-viable splits as they are produced. It must + // be sound to apply per split: a candidate N can still gain a prefix from a + // factor appended to its right later (concat/star), so the oracle must use a + // "prefix-compatible" test (prune only when N can never match the lookahead, + // even partially), NOT a strict "starts-with" test. The complement body is + // expanded WITHOUT the oracle (inverted orientation); the oracle is re-applied + // to the complement's output fold. + class iterator { + seq_split& m_engine; + ast_manager& m; + split_mode m_mode; + unsigned m_threshold; + split_oracle m_oracle; + expr_ref_vector m_work; // GC-safe worklist of suspended split-sets + unsigned m_count = 0; // splits produced so far (vs. threshold) + bool m_giveup = false; + public: + iterator(seq_split& engine, expr* node, split_mode mode, + unsigned threshold, split_oracle oracle); + // Compute the next split. On success returns true and sets ; on + // exhaustion or give-up returns false (see gave_up()). Calling next() + // again after it has returned false keeps returning false. + bool next(expr_ref& d, expr_ref& n); + // Valid after next() has returned false: true iff the enumeration aborted + // (unsupported regex / weak-mode Boolean / threshold overrun) rather than + // running out of splits. + bool gave_up() const { return m_giveup; } + }; + + // Build the *suspended* sigma(r) as a split-algebra term (no expansion). + // Returns null on a non-regex argument. Drive it with `iterate`. + expr_ref make(expr* r); + + // Create a lazy enumerator over a suspended split-set `node` (typically the + // result of make()). See `iterator` for the meaning of the arguments. + iterator iterate(expr* node, split_mode mode, unsigned threshold, + split_oracle const& oracle = {}); + + // Compute sigma(r), appending to `out` (does not clear it). Thin eager + // wrapper that drains an `iterator` to exhaustion; semantics match the historic + // engine. See `iterator` for the meaning of `threshold`, `mode`, and `oracle`. + bool compute(expr* r, split_set& out, unsigned threshold, + split_mode mode = split_mode::strong, split_oracle const& oracle = {}); + + // In-place simplification of a split-set: drop bottom components, apply the + // same-D / same-N merge rules, and drop splits subsumed by another (using + // seq_subset). Size-capped to keep the O(n^2) subsumption affordable. + void simplify(split_set& s) const; + + // decompose a membership constraint into a set of pairs of regex splits + std::pair split_membership(expr* str, expr* regex, unsigned threshold, split_set& result) const; + + // Lookahead oracle for the split engine: is the split's right component + // `n_regex` prefix-compatible with the constant character sequence `c`? + // This is sound to apply during split generation — it never drops a viable split. + // Thus, it might not eliminate all cases in order to stay sound + bool split_lookahead_viable(expr* regex, zstring const& c) const; + + +}; diff --git a/src/params/smt_params_helper.pyg b/src/params/smt_params_helper.pyg index cd52b989cc..d3f164f3bb 100644 --- a/src/params/smt_params_helper.pyg +++ b/src/params/smt_params_helper.pyg @@ -138,6 +138,8 @@ def_module_params(module_name='smt', ('seq.validate', BOOL, False, 'enable self-validation of theory axioms created by seq theory'), ('seq.max_unfolding', UINT, 1000000000, 'maximal unfolding depth for checking string equations and regular expressions'), ('seq.min_unfolding', UINT, 1, 'initial bound for strings whose lengths are bounded by iterative deepening. Set this to a higher value if there are only models with larger string lengths'), + ('seq.regex_factorization_threshold', UINT, 10, 'maximum number of cases to factor a regex into in a single step'), + ('seq.regex_factorization_enabled', BOOL, False, 'apply regex factorization (sigma splitting)'), ('theory_aware_branching', BOOL, False, 'Allow the context to use extra information from theory solvers regarding literal branching prioritization.'), ('sls.enable', BOOL, False, 'enable sls co-processor with SMT engine'), ('sls.parallel', BOOL, True, 'use sls co-processor in parallel or sequential with SMT engine'), diff --git a/src/params/theory_seq_params.cpp b/src/params/theory_seq_params.cpp index 54bf691620..960f145a66 100644 --- a/src/params/theory_seq_params.cpp +++ b/src/params/theory_seq_params.cpp @@ -23,4 +23,6 @@ void theory_seq_params::updt_params(params_ref const & _p) { m_seq_validate = p.seq_validate(); m_seq_max_unfolding = p.seq_max_unfolding(); m_seq_min_unfolding = p.seq_min_unfolding(); + m_seq_regex_factorization_enabled = p.seq_regex_factorization_enabled(); + m_seq_regex_factorization_threshold = p.seq_regex_factorization_threshold(); } diff --git a/src/params/theory_seq_params.h b/src/params/theory_seq_params.h index f964088eb8..067a65a663 100644 --- a/src/params/theory_seq_params.h +++ b/src/params/theory_seq_params.h @@ -26,6 +26,8 @@ struct theory_seq_params { bool m_seq_validate = false; unsigned m_seq_max_unfolding = UINT_MAX/4; unsigned m_seq_min_unfolding = 1; + bool m_seq_regex_factorization_enabled = false; + unsigned m_seq_regex_factorization_threshold = 1; theory_seq_params(params_ref const & p = params_ref()) { updt_params(p); diff --git a/src/smt/seq_regex.cpp b/src/smt/seq_regex.cpp index 878629e530..e0423599dc 100644 --- a/src/smt/seq_regex.cpp +++ b/src/smt/seq_regex.cpp @@ -128,6 +128,30 @@ namespace smt { return; } + if (th.get_fparams().m_seq_regex_factorization_enabled) { + unsigned threshold = th.get_fparams().m_seq_regex_factorization_threshold; + if (threshold == 0) + threshold = UINT_MAX; + split_set result; + auto [head, tail] = seq_rw().split_membership(s, r, threshold, result); + if (head) { + SASSERT(tail); + // propagate all cases + expr_ref_vector cases(m); + expr_ref_vector branches(m); + for (auto [pre, post] : result) { + expr_ref mem_head(re().mk_in_re(head, pre), m); + expr_ref mem_tail(re().mk_in_re(tail, post), m); + cases.push_back(m.mk_and(mem_head, mem_tail)); + } + const expr_ref cases_expr(m.mk_or(cases), m); + ctx.internalize(cases_expr, false); + th.propagate_lit(nullptr, 1, &lit, ctx.get_literal(cases_expr)); + return; + } + // fallthrough; decomposition failed + } + // Convert a non-ground sequence into an additional regex and // strengthen the original regex constraint into an intersection // for example: diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index b41175deda..4563752812 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -138,6 +138,7 @@ add_executable(test-z3 simplifier.cpp sls_test.cpp sls_seq_plugin.cpp + seq_split.cpp small_object_allocator.cpp smt2print_parse.cpp smt_context.cpp diff --git a/src/test/main.cpp b/src/test/main.cpp index 8b4d9e760b..3a3cab41db 100644 --- a/src/test/main.cpp +++ b/src/test/main.cpp @@ -197,6 +197,7 @@ X(ho_matcher) \ X(finite_set) \ X(finite_set_rewriter) \ + X(seq_split) \ X(fpa) \ X(seq_regex_bisim) \ X(term_enumeration) \ diff --git a/src/test/seq_split.cpp b/src/test/seq_split.cpp new file mode 100644 index 0000000000..29df0545c7 --- /dev/null +++ b/src/test/seq_split.cpp @@ -0,0 +1,450 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_split.cpp + +Abstract: + + Unit tests for the regex split engine (the split function sigma) in ast/rewriter/seq_split.cpp. + +Author: + + Clemens Eisenhofer 2026-6-22 + +--*/ + +#include "ast/ast.h" +#include "ast/reg_decl_plugins.h" +#include "ast/seq_decl_plugin.h" +#include "ast/rewriter/seq_rewriter.h" +#include "ast/rewriter/seq_split.h" +#include +#include + + +struct plugin_registrar { + plugin_registrar(ast_manager& m) { reg_decl_plugins(m); } +}; + +class seq_split_test { + ast_manager m; + plugin_registrar m_reg; + seq_rewriter m_rw; + seq_split m_split; + seq_util u; + sort_ref m_str; // the sequence (String) sort + sort_ref m_re; // the RegEx sort over m_str + + seq_util::rex& re() { return u.re; } + + expr_ref eps() { return expr_ref(re().mk_epsilon(m_str), m); } // mk_epsilon takes the seq sort + expr_ref dot() { return expr_ref(re().mk_full_char(m_re), m); } // mk_full_char takes the RegEx sort + expr_ref dotstar() { return expr_ref(re().mk_full_seq(m_re), m); } // .* + expr_ref empty_re() { return expr_ref(re().mk_empty(m_re), m); } // the bottom regex + expr_ref rappend(expr* a, expr* b) { return m_rw.mk_re_append(a, b); } // the engine's regex concat + expr_ref word(char const* s) { return expr_ref(re().mk_to_re(u.str.mk_string(zstring(s))), m); } + expr_ref rng(char lo, char hi) { + return expr_ref(re().mk_range(u.str.mk_string(zstring(std::string(1, lo).c_str())), + u.str.mk_string(zstring(std::string(1, hi).c_str()))), m); + } + + typedef std::set> pair_set; + + pair_set as_set(split_set const& s) { + pair_set out; + for (auto const& p : s) + out.insert({ p.m_d.get(), p.m_n.get() }); + return out; + } + + bool eager(expr* r, split_set& out, unsigned threshold = UINT_MAX, + split_mode mode = split_mode::strong, split_oracle const& oracle = {}) { + return m_split.compute(r, out, threshold, mode, oracle); + } + + bool lazy(expr* r, split_set& out, unsigned threshold = UINT_MAX, + split_mode mode = split_mode::strong, split_oracle const& oracle = {}) { + expr_ref node = m_split.make(r); + ENSURE(node); + seq_split::iterator it = m_split.iterate(node, mode, threshold, oracle); + expr_ref d(m), n(m); + while (it.next(d, n)) + out.push_back(split_pair(d, n, m)); + return !it.gave_up(); + } + + // assert that the eager and lazy engines agree on sigma(r) as a *set* of + // splits, and report the common cardinality. + unsigned check_agree(expr* r) { + split_set se, sl; + bool oke = eager(r, se); + bool okl = lazy(r, sl); + ENSURE(oke == okl); + if (!oke) + return 0; + ENSURE(as_set(se) == as_set(sl)); + return (unsigned)as_set(se).size(); + } + +public: + seq_split_test() : m_reg(m), m_rw(m), m_split(m_rw), u(m), m_str(m), m_re(m) { + m_str = u.str.mk_string_sort(); + m_re = re().mk_re(m_str); + } + + void test_eager_epsilon() { + split_set s; + ENSURE(eager(eps(), s)); + ENSURE(as_set(s) == pair_set({ { eps().get(), eps().get() } })); + } + + void test_eager_char() { + // sigma(.) = { , <., eps> } + expr_ref a = dot(); + split_set s; + ENSURE(eager(a, s)); + pair_set expected({ { eps().get(), a.get() }, { a.get(), eps().get() } }); + ENSURE(as_set(s) == expected); + } + + void test_eager_word() { + // sigma("ab") = { <"", "ab">, <"a","b">, <"ab",""> } + split_set s; + ENSURE(eager(word("ab"), s)); + pair_set expected({ + { word("").get(), word("ab").get() }, + { word("a").get(), word("b").get() }, + { word("ab").get(), word("").get() }, + }); + ENSURE(as_set(s) == expected); + } + + void test_eager_union() { + // sigma(a | b) = sigma(a) cup sigma(b) + expr_ref a = rng('a', 'a'), b = rng('b', 'b'); + expr_ref u_re(re().mk_union(a, b), m); + split_set s; + ENSURE(eager(u_re, s)); + pair_set expected({ + { eps().get(), a.get() }, { a.get(), eps().get() }, + { eps().get(), b.get() }, { b.get(), eps().get() }, + }); + ENSURE(as_set(s) == expected); + } + + void test_agree_all() { + expr_ref a = rng('a', 'a'), b = rng('b', 'b'); + expr_ref star(re().mk_star(a), m); + expr_ref plus(re().mk_plus(a), m); + expr_ref concat(re().mk_concat(a, b), m); + expr_ref uni(re().mk_union(a, b), m); + expr_ref inter(re().mk_inter(re().mk_star(a), re().mk_star(b)), m); + expr_ref compl_(re().mk_complement(re().mk_star(a)), m); + expr_ref diff(re().mk_diff(re().mk_star(a), re().mk_star(b)), m); + + ENSURE(check_agree(eps()) == 1); + ENSURE(check_agree(a) == 2); + ENSURE(check_agree(word("ab")) == 3); + ENSURE(check_agree(uni) == 4); + ENSURE(check_agree(star) == 3); // { , , } + (void)check_agree(plus); + (void)check_agree(concat); + (void)check_agree(inter); // strong-mode intersection + (void)check_agree(compl_); // strong-mode De Morgan complement + (void)check_agree(diff); + } + + void test_lazy_early_stop() { + // a* has 3 splits; pull just the first one and then stop. (Note .* is the + // full_seq special case with a single split, so use a proper char-class body.) + expr_ref star(re().mk_star(rng('a', 'a')), m); + expr_ref node = m_split.make(star); + ENSURE(node); + seq_split::iterator it = m_split.iterate(node, split_mode::strong, UINT_MAX, {}); + expr_ref d(m), n(m); + unsigned seen = 0; + if (it.next(d, n)) // pull exactly one split, then walk away + ++seen; + ENSURE(!it.gave_up()); // stopping early is not a give-up + ENSURE(seen == 1); + } + + void test_threshold_giveup() { + expr_ref star(re().mk_star(rng('a', 'a')), m); // 3 splits + split_set s; + ENSURE(!lazy(star, s, /*threshold*/ 1)); + // the eager wrapper honours the same cap + split_set s2; + ENSURE(!eager(star, s2, /*threshold*/ 1)); + } + + void test_weak_vs_strong() { + expr_ref inter(re().mk_inter(re().mk_star(rng('a', 'a')), re().mk_star(rng('b', 'b'))), m); + expr_ref compl_(re().mk_complement(re().mk_star(dot())), m); + + split_set s; + ENSURE(!eager(inter, s, UINT_MAX, split_mode::weak)); + s.reset(); + ENSURE(!lazy(inter, s, UINT_MAX, split_mode::weak)); + s.reset(); + ENSURE(!eager(compl_, s, UINT_MAX, split_mode::weak)); + s.reset(); + ENSURE(!lazy(compl_, s, UINT_MAX, split_mode::weak)); + + // strong mode succeeds for both + s.reset(); + ENSURE(eager(inter, s, UINT_MAX, split_mode::strong)); + s.reset(); + ENSURE(eager(compl_, s, UINT_MAX, split_mode::strong)); + } + + void test_make_non_regex() { + expr_ref not_a_regex(u.str.mk_string(zstring("a")), m); // String, not RegEx + expr_ref node = m_split.make(not_a_regex); + ENSURE(!node); + } + + void test_oracle_prunes() { + // sigma(.) without an oracle = { , <.,eps> }; an oracle that keeps + // only splits whose suffix is epsilon must drop one of the two. + expr_ref a = dot(); + expr_ref e = eps(); + split_oracle keep_eps_suffix = [&](expr*, expr* n) { return n == e.get(); }; + + split_set se, sl; + ENSURE(eager(a, se, UINT_MAX, split_mode::strong, keep_eps_suffix)); + ENSURE(lazy(a, sl, UINT_MAX, split_mode::strong, keep_eps_suffix)); + pair_set expected({ { a.get(), e.get() } }); + ENSURE(as_set(se) == expected); + ENSURE(as_set(sl) == expected); + } + + void test_eager_full_seq() { + // sigma(.*) = { <.*, .*> } + expr_ref ds = dotstar(); + split_set s; + ENSURE(eager(ds, s)); + ENSURE(as_set(s) == pair_set({ { ds.get(), ds.get() } })); + } + + void test_eager_bottom() { + // sigma(empty) = {} + split_set s; + ENSURE(eager(empty_re(), s)); + ENSURE(s.empty()); + + split_set sl; + ENSURE(lazy(empty_re(), sl)); + ENSURE(sl.empty()); + } + + void test_eager_empty_word() { + // sigma(to_re("")) = { <"", ""> } (a single, trivial split) + split_set s; + ENSURE(eager(word(""), s)); + ENSURE(as_set(s) == pair_set({ { word("").get(), word("").get() } })); + } + + void test_eager_star_content() { + // sigma(a*) = { , , } + expr_ref a = rng('a', 'a'); + expr_ref as(re().mk_star(a), m); + split_set s; + ENSURE(eager(as, s)); + pair_set expected({ + { eps().get(), eps().get() }, + { rappend(as, eps()).get(), rappend(a, as).get() }, + { rappend(as, a).get(), rappend(eps(), as).get() }, + }); + ENSURE(as_set(s) == expected); + } + + void test_eager_plus_content() { + // sigma(a+) = a*.sigma(a).a* (the star rule without ) + expr_ref a = rng('a', 'a'); + expr_ref as(re().mk_star(a), m); + expr_ref ap(re().mk_plus(a), m); + split_set s; + ENSURE(eager(ap, s)); + pair_set expected({ + { rappend(as, eps()).get(), rappend(a, as).get() }, + { rappend(as, a).get(), rappend(eps(), as).get() }, + }); + ENSURE(as_set(s) == expected); + } + + void test_eager_concat_content() { + // sigma(a.b) = sigma(a).b cup a.sigma(b) + expr_ref a = rng('a', 'a'), b = rng('b', 'b'); + expr_ref ab(re().mk_concat(a, b), m); + split_set s; + ENSURE(eager(ab, s)); + pair_set expected({ + { eps().get(), rappend(a, b).get() }, // + { a.get(), rappend(eps(), b).get() }, // + { rappend(a, eps()).get(), b.get() }, // + { rappend(a, b).get(), eps().get() }, // + }); + ENSURE(as_set(s) == expected); + } + + void test_nary_union() { + // sigma(a|b|c) has 2 splits per char-class + expr_ref a = rng('a', 'a'), b = rng('b', 'b'), c = rng('c', 'c'); + expr_ref u3(re().mk_union(a, re().mk_union(b, c)), m); + ENSURE(check_agree(u3) == 6); + } + + void test_nary_concat() { + // sigma(a.b.c) + expr_ref a = rng('a', 'a'), b = rng('b', 'b'), c = rng('c', 'c'); + expr_ref c3(re().mk_concat(a, re().mk_concat(b, c)), m); + ENSURE(check_agree(c3) >= 4); + } + + void test_nested_complement() { + // sigma(~~(a*)) + expr_ref cc(re().mk_complement(re().mk_complement(re().mk_star(rng('a', 'a')))), m); + (void)check_agree(cc); + } + + void test_determinism() { + expr_ref r(re().mk_concat(rng('a', 'a'), re().mk_star(rng('b', 'b'))), m); + split_set s1, s2; + ENSURE(lazy(r, s1)); + ENSURE(lazy(r, s2)); + ENSURE(as_set(s1) == as_set(s2)); + } + + void test_threshold_boundary() { + expr_ref as(re().mk_star(rng('a', 'a')), m); // exactly 3 splits + split_set s; + ENSURE(eager(as, s)); + unsigned k = (unsigned)as_set(s).size(); + ENSURE(k == 3); + + split_set ok_e, ok_l, bad_e, bad_l; + ENSURE(eager(as, ok_e, k)); + ENSURE(lazy(as, ok_l, k)); + ENSURE(!eager(as, bad_e, k - 1)); // one below threshold; give up + ENSURE(!lazy(as, bad_l, k - 1)); + } + + void test_early_stop_after_two() { + expr_ref as(re().mk_star(rng('a', 'a')), m); // 3 splits + expr_ref node = m_split.make(as); + ENSURE(node); + seq_split::iterator it = m_split.iterate(node, split_mode::strong, UINT_MAX, {}); + expr_ref d(m), n(m); + unsigned seen = 0; + while (seen < 2 && it.next(d, n)) // pull two splits on demand, then stop + ++seen; + ENSURE(!it.gave_up()); + ENSURE(seen == 2); + } + + void test_iterator_exhaustion() { + // Pull every split on demand; gave_up() must stay false on a clean + // exhaustion, and next() must keep returning false once drained. + expr_ref as(re().mk_star(rng('a', 'a')), m); // 3 splits + expr_ref node = m_split.make(as); + ENSURE(node); + seq_split::iterator it = m_split.iterate(node, split_mode::strong, UINT_MAX, {}); + expr_ref d(m), n(m); + unsigned seen = 0; + while (it.next(d, n)) + ++seen; + ENSURE(seen == 3); + ENSURE(!it.gave_up()); + // idempotent past the end + ENSURE(!it.next(d, n)); + ENSURE(!it.gave_up()); + } + + void test_iterator_giveup() { + // A threshold overrun aborts: next() returns false and gave_up() is true. + expr_ref as(re().mk_star(rng('a', 'a')), m); // 3 splits, cap at 1 + expr_ref node = m_split.make(as); + ENSURE(node); + seq_split::iterator it = m_split.iterate(node, split_mode::strong, /*threshold*/ 1, {}); + expr_ref d(m), n(m); + unsigned seen = 0; + while (it.next(d, n)) + ++seen; + ENSURE(it.gave_up()); // aborted, not a clean exhaustion + ENSURE(seen <= 1); // produced at most the capped number + + // A weak-mode Boolean closure is likewise a give-up. + expr_ref inter(re().mk_inter(re().mk_star(rng('a', 'a')), re().mk_star(rng('b', 'b'))), m); + expr_ref inode = m_split.make(inter); + ENSURE(inode); + seq_split::iterator wit = m_split.iterate(inode, split_mode::weak, UINT_MAX, {}); + ENSURE(!wit.next(d, n)); + ENSURE(wit.gave_up()); + } + + void test_simplify() { + expr_ref regs[] = { + expr_ref(re().mk_star(rng('a', 'a')), m), + expr_ref(re().mk_complement(re().mk_star(rng('a', 'a'))), m), + expr_ref(re().mk_concat(rng('a', 'a'), rng('b', 'b')), m), + }; + for (auto& r : regs) { + split_set s; + ENSURE(eager(r, s)); + unsigned before = (unsigned)s.size(); + m_split.simplify(s); + ENSURE(s.size() <= before); + ENSURE(!s.empty()); + // idempotent + split_set s2(s); + m_split.simplify(s2); + ENSURE(as_set(s) == as_set(s2)); + } + } + + void test_trivial_oracle() { + expr_ref r(re().mk_star(rng('a', 'a')), m); + split_oracle keep_all = [](expr*, expr*) { return true; }; + split_set s_no, s_yes; + ENSURE(eager(r, s_no)); + ENSURE(eager(r, s_yes, UINT_MAX, split_mode::strong, keep_all)); + ENSURE(as_set(s_no) == as_set(s_yes)); + } + + void run() { + test_eager_epsilon(); + test_eager_char(); + test_eager_word(); + test_eager_union(); + test_agree_all(); + test_lazy_early_stop(); + test_threshold_giveup(); + test_weak_vs_strong(); + test_make_non_regex(); + test_oracle_prunes(); + test_eager_full_seq(); + test_eager_bottom(); + test_eager_empty_word(); + test_eager_star_content(); + test_eager_plus_content(); + test_eager_concat_content(); + test_nary_union(); + test_nary_concat(); + test_nested_complement(); + test_determinism(); + test_threshold_boundary(); + test_early_stop_after_two(); + test_iterator_exhaustion(); + test_iterator_giveup(); + test_simplify(); + test_trivial_oracle(); + } +}; + +void tst_seq_split() { + seq_split_test t; + t.run(); +}