diff --git a/src/ast/rewriter/CMakeLists.txt b/src/ast/rewriter/CMakeLists.txt index cfcc179bc..be8449978 100644 --- a/src/ast/rewriter/CMakeLists.txt +++ b/src/ast/rewriter/CMakeLists.txt @@ -40,6 +40,7 @@ z3_add_component(rewriter seq_axioms.cpp seq_eq_solver.cpp seq_subset.cpp + seq_split.cpp seq_rewriter.cpp seq_regex_bisim.cpp seq_skolem.cpp diff --git a/src/ast/rewriter/seq_rewriter.cpp b/src/ast/rewriter/seq_rewriter.cpp index 2b9d738c0..6750b12c8 100644 --- a/src/ast/rewriter/seq_rewriter.cpp +++ b/src/ast/rewriter/seq_rewriter.cpp @@ -4037,58 +4037,54 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) { //do not concatenate [], it is a deade-end return result; } - else { - // Classical Brzozowski union: keep the derivative tree free of - // antimirov-union nodes so the bisimulation procedure sees a - // single regex tree whose leaves are XOR pairs. - return mk_der_union(result, mk_der_concat(is_n, dr2)); - } + // Classical Brzozowski union: keep the derivative tree free of + // antimirov-union nodes so the bisimulation procedure sees a + // single regex tree whose leaves are XOR pairs. + return mk_der_union(result, mk_der_concat(is_n, dr2)); } - else if (re().is_star(r, r1)) { + if (re().is_star(r, r1)) { return mk_der_concat(mk_derivative_rec(ele, r1), r); } - else if (re().is_plus(r, r1)) { + if (re().is_plus(r, r1)) { expr_ref star(re().mk_star(r1), m()); return mk_derivative_rec(ele, star); } - else if (re().is_union(r, r1, r2)) { + if (re().is_union(r, r1, r2)) { return mk_der_union(mk_derivative_rec(ele, r1), mk_derivative_rec(ele, r2)); } - else if (re().is_intersection(r, r1, r2)) { + if (re().is_intersection(r, r1, r2)) { return mk_der_inter(mk_derivative_rec(ele, r1), mk_derivative_rec(ele, r2)); } - else if (re().is_diff(r, r1, r2)) { + if (re().is_diff(r, r1, r2)) { return mk_der_inter(mk_derivative_rec(ele, r1), mk_der_compl(mk_derivative_rec(ele, r2))); } - else if (re().is_xor(r, r1, r2)) { + if (re().is_xor(r, r1, r2)) { return mk_der_xor(mk_derivative_rec(ele, r1), mk_derivative_rec(ele, r2)); } - else if (m().is_ite(r, p, r1, r2)) { + if (m().is_ite(r, p, r1, r2)) { // there is no BDD normalization here result = m().mk_ite(p, mk_derivative_rec(ele, r1), mk_derivative_rec(ele, r2)); return result; } - else if (re().is_opt(r, r1)) { + if (re().is_opt(r, r1)) { return mk_derivative_rec(ele, r1); } - else if (re().is_complement(r, r1)) { + if (re().is_complement(r, r1)) { return mk_der_compl(mk_derivative_rec(ele, r1)); } - else if (re().is_loop(r, r1, lo)) { + if (re().is_loop(r, r1, lo)) { if (lo > 0) { lo--; } result = mk_derivative_rec(ele, r1); - //do not concatenate with [] (emptyset) + // do not concatenate with [] (emptyset) if (re().is_empty(result)) { return result; } - else { - //do not create loop r1{0,}, instead create r1* - return mk_der_concat(result, (lo == 0 ? re().mk_star(r1) : re().mk_loop(r1, lo))); - } + // do not create loop r1{0,}, instead create r1* + return mk_der_concat(result, (lo == 0 ? re().mk_star(r1) : re().mk_loop(r1, lo))); } - else if (re().is_loop(r, r1, lo, hi)) { + if (re().is_loop(r, r1, lo, hi)) { if (hi == 0) { return mk_empty(); } @@ -4097,19 +4093,16 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) { lo--; } result = mk_derivative_rec(ele, r1); - //do not concatenate with [] (emptyset) or handle the rest of the loop if no more iterations remain + // do not concatenate with [] (emptyset) or handle the rest of the loop if no more iterations remain if (re().is_empty(result) || hi == 0) { return result; } - else { - return mk_der_concat(result, re().mk_loop_proper(r1, lo, hi)); - } + return mk_der_concat(result, re().mk_loop_proper(r1, lo, hi)); } - else if (re().is_full_seq(r) || - re().is_empty(r)) { + if (re().is_full_seq(r) || re().is_empty(r)) { return expr_ref(r, m()); } - else if (re().is_to_re(r, r1)) { + if (re().is_to_re(r, r1)) { // r1 is a string here (not a regexp) expr_ref hd(m()), tl(m()); if (get_head_tail(r1, hd, tl)) { @@ -4122,16 +4115,16 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) { result = mk_der_concat(result, r1); return result; } - else if (str().is_empty(r1)) { - //observe: str().is_empty(r1) checks that r = () = epsilon - //while mk_empty() = [], because deriv(epsilon) = [] = nothing + if (str().is_empty(r1)) { + // observe: str().is_empty(r1) checks that r = () = epsilon + // while mk_empty() = [], because deriv(epsilon) = [] = nothing return mk_empty(); } - else if (str().is_itos(r1)) { + if (str().is_itos(r1)) { // - // here r1 = (str.from_int r2) and r2 is non-ground + // here r1 = (str.from_int r2) and r2 is non-ground // or else the expression would have been simplified earlier - // so r1 must be nonempty and must consists of decimal digits + // so r1 must be nonempty and must consists of decimal digits // '0' <= elem <= '9' // if ((isdigit ele) and (ele = (hd r1))) then (to_re (tl r1)) else [] // @@ -4143,19 +4136,17 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) { auto a3 = m().mk_eq(hd, ele); auto inner = m().mk_and(a2, a3); m_br.mk_and(a0, a1, inner, result); - tl = re().mk_to_re(mk_seq_rest(r1)); - return re_and(result, tl); - } - else { - // recall: [] denotes the empty language (nothing) regex, () denotes epsilon or empty sequence - // construct the term (if (r1 != () and (ele = (first r1)) then (to_re (rest r1)) else [])) - hd = mk_seq_first(r1); - m_br.mk_and(m().mk_not(m().mk_eq(r1, str().mk_empty(seq_sort))), m().mk_eq(hd, ele), result); tl = re().mk_to_re(mk_seq_rest(r1)); return re_and(result, tl); } + // recall: [] denotes the empty language (nothing) regex, () denotes epsilon or empty sequence + // construct the term (if (r1 != () and (ele = (first r1)) then (to_re (rest r1)) else [])) + hd = mk_seq_first(r1); + m_br.mk_and(m().mk_not(m().mk_eq(r1, str().mk_empty(seq_sort))), m().mk_eq(hd, ele), result); + tl = re().mk_to_re(mk_seq_rest(r1)); + return re_and(result, tl); } - else if (re().is_reverse(r, r1)) { + if (re().is_reverse(r, r1)) { if (re().is_to_re(r1, r2)) { // First try to extract hd and tl such that r = hd ++ tl and |tl|=1 expr_ref hd(m()), tl(m()); @@ -4167,22 +4158,20 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) { result = mk_der_concat(result, re().mk_reverse(re().mk_to_re(hd))); return result; } - else if (str().is_empty(r2)) { + if (str().is_empty(r2)) { return mk_empty(); } - else { - // construct the term (if (r2 != () and (ele = (last r2)) then reverse(to_re (butlast r2)) else [])) - // hd = first of reverse(r2) i.e. last of r2 - // tl = rest of reverse(r2) i.e. butlast of r2 - //hd = str().mk_nth_i(r2, m_autil.mk_sub(str().mk_length(r2), one())); - hd = mk_seq_last(r2); - // factor nested constructor calls to enforce deterministic argument evaluation order - auto a_non_empty = m().mk_not(m().mk_eq(r2, str().mk_empty(seq_sort))); - auto a_eq = m().mk_eq(hd, ele); - m_br.mk_and(a_non_empty, a_eq, result); - tl = re().mk_to_re(mk_seq_butlast(r2)); - return re_and(result, re().mk_reverse(tl)); - } + // construct the term (if (r2 != () and (ele = (last r2)) then reverse(to_re (butlast r2)) else [])) + // hd = first of reverse(r2) i.e. last of r2 + // tl = rest of reverse(r2) i.e. butlast of r2 + // hd = str().mk_nth_i(r2, m_autil.mk_sub(str().mk_length(r2), one())); + hd = mk_seq_last(r2); + // factor nested constructor calls to enforce deterministic argument evaluation order + auto a_non_empty = m().mk_not(m().mk_eq(r2, str().mk_empty(seq_sort))); + auto a_eq = m().mk_eq(hd, ele); + m_br.mk_and(a_non_empty, a_eq, result); + tl = re().mk_to_re(mk_seq_butlast(r2)); + return re_and(result, re().mk_reverse(tl)); } } else if (re().is_range(r, r1, r2)) { @@ -4201,11 +4190,9 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) { result = mk_der_inter(p1, p2); return result; } - else { - return mk_empty(); - } + return mk_empty(); } - expr* e1 = nullptr, * e2 = nullptr; + expr *e1 = nullptr, *e2 = nullptr; if (str().is_unit(r1, e1) && str().is_unit(r2, e2)) { SASSERT(u().is_char(e1)); // Use mk_der_cond to normalize @@ -4223,7 +4210,7 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) { } else if (re().is_of_pred(r, p)) { array_util array(m()); - expr* args[2] = { p, ele }; + expr *args[2] = {p, ele}; result = array.mk_select(2, args); // Use mk_der_cond to normalize STRACE(seq_verbose, tout << "deriv of_pred" << std::endl;); diff --git a/src/ast/rewriter/seq_rewriter.h b/src/ast/rewriter/seq_rewriter.h index 618124e10..0d21f94ba 100644 --- a/src/ast/rewriter/seq_rewriter.h +++ b/src/ast/rewriter/seq_rewriter.h @@ -18,6 +18,7 @@ Notes: --*/ #pragma once +#include "seq_split.h" #include "ast/seq_decl_plugin.h" #include "ast/ast_pp.h" #include "ast/arith_decl_plugin.h" @@ -130,6 +131,7 @@ class seq_rewriter { seq_util m_util; seq_subset m_subset; + seq_split m_split; arith_util m_autil; bool_rewriter m_br; // re2automaton m_re2aut; @@ -346,7 +348,7 @@ class seq_rewriter { public: seq_rewriter(ast_manager & m, params_ref const & p = params_ref()): - m_util(m), m_subset(m_util.re), m_autil(m), m_br(m, p), // m_re2aut(m), + m_util(m), m_subset(m_util.re), m_split(*this), m_autil(m), m_br(m, p), // m_re2aut(m), m_op_cache(m), m_es(m), m_lhs(m), m_rhs(m), m_coalesce_chars(true) { } @@ -397,6 +399,20 @@ public: return result; } + // Split decomposition (sigma) of a regex; see seq_split.h. `oracle` (optional) + // prunes non-viable splits during generation. + bool split(expr* r, split_set& out, unsigned threshold, + const split_mode mode = split_mode::strong, split_oracle const& oracle = {}) { + return m_split.compute(r, out, threshold, mode, oracle); + } + + void simplify_split(split_set& s) { m_split.simplify(s); } + + // decompose a membership constraint into a set of pairs of regex splits + std::pair split_membership(expr* str, expr* regex, unsigned threshold, split_set& result) const { + return m_split.split_membership(str, regex, threshold, result); + } + /** * check if regular expression is of the form all ++ s ++ all ++ t + u ++ all, where, s, t, u are sequences */ diff --git a/src/ast/rewriter/seq_split.cpp b/src/ast/rewriter/seq_split.cpp new file mode 100644 index 000000000..8095cff95 --- /dev/null +++ b/src/ast/rewriter/seq_split.cpp @@ -0,0 +1,545 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_split.cpp + +Abstract: + + Regex split decomposition (the split function sigma). See seq_split.h. + +Author: + + Nikolaj Bjorner (nbjorner) 2026-6-10 + Clemens Eisenhofer 2026-6-10 + +--*/ + +#include "ast/rewriter/seq_split.h" +#include "ast/rewriter/seq_rewriter.h" +#include "ast/ast_pp.h" +#include "util/obj_hashtable.h" +#include "util/stack.h" + +seq_split::seq_split(seq_rewriter& rw) : + m_rw(rw), m_subset(rw.u().re) {} + +ast_manager& seq_split::m() const { return m_rw.m(); } +seq_util& seq_split::seq() const { return m_rw.u(); } +seq_util::rex& seq_split::re() const { return m_rw.u().re; } + +// Add unless the (optional) lookahead oracle prunes it. +void seq_split::push(split_set& out, split_oracle const& oracle, expr* d, expr* n) const { + if (!oracle || oracle(d, n)) + out.push_back(split_pair(d, n, m())); +} + +// Cross-product intersection of two split-sets (split algebra): +// S1 cap S2 = { | in S1, in S2 }. +// Pairs where any component is bottom (the empty regex) are dropped. +bool seq_split::intersect(split_set const& s1, split_set const& s2, split_set& result, + unsigned threshold, split_oracle const& oracle) const { + const seq_util::rex& r = re(); + for (auto const& p1 : s1) { + for (auto const& p2 : s2) { + if (r.is_empty(p1.m_d) || r.is_empty(p2.m_d) || + r.is_empty(p1.m_n) || r.is_empty(p2.m_n)) + continue; + const expr_ref di(m_rw.mk_regex_inter_normalize(p1.m_d, p2.m_d), m()); + const expr_ref ni(m_rw.mk_regex_inter_normalize(p1.m_n, p2.m_n), m()); + push(result, oracle, di, ni); + if (result.size() > threshold) + return false; + } + } + return true; +} + +// Complement of a split-set via De Morgan: ~S = cap_{s in S} ~s with +// ~ = { <~D, .*>, <.*, ~N> } and ~{} = { <.*, .*> }. +// May produce up to 2^|sp| pairs (bounded by the threshold). A threshold +// overrun must abort entirely: a partial fold is a strictly weaker (unsound) +// split-set, since each ~sp[i] further constrains ~S. +bool seq_split::complement(sort* seq_sort, split_set const& sp, split_set& result, + const unsigned threshold, split_oracle const& oracle) const { + + seq_util::rex& r = re(); + sort* re_sort = r.mk_re(seq_sort); + const expr_ref full(r.mk_full_seq(re_sort), m()); // .* + if (sp.empty()) { // ~{} = <.*, .*> + push(result, oracle, full, full); + return true; + } + // The acc/next pairs carry genuine output-orientation N components (the De + // Morgan ~ = {<~D,.*>, <.*,~N>}), so the oracle prunes them soundly and + // keeps the 2^|sp| fold from blowing up. + split_set acc; + push(acc, oracle, r.mk_complement(sp[0].m_d), full); + push(acc, oracle, full, r.mk_complement(sp[0].m_n)); + for (unsigned i = 1; i < sp.size(); ++i) { + split_set next; + push(next, oracle, r.mk_complement(sp[i].m_d), full); + push(next, oracle, full, r.mk_complement(sp[i].m_n)); + split_set tmp; + if (!intersect(acc, next, tmp, threshold, oracle)) + return false; + acc = std::move(tmp); + if (acc.empty()) // intersection empty => ~S is empty + break; + if (acc.size() > threshold) + return false; + } + result.append(acc); + return true; +} + +bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mode mode, + split_oracle const& oracle) { + SASSERT(r); + seq_util& sq = seq(); + seq_util::rex& rex = re(); + ast_manager& mm = m(); + + sort* seq_sort = nullptr; + if (!sq.is_re(r, seq_sort)) + return false; + + // bottom: sigma(empty) = {} (the empty split-set) + if (rex.is_empty(r)) + return true; + + // epsilon: sigma(eps) = { } + if (rex.is_epsilon(r)) { + const expr_ref eps(rex.mk_epsilon(seq_sort), mm); + push(result, oracle, eps, eps); + return true; + } + + expr* a = nullptr, *b = nullptr; + + // to_re(s): split the literal word s at every position. + expr* s = nullptr; + if (rex.is_to_re(r, s)) { + zstring str; + vector stack; + stack.push_back(s); + + while (!stack.empty()) { + expr* cur = stack.back(); + stack.pop_back(); + if (seq().str.is_concat(cur, a, b)) { + stack.push_back(b); + stack.push_back(a); + } + else { + expr* ch; + unsigned cv; + if (seq().str.is_unit(cur, ch) && seq().is_const_char(ch, cv)) { + str += zstring(cv); + continue; + } + zstring str2; + if (sq.str.is_string(s, str2)) { + str = str2; + continue; + } + // not a constant string; unsupported for now + return false; + } + } + for (unsigned i = 0; i <= str.length(); ++i) { + const expr_ref p(rex.mk_to_re(sq.str.mk_string(str.extract(0, i))), mm); + const expr_ref q(rex.mk_to_re(sq.str.mk_string(str.extract(i, str.length() - i))), mm); + push(result, oracle, p, q); + } + return true; + } + + // single-character class alpha (., [lo-hi], of_pred): + // sigma(alpha) = { , } + if (rex.is_full_char(r) || rex.is_range(r) || rex.is_of_pred(r)) { + const expr_ref ex(r, mm); + const expr_ref eps(rex.mk_epsilon(seq_sort), mm); + push(result, oracle, eps, ex); + push(result, oracle, ex, eps); + return true; + } + + // .* : sigma(.*) = { <.*, .*> } + if (rex.is_full_seq(r)) { + const expr_ref ex(r, mm); + push(result, oracle, ex, ex); + return true; + } + + // union: sigma(r0 | ... | r_{n-1}) = U sigma(ri) (re.union may be n-ary) + if (rex.is_union(r)) { + app* ap = to_app(r); + for (unsigned i = 0; i < ap->get_num_args(); ++i) { + if (!compute(ap->get_arg(i), result, threshold, mode, oracle)) + return false; + } + return true; + } + + // concat: sigma(r0...r_{n-1}) = U_i (r0...r_{i-1}) . sigma(ri) . (r_{i+1}...r_{n-1}) + // (re.++ may be n-ary) + if (rex.is_concat(r)) { + app* ap = to_app(r); + const unsigned n = ap->get_num_args(); + for (unsigned i = 0; i < n; ++i) { + // Sound to pass the oracle into the sub-computation: N_inner.Sigma* + // over-approximates the final N_inner.right, so a prune here is a + // prune of the final pair too (prefix-compatible test). + split_set sigma_arg; + if (!compute(ap->get_arg(i), sigma_arg, threshold, mode, oracle)) + return false; + + expr_ref left(mm), right(mm); + if (i == 0) + left = rex.mk_epsilon(seq_sort); + else { + for (unsigned j = 0; j < i; ++j) { + expr* arg = ap->get_arg(j); + left = left ? expr_ref(rex.mk_concat(left, arg), mm) : expr_ref(arg, mm); + } + } + if (i == n - 1) + right = rex.mk_epsilon(seq_sort); + else { + right = ap->get_arg(i + 1); + for (unsigned j = i + 2; j < n; ++j) { + expr* arg = ap->get_arg(j); + right = rex.mk_concat(right, arg); + } + } + + for (auto const& [d, nn] : sigma_arg) { + const expr_ref p = m_rw.mk_re_append(left, d); + const expr_ref q = m_rw.mk_re_append(nn, right); + push(result, oracle, p, q); + } + } + return true; + } + + // star: sigma(a*) = { } cup a*.sigma(a).a* + if (rex.is_star(r, a)) { + const expr_ref eps(rex.mk_epsilon(seq_sort), mm); + push(result, oracle, eps, eps); + split_set sa; + if (!compute(a, sa, threshold, mode, oracle)) + return false; + for (auto const& [d, n] : sa) { + const expr_ref p = m_rw.mk_re_append(r, d); // a*.D + const expr_ref q = m_rw.mk_re_append(n, r); // N.a* + push(result, oracle, p, q); + } + return true; + } + + // plus: a+ = a.a* ; sigma(a+) = a*.sigma(a).a* (star rule without ) + if (rex.is_plus(r, a)) { + const expr_ref star(rex.mk_star(a), mm); // a* + split_set sa; + if (!compute(a, sa, threshold, mode, oracle)) + return false; + for (auto const& [d, n] : sa) { + const expr_ref p = m_rw.mk_re_append(star, d); + const expr_ref q = m_rw.mk_re_append(n, star); + push(result, oracle, p, q); + } + return true; + } + + // intersection: sigma(r0 & ... & r_{n-1}) = cap sigma(ri) (re.inter may be n-ary) + if (rex.is_intersection(r)) { + if (mode == split_mode::weak) + return false; + app* ap = to_app(r); + const unsigned n = ap->get_num_args(); + split_set current; + if (!compute(ap->get_arg(0), current, threshold, mode, oracle)) + return false; + // A give-up on any conjunct must propagate as a give-up: silently treating + // it as the empty split-set would collapse the whole intersection to bottom + // and be misreported as an (unsound) conflict. + for (unsigned i = 1; i < n && !current.empty(); ++i) { + split_set arg_i, tmp; + if (!compute(ap->get_arg(i), arg_i, threshold, mode, oracle)) + return false; + if (!intersect(current, arg_i, tmp, threshold, oracle)) + return false; + current = std::move(tmp); + } + result.append(current); + return true; + } + + // complement: sigma(~a) = ~sigma(a). + // The body is computed WITHOUT the oracle (the body's pairs are inverted, so + // their N is unrelated to the output N); the oracle is re-applied in complement(). + if (rex.is_complement(r, a)) { + if (mode == split_mode::weak) + return false; + split_set sa; + if (!compute(a, sa, threshold, mode)) + return false; + return complement(seq_sort, sa, result, threshold, oracle); + } + + // difference: a \ b = a & ~b ; sigma(a \ b) = sigma(a) cap ~sigma(b). + // sigma(b) (used only inside the complement) is computed WITHOUT the oracle. + if (rex.is_diff(r, a, b)) { + if (mode == split_mode::weak) + return false; + split_set sa, sb, sb_compl, tmp; + if (!compute(a, sa, threshold, mode, oracle)) + return false; + if (!compute(b, sb, threshold, mode)) + return false; + if (!complement(seq_sort, sb, sb_compl, threshold, oracle)) + return false; + if (!intersect(sa, sb_compl, tmp, threshold, oracle)) + return false; + result.append(tmp); + return true; + } + + // bounded loop / ite / other: not handled (paper "v1: bail"). + TRACE(seq, tout << "seq_split: unsupported regex " << mk_pp(r, mm) << "\n";); + return false; +} + +// same-D / same-N merge (paper eqs. 1 & 2): +// { , } -> (by_left = true, group by D) +// { , } -> (by_left = false, group by N) +// Only fires on syntactically-identical (perfectly-shared) key components, so +// it is a conservative instance of the rule. +void seq_split::merge_by(split_set& pairs, const bool by_left) const { + ast_manager& mm = m(); + obj_map idx; // key component -> position in `out` + split_set out; + for (auto const& p : pairs) { + expr* key = by_left ? p.m_d.get() : p.m_n.get(); + expr* other = by_left ? p.m_n.get() : p.m_d.get(); + unsigned pos; + if (idx.find(key, pos)) { + expr* prev = by_left ? out[pos].m_n.get() : out[pos].m_d.get(); + const expr_ref u(m_rw.mk_regex_union_normalize(prev, other), mm); + if (by_left) + out[pos].m_n = u; + else + out[pos].m_d = u; + } + else { + idx.insert(key, out.size()); + out.push_back(p); + } + } + pairs.swap(out); +} + +void seq_split::simplify(split_set& pairs) const { + seq_util::rex& r = re(); + + // 1. drop pairs with a bottom (empty-language) component. + unsigned w = 0; + for (unsigned i = 0; i < pairs.size(); ++i) { + if (r.is_empty(pairs[i].m_d) || r.is_empty(pairs[i].m_n)) + continue; + if (w != i) + pairs[w] = pairs[i]; + ++w; + } + pairs.shrink(w); + if (pairs.size() <= 1) + return; + + // 2. same-D / same-N merge rules. + merge_by(pairs, true); + merge_by(pairs, false); + if (pairs.size() <= 1) + return; + + // 3. subsumption: drop when L(D_i) subseteq L(D_j) and + // L(N_i) subseteq L(N_j) for some kept j. seq_subset is conservative + // (returns true only for definite containment), so we never drop a + // needed split. + //if (pairs.size() > 64) + // return; + + struct row { expr* d; expr* n; unsigned idx; }; + vector rows; + for (unsigned i = 0; i < pairs.size(); ++i) + rows.push_back({ pairs[i].m_d.get(), pairs[i].m_n.get(), i }); + + auto subsumes = [&](row const& a, row const& b) { + return m_subset.is_subset(b.d, a.d) && m_subset.is_subset(b.n, a.n); + }; + + vector kept; + for (row const& row_r : rows) { + bool redundant = false; + for (row const& k : kept) + if (subsumes(k, row_r)) { redundant = true; break; } + if (redundant) + continue; + // drop already-kept rows strictly subsumed by row_r + unsigned kw = 0; + for (unsigned t = 0; t < kept.size(); ++t) { + if (subsumes(row_r, kept[t])) + continue; + kept[kw++] = kept[t]; + } + kept.shrink(kw); + kept.push_back(row_r); + } + + split_set result; + for (row const& k : kept) + result.push_back(pairs[k.idx]); + pairs.swap(result); +} + +std::pair seq_split::split_membership(expr* str, expr* regex, unsigned threshold, split_set& result) const { + ast_manager& m = this->m(); + + expr_ref_vector tokens(m); + vector stack; + stack.push_back(str); + + while (!stack.empty()) { + expr* cur = stack.back(); + stack.pop_back(); + expr* l, *r; + if (seq().str.is_concat(cur, l, r)) { + stack.push_back(r); + stack.push_back(l); + } + else + tokens.push_back(expr_ref(cur, m)); + } + + expr* ch; + unsigned i = 0; + + while (i < tokens.size() && (seq().str.is_string(tokens.get(i)) || (seq().str.is_unit(tokens.get(i), ch) && seq().is_const_char(ch)))) { + zstring s; + if (seq().str.is_string(tokens.get(i), s)) { + if (s.empty()) { + i++; + continue; + } + ch = seq().mk_char(s[0]); + tokens[i] = seq().str.mk_string(s.extract(1, s.length() - 1)); + } + else + i++; + regex = m_rw.mk_derivative(ch, regex); + } + + if (i > 0) { + unsigned j = 0; + for (; i < tokens.size(); i++, j++) { + tokens[j] = tokens.get(i); + } + tokens.shrink(j); + } + + // TODO: Do this for the back as well (also, why did no rule before do that?) + + if (tokens.empty()) + return { nullptr, nullptr }; + + // Choose the factorization boundary so the tail starts with the + // longest run of concrete characters c. + // This gives the split-engine lookahead oracle the most pruning information. + // head = u' (tokens before the run), tail = c · u''' (tokens from the run onward). + const unsigned total = tokens.size(); + unsigned run_start = 0, run_len = 0; + for (i = 1; i < total; ) { + if (!(seq().str.is_unit(tokens.get(i), ch) && seq().is_const_char(ch))) { + i++; + continue; + } + unsigned j = i; + while (j < total && seq().str.is_unit(tokens.get(j), ch) && seq().is_const_char(ch)) { + j++; + } + if (j - i > run_len) { + run_len = j - i; + run_start = i; + } + i = j; + } + // No constant run => fall back to splitting off the first token. + const unsigned p = run_len == 0 ? 1 : run_start; + SASSERT(p >= 1); + expr* head = tokens.get(0); + for (i = 1; i < p; i++) { + head = seq().str.mk_concat(head, tokens.get(i)); + } + expr* tail = seq().str.mk_empty(head->get_sort()); + if (tokens.size() > p + run_len) { + tail = tokens.get(p + run_len); + for (i = p + run_len + 1; i < tokens.size(); i++) { + tail = seq().str.mk_concat(tail, tokens.get(i)); + } + } + SASSERT(head && tail); + + // Build the constant lookahead c and (if non-empty) an oracle that + // prunes splits whose postfix cannot match c. + zstring c; + for (i = 0; i < run_len; ++i) { + unsigned cv; + VERIFY(seq().str.is_unit(tokens.get(run_start + i), ch)); + VERIFY(seq().is_const_char(ch, cv)); + c = c + zstring(cv); + } + split_oracle oracle; + if (!c.empty()) + oracle = [this, &c](expr*, expr* n) { return split_lookahead_viable(n, c); }; + + // Decompose the regex into a split-set via the shared seq_split engine + if (!m_rw.split(regex, result, threshold, split_mode::strong, oracle)) { + result.clear(); + return { nullptr, nullptr }; + } + + m_rw.simplify_split(result); + + // Eagerly consume the constant run c from the tail by taking the c-derivative + // of each postfix + if (!c.empty()) { + unsigned w = 0; + for (i = 0; i < result.size(); ++i) { + expr* d = result[i].m_n; + for (unsigned k = 0; d && !seq().re.is_empty(d) && k < c.length(); ++k) { + d = m_rw.mk_derivative(seq().mk_char(c[k]), d); + } + SASSERT(d); + if (re().is_empty(d)) + continue; // postfix can't start with c => infeasible split, drop + result[w++] = split_pair(result[i].m_d, d, m); + } + result.shrink(w); + } + + return { head, tail }; +} + +bool seq_split::split_lookahead_viable(expr* regex, zstring const& c) const { + SASSERT(regex); + for (unsigned i = 0; i < c.length(); i++) { + if (m().is_true(m_rw.is_nullable(regex))) + return true; // N accepts the prefix c[0..i) => a suffix completes it + regex = m_rw.mk_derivative(seq().mk_char(c[i]), regex); + SASSERT(regex); + if (re().is_empty(regex)) + return false; // N went (syntactically) dead before reaching c + } + return !re().is_empty(regex); +} \ No newline at end of file diff --git a/src/ast/rewriter/seq_split.h b/src/ast/rewriter/seq_split.h new file mode 100644 index 000000000..5ba4cabde --- /dev/null +++ b/src/ast/rewriter/seq_split.h @@ -0,0 +1,118 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_split.h + +Abstract: + + Regex split decomposition: the split function sigma from the paper + "Solving by Splitting". For a regular expression r, sigma(r) is a finite + "split-set" of pairs { } such that + + u.v in L(r) iff exists i: u in L(D_i) and v in L(N_i). + + The split algebra (intersection, De Morgan complement, left/right + concatenation with a regex) and the cardinality-reducing simplification + heuristics (drop bottom, same-D/same-N merge, subsumption via seq_subset) + follow the paper. + +Author: + + Clemens Eisenhofer 2026-6-10 + +--*/ +#pragma once + +#include "ast/seq_decl_plugin.h" +#include "ast/rewriter/seq_subset.h" +#include + +class seq_rewriter; + +// An individual split : the left (prefix) regex D and right (suffix) +// regex N. u.v in L(r) for this split iff u in L(D) and v in L(N). +struct split_pair { + expr_ref m_d; + expr_ref m_n; + split_pair(expr* d, expr* n, ast_manager& m) : m_d(d, m), m_n(n, m) { + SASSERT(d && n); + } +}; + +// A split-set is a union of individual splits. +typedef vector split_set; + +// Controls how aggressively sigma expands the Boolean-closure cases: +// strong - fully expand complement / intersection via the split algebra +// (De Morgan / cross product). This is the behaviour the nseq +// solver relies on. +// weak - do not perform the (potentially 2^k) Boolean-closure expansion; +// give up (return false) on complement / intersection instead. +enum class split_mode { weak, strong }; + +// Optional lookahead oracle. Called for each candidate split as it is +// generated; returns true to keep it, false to prune it. An empty oracle (the +// default) keeps everything, so sigma is unchanged. See seq_split::compute. +typedef std::function split_oracle; + +class seq_split { + seq_rewriter& m_rw; // for mk_re_append + manager / seq_util access + seq_subset m_subset; // language-subset checks for subsumption + + ast_manager& m() const; + seq_util& seq() const; + seq_util::rex& re() const; + + // Push onto `out`, unless `oracle` rejects it. + void push(split_set& out, split_oracle const& oracle, expr* d, expr* n) const; + + // S1 cap S2 = { } dropping any pair with a bottom + // component (and any rejected by `oracle`). Returns false on threshold overrun. + bool intersect(split_set const& s1, split_set const& s2, split_set& result, + unsigned threshold, split_oracle const& oracle) const; + + // De Morgan complement of a split-set: ~S = cap_{s in S} ~s with + // ~ = { <~D, .*>, <.*, ~N> } and ~{} = { <.*, .*> }. + bool complement(sort* seq_sort, split_set const& sp, split_set& result, + unsigned threshold, split_oracle const& oracle) const; + + // same-D / same-N merge: groups pairs that share a (syntactically identical) + // left (resp. right) component and unions the other component. + void merge_by(split_set& pairs, bool by_left) const; + +public: + explicit seq_split(seq_rewriter& rw); + + // Compute sigma(r), appending to `out` (does not clear it). `threshold` + // bounds the number of produced splits; an overrun, an unsupported regex + // shape (bounded loop / ite), or a Boolean-closure case in weak mode makes + // it return false ("give up"). + // + // `oracle` (optional) prunes non-viable splits *during* generation. It must + // be sound to apply at every generation step: a candidate N can still gain a + // prefix from a factor appended to its right later (concat/star), so the + // oracle must use a "prefix-compatible" test (prune only when N can never + // match the lookahead, even partially), NOT a strict "starts-with" test. + // The complement body is computed WITHOUT the oracle (inverted orientation); + // the oracle is re-applied to the complement's output fold. + bool compute(expr* r, split_set& out, unsigned threshold, + split_mode mode = split_mode::strong, split_oracle const& oracle = {}); + + // In-place simplification of a split-set: drop bottom components, apply the + // same-D / same-N merge rules, and drop splits subsumed by another (using + // seq_subset). Size-capped to keep the O(n^2) subsumption affordable. + void simplify(split_set& s) const; + + // decompose a membership constraint into a set of pairs of regex splits + std::pair split_membership(expr* str, expr* regex, unsigned threshold, split_set& result) const; + + // Lookahead oracle for the split engine: is the split's right component + // `n_regex` prefix-compatible with the constant character sequence `c`? + // This is sound to apply during split generation — it never drops a viable split. + // Thus, it might not eliminate all cases in order to stay sound + bool split_lookahead_viable(expr* regex, zstring const& c) const; + + +}; diff --git a/src/params/smt_params_helper.pyg b/src/params/smt_params_helper.pyg index cd52b989c..d3f164f3b 100644 --- a/src/params/smt_params_helper.pyg +++ b/src/params/smt_params_helper.pyg @@ -138,6 +138,8 @@ def_module_params(module_name='smt', ('seq.validate', BOOL, False, 'enable self-validation of theory axioms created by seq theory'), ('seq.max_unfolding', UINT, 1000000000, 'maximal unfolding depth for checking string equations and regular expressions'), ('seq.min_unfolding', UINT, 1, 'initial bound for strings whose lengths are bounded by iterative deepening. Set this to a higher value if there are only models with larger string lengths'), + ('seq.regex_factorization_threshold', UINT, 10, 'maximum number of cases to factor a regex into in a single step'), + ('seq.regex_factorization_enabled', BOOL, False, 'apply regex factorization (sigma splitting)'), ('theory_aware_branching', BOOL, False, 'Allow the context to use extra information from theory solvers regarding literal branching prioritization.'), ('sls.enable', BOOL, False, 'enable sls co-processor with SMT engine'), ('sls.parallel', BOOL, True, 'use sls co-processor in parallel or sequential with SMT engine'), diff --git a/src/params/theory_seq_params.cpp b/src/params/theory_seq_params.cpp index 54bf69162..960f145a6 100644 --- a/src/params/theory_seq_params.cpp +++ b/src/params/theory_seq_params.cpp @@ -23,4 +23,6 @@ void theory_seq_params::updt_params(params_ref const & _p) { m_seq_validate = p.seq_validate(); m_seq_max_unfolding = p.seq_max_unfolding(); m_seq_min_unfolding = p.seq_min_unfolding(); + m_seq_regex_factorization_enabled = p.seq_regex_factorization_enabled(); + m_seq_regex_factorization_threshold = p.seq_regex_factorization_threshold(); } diff --git a/src/params/theory_seq_params.h b/src/params/theory_seq_params.h index f964088eb..067a65a66 100644 --- a/src/params/theory_seq_params.h +++ b/src/params/theory_seq_params.h @@ -26,6 +26,8 @@ struct theory_seq_params { bool m_seq_validate = false; unsigned m_seq_max_unfolding = UINT_MAX/4; unsigned m_seq_min_unfolding = 1; + bool m_seq_regex_factorization_enabled = false; + unsigned m_seq_regex_factorization_threshold = 1; theory_seq_params(params_ref const & p = params_ref()) { updt_params(p); diff --git a/src/smt/seq_regex.cpp b/src/smt/seq_regex.cpp index 0e9a03b63..af8b280d8 100644 --- a/src/smt/seq_regex.cpp +++ b/src/smt/seq_regex.cpp @@ -128,6 +128,31 @@ namespace smt { return; } + if (th.get_fparams().m_seq_regex_factorization_enabled) { + unsigned threshold = th.get_fparams().m_seq_regex_factorization_threshold; + if (threshold == 0) + threshold = UINT_MAX; + split_set result; + auto [head, tail] = seq_rw().split_membership(s, r, threshold, result); + if (head) { + SASSERT(tail); + // propagate all cases + expr_ref_vector cases(m); + expr_ref_vector branches(m); + for (auto [pre, post] : result) { + expr_ref mem_head(re().mk_in_re(head, pre), m); + expr_ref mem_tail(re().mk_in_re(tail, post), m); + cases.push_back(m.mk_and(mem_head, mem_tail)); + } + const expr_ref cases_expr(m.mk_or(cases), m); + ctx.internalize(cases_expr, false); + std::cout << mk_pp(s, m) << " in " << mk_pp(r, m) << " =>\n" << mk_pp(cases_expr, m) << std::endl; + th.propagate_lit(nullptr, 1, &lit, ctx.get_literal(cases_expr)); + return; + } + // fallthrough; decomposition failed + } + // Convert a non-ground sequence into an additional regex and // strengthen the original regex constraint into an intersection // for example: