/*++ Copyright (c) 2026 Microsoft Corporation Module Name: seq_split.cpp Abstract: Regex split decomposition (the split function sigma). See seq_split.h. Author: Nikolaj Bjorner (nbjorner) 2026-6-10 Clemens Eisenhofer 2026-6-10 --*/ #include "ast/rewriter/seq_split.h" #include "ast/rewriter/seq_rewriter.h" #include "ast/ast_pp.h" #include "util/obj_hashtable.h" seq_split::seq_split(seq_rewriter& rw) : m_rw(rw), m_subset(rw.u().re) {} ast_manager& seq_split::m() const { return m_rw.m(); } seq_util& seq_split::seq() const { return m_rw.u(); } seq_util::rex& seq_split::re() const { return m_rw.u().re; } // Add unless the (optional) lookahead oracle prunes it. void seq_split::push(split_set& out, split_oracle const& oracle, expr* d, expr* n) const { if (!oracle || oracle(d, n)) out.push_back(split_pair(d, n, m())); } // Cross-product intersection of two split-sets (split algebra): // S1 cap S2 = { | in S1, in S2 }. // Pairs where any component is bottom (the empty regex) are dropped. bool seq_split::intersect(split_set const& s1, split_set const& s2, split_set& result, unsigned threshold, split_oracle const& oracle) { const seq_util::rex& r = re(); for (auto const& p1 : s1) { for (auto const& p2 : s2) { if (r.is_empty(p1.m_d) || r.is_empty(p2.m_d) || r.is_empty(p1.m_n) || r.is_empty(p2.m_n)) continue; const expr_ref di(m_rw.mk_regex_inter_normalize(p1.m_d, p2.m_d), m()); const expr_ref ni(m_rw.mk_regex_inter_normalize(p1.m_n, p2.m_n), m()); push(result, oracle, di, ni); if (result.size() > threshold) return false; } } return true; } // Complement of a split-set via De Morgan: ~S = cap_{s in S} ~s with // ~ = { <~D, .*>, <.*, ~N> } and ~{} = { <.*, .*> }. // May produce up to 2^|sp| pairs (bounded by the threshold). A threshold // overrun must abort entirely: a partial fold is a strictly weaker (unsound) // split-set, since each ~sp[i] further constrains ~S. bool seq_split::complement(sort* seq_sort, split_set const& sp, split_set& result, unsigned threshold, split_oracle const& oracle) { seq_util::rex& r = re(); sort* re_sort = r.mk_re(seq_sort); const expr_ref full(r.mk_full_seq(re_sort), m()); // .* if (sp.empty()) { // ~{} = <.*, .*> push(result, oracle, full, full); return true; } // The acc/next pairs carry genuine output-orientation N components (the De // Morgan ~ = {<~D,.*>, <.*,~N>}), so the oracle prunes them soundly and // keeps the 2^|sp| fold from blowing up. split_set acc; push(acc, oracle, r.mk_complement(sp[0].m_d), full); push(acc, oracle, full, r.mk_complement(sp[0].m_n)); for (unsigned i = 1; i < sp.size(); ++i) { split_set next; push(next, oracle, r.mk_complement(sp[i].m_d), full); push(next, oracle, full, r.mk_complement(sp[i].m_n)); split_set tmp; if (!intersect(acc, next, tmp, threshold, oracle)) return false; acc = std::move(tmp); if (acc.empty()) // intersection empty => ~S is empty break; if (acc.size() > threshold) return false; } result.append(acc); return true; } bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mode mode, split_oracle const& oracle) { SASSERT(r); seq_util& sq = seq(); seq_util::rex& rex = re(); ast_manager& mm = m(); sort* seq_sort = nullptr; if (!sq.is_re(r, seq_sort)) return false; // bottom: sigma(empty) = {} (the empty split-set) if (rex.is_empty(r)) return true; // epsilon: sigma(eps) = { } if (rex.is_epsilon(r)) { const expr_ref eps(rex.mk_epsilon(seq_sort), mm); push(result, oracle, eps, eps); return true; } expr* a = nullptr, *b = nullptr; // to_re(s): split the literal word s at every position. expr* s = nullptr; if (rex.is_to_re(r, s)) { zstring str; if (sq.str.is_string(s, str)) { for (unsigned i = 0; i <= str.length(); ++i) { const expr_ref p(rex.mk_to_re(sq.str.mk_string(str.extract(0, i))), mm); const expr_ref q(rex.mk_to_re(sq.str.mk_string(str.extract(i, str.length() - i))), mm); push(result, oracle, p, q); } return true; } // a single symbolic unit behaves like one token: { , } if (sq.str.is_unit(s)) { const expr_ref ex(r, mm); const expr_ref eps(rex.mk_epsilon(seq_sort), mm); push(result, oracle, eps, ex); push(result, oracle, ex, eps); return true; } // to_re over a non-literal sequence: not handled. return false; } // single-character class alpha (., [lo-hi], of_pred): // sigma(alpha) = { , } if (rex.is_full_char(r) || rex.is_range(r) || rex.is_of_pred(r)) { const expr_ref ex(r, mm); const expr_ref eps(rex.mk_epsilon(seq_sort), mm); push(result, oracle, eps, ex); push(result, oracle, ex, eps); return true; } // .* : sigma(.*) = { <.*, .*> } if (rex.is_full_seq(r)) { const expr_ref ex(r, mm); push(result, oracle, ex, ex); return true; } // union: sigma(r0 | ... | r_{n-1}) = U sigma(ri) (re.union may be n-ary) if (rex.is_union(r)) { app* ap = to_app(r); for (unsigned i = 0; i < ap->get_num_args(); ++i) { if (!compute(ap->get_arg(i), result, threshold, mode, oracle)) return false; } return true; } // concat: sigma(r0...r_{n-1}) = U_i (r0...r_{i-1}) . sigma(ri) . (r_{i+1}...r_{n-1}) // (re.++ may be n-ary) if (rex.is_concat(r)) { app* ap = to_app(r); const unsigned n = ap->get_num_args(); for (unsigned i = 0; i < n; ++i) { // Sound to pass the oracle into the sub-computation: N_inner.Sigma* // over-approximates the final N_inner.right, so a prune here is a // prune of the final pair too (prefix-compatible test). split_set sigma_arg; if (!compute(ap->get_arg(i), sigma_arg, threshold, mode, oracle)) return false; expr_ref left(mm), right(mm); if (i == 0) left = rex.mk_epsilon(seq_sort); else { for (unsigned j = 0; j < i; ++j) { expr* arg = ap->get_arg(j); left = left ? expr_ref(rex.mk_concat(left, arg), mm) : expr_ref(arg, mm); } } if (i == n - 1) right = rex.mk_epsilon(seq_sort); else { right = ap->get_arg(i + 1); for (unsigned j = i + 2; j < n; ++j) { expr* arg = ap->get_arg(j); right = rex.mk_concat(right, arg); } } for (auto const& [d, nn] : sigma_arg) { const expr_ref p = m_rw.mk_re_append(left, d); const expr_ref q = m_rw.mk_re_append(nn, right); push(result, oracle, p, q); } } return true; } // star: sigma(a*) = { } cup a*.sigma(a).a* if (rex.is_star(r, a)) { const expr_ref eps(rex.mk_epsilon(seq_sort), mm); push(result, oracle, eps, eps); split_set sa; if (!compute(a, sa, threshold, mode, oracle)) return false; for (auto const& [d, n] : sa) { const expr_ref p = m_rw.mk_re_append(r, d); // a*.D const expr_ref q = m_rw.mk_re_append(n, r); // N.a* push(result, oracle, p, q); } return true; } // plus: a+ = a.a* ; sigma(a+) = a*.sigma(a).a* (star rule without ) if (rex.is_plus(r, a)) { const expr_ref star(rex.mk_star(a), mm); // a* split_set sa; if (!compute(a, sa, threshold, mode, oracle)) return false; for (auto const& [d, n] : sa) { const expr_ref p = m_rw.mk_re_append(star, d); const expr_ref q = m_rw.mk_re_append(n, star); push(result, oracle, p, q); } return true; } // intersection: sigma(r0 & ... & r_{n-1}) = cap sigma(ri) (re.inter may be n-ary) if (rex.is_intersection(r)) { if (mode == split_mode::weak) return false; app* ap = to_app(r); const unsigned n = ap->get_num_args(); split_set current; if (!compute(ap->get_arg(0), current, threshold, mode, oracle)) return false; // A give-up on any conjunct must propagate as a give-up: silently treating // it as the empty split-set would collapse the whole intersection to bottom // and be misreported as an (unsound) conflict. for (unsigned i = 1; i < n && !current.empty(); ++i) { split_set arg_i, tmp; if (!compute(ap->get_arg(i), arg_i, threshold, mode, oracle)) return false; if (!intersect(current, arg_i, tmp, threshold, oracle)) return false; current = std::move(tmp); } result.append(current); return true; } // complement: sigma(~a) = ~sigma(a). // The body is computed WITHOUT the oracle (the body's pairs are inverted, so // their N is unrelated to the output N); the oracle is re-applied in complement(). if (rex.is_complement(r, a)) { if (mode == split_mode::weak) return false; split_set sa; if (!compute(a, sa, threshold, mode)) return false; return complement(seq_sort, sa, result, threshold, oracle); } // difference: a \ b = a & ~b ; sigma(a \ b) = sigma(a) cap ~sigma(b). // sigma(b) (used only inside the complement) is computed WITHOUT the oracle. if (rex.is_diff(r, a, b)) { if (mode == split_mode::weak) return false; split_set sa, sb, sb_compl, tmp; if (!compute(a, sa, threshold, mode, oracle)) return false; if (!compute(b, sb, threshold, mode)) return false; if (!complement(seq_sort, sb, sb_compl, threshold, oracle)) return false; if (!intersect(sa, sb_compl, tmp, threshold, oracle)) return false; result.append(tmp); return true; } // bounded loop / ite / other: not handled (paper "v1: bail"). TRACE(seq, tout << "seq_split: unsupported regex " << mk_pp(r, mm) << "\n";); return false; } // same-D / same-N merge (paper eqs. 1 & 2): // { , } -> (by_left = true, group by D) // { , } -> (by_left = false, group by N) // Only fires on syntactically-identical (perfectly-shared) key components, so // it is a conservative instance of the rule. void seq_split::merge_by(split_set& pairs, const bool by_left) const { ast_manager& mm = m(); obj_map idx; // key component -> position in `out` split_set out; for (auto const& p : pairs) { expr* key = by_left ? p.m_d.get() : p.m_n.get(); expr* other = by_left ? p.m_n.get() : p.m_d.get(); unsigned pos; if (idx.find(key, pos)) { expr* prev = by_left ? out[pos].m_n.get() : out[pos].m_d.get(); seq_rewriter rw(m()); const expr_ref u(m_rw.mk_regex_union_normalize(prev, other), mm); if (by_left) out[pos].m_n = u; else out[pos].m_d = u; } else { idx.insert(key, out.size()); out.push_back(p); } } pairs.swap(out); } void seq_split::simplify(split_set& pairs) { seq_util::rex& r = re(); // 1. drop pairs with a bottom (empty-language) component. unsigned w = 0; for (unsigned i = 0; i < pairs.size(); ++i) { if (r.is_empty(pairs[i].m_d) || r.is_empty(pairs[i].m_n)) continue; if (w != i) pairs[w] = pairs[i]; ++w; } pairs.shrink(w); if (pairs.size() <= 1) return; // 2. same-D / same-N merge rules. merge_by(pairs, true); merge_by(pairs, false); if (pairs.size() <= 1) return; // 3. subsumption: drop when L(D_i) subseteq L(D_j) and // L(N_i) subseteq L(N_j) for some kept j. seq_subset is conservative // (returns true only for definite containment), so we never drop a // needed split. //if (pairs.size() > 64) // return; struct row { expr* d; expr* n; unsigned idx; }; vector rows; for (unsigned i = 0; i < pairs.size(); ++i) rows.push_back({ pairs[i].m_d.get(), pairs[i].m_n.get(), i }); auto subsumes = [&](row const& a, row const& b) { return m_subset.is_subset(b.d, a.d) && m_subset.is_subset(b.n, a.n); }; vector kept; for (row const& row_r : rows) { bool redundant = false; for (row const& k : kept) if (subsumes(k, row_r)) { redundant = true; break; } if (redundant) continue; // drop already-kept rows strictly subsumed by row_r unsigned kw = 0; for (unsigned t = 0; t < kept.size(); ++t) { if (subsumes(row_r, kept[t])) continue; kept[kw++] = kept[t]; } kept.shrink(kw); kept.push_back(row_r); } split_set result; for (row const& k : kept) result.push_back(pairs[k.idx]); pairs.swap(result); }