3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2026-06-19 07:06:28 +00:00

Porting seq_split to master

This commit is contained in:
CEisenhofer 2026-06-12 15:18:14 +02:00
parent e05ebe8bef
commit 871a2b3bc8
5 changed files with 682 additions and 64 deletions

View file

@ -40,6 +40,7 @@ z3_add_component(rewriter
seq_axioms.cpp
seq_eq_solver.cpp
seq_subset.cpp
seq_split.cpp
seq_rewriter.cpp
seq_regex_bisim.cpp
seq_skolem.cpp

View file

@ -3999,58 +3999,54 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) {
//do not concatenate [], it is a deade-end
return result;
}
else {
// Classical Brzozowski union: keep the derivative tree free of
// antimirov-union nodes so the bisimulation procedure sees a
// single regex tree whose leaves are XOR pairs.
return mk_der_union(result, mk_der_concat(is_n, dr2));
}
// Classical Brzozowski union: keep the derivative tree free of
// antimirov-union nodes so the bisimulation procedure sees a
// single regex tree whose leaves are XOR pairs.
return mk_der_union(result, mk_der_concat(is_n, dr2));
}
else if (re().is_star(r, r1)) {
if (re().is_star(r, r1)) {
return mk_der_concat(mk_derivative_rec(ele, r1), r);
}
else if (re().is_plus(r, r1)) {
if (re().is_plus(r, r1)) {
expr_ref star(re().mk_star(r1), m());
return mk_derivative_rec(ele, star);
}
else if (re().is_union(r, r1, r2)) {
if (re().is_union(r, r1, r2)) {
return mk_der_union(mk_derivative_rec(ele, r1), mk_derivative_rec(ele, r2));
}
else if (re().is_intersection(r, r1, r2)) {
if (re().is_intersection(r, r1, r2)) {
return mk_der_inter(mk_derivative_rec(ele, r1), mk_derivative_rec(ele, r2));
}
else if (re().is_diff(r, r1, r2)) {
if (re().is_diff(r, r1, r2)) {
return mk_der_inter(mk_derivative_rec(ele, r1), mk_der_compl(mk_derivative_rec(ele, r2)));
}
else if (re().is_xor(r, r1, r2)) {
if (re().is_xor(r, r1, r2)) {
return mk_der_xor(mk_derivative_rec(ele, r1), mk_derivative_rec(ele, r2));
}
else if (m().is_ite(r, p, r1, r2)) {
if (m().is_ite(r, p, r1, r2)) {
// there is no BDD normalization here
result = m().mk_ite(p, mk_derivative_rec(ele, r1), mk_derivative_rec(ele, r2));
return result;
}
else if (re().is_opt(r, r1)) {
if (re().is_opt(r, r1)) {
return mk_derivative_rec(ele, r1);
}
else if (re().is_complement(r, r1)) {
if (re().is_complement(r, r1)) {
return mk_der_compl(mk_derivative_rec(ele, r1));
}
else if (re().is_loop(r, r1, lo)) {
if (re().is_loop(r, r1, lo)) {
if (lo > 0) {
lo--;
}
result = mk_derivative_rec(ele, r1);
//do not concatenate with [] (emptyset)
// do not concatenate with [] (emptyset)
if (re().is_empty(result)) {
return result;
}
else {
//do not create loop r1{0,}, instead create r1*
return mk_der_concat(result, (lo == 0 ? re().mk_star(r1) : re().mk_loop(r1, lo)));
}
// do not create loop r1{0,}, instead create r1*
return mk_der_concat(result, (lo == 0 ? re().mk_star(r1) : re().mk_loop(r1, lo)));
}
else if (re().is_loop(r, r1, lo, hi)) {
if (re().is_loop(r, r1, lo, hi)) {
if (hi == 0) {
return mk_empty();
}
@ -4059,19 +4055,16 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) {
lo--;
}
result = mk_derivative_rec(ele, r1);
//do not concatenate with [] (emptyset) or handle the rest of the loop if no more iterations remain
// do not concatenate with [] (emptyset) or handle the rest of the loop if no more iterations remain
if (re().is_empty(result) || hi == 0) {
return result;
}
else {
return mk_der_concat(result, re().mk_loop_proper(r1, lo, hi));
}
return mk_der_concat(result, re().mk_loop_proper(r1, lo, hi));
}
else if (re().is_full_seq(r) ||
re().is_empty(r)) {
if (re().is_full_seq(r) || re().is_empty(r)) {
return expr_ref(r, m());
}
else if (re().is_to_re(r, r1)) {
if (re().is_to_re(r, r1)) {
// r1 is a string here (not a regexp)
expr_ref hd(m()), tl(m());
if (get_head_tail(r1, hd, tl)) {
@ -4084,16 +4077,16 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) {
result = mk_der_concat(result, r1);
return result;
}
else if (str().is_empty(r1)) {
//observe: str().is_empty(r1) checks that r = () = epsilon
//while mk_empty() = [], because deriv(epsilon) = [] = nothing
if (str().is_empty(r1)) {
// observe: str().is_empty(r1) checks that r = () = epsilon
// while mk_empty() = [], because deriv(epsilon) = [] = nothing
return mk_empty();
}
else if (str().is_itos(r1)) {
if (str().is_itos(r1)) {
//
// here r1 = (str.from_int r2) and r2 is non-ground
// here r1 = (str.from_int r2) and r2 is non-ground
// or else the expression would have been simplified earlier
// so r1 must be nonempty and must consists of decimal digits
// so r1 must be nonempty and must consists of decimal digits
// '0' <= elem <= '9'
// if ((isdigit ele) and (ele = (hd r1))) then (to_re (tl r1)) else []
//
@ -4105,19 +4098,17 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) {
auto a3 = m().mk_eq(hd, ele);
auto inner = m().mk_and(a2, a3);
m_br.mk_and(a0, a1, inner, result);
tl = re().mk_to_re(mk_seq_rest(r1));
return re_and(result, tl);
}
else {
// recall: [] denotes the empty language (nothing) regex, () denotes epsilon or empty sequence
// construct the term (if (r1 != () and (ele = (first r1)) then (to_re (rest r1)) else []))
hd = mk_seq_first(r1);
m_br.mk_and(m().mk_not(m().mk_eq(r1, str().mk_empty(seq_sort))), m().mk_eq(hd, ele), result);
tl = re().mk_to_re(mk_seq_rest(r1));
return re_and(result, tl);
}
// recall: [] denotes the empty language (nothing) regex, () denotes epsilon or empty sequence
// construct the term (if (r1 != () and (ele = (first r1)) then (to_re (rest r1)) else []))
hd = mk_seq_first(r1);
m_br.mk_and(m().mk_not(m().mk_eq(r1, str().mk_empty(seq_sort))), m().mk_eq(hd, ele), result);
tl = re().mk_to_re(mk_seq_rest(r1));
return re_and(result, tl);
}
else if (re().is_reverse(r, r1)) {
if (re().is_reverse(r, r1)) {
if (re().is_to_re(r1, r2)) {
// First try to extract hd and tl such that r = hd ++ tl and |tl|=1
expr_ref hd(m()), tl(m());
@ -4129,22 +4120,20 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) {
result = mk_der_concat(result, re().mk_reverse(re().mk_to_re(hd)));
return result;
}
else if (str().is_empty(r2)) {
if (str().is_empty(r2)) {
return mk_empty();
}
else {
// construct the term (if (r2 != () and (ele = (last r2)) then reverse(to_re (butlast r2)) else []))
// hd = first of reverse(r2) i.e. last of r2
// tl = rest of reverse(r2) i.e. butlast of r2
//hd = str().mk_nth_i(r2, m_autil.mk_sub(str().mk_length(r2), one()));
hd = mk_seq_last(r2);
// factor nested constructor calls to enforce deterministic argument evaluation order
auto a_non_empty = m().mk_not(m().mk_eq(r2, str().mk_empty(seq_sort)));
auto a_eq = m().mk_eq(hd, ele);
m_br.mk_and(a_non_empty, a_eq, result);
tl = re().mk_to_re(mk_seq_butlast(r2));
return re_and(result, re().mk_reverse(tl));
}
// construct the term (if (r2 != () and (ele = (last r2)) then reverse(to_re (butlast r2)) else []))
// hd = first of reverse(r2) i.e. last of r2
// tl = rest of reverse(r2) i.e. butlast of r2
// hd = str().mk_nth_i(r2, m_autil.mk_sub(str().mk_length(r2), one()));
hd = mk_seq_last(r2);
// factor nested constructor calls to enforce deterministic argument evaluation order
auto a_non_empty = m().mk_not(m().mk_eq(r2, str().mk_empty(seq_sort)));
auto a_eq = m().mk_eq(hd, ele);
m_br.mk_and(a_non_empty, a_eq, result);
tl = re().mk_to_re(mk_seq_butlast(r2));
return re_and(result, re().mk_reverse(tl));
}
}
else if (re().is_range(r, r1, r2)) {
@ -4163,11 +4152,9 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) {
result = mk_der_inter(p1, p2);
return result;
}
else {
return mk_empty();
}
return mk_empty();
}
expr* e1 = nullptr, * e2 = nullptr;
expr *e1 = nullptr, *e2 = nullptr;
if (str().is_unit(r1, e1) && str().is_unit(r2, e2)) {
SASSERT(u().is_char(e1));
// Use mk_der_cond to normalize
@ -4185,7 +4172,7 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) {
}
else if (re().is_of_pred(r, p)) {
array_util array(m());
expr* args[2] = { p, ele };
expr *args[2] = {p, ele};
result = array.mk_select(2, args);
// Use mk_der_cond to normalize
STRACE(seq_verbose, tout << "deriv of_pred" << std::endl;);

View file

@ -18,6 +18,7 @@ Notes:
--*/
#pragma once
#include "seq_split.h"
#include "ast/seq_decl_plugin.h"
#include "ast/ast_pp.h"
#include "ast/arith_decl_plugin.h"
@ -130,6 +131,7 @@ class seq_rewriter {
seq_util m_util;
seq_subset m_subset;
seq_split m_split;
arith_util m_autil;
bool_rewriter m_br;
// re2automaton m_re2aut;
@ -346,7 +348,7 @@ class seq_rewriter {
public:
seq_rewriter(ast_manager & m, params_ref const & p = params_ref()):
m_util(m), m_subset(m_util.re), m_autil(m), m_br(m, p), // m_re2aut(m),
m_util(m), m_subset(m_util.re), m_split(*this), m_autil(m), m_br(m, p), // m_re2aut(m),
m_op_cache(m), m_es(m),
m_lhs(m), m_rhs(m), m_coalesce_chars(true) {
}
@ -397,6 +399,15 @@ public:
return result;
}
// Split decomposition (sigma) of a regex; see seq_split.h. `oracle` (optional)
// prunes non-viable splits during generation.
bool split(expr* r, split_set& out, unsigned threshold,
const split_mode mode = split_mode::strong, split_oracle const& oracle = {}) {
return m_split.compute(r, out, threshold, mode, oracle);
}
void simplify_split(split_set& s) { m_split.simplify(s); }
/**
* check if regular expression is of the form all ++ s ++ all ++ t + u ++ all, where, s, t, u are sequences
*/

View file

@ -0,0 +1,501 @@
/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
seq_split.cpp
Abstract:
Regex split decomposition (the split function sigma). See seq_split.h.
Author:
Nikolaj Bjorner (nbjorner) 2026-6-10
Clemens Eisenhofer 2026-6-10
--*/
#include "ast/rewriter/seq_split.h"
#include "ast/rewriter/seq_rewriter.h"
#include "ast/ast_pp.h"
#include "util/obj_hashtable.h"
#include "util/stack.h"
seq_split::seq_split(seq_rewriter& rw) :
m_rw(rw), m_subset(rw.u().re) {}
ast_manager& seq_split::m() const { return m_rw.m(); }
seq_util& seq_split::seq() const { return m_rw.u(); }
seq_util::rex& seq_split::re() const { return m_rw.u().re; }
// Add <d, n> unless the (optional) lookahead oracle prunes it.
void seq_split::push(split_set& out, split_oracle const& oracle, expr* d, expr* n) const {
if (!oracle || oracle(d, n))
out.push_back(split_pair(d, n, m()));
}
// Cross-product intersection of two split-sets (split algebra):
// S1 cap S2 = { <D1 cap D2, N1 cap N2> | <D1,N1> in S1, <D2,N2> in S2 }.
// Pairs where any component is bottom (the empty regex) are dropped.
bool seq_split::intersect(split_set const& s1, split_set const& s2, split_set& result,
unsigned threshold, split_oracle const& oracle) const {
const seq_util::rex& r = re();
for (auto const& p1 : s1) {
for (auto const& p2 : s2) {
if (r.is_empty(p1.m_d) || r.is_empty(p2.m_d) ||
r.is_empty(p1.m_n) || r.is_empty(p2.m_n))
continue;
const expr_ref di(m_rw.mk_regex_inter_normalize(p1.m_d, p2.m_d), m());
const expr_ref ni(m_rw.mk_regex_inter_normalize(p1.m_n, p2.m_n), m());
push(result, oracle, di, ni);
if (result.size() > threshold)
return false;
}
}
return true;
}
// Complement of a split-set via De Morgan: ~S = cap_{s in S} ~s with
// ~<D,N> = { <~D, .*>, <.*, ~N> } and ~{} = { <.*, .*> }.
// May produce up to 2^|sp| pairs (bounded by the threshold). A threshold
// overrun must abort entirely: a partial fold is a strictly weaker (unsound)
// split-set, since each ~sp[i] further constrains ~S.
bool seq_split::complement(sort* seq_sort, split_set const& sp, split_set& result,
const unsigned threshold, split_oracle const& oracle) const {
seq_util::rex& r = re();
sort* re_sort = r.mk_re(seq_sort);
const expr_ref full(r.mk_full_seq(re_sort), m()); // .*
if (sp.empty()) { // ~{} = <.*, .*>
push(result, oracle, full, full);
return true;
}
// The acc/next pairs carry genuine output-orientation N components (the De
// Morgan ~<D,N> = {<~D,.*>, <.*,~N>}), so the oracle prunes them soundly and
// keeps the 2^|sp| fold from blowing up.
split_set acc;
push(acc, oracle, r.mk_complement(sp[0].m_d), full);
push(acc, oracle, full, r.mk_complement(sp[0].m_n));
for (unsigned i = 1; i < sp.size(); ++i) {
split_set next;
push(next, oracle, r.mk_complement(sp[i].m_d), full);
push(next, oracle, full, r.mk_complement(sp[i].m_n));
split_set tmp;
if (!intersect(acc, next, tmp, threshold, oracle))
return false;
acc = std::move(tmp);
if (acc.empty()) // intersection empty => ~S is empty
break;
if (acc.size() > threshold)
return false;
}
result.append(acc);
return true;
}
bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mode mode,
split_oracle const& oracle) {
SASSERT(r);
seq_util& sq = seq();
seq_util::rex& rex = re();
ast_manager& mm = m();
sort* seq_sort = nullptr;
if (!sq.is_re(r, seq_sort))
return false;
// bottom: sigma(empty) = {} (the empty split-set)
if (rex.is_empty(r))
return true;
// epsilon: sigma(eps) = { <eps, eps> }
if (rex.is_epsilon(r)) {
const expr_ref eps(rex.mk_epsilon(seq_sort), mm);
push(result, oracle, eps, eps);
return true;
}
expr* a = nullptr, *b = nullptr;
// to_re(s): split the literal word s at every position.
expr* s = nullptr;
if (rex.is_to_re(r, s)) {
zstring str;
if (sq.str.is_string(s, str)) {
for (unsigned i = 0; i <= str.length(); ++i) {
const expr_ref p(rex.mk_to_re(sq.str.mk_string(str.extract(0, i))), mm);
const expr_ref q(rex.mk_to_re(sq.str.mk_string(str.extract(i, str.length() - i))), mm);
push(result, oracle, p, q);
}
return true;
}
// a single symbolic unit behaves like one token: { <eps, u>, <u, eps> }
if (sq.str.is_unit(s)) {
const expr_ref ex(r, mm);
const expr_ref eps(rex.mk_epsilon(seq_sort), mm);
push(result, oracle, eps, ex);
push(result, oracle, ex, eps);
return true;
}
// to_re over a non-literal sequence: not handled.
return false;
}
// single-character class alpha (., [lo-hi], of_pred):
// sigma(alpha) = { <eps, alpha>, <alpha, eps> }
if (rex.is_full_char(r) || rex.is_range(r) || rex.is_of_pred(r)) {
const expr_ref ex(r, mm);
const expr_ref eps(rex.mk_epsilon(seq_sort), mm);
push(result, oracle, eps, ex);
push(result, oracle, ex, eps);
return true;
}
// .* : sigma(.*) = { <.*, .*> }
if (rex.is_full_seq(r)) {
const expr_ref ex(r, mm);
push(result, oracle, ex, ex);
return true;
}
// union: sigma(r0 | ... | r_{n-1}) = U sigma(ri) (re.union may be n-ary)
if (rex.is_union(r)) {
app* ap = to_app(r);
for (unsigned i = 0; i < ap->get_num_args(); ++i) {
if (!compute(ap->get_arg(i), result, threshold, mode, oracle))
return false;
}
return true;
}
// concat: sigma(r0...r_{n-1}) = U_i (r0...r_{i-1}) . sigma(ri) . (r_{i+1}...r_{n-1})
// (re.++ may be n-ary)
if (rex.is_concat(r)) {
app* ap = to_app(r);
const unsigned n = ap->get_num_args();
for (unsigned i = 0; i < n; ++i) {
// Sound to pass the oracle into the sub-computation: N_inner.Sigma*
// over-approximates the final N_inner.right, so a prune here is a
// prune of the final pair too (prefix-compatible test).
split_set sigma_arg;
if (!compute(ap->get_arg(i), sigma_arg, threshold, mode, oracle))
return false;
expr_ref left(mm), right(mm);
if (i == 0)
left = rex.mk_epsilon(seq_sort);
else {
for (unsigned j = 0; j < i; ++j) {
expr* arg = ap->get_arg(j);
left = left ? expr_ref(rex.mk_concat(left, arg), mm) : expr_ref(arg, mm);
}
}
if (i == n - 1)
right = rex.mk_epsilon(seq_sort);
else {
right = ap->get_arg(i + 1);
for (unsigned j = i + 2; j < n; ++j) {
expr* arg = ap->get_arg(j);
right = rex.mk_concat(right, arg);
}
}
for (auto const& [d, nn] : sigma_arg) {
const expr_ref p = m_rw.mk_re_append(left, d);
const expr_ref q = m_rw.mk_re_append(nn, right);
push(result, oracle, p, q);
}
}
return true;
}
// star: sigma(a*) = { <eps, eps> } cup a*.sigma(a).a*
if (rex.is_star(r, a)) {
const expr_ref eps(rex.mk_epsilon(seq_sort), mm);
push(result, oracle, eps, eps);
split_set sa;
if (!compute(a, sa, threshold, mode, oracle))
return false;
for (auto const& [d, n] : sa) {
const expr_ref p = m_rw.mk_re_append(r, d); // a*.D
const expr_ref q = m_rw.mk_re_append(n, r); // N.a*
push(result, oracle, p, q);
}
return true;
}
// plus: a+ = a.a* ; sigma(a+) = a*.sigma(a).a* (star rule without <eps,eps>)
if (rex.is_plus(r, a)) {
const expr_ref star(rex.mk_star(a), mm); // a*
split_set sa;
if (!compute(a, sa, threshold, mode, oracle))
return false;
for (auto const& [d, n] : sa) {
const expr_ref p = m_rw.mk_re_append(star, d);
const expr_ref q = m_rw.mk_re_append(n, star);
push(result, oracle, p, q);
}
return true;
}
// intersection: sigma(r0 & ... & r_{n-1}) = cap sigma(ri) (re.inter may be n-ary)
if (rex.is_intersection(r)) {
if (mode == split_mode::weak)
return false;
app* ap = to_app(r);
const unsigned n = ap->get_num_args();
split_set current;
if (!compute(ap->get_arg(0), current, threshold, mode, oracle))
return false;
// A give-up on any conjunct must propagate as a give-up: silently treating
// it as the empty split-set would collapse the whole intersection to bottom
// and be misreported as an (unsound) conflict.
for (unsigned i = 1; i < n && !current.empty(); ++i) {
split_set arg_i, tmp;
if (!compute(ap->get_arg(i), arg_i, threshold, mode, oracle))
return false;
if (!intersect(current, arg_i, tmp, threshold, oracle))
return false;
current = std::move(tmp);
}
result.append(current);
return true;
}
// complement: sigma(~a) = ~sigma(a).
// The body is computed WITHOUT the oracle (the body's pairs are inverted, so
// their N is unrelated to the output N); the oracle is re-applied in complement().
if (rex.is_complement(r, a)) {
if (mode == split_mode::weak)
return false;
split_set sa;
if (!compute(a, sa, threshold, mode))
return false;
return complement(seq_sort, sa, result, threshold, oracle);
}
// difference: a \ b = a & ~b ; sigma(a \ b) = sigma(a) cap ~sigma(b).
// sigma(b) (used only inside the complement) is computed WITHOUT the oracle.
if (rex.is_diff(r, a, b)) {
if (mode == split_mode::weak)
return false;
split_set sa, sb, sb_compl, tmp;
if (!compute(a, sa, threshold, mode, oracle))
return false;
if (!compute(b, sb, threshold, mode))
return false;
if (!complement(seq_sort, sb, sb_compl, threshold, oracle))
return false;
if (!intersect(sa, sb_compl, tmp, threshold, oracle))
return false;
result.append(tmp);
return true;
}
// bounded loop / ite / other: not handled (paper "v1: bail").
TRACE(seq, tout << "seq_split: unsupported regex " << mk_pp(r, mm) << "\n";);
return false;
}
// same-D / same-N merge (paper eqs. 1 & 2):
// { <D,N>, <D,N'> } -> <D, N | N'> (by_left = true, group by D)
// { <D,N>, <D',N> } -> <D | D', N> (by_left = false, group by N)
// Only fires on syntactically-identical (perfectly-shared) key components, so
// it is a conservative instance of the rule.
void seq_split::merge_by(split_set& pairs, const bool by_left) const {
ast_manager& mm = m();
obj_map<expr, unsigned> idx; // key component -> position in `out`
split_set out;
for (auto const& p : pairs) {
expr* key = by_left ? p.m_d.get() : p.m_n.get();
expr* other = by_left ? p.m_n.get() : p.m_d.get();
unsigned pos;
if (idx.find(key, pos)) {
expr* prev = by_left ? out[pos].m_n.get() : out[pos].m_d.get();
const expr_ref u(m_rw.mk_regex_union_normalize(prev, other), mm);
if (by_left)
out[pos].m_n = u;
else
out[pos].m_d = u;
}
else {
idx.insert(key, out.size());
out.push_back(p);
}
}
pairs.swap(out);
}
void seq_split::simplify(split_set& pairs) const {
seq_util::rex& r = re();
// 1. drop pairs with a bottom (empty-language) component.
unsigned w = 0;
for (unsigned i = 0; i < pairs.size(); ++i) {
if (r.is_empty(pairs[i].m_d) || r.is_empty(pairs[i].m_n))
continue;
if (w != i)
pairs[w] = pairs[i];
++w;
}
pairs.shrink(w);
if (pairs.size() <= 1)
return;
// 2. same-D / same-N merge rules.
merge_by(pairs, true);
merge_by(pairs, false);
if (pairs.size() <= 1)
return;
// 3. subsumption: drop <D_i,N_i> when L(D_i) subseteq L(D_j) and
// L(N_i) subseteq L(N_j) for some kept j. seq_subset is conservative
// (returns true only for definite containment), so we never drop a
// needed split.
//if (pairs.size() > 64)
// return;
struct row { expr* d; expr* n; unsigned idx; };
vector<row> rows;
for (unsigned i = 0; i < pairs.size(); ++i)
rows.push_back({ pairs[i].m_d.get(), pairs[i].m_n.get(), i });
auto subsumes = [&](row const& a, row const& b) {
return m_subset.is_subset(b.d, a.d) && m_subset.is_subset(b.n, a.n);
};
vector<row> kept;
for (row const& row_r : rows) {
bool redundant = false;
for (row const& k : kept)
if (subsumes(k, row_r)) { redundant = true; break; }
if (redundant)
continue;
// drop already-kept rows strictly subsumed by row_r
unsigned kw = 0;
for (unsigned t = 0; t < kept.size(); ++t) {
if (subsumes(row_r, kept[t]))
continue;
kept[kw++] = kept[t];
}
kept.shrink(kw);
kept.push_back(row_r);
}
split_set result;
for (row const& k : kept)
result.push_back(pairs[k.idx]);
pairs.swap(result);
}
std::pair<expr*, expr*> seq_split::split_membership(expr* str, expr* regex, unsigned threshold, split_set& result) const {
ast_manager& m = this->m();
expr_ref_vector tokens(m);
vector<expr*> stack;
stack.push_back(str);
while (!stack.empty()) {
expr* cur = stack.back();
stack.pop_back();
expr* l, *r;
if (seq().str.is_concat(cur, l, r)) {
stack.push_back(r);
stack.push_back(l);
}
else
tokens.push_back(expr_ref(cur, m));
}
SASSERT(!tokens.empty());
expr* first = tokens.get(0);
SASSERT(seq().is_char(first)); // constants are consumed earlier
// Choose the factorization boundary so the tail starts with the
// longest run of concrete characters c.
// This gives the split-engine lookahead oracle the most pruning information.
// head = u' (tokens before the run), tail = c · u''' (tokens from the run onward).
const unsigned total = tokens.size();
unsigned run_start = 0, run_len = 0;
for (unsigned i = 0; i < total; ) {
if (!seq().is_char(tokens.get(i))) {
i++;
continue;
}
unsigned j = i;
while (j < total && seq().is_char(tokens.get(j))) {
j++;
}
if (j - i > run_len) {
run_len = j - i;
run_start = i;
}
i = j;
}
// No constant run => fall back to splitting off the first token.
const unsigned p = run_len == 0 ? 1 : run_start;
SASSERT(p >= 1);
expr* head = first;
for (unsigned i = 1; i < p; i++) {
head = seq().str.mk_concat(head, tokens.get(i));
}
expr* tail = tokens.get(p);
for (unsigned i = p + run_len; i < tokens.size(); i++) {
tail = seq().str.mk_concat(tail, tokens.get(i));
}
SASSERT(head && tail);
// Build the constant lookahead c and (if non-empty) an oracle that
// prunes splits whose postfix cannot match c.
zstring c;
for (unsigned i = 0; i < run_len; ++i) {
expr* ch; unsigned cv;
VERIFY(seq().str.is_unit(tokens.get(run_start + i), ch));
VERIFY(seq().is_const_char(ch, cv));
c = c + zstring(cv);
}
split_oracle oracle;
if (!c.empty())
oracle = [this, &c](expr*, expr* n) { return split_lookahead_viable(n, c); };
// Decompose the regex into a split-set via the shared seq_split engine
if (!m_rw.split(regex, result, threshold, split_mode::strong, oracle)) {
result.clear();
return { nullptr, nullptr };
}
m_rw.simplify_split(result);
// Eagerly consume the constant run c from the tail by taking the c-derivative
// of each postfix
if (!c.empty()) {
unsigned w = 0;
for (unsigned i = 0; i < result.size(); ++i) {
expr* d = result[i].m_n;
for (unsigned k = 0; d && !seq().re.is_empty(d) && k < c.length(); ++k) {
d = m_rw.mk_derivative(seq().mk_char(c[k]), d);
}
SASSERT(d);
if (re().is_empty(d))
continue; // postfix can't start with c => infeasible split, drop
result[w++] = split_pair(result[i].m_d, d, m);
}
result.shrink(w);
}
return { head, tail };
}
bool seq_split::split_lookahead_viable(expr* regex, zstring const& c) const {
SASSERT(regex);
for (unsigned i = 0; i < c.length(); i++) {
if (m().is_true(m_rw.is_nullable(regex)))
return true; // N accepts the prefix c[0..i) => a suffix completes it
regex = m_rw.mk_derivative(seq().mk_char(c[i]), regex);
SASSERT(regex);
if (re().is_empty(regex))
return false; // N went (syntactically) dead before reaching c
}
return !re().is_empty(regex);
}

View file

@ -0,0 +1,118 @@
/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
seq_split.h
Abstract:
Regex split decomposition: the split function sigma from the paper
"Solving by Splitting". For a regular expression r, sigma(r) is a finite
"split-set" of pairs { <D_i, N_i> } such that
u.v in L(r) iff exists i: u in L(D_i) and v in L(N_i).
The split algebra (intersection, De Morgan complement, left/right
concatenation with a regex) and the cardinality-reducing simplification
heuristics (drop bottom, same-D/same-N merge, subsumption via seq_subset)
follow the paper.
Author:
Clemens Eisenhofer 2026-6-10
--*/
#pragma once
#include "ast/seq_decl_plugin.h"
#include "ast/rewriter/seq_subset.h"
#include <functional>
class seq_rewriter;
// An individual split <D, N>: the left (prefix) regex D and right (suffix)
// regex N. u.v in L(r) for this split iff u in L(D) and v in L(N).
struct split_pair {
expr_ref m_d;
expr_ref m_n;
split_pair(expr* d, expr* n, ast_manager& m) : m_d(d, m), m_n(n, m) {
SASSERT(d && n);
}
};
// A split-set is a union of individual splits.
typedef vector<split_pair> split_set;
// Controls how aggressively sigma expands the Boolean-closure cases:
// strong - fully expand complement / intersection via the split algebra
// (De Morgan / cross product). This is the behaviour the nseq
// solver relies on.
// weak - do not perform the (potentially 2^k) Boolean-closure expansion;
// give up (return false) on complement / intersection instead.
enum class split_mode { weak, strong };
// Optional lookahead oracle. Called for each candidate split <D, N> as it is
// generated; returns true to keep it, false to prune it. An empty oracle (the
// default) keeps everything, so sigma is unchanged. See seq_split::compute.
typedef std::function<bool(expr* D, expr* N)> split_oracle;
class seq_split {
seq_rewriter& m_rw; // for mk_re_append + manager / seq_util access
seq_subset m_subset; // language-subset checks for subsumption
ast_manager& m() const;
seq_util& seq() const;
seq_util::rex& re() const;
// Push <d, n> onto `out`, unless `oracle` rejects it.
void push(split_set& out, split_oracle const& oracle, expr* d, expr* n) const;
// S1 cap S2 = { <D1 cap D2, N1 cap N2> } dropping any pair with a bottom
// component (and any rejected by `oracle`). Returns false on threshold overrun.
bool intersect(split_set const& s1, split_set const& s2, split_set& result,
unsigned threshold, split_oracle const& oracle) const;
// De Morgan complement of a split-set: ~S = cap_{s in S} ~s with
// ~<D,N> = { <~D, .*>, <.*, ~N> } and ~{} = { <.*, .*> }.
bool complement(sort* seq_sort, split_set const& sp, split_set& result,
unsigned threshold, split_oracle const& oracle) const;
// same-D / same-N merge: groups pairs that share a (syntactically identical)
// left (resp. right) component and unions the other component.
void merge_by(split_set& pairs, bool by_left) const;
public:
explicit seq_split(seq_rewriter& rw);
// Compute sigma(r), appending to `out` (does not clear it). `threshold`
// bounds the number of produced splits; an overrun, an unsupported regex
// shape (bounded loop / ite), or a Boolean-closure case in weak mode makes
// it return false ("give up").
//
// `oracle` (optional) prunes non-viable splits *during* generation. It must
// be sound to apply at every generation step: a candidate N can still gain a
// prefix from a factor appended to its right later (concat/star), so the
// oracle must use a "prefix-compatible" test (prune only when N can never
// match the lookahead, even partially), NOT a strict "starts-with" test.
// The complement body is computed WITHOUT the oracle (inverted orientation);
// the oracle is re-applied to the complement's output fold.
bool compute(expr* r, split_set& out, unsigned threshold,
split_mode mode = split_mode::strong, split_oracle const& oracle = {});
// In-place simplification of a split-set: drop bottom components, apply the
// same-D / same-N merge rules, and drop splits subsumed by another (using
// seq_subset). Size-capped to keep the O(n^2) subsumption affordable.
void simplify(split_set& s) const;
// decompose a membership constraint into a set of pairs of regex splits
std::pair<expr*, expr*> split_membership(expr* str, expr* regex, unsigned threshold, split_set& result) const;
// Lookahead oracle for the split engine: is the split's right component
// `n_regex` prefix-compatible with the constant character sequence `c`?
// This is sound to apply during split generation — it never drops a viable split.
// Thus, it might not eliminate all cases in order to stay sound
bool split_lookahead_viable(expr* regex, zstring const& c) const;
};