3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2026-07-01 12:58:54 +00:00

Porting seq_split to master (#9840)

Co-authored-by: Nikolaj Bjorner <nbjorner@microsoft.com>
This commit is contained in:
Clemens Eisenhofer 2026-06-30 19:18:28 +02:00 committed by GitHub
parent c22a7bac7c
commit b3143e759b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 1545 additions and 1 deletions

View file

@ -41,6 +41,7 @@ z3_add_component(rewriter
seq_eq_solver.cpp
seq_derive.cpp
seq_subset.cpp
seq_split.cpp
seq_derive.cpp
seq_range_collapse.cpp
seq_range_predicate.cpp

View file

@ -18,6 +18,7 @@ Notes:
--*/
#pragma once
#include "seq_split.h"
#include "ast/seq_decl_plugin.h"
#include "ast/rewriter/seq_derive.h"
#include "ast/ast_pp.h"
@ -133,6 +134,7 @@ class seq_rewriter {
seq_util m_util;
seq_subset m_subset;
seq_split m_split;
arith_util m_autil;
bool_rewriter m_br;
seq::derive m_derive;
@ -332,7 +334,7 @@ class seq_rewriter {
public:
seq_rewriter(ast_manager & m, params_ref const & p = params_ref()):
m_util(m), m_subset(m_util.re), m_autil(m), m_br(m, p), m_derive(m, *this),
m_util(m), m_subset(m_util.re), m_split(*this), m_autil(m), m_br(m, p), m_derive(m, *this), // m_re2aut(m),
m_op_cache(m), m_es(m),
m_lhs(m), m_rhs(m) {
}
@ -411,6 +413,20 @@ public:
return result;
}
// Split decomposition (sigma) of a regex; see seq_split.h. `oracle` (optional)
// prunes non-viable splits during generation.
bool split(expr* r, split_set& out, unsigned threshold,
const split_mode mode = split_mode::strong, split_oracle const& oracle = {}) {
return m_split.compute(r, out, threshold, mode, oracle);
}
void simplify_split(split_set& s) { m_split.simplify(s); }
// decompose a membership constraint into a set of pairs of regex splits
std::pair<expr_ref, expr_ref> split_membership(expr* str, expr* regex, unsigned threshold, split_set& result) const {
return m_split.split_membership(str, regex, threshold, result);
}
/**
* check if regular expression is of the form all ++ s ++ all ++ t + u ++ all, where, s, t, u are sequences
*/

View file

@ -0,0 +1,820 @@
/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
seq_split.cpp
Abstract:
Regex split decomposition (the split function sigma). See seq_split.h.
Author:
Clemens Eisenhofer 2026-6-10
--*/
#include "ast/rewriter/seq_split.h"
#include "ast/rewriter/seq_rewriter.h"
#include "ast/ast_pp.h"
#include "util/obj_hashtable.h"
#include "util/stack.h"
seq_split::seq_split(seq_rewriter& rw) :
m(rw.m()), m_rw(rw), m_subset(rw.u().re),
m_set_sort(m),
m_d_empty(m), m_d_single(m), m_d_fromre(m), m_d_union(m),
m_d_inter(m), m_d_compl(m), m_d_lcat(m), m_d_rcat(m),
m_empty_app(m) {}
// ---------------------------------------------------------------------------
// Suspended split-set representation (split algebra over `expr`).
// ---------------------------------------------------------------------------
void seq_split::ensure_decls(sort* seq_sort) {
SASSERT(seq_sort);
if (m_seq_sort == seq_sort)
return;
sort* re_sort = re().mk_re(seq_sort);
m_set_sort = m.mk_uninterpreted_sort(symbol("seq.split.set"));
sort* ss = m_set_sort;
m_d_empty = m.mk_func_decl(symbol("seq.split.empty"), 0u, nullptr, ss);
m_d_single = m.mk_func_decl(symbol("seq.split.single"), re_sort, re_sort, ss);
m_d_fromre = m.mk_func_decl(symbol("seq.split.from_re"), re_sort, ss);
m_d_union = m.mk_func_decl(symbol("seq.split.union"), ss, ss, ss);
m_d_inter = m.mk_func_decl(symbol("seq.split.inter"), ss, ss, ss);
m_d_compl = m.mk_func_decl(symbol("seq.split.compl"), ss, ss);
m_d_lcat = m.mk_func_decl(symbol("seq.split.lcat"), re_sort, ss, ss);
m_d_rcat = m.mk_func_decl(symbol("seq.split.rcat"), ss, re_sort, ss);
m_empty_app = m.mk_const(m_d_empty);
m_seq_sort = seq_sort;
}
// --- smart constructors ----------------------------------------------------
expr_ref seq_split::mk_empty() {
SASSERT(m_empty_app);
return m_empty_app;
}
expr_ref seq_split::mk_single(expr* d, expr* n) {
SASSERT(d && n);
if (re().is_empty(d) || re().is_empty(n))
return mk_empty();
return expr_ref(m.mk_app(m_d_single, d, n), m);
}
expr_ref seq_split::mk_fromre(expr* r) {
SASSERT(r);
sort* seq_sort = nullptr;
VERIFY(seq().is_re(r, seq_sort));
ensure_decls(seq_sort);
if (re().is_empty(r))
return mk_empty();
return expr_ref(m.mk_app(m_d_fromre, r), m);
}
expr_ref seq_split::mk_union(expr* a, expr* b) {
SASSERT(a && b);
if (is_empty_ss(a))
return expr_ref(b, m);
if (is_empty_ss(b))
return expr_ref(a, m);
return expr_ref(m.mk_app(m_d_union, a, b), m);
}
expr_ref seq_split::mk_inter(expr* a, expr* b) {
SASSERT(a && b);
if (is_empty_ss(a) || is_empty_ss(b))
return mk_empty();
return expr_ref(m.mk_app(m_d_inter, a, b), m);
}
expr_ref seq_split::mk_compl(expr* a) {
SASSERT(a);
return expr_ref(m.mk_app(m_d_compl, a), m);
}
expr_ref seq_split::mk_lcat(expr* r, expr* s) {
SASSERT(r && s);
if (is_empty_ss(s))
return mk_empty();
if (re().is_epsilon(r)) // eps . S = S
return expr_ref(s, m);
return expr_ref(m.mk_app(m_d_lcat, r, s), m);
}
expr_ref seq_split::mk_rcat(expr* s, expr* r) {
SASSERT(r && s);
if (is_empty_ss(s))
return mk_empty();
if (re().is_epsilon(r)) // S . eps = S
return expr_ref(s, m);
return expr_ref(m.mk_app(m_d_rcat, s, r), m);
}
// --- recognizers -----------------------------------------------------------
bool seq_split::is_empty_ss(expr* e) const {
return is_app(e) && to_app(e)->get_decl() == m_d_empty;
}
bool seq_split::is_single(expr* e, expr*& d, expr*& n) const {
if (!is_app(e) || to_app(e)->get_decl() != m_d_single)
return false;
d = to_app(e)->get_arg(0);
n = to_app(e)->get_arg(1);
return true;
}
bool seq_split::is_fromre(expr* e, expr*& r) const {
if (!is_app(e) || to_app(e)->get_decl() != m_d_fromre)
return false;
r = to_app(e)->get_arg(0);
return true;
}
bool seq_split::is_union(expr* e, expr*& a, expr*& b) const {
if (!is_app(e) || to_app(e)->get_decl() != m_d_union)
return false;
a = to_app(e)->get_arg(0);
b = to_app(e)->get_arg(1);
return true;
}
bool seq_split::is_inter(expr* e, expr*& a, expr*& b) const {
if (!is_app(e) || to_app(e)->get_decl() != m_d_inter)
return false;
a = to_app(e)->get_arg(0);
b = to_app(e)->get_arg(1);
return true;
}
bool seq_split::is_compl(expr* e, expr*& a) const {
if (!is_app(e) || to_app(e)->get_decl() != m_d_compl)
return false;
a = to_app(e)->get_arg(0);
return true;
}
bool seq_split::is_lcat(expr* e, expr*& r, expr*& s) const {
if (!is_app(e) || to_app(e)->get_decl() != m_d_lcat)
return false;
r = to_app(e)->get_arg(0);
s = to_app(e)->get_arg(1);
return true;
}
bool seq_split::is_rcat(expr* e, expr*& s, expr*& r) const {
if (!is_app(e) || to_app(e)->get_decl() != m_d_rcat)
return false;
s = to_app(e)->get_arg(0);
r = to_app(e)->get_arg(1);
return true;
}
bool seq_split::is_frontier(expr* e) const {
expr *a = nullptr, *b = nullptr;
return is_empty_ss(e) || is_single(e, a, b) || is_union(e, a, b);
}
seq_util& seq_split::seq() const { return m_rw.u(); }
seq_util::rex& seq_split::re() const { return m_rw.u().re; }
// Add <d, n> unless the (optional) lookahead oracle prunes it.
void seq_split::push(split_set& out, split_oracle const& oracle, expr* d, expr* n) const {
if (!oracle || oracle(d, n))
out.push_back(split_pair(d, n, m));
}
// Cross-product intersection of two split-sets (split algebra):
// S1 cap S2 = { <D1 cap D2, N1 cap N2> | <D1,N1> in S1, <D2,N2> in S2 }.
// Pairs where any component is bottom (the empty regex) are dropped.
bool seq_split::intersect(split_set const& s1, split_set const& s2, split_set& result,
unsigned threshold, split_oracle const& oracle) const {
const seq_util::rex& r = re();
for (auto const& p1 : s1) {
for (auto const& p2 : s2) {
if (r.is_empty(p1.m_d) || r.is_empty(p2.m_d) ||
r.is_empty(p1.m_n) || r.is_empty(p2.m_n))
continue;
const expr_ref di(m_rw.mk_regex_inter_normalize(p1.m_d, p2.m_d), m);
const expr_ref ni(m_rw.mk_regex_inter_normalize(p1.m_n, p2.m_n), m);
push(result, oracle, di, ni);
if (result.size() > threshold)
return false;
}
}
return true;
}
// Complement of a split-set via De Morgan: ~S = cap_{s in S} ~s with
// ~<D,N> = { <~D, .*>, <.*, ~N> } and ~{} = { <.*, .*> }.
// May produce up to 2^|sp| pairs (bounded by the threshold). A threshold
// overrun must abort entirely: a partial fold is a strictly weaker (unsound)
// split-set, since each ~sp[i] further constrains ~S.
bool seq_split::complement(sort* seq_sort, split_set const& sp, split_set& result,
const unsigned threshold, split_oracle const& oracle) const {
seq_util::rex& r = re();
sort* re_sort = r.mk_re(seq_sort);
const expr_ref full(r.mk_full_seq(re_sort), m); // .*
if (sp.empty()) { // ~{} = <.*, .*>
push(result, oracle, full, full);
return true;
}
// The acc/next pairs carry genuine output-orientation N components (the De
// Morgan ~<D,N> = {<~D,.*>, <.*,~N>}), so the oracle prunes them soundly and
// keeps the 2^|sp| fold from blowing up.
split_set acc;
push(acc, oracle, r.mk_complement(sp[0].m_d), full);
push(acc, oracle, full, r.mk_complement(sp[0].m_n));
for (unsigned i = 1; i < sp.size(); ++i) {
split_set next;
push(next, oracle, r.mk_complement(sp[i].m_d), full);
push(next, oracle, full, r.mk_complement(sp[i].m_n));
split_set tmp;
if (!intersect(acc, next, tmp, threshold, oracle))
return false;
acc = std::move(tmp);
if (acc.empty()) // intersection empty => ~S is empty
break;
if (acc.size() > threshold)
return false;
}
result.append(acc);
return true;
}
// One level of the sigma rules. Mirrors the historic eager `compute`, except it
// emits *suspended* split-algebra terms (from_re / lcat / rcat / inter / compl) for
// the subterms instead of recursing. `mode` is irrelevant here: weak vs. strong is
// decided when `head_normalize` reaches an inter / compl node.
expr_ref seq_split::expand_fromre(expr* r, bool& ok) {
ok = true;
seq_util& sq = seq();
seq_util::rex& rex = re();
sort* seq_sort = nullptr;
if (!sq.is_re(r, seq_sort)) {
ok = false;
return expr_ref(m);
}
ensure_decls(seq_sort);
// bottom: sigma(empty) = {}
if (rex.is_empty(r))
return mk_empty();
// epsilon: sigma(eps) = { <eps, eps> }
if (rex.is_epsilon(r)) {
const expr_ref eps(rex.mk_epsilon(seq_sort), m);
return mk_single(eps, eps);
}
expr* a = nullptr, *b = nullptr;
// to_re(s): split the literal word s at every position.
expr* s = nullptr;
if (rex.is_to_re(r, s)) {
zstring str;
vector<expr*> stack;
stack.push_back(s);
while (!stack.empty()) {
expr* cur = stack.back();
stack.pop_back();
if (seq().str.is_concat(cur, a, b)) {
stack.push_back(b);
stack.push_back(a);
}
else {
expr* ch;
unsigned cv;
if (seq().str.is_unit(cur, ch) && seq().is_const_char(ch, cv)) {
str += zstring(cv);
continue;
}
zstring str2;
if (sq.str.is_string(s, str2)) {
str = str2;
continue;
}
// not a constant string; unsupported for now
ok = false;
return expr_ref(m);
}
}
expr_ref acc = mk_empty();
for (unsigned i = 0; i <= str.length(); ++i) {
const expr_ref p(rex.mk_to_re(sq.str.mk_string(str.extract(0, i))), m);
const expr_ref q(rex.mk_to_re(sq.str.mk_string(str.extract(i, str.length() - i))), m);
acc = mk_union(acc, mk_single(p, q));
}
return acc;
}
// single-character class alpha (., [lo-hi], of_pred):
// sigma(alpha) = { <eps, alpha>, <alpha, eps> }
if (rex.is_full_char(r) || rex.is_range(r) || rex.is_of_pred(r)) {
const expr_ref ex(r, m);
const expr_ref eps(rex.mk_epsilon(seq_sort), m);
return mk_union(mk_single(eps, ex), mk_single(ex, eps));
}
// .* : sigma(.*) = { <.*, .*> }
if (rex.is_full_seq(r)) {
const expr_ref ex(r, m);
return mk_single(ex, ex);
}
// union: sigma(r0 | ... | r_{n-1}) = U from_re(ri) (re.union may be n-ary)
if (rex.is_union(r)) {
app* ap = to_app(r);
expr_ref acc = mk_empty();
for (expr* arg : *ap) {
acc = mk_union(acc, mk_fromre(arg));
}
return acc;
}
// concat: sigma(r0...r_{n-1}) = U_i (r0...r_{i-1}) . sigma(ri) . (r_{i+1}...r_{n-1})
// emitted as U_i lcat(left, rcat(from_re(ri), right)) (re.++ may be n-ary)
if (rex.is_concat(r)) {
app* ap = to_app(r);
const unsigned n = ap->get_num_args();
expr_ref acc = mk_empty();
for (unsigned i = 0; i < n; ++i) {
expr_ref left(m), right(m);
if (i == 0)
left = rex.mk_epsilon(seq_sort);
else {
for (unsigned j = 0; j < i; ++j) {
expr* arg = ap->get_arg(j);
left = left ? expr_ref(rex.mk_concat(left, arg), m) : expr_ref(arg, m);
}
}
if (i == n - 1)
right = rex.mk_epsilon(seq_sort);
else {
right = ap->get_arg(i + 1);
for (unsigned j = i + 2; j < n; ++j) {
expr* arg = ap->get_arg(j);
right = rex.mk_concat(right, arg);
}
}
expr_ref term = mk_lcat(left, mk_rcat(mk_fromre(ap->get_arg(i)), right));
acc = mk_union(acc, term);
}
return acc;
}
// star: sigma(a*) = { <eps, eps> } cup a*.sigma(a).a*
if (rex.is_star(r, a)) {
const expr_ref eps(rex.mk_epsilon(seq_sort), m);
expr_ref body = mk_lcat(r, mk_rcat(mk_fromre(a), r)); // a*.from_re(a).a*
return mk_union(mk_single(eps, eps), body);
}
// plus: a+ = a.a* ; sigma(a+) = a*.sigma(a).a* (star rule without <eps,eps>)
if (rex.is_plus(r, a)) {
const expr_ref star(rex.mk_star(a), m); // a*
return mk_lcat(star, mk_rcat(mk_fromre(a), star));
}
// intersection: sigma(r0 & ... & r_{n-1}) = cap from_re(ri) (re.inter may be n-ary)
if (rex.is_intersection(r)) {
app* ap = to_app(r);
const unsigned n = ap->get_num_args();
expr_ref acc = mk_fromre(ap->get_arg(0));
for (unsigned i = 1; i < n; ++i)
acc = mk_inter(acc, mk_fromre(ap->get_arg(i)));
return acc;
}
// complement: sigma(~a) = ~sigma(a).
if (rex.is_complement(r, a))
return mk_compl(mk_fromre(a));
// difference: a \ b = a & ~b ; sigma(a \ b) = sigma(a) cap ~sigma(b).
if (rex.is_diff(r, a, b))
return mk_inter(mk_fromre(a), mk_compl(mk_fromre(b)));
// bounded loop / ite / other: not handled (paper "v1: bail").
TRACE(seq, tout << "seq_split: unsupported regex " << mk_pp(r, m) << "\n";);
ok = false;
return expr_ref(m);
}
// r . hs : push the left regex onto the D component of a head-normal split-set.
expr_ref seq_split::distribute_lcat(expr* r, expr* hs) {
expr *a = nullptr, *b = nullptr, *d = nullptr, *n = nullptr;
if (is_empty_ss(hs))
return mk_empty();
if (is_single(hs, d, n))
return mk_single(m_rw.mk_re_append(r, d), n); // r.D
if (is_union(hs, a, b))
return mk_union(mk_lcat(r, a), mk_lcat(r, b));
UNREACHABLE();
return expr_ref(hs, m);
}
// hs . r : push the right regex onto the N component of a head-normal split-set.
expr_ref seq_split::distribute_rcat(expr* hs, expr* r) {
expr *a = nullptr, *b = nullptr, *d = nullptr, *n = nullptr;
if (is_empty_ss(hs))
return mk_empty();
if (is_single(hs, d, n))
return mk_single(d, m_rw.mk_re_append(n, r)); // N.r
if (is_union(hs, a, b))
return mk_union(mk_rcat(a, r), mk_rcat(b, r));
UNREACHABLE();
return expr_ref(hs, m);
}
expr_ref seq_split::from_split_set(split_set const& s) {
expr_ref acc = mk_empty();
for (auto const& p : s)
acc = mk_union(acc, mk_single(p.m_d, p.m_n));
return acc;
}
expr_ref seq_split::head_normalize(expr* t, split_mode mode, unsigned threshold,
split_oracle const& oracle, bool& ok) {
ok = true;
expr *a = nullptr, *b = nullptr, *r = nullptr, *s = nullptr;
// already a frontier node
if (is_frontier(t))
return expr_ref(t, m);
// from_re(r): one level of sigma; recurse to settle a non-frontier head
// (plus / inter / compl / diff expand to lcat / inter / compl nodes).
if (is_fromre(t, r)) {
expr_ref e = expand_fromre(r, ok);
if (!ok)
return expr_ref(m);
if (is_frontier(e))
return e;
return head_normalize(e, mode, threshold, oracle, ok);
}
// r.S : head-normalize S, then distribute r over the frontier.
if (is_lcat(t, r, s)) {
expr_ref hs = head_normalize(s, mode, threshold, oracle, ok);
if (!ok)
return expr_ref(m);
return distribute_lcat(r, hs);
}
if (is_rcat(t, s, r)) {
expr_ref hs = head_normalize(s, mode, threshold, oracle, ok);
if (!ok)
return expr_ref(m);
return distribute_rcat(hs, r);
}
// inter / compl are eager by nature: a single split of S1 cap S2 (or ~S)
// cannot be produced without materializing the operand split-sets.
if (is_inter(t, a, b)) {
if (mode == split_mode::weak) {
ok = false;
return expr_ref(m);
}
split_set sa, sb, tmp;
if (!materialize(a, mode, threshold, oracle, sa) ||
!materialize(b, mode, threshold, oracle, sb) ||
!intersect(sa, sb, tmp, threshold, oracle)) {
ok = false;
return expr_ref(m);
}
return from_split_set(tmp);
}
if (is_compl(t, a)) {
if (mode == split_mode::weak) {
ok = false;
return expr_ref(m);
}
// The body is materialized WITHOUT the oracle (its pairs are inverted, so
// their N is unrelated to the output N); the oracle is re-applied in
// complement().
split_set sa, res;
if (!materialize(a, mode, threshold, split_oracle{}, sa) ||
!complement(m_seq_sort, sa, res, threshold, oracle)) {
ok = false;
return expr_ref(m);
}
return from_split_set(res);
}
UNREACHABLE();
ok = false;
return expr_ref(m);
}
bool seq_split::materialize(expr* node, split_mode mode, unsigned threshold,
split_oracle const& oracle, split_set& out) {
iterator it(*this, node, mode, threshold, oracle);
expr_ref d(m), n(m);
while (it.next(d, n))
out.push_back(split_pair(d, n, m));
return !it.gave_up();
}
expr_ref seq_split::make(expr* r) {
SASSERT(r);
sort* seq_sort = nullptr;
if (!seq().is_re(r, seq_sort))
return expr_ref(m);
return mk_fromre(r);
}
// --- Lazy enumerator --------------------------------------------------------
// The worklist holds suspended split-sets. Each next() pops a node, head-
// normalizes it to a frontier (empty | single | union), and either returns the
// single split, pushes the two union branches back, or skips an empty. All the
// expansion work happens lazily, one split per next() call.
seq_split::iterator::iterator(seq_split& engine, expr* node, split_mode mode,
unsigned threshold, split_oracle oracle) :
m_engine(engine), m(engine.m), m_mode(mode), m_threshold(threshold),
m_oracle(std::move(oracle)), m_work(engine.m) {
SASSERT(node);
m_work.push_back(node);
}
bool seq_split::iterator::next(expr_ref& out_d, expr_ref& out_n) {
if (m_giveup)
return false; // a prior give-up is sticky
while (!m_work.empty()) {
expr_ref t(m_work.back(), m);
m_work.pop_back();
bool ok = true;
expr_ref hn = m_engine.head_normalize(t, m_mode, m_threshold, m_oracle, ok);
if (!ok) {
m_giveup = true; // unsupported / weak Boolean / overrun
return false;
}
expr *a = nullptr, *b = nullptr, *d = nullptr, *n = nullptr;
if (m_engine.is_empty_ss(hn))
continue;
if (m_engine.is_single(hn, d, n)) {
if (m_oracle && !m_oracle(d, n))
continue; // pruned by lookahead
if (++m_count > m_threshold) {
m_giveup = true; // safety cap against space bloat
return false;
}
out_d = d;
out_n = n;
return true;
}
if (m_engine.is_union(hn, a, b)) {
m_work.push_back(a);
m_work.push_back(b);
continue;
}
UNREACHABLE();
}
return false; // exhausted (m_giveup stays false)
}
seq_split::iterator seq_split::iterate(expr* node, split_mode mode, unsigned threshold,
split_oracle const& oracle) {
return iterator(*this, node, mode, threshold, oracle);
}
// Eager wrapper: drain the lazy enumeration into `out`. Semantics (give-up cases,
// oracle discipline) match the historic engine.
bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mode mode,
split_oracle const& oracle) {
SASSERT(r);
sort* seq_sort = nullptr;
if (!seq().is_re(r, seq_sort))
return false;
expr_ref node = mk_fromre(r);
return materialize(node, mode, threshold, oracle, result);
}
// same-D / same-N merge (paper eqs. 1 & 2):
// { <D,N>, <D,N'> } -> <D, N | N'> (by_left = true, group by D)
// { <D,N>, <D',N> } -> <D | D', N> (by_left = false, group by N)
// Only fires on syntactically-identical (perfectly-shared) key components, so
// it is a conservative instance of the rule.
void seq_split::merge_by(split_set& pairs, const bool by_left) const {
obj_map<expr, unsigned> idx; // key component -> position in `out`
split_set out;
for (auto const& p : pairs) {
expr* key = by_left ? p.m_d.get() : p.m_n.get();
expr* other = by_left ? p.m_n.get() : p.m_d.get();
unsigned pos;
if (idx.find(key, pos)) {
expr* prev = by_left ? out[pos].m_n.get() : out[pos].m_d.get();
const expr_ref u(m_rw.mk_regex_union_normalize(prev, other), m);
if (by_left)
out[pos].m_n = u;
else
out[pos].m_d = u;
}
else {
idx.insert(key, out.size());
out.push_back(p);
}
}
pairs.swap(out);
}
void seq_split::simplify(split_set& pairs) const {
seq_util::rex& r = re();
// 1. drop pairs with a bottom (empty-language) component.
unsigned w = 0;
for (unsigned i = 0; i < pairs.size(); ++i) {
if (r.is_empty(pairs[i].m_d) || r.is_empty(pairs[i].m_n))
continue;
if (w != i)
pairs[w] = pairs[i];
++w;
}
pairs.shrink(w);
if (pairs.size() <= 1)
return;
// 2. same-D / same-N merge rules.
merge_by(pairs, true);
merge_by(pairs, false);
if (pairs.size() <= 1)
return;
// 3. subsumption: drop <D_i,N_i> when L(D_i) subseteq L(D_j) and
// L(N_i) subseteq L(N_j) for some kept j. seq_subset is conservative
// (returns true only for definite containment), so we never drop a
// needed split.
//if (pairs.size() > 64)
// return;
struct row { expr* d; expr* n; unsigned idx; };
vector<row> rows;
for (unsigned i = 0; i < pairs.size(); ++i)
rows.push_back({ pairs[i].m_d.get(), pairs[i].m_n.get(), i });
auto subsumes = [&](row const& a, row const& b) {
return m_subset.is_subset(b.d, a.d) && m_subset.is_subset(b.n, a.n);
};
vector<row> kept;
for (row const& row_r : rows) {
bool redundant = false;
for (row const& k : kept)
if (subsumes(k, row_r)) { redundant = true; break; }
if (redundant)
continue;
// drop already-kept rows strictly subsumed by row_r
unsigned kw = 0;
for (unsigned t = 0; t < kept.size(); ++t) {
if (subsumes(row_r, kept[t]))
continue;
kept[kw++] = kept[t];
}
kept.shrink(kw);
kept.push_back(row_r);
}
split_set result;
for (row const& k : kept)
result.push_back(pairs[k.idx]);
pairs.swap(result);
}
std::pair<expr_ref, expr_ref> seq_split::split_membership(expr* str, expr* regex, unsigned threshold, split_set& result) const {
expr_ref_vector tokens(m);
vector<expr*> stack;
stack.push_back(str);
while (!stack.empty()) {
expr* cur = stack.back();
stack.pop_back();
expr* l, *r;
if (seq().str.is_concat(cur, l, r)) {
stack.push_back(r);
stack.push_back(l);
}
else
tokens.push_back(expr_ref(cur, m));
}
expr* ch;
unsigned i = 0;
while (i < tokens.size() && (seq().str.is_string(tokens.get(i)) || (seq().str.is_unit(tokens.get(i), ch) && seq().is_const_char(ch)))) {
zstring s;
if (seq().str.is_string(tokens.get(i), s)) {
if (s.empty()) {
i++;
continue;
}
ch = seq().mk_char(s[0]);
tokens[i] = seq().str.mk_string(s.extract(1, s.length() - 1));
}
else
i++;
regex = m_rw.mk_derivative(ch, regex);
}
if (i > 0) {
unsigned j = 0;
for (; i < tokens.size(); i++, j++) {
tokens[j] = tokens.get(i);
}
tokens.shrink(j);
}
// TODO: Do this for the back as well (also, why did no rule before do that?)
if (tokens.empty())
return { expr_ref(m), expr_ref(m) };
// Choose the factorization boundary so the tail starts with the
// longest run of concrete characters c.
// This gives the split-engine lookahead oracle the most pruning information.
// head = u' (tokens before the run), tail = c · u''' (tokens from the run onward).
const unsigned total = tokens.size();
unsigned run_start = 0, run_len = 0;
for (i = 1; i < total; ) {
if (!(seq().str.is_unit(tokens.get(i), ch) && seq().is_const_char(ch))) {
i++;
continue;
}
unsigned j = i;
while (j < total && seq().str.is_unit(tokens.get(j), ch) && seq().is_const_char(ch)) {
j++;
}
if (j - i > run_len) {
run_len = j - i;
run_start = i;
}
i = j;
}
// No constant run => fall back to splitting off the first token.
const unsigned p = run_len == 0 ? 1 : run_start;
SASSERT(p >= 1);
expr* head = tokens.get(0);
for (i = 1; i < p; i++) {
head = seq().str.mk_concat(head, tokens.get(i));
}
expr* tail = seq().str.mk_empty(head->get_sort());
if (tokens.size() > p + run_len) {
tail = tokens.get(p + run_len);
for (i = p + run_len + 1; i < tokens.size(); i++) {
tail = seq().str.mk_concat(tail, tokens.get(i));
}
}
SASSERT(head && tail);
// Build the constant lookahead c and (if non-empty) an oracle that
// prunes splits whose postfix cannot match c.
zstring c;
for (i = 0; i < run_len; ++i) {
unsigned cv;
VERIFY(seq().str.is_unit(tokens.get(run_start + i), ch));
VERIFY(seq().is_const_char(ch, cv));
c = c + zstring(cv);
}
split_oracle oracle;
if (!c.empty())
oracle = [this, &c](expr*, expr* n) { return split_lookahead_viable(n, c); };
// Decompose the regex into a split-set via the shared seq_split engine
if (!m_rw.split(regex, result, threshold, split_mode::strong, oracle)) {
result.clear();
return { expr_ref(m), expr_ref(m) };
}
simplify(result);
// Eagerly consume the constant run c from the tail by taking the c-derivative
// of each postfix
if (!c.empty()) {
unsigned w = 0;
for (i = 0; i < result.size(); ++i) {
expr* d = result[i].m_n;
for (unsigned k = 0; d && !seq().re.is_empty(d) && k < c.length(); ++k) {
d = m_rw.mk_derivative(seq().mk_char(c[k]), d);
}
SASSERT(d);
if (re().is_empty(d))
continue; // postfix can't start with c => infeasible split, drop
result[w++] = split_pair(result[i].m_d, d, m);
}
result.shrink(w);
}
return { expr_ref(head, m), expr_ref(tail, m) };
}
bool seq_split::split_lookahead_viable(expr* regex, zstring const& c) const {
SASSERT(regex);
for (unsigned i = 0; i < c.length(); i++) {
if (m.is_true(m_rw.is_nullable(regex)))
return true; // N accepts the prefix c[0..i) => a suffix completes it
regex = m_rw.mk_derivative(seq().mk_char(c[i]), regex);
SASSERT(regex);
if (re().is_empty(regex))
return false; // N went (syntactically) dead before reaching c
}
return !re().is_empty(regex);
}

View file

@ -0,0 +1,225 @@
/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
seq_split.h
Abstract:
Regex split decomposition: the split function sigma from the paper
"Solving by Splitting". For a regular expression r, sigma(r) is a finite
"split-set" of pairs { <D_i, N_i> } such that
u.v in L(r) iff exists i: u in L(D_i) and v in L(N_i).
The split algebra (intersection, De Morgan complement, left/right
concatenation with a regex) and the cardinality-reducing simplification
heuristics (drop bottom, same-D/same-N merge, subsumption via seq_subset)
follow the paper.
Author:
Clemens Eisenhofer 2026-6-10
--*/
#pragma once
#include "ast/seq_decl_plugin.h"
#include "ast/rewriter/seq_subset.h"
#include <functional>
class seq_rewriter;
// An individual split <D, N>: the left (prefix) regex D and right (suffix)
// regex N. u.v in L(r) for this split iff u in L(D) and v in L(N).
struct split_pair {
expr_ref m_d;
expr_ref m_n;
split_pair(expr* d, expr* n, ast_manager& m) : m_d(d, m), m_n(n, m) {
SASSERT(d && n);
}
};
// A split-set is a union of individual splits.
typedef vector<split_pair> split_set;
// Controls how aggressively sigma expands the Boolean-closure cases:
// strong - fully expand complement / intersection via the split algebra
// (De Morgan / cross product). This is the behaviour the nseq
// solver relies on.
// weak - do not perform the (potentially 2^k) Boolean-closure expansion;
// give up (return false) on complement / intersection instead.
enum class split_mode { weak, strong };
// Optional lookahead oracle. Called for each candidate split <D, N> as it is
// generated; returns true to keep it, false to prune it. An empty oracle (the
// default) keeps everything, so sigma is unchanged. See seq_split::compute.
typedef std::function<bool(expr* D, expr* N)> split_oracle;
class seq_split {
ast_manager& m;
seq_rewriter& m_rw; // for mk_re_append + manager / seq_util access
seq_subset m_subset; // language-subset checks for subsumption
// --- Suspended split-set representation -------------------------------
// A split-set computation is kept as an `expr` term over a small family of
// locally-declared, uninterpreted function symbols (the split algebra of the
// paper / split-algebra.md). Nothing here is ever asserted to the solver;
// the terms are only used as scratch structure to drive lazy expansion.
//
// empty : SplitSet -- {} (bottom)
// single : Re x Re -> SplitSet -- a single split <D, N>
// from_re : Re -> SplitSet -- the *suspended* sigma(r)
// union : SplitSet x SplitSet -> SplitSet
// inter : SplitSet x SplitSet -> SplitSet
// compl : SplitSet -> SplitSet
// lcat : Re x SplitSet -> SplitSet -- r . S (left-concat onto D)
// rcat : SplitSet x Re -> SplitSet -- S . r (right-concat onto N)
sort* m_seq_sort = nullptr; // sequence sort the decls are built for
sort_ref m_set_sort; // the uninterpreted SplitSet sort
func_decl_ref m_d_empty, m_d_single, m_d_fromre, m_d_union,
m_d_inter, m_d_compl, m_d_lcat, m_d_rcat;
expr_ref m_empty_app; // cached nullary `empty` term
seq_util& seq() const;
seq_util::rex& re() const;
// (Re)build the local declarations for `seq_sort` if not already current.
void ensure_decls(sort* seq_sort);
// Smart constructors: apply the cheap normalizations the eager engine relies
// on (drop-bottom, eps cancellation, union absorption of empty).
expr_ref mk_empty();
expr_ref mk_single(expr* d, expr* n);
expr_ref mk_fromre(expr* r);
expr_ref mk_union(expr* a, expr* b);
expr_ref mk_inter(expr* a, expr* b);
expr_ref mk_compl(expr* a);
expr_ref mk_lcat(expr* r, expr* s);
expr_ref mk_rcat(expr* s, expr* r);
// Recognizers over the local decls.
bool is_empty_ss(expr* e) const;
bool is_single(expr* e, expr*& d, expr*& n) const;
bool is_fromre(expr* e, expr*& r) const;
bool is_union (expr* e, expr*& a, expr*& b) const;
bool is_inter (expr* e, expr*& a, expr*& b) const;
bool is_compl (expr* e, expr*& a) const;
bool is_lcat (expr* e, expr*& r, expr*& s) const;
bool is_rcat (expr* e, expr*& s, expr*& r) const;
// A term whose head is empty | single | union (ready for the worklist loop).
bool is_frontier(expr* e) const;
// One level of the sigma rules: from_re(r) -> a SplitSet term built from the
// immediate subterms. `ok` is set false on an unsupported shape.
expr_ref expand_fromre(expr* r, bool& ok);
// Distribute a left/right concatenation over a head-normal split-set.
expr_ref distribute_lcat(expr* r, expr* hs);
expr_ref distribute_rcat(expr* hs, expr* r);
// Materialized split-set -> a `union` of `single`s.
expr_ref from_split_set(split_set const& s);
// Reduce `t` until its head is empty | single | union (one outermost level
// for the lazy nodes; inter/compl are expanded eagerly via `materialize`,
// since the paper's De Morgan / cross-product cannot yield a split lazily).
// `ok` is set false on a give-up (unsupported shape, weak-mode Boolean, or
// threshold overrun).
expr_ref head_normalize(expr* t, split_mode mode, unsigned threshold,
split_oracle const& oracle, bool& ok);
// Fully drain a suspended split-set into `out` (used for inter/compl bodies).
// Runs an `iterator` to exhaustion; returns false on a give-up.
bool materialize(expr* node, split_mode mode, unsigned threshold,
split_oracle const& oracle, split_set& out);
// Push <d, n> onto `out`, unless `oracle` rejects it.
void push(split_set& out, split_oracle const& oracle, expr* d, expr* n) const;
// S1 cap S2 = { <D1 cap D2, N1 cap N2> } dropping any pair with a bottom
// component (and any rejected by `oracle`). Returns false on threshold overrun.
bool intersect(split_set const& s1, split_set const& s2, split_set& result,
unsigned threshold, split_oracle const& oracle) const;
// De Morgan complement of a split-set: ~S = cap_{s in S} ~s with
// ~<D,N> = { <~D, .*>, <.*, ~N> } and ~{} = { <.*, .*> }.
bool complement(sort* seq_sort, split_set const& sp, split_set& result,
unsigned threshold, split_oracle const& oracle) const;
// same-D / same-N merge: groups pairs that share a (syntactically identical)
// left (resp. right) component and unions the other component.
void merge_by(split_set& pairs, bool by_left) const;
public:
explicit seq_split(seq_rewriter& rw);
// Lazy split enumerator. Holds the suspended split-set worklist and produces
// the concrete splits <D, N> one at a time, on demand, instead of computing
// them all up front. Obtain one from seq_split::iterate (or construct it
// directly) and pull splits with next() until it returns false; gave_up() then
// tells a normal exhaustion (false) apart from a give-up (true).
//
// The threshold is supplied by the caller and serves only as a safety cap
// against space bloat (lazy expansion still has to materialize the operands of
// intersection / complement). A threshold overrun, an unsupported regex shape,
// or a Boolean-closure case in weak mode aborts the enumeration: next() returns
// false and gave_up() returns true. To stop early, simply stop calling next().
//
// `oracle` (optional) prunes non-viable splits as they are produced. It must
// be sound to apply per split: a candidate N can still gain a prefix from a
// factor appended to its right later (concat/star), so the oracle must use a
// "prefix-compatible" test (prune only when N can never match the lookahead,
// even partially), NOT a strict "starts-with" test. The complement body is
// expanded WITHOUT the oracle (inverted orientation); the oracle is re-applied
// to the complement's output fold.
class iterator {
seq_split& m_engine;
ast_manager& m;
split_mode m_mode;
unsigned m_threshold;
split_oracle m_oracle;
expr_ref_vector m_work; // GC-safe worklist of suspended split-sets
unsigned m_count = 0; // splits produced so far (vs. threshold)
bool m_giveup = false;
public:
iterator(seq_split& engine, expr* node, split_mode mode,
unsigned threshold, split_oracle oracle);
// Compute the next split. On success returns true and sets <d, n>; on
// exhaustion or give-up returns false (see gave_up()). Calling next()
// again after it has returned false keeps returning false.
bool next(expr_ref& d, expr_ref& n);
// Valid after next() has returned false: true iff the enumeration aborted
// (unsupported regex / weak-mode Boolean / threshold overrun) rather than
// running out of splits.
bool gave_up() const { return m_giveup; }
};
// Build the *suspended* sigma(r) as a split-algebra term (no expansion).
// Returns null on a non-regex argument. Drive it with `iterate`.
expr_ref make(expr* r);
// Create a lazy enumerator over a suspended split-set `node` (typically the
// result of make()). See `iterator` for the meaning of the arguments.
iterator iterate(expr* node, split_mode mode, unsigned threshold,
split_oracle const& oracle = {});
// Compute sigma(r), appending to `out` (does not clear it). Thin eager
// wrapper that drains an `iterator` to exhaustion; semantics match the historic
// engine. See `iterator` for the meaning of `threshold`, `mode`, and `oracle`.
bool compute(expr* r, split_set& out, unsigned threshold,
split_mode mode = split_mode::strong, split_oracle const& oracle = {});
// In-place simplification of a split-set: drop bottom components, apply the
// same-D / same-N merge rules, and drop splits subsumed by another (using
// seq_subset). Size-capped to keep the O(n^2) subsumption affordable.
void simplify(split_set& s) const;
// decompose a membership constraint into a set of pairs of regex splits
std::pair<expr_ref, expr_ref> split_membership(expr* str, expr* regex, unsigned threshold, split_set& result) const;
// Lookahead oracle for the split engine: is the split's right component
// `n_regex` prefix-compatible with the constant character sequence `c`?
// This is sound to apply during split generation — it never drops a viable split.
// Thus, it might not eliminate all cases in order to stay sound
bool split_lookahead_viable(expr* regex, zstring const& c) const;
};

View file

@ -138,6 +138,8 @@ def_module_params(module_name='smt',
('seq.validate', BOOL, False, 'enable self-validation of theory axioms created by seq theory'),
('seq.max_unfolding', UINT, 1000000000, 'maximal unfolding depth for checking string equations and regular expressions'),
('seq.min_unfolding', UINT, 1, 'initial bound for strings whose lengths are bounded by iterative deepening. Set this to a higher value if there are only models with larger string lengths'),
('seq.regex_factorization_threshold', UINT, 10, 'maximum number of cases to factor a regex into in a single step'),
('seq.regex_factorization_enabled', BOOL, False, 'apply regex factorization (sigma splitting)'),
('theory_aware_branching', BOOL, False, 'Allow the context to use extra information from theory solvers regarding literal branching prioritization.'),
('sls.enable', BOOL, False, 'enable sls co-processor with SMT engine'),
('sls.parallel', BOOL, True, 'use sls co-processor in parallel or sequential with SMT engine'),

View file

@ -23,4 +23,6 @@ void theory_seq_params::updt_params(params_ref const & _p) {
m_seq_validate = p.seq_validate();
m_seq_max_unfolding = p.seq_max_unfolding();
m_seq_min_unfolding = p.seq_min_unfolding();
m_seq_regex_factorization_enabled = p.seq_regex_factorization_enabled();
m_seq_regex_factorization_threshold = p.seq_regex_factorization_threshold();
}

View file

@ -26,6 +26,8 @@ struct theory_seq_params {
bool m_seq_validate = false;
unsigned m_seq_max_unfolding = UINT_MAX/4;
unsigned m_seq_min_unfolding = 1;
bool m_seq_regex_factorization_enabled = false;
unsigned m_seq_regex_factorization_threshold = 1;
theory_seq_params(params_ref const & p = params_ref()) {
updt_params(p);

View file

@ -128,6 +128,30 @@ namespace smt {
return;
}
if (th.get_fparams().m_seq_regex_factorization_enabled) {
unsigned threshold = th.get_fparams().m_seq_regex_factorization_threshold;
if (threshold == 0)
threshold = UINT_MAX;
split_set result;
auto [head, tail] = seq_rw().split_membership(s, r, threshold, result);
if (head) {
SASSERT(tail);
// propagate all cases
expr_ref_vector cases(m);
expr_ref_vector branches(m);
for (auto [pre, post] : result) {
expr_ref mem_head(re().mk_in_re(head, pre), m);
expr_ref mem_tail(re().mk_in_re(tail, post), m);
cases.push_back(m.mk_and(mem_head, mem_tail));
}
const expr_ref cases_expr(m.mk_or(cases), m);
ctx.internalize(cases_expr, false);
th.propagate_lit(nullptr, 1, &lit, ctx.get_literal(cases_expr));
return;
}
// fallthrough; decomposition failed
}
// Convert a non-ground sequence into an additional regex and
// strengthen the original regex constraint into an intersection
// for example:

View file

@ -138,6 +138,7 @@ add_executable(test-z3
simplifier.cpp
sls_test.cpp
sls_seq_plugin.cpp
seq_split.cpp
small_object_allocator.cpp
smt2print_parse.cpp
smt_context.cpp

View file

@ -197,6 +197,7 @@
X(ho_matcher) \
X(finite_set) \
X(finite_set_rewriter) \
X(seq_split) \
X(fpa) \
X(seq_regex_bisim) \
X(term_enumeration) \

450
src/test/seq_split.cpp Normal file
View file

@ -0,0 +1,450 @@
/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
seq_split.cpp
Abstract:
Unit tests for the regex split engine (the split function sigma) in ast/rewriter/seq_split.cpp.
Author:
Clemens Eisenhofer 2026-6-22
--*/
#include "ast/ast.h"
#include "ast/reg_decl_plugins.h"
#include "ast/seq_decl_plugin.h"
#include "ast/rewriter/seq_rewriter.h"
#include "ast/rewriter/seq_split.h"
#include <set>
#include <utility>
struct plugin_registrar {
plugin_registrar(ast_manager& m) { reg_decl_plugins(m); }
};
class seq_split_test {
ast_manager m;
plugin_registrar m_reg;
seq_rewriter m_rw;
seq_split m_split;
seq_util u;
sort_ref m_str; // the sequence (String) sort
sort_ref m_re; // the RegEx sort over m_str
seq_util::rex& re() { return u.re; }
expr_ref eps() { return expr_ref(re().mk_epsilon(m_str), m); } // mk_epsilon takes the seq sort
expr_ref dot() { return expr_ref(re().mk_full_char(m_re), m); } // mk_full_char takes the RegEx sort
expr_ref dotstar() { return expr_ref(re().mk_full_seq(m_re), m); } // .*
expr_ref empty_re() { return expr_ref(re().mk_empty(m_re), m); } // the bottom regex
expr_ref rappend(expr* a, expr* b) { return m_rw.mk_re_append(a, b); } // the engine's regex concat
expr_ref word(char const* s) { return expr_ref(re().mk_to_re(u.str.mk_string(zstring(s))), m); }
expr_ref rng(char lo, char hi) {
return expr_ref(re().mk_range(u.str.mk_string(zstring(std::string(1, lo).c_str())),
u.str.mk_string(zstring(std::string(1, hi).c_str()))), m);
}
typedef std::set<std::pair<expr*, expr*>> pair_set;
pair_set as_set(split_set const& s) {
pair_set out;
for (auto const& p : s)
out.insert({ p.m_d.get(), p.m_n.get() });
return out;
}
bool eager(expr* r, split_set& out, unsigned threshold = UINT_MAX,
split_mode mode = split_mode::strong, split_oracle const& oracle = {}) {
return m_split.compute(r, out, threshold, mode, oracle);
}
bool lazy(expr* r, split_set& out, unsigned threshold = UINT_MAX,
split_mode mode = split_mode::strong, split_oracle const& oracle = {}) {
expr_ref node = m_split.make(r);
ENSURE(node);
seq_split::iterator it = m_split.iterate(node, mode, threshold, oracle);
expr_ref d(m), n(m);
while (it.next(d, n))
out.push_back(split_pair(d, n, m));
return !it.gave_up();
}
// assert that the eager and lazy engines agree on sigma(r) as a *set* of
// splits, and report the common cardinality.
unsigned check_agree(expr* r) {
split_set se, sl;
bool oke = eager(r, se);
bool okl = lazy(r, sl);
ENSURE(oke == okl);
if (!oke)
return 0;
ENSURE(as_set(se) == as_set(sl));
return (unsigned)as_set(se).size();
}
public:
seq_split_test() : m_reg(m), m_rw(m), m_split(m_rw), u(m), m_str(m), m_re(m) {
m_str = u.str.mk_string_sort();
m_re = re().mk_re(m_str);
}
void test_eager_epsilon() {
split_set s;
ENSURE(eager(eps(), s));
ENSURE(as_set(s) == pair_set({ { eps().get(), eps().get() } }));
}
void test_eager_char() {
// sigma(.) = { <eps, .>, <., eps> }
expr_ref a = dot();
split_set s;
ENSURE(eager(a, s));
pair_set expected({ { eps().get(), a.get() }, { a.get(), eps().get() } });
ENSURE(as_set(s) == expected);
}
void test_eager_word() {
// sigma("ab") = { <"", "ab">, <"a","b">, <"ab",""> }
split_set s;
ENSURE(eager(word("ab"), s));
pair_set expected({
{ word("").get(), word("ab").get() },
{ word("a").get(), word("b").get() },
{ word("ab").get(), word("").get() },
});
ENSURE(as_set(s) == expected);
}
void test_eager_union() {
// sigma(a | b) = sigma(a) cup sigma(b)
expr_ref a = rng('a', 'a'), b = rng('b', 'b');
expr_ref u_re(re().mk_union(a, b), m);
split_set s;
ENSURE(eager(u_re, s));
pair_set expected({
{ eps().get(), a.get() }, { a.get(), eps().get() },
{ eps().get(), b.get() }, { b.get(), eps().get() },
});
ENSURE(as_set(s) == expected);
}
void test_agree_all() {
expr_ref a = rng('a', 'a'), b = rng('b', 'b');
expr_ref star(re().mk_star(a), m);
expr_ref plus(re().mk_plus(a), m);
expr_ref concat(re().mk_concat(a, b), m);
expr_ref uni(re().mk_union(a, b), m);
expr_ref inter(re().mk_inter(re().mk_star(a), re().mk_star(b)), m);
expr_ref compl_(re().mk_complement(re().mk_star(a)), m);
expr_ref diff(re().mk_diff(re().mk_star(a), re().mk_star(b)), m);
ENSURE(check_agree(eps()) == 1);
ENSURE(check_agree(a) == 2);
ENSURE(check_agree(word("ab")) == 3);
ENSURE(check_agree(uni) == 4);
ENSURE(check_agree(star) == 3); // { <eps,eps>, <a*, a.a*>, <a*.a, a*> }
(void)check_agree(plus);
(void)check_agree(concat);
(void)check_agree(inter); // strong-mode intersection
(void)check_agree(compl_); // strong-mode De Morgan complement
(void)check_agree(diff);
}
void test_lazy_early_stop() {
// a* has 3 splits; pull just the first one and then stop. (Note .* is the
// full_seq special case with a single split, so use a proper char-class body.)
expr_ref star(re().mk_star(rng('a', 'a')), m);
expr_ref node = m_split.make(star);
ENSURE(node);
seq_split::iterator it = m_split.iterate(node, split_mode::strong, UINT_MAX, {});
expr_ref d(m), n(m);
unsigned seen = 0;
if (it.next(d, n)) // pull exactly one split, then walk away
++seen;
ENSURE(!it.gave_up()); // stopping early is not a give-up
ENSURE(seen == 1);
}
void test_threshold_giveup() {
expr_ref star(re().mk_star(rng('a', 'a')), m); // 3 splits
split_set s;
ENSURE(!lazy(star, s, /*threshold*/ 1));
// the eager wrapper honours the same cap
split_set s2;
ENSURE(!eager(star, s2, /*threshold*/ 1));
}
void test_weak_vs_strong() {
expr_ref inter(re().mk_inter(re().mk_star(rng('a', 'a')), re().mk_star(rng('b', 'b'))), m);
expr_ref compl_(re().mk_complement(re().mk_star(dot())), m);
split_set s;
ENSURE(!eager(inter, s, UINT_MAX, split_mode::weak));
s.reset();
ENSURE(!lazy(inter, s, UINT_MAX, split_mode::weak));
s.reset();
ENSURE(!eager(compl_, s, UINT_MAX, split_mode::weak));
s.reset();
ENSURE(!lazy(compl_, s, UINT_MAX, split_mode::weak));
// strong mode succeeds for both
s.reset();
ENSURE(eager(inter, s, UINT_MAX, split_mode::strong));
s.reset();
ENSURE(eager(compl_, s, UINT_MAX, split_mode::strong));
}
void test_make_non_regex() {
expr_ref not_a_regex(u.str.mk_string(zstring("a")), m); // String, not RegEx
expr_ref node = m_split.make(not_a_regex);
ENSURE(!node);
}
void test_oracle_prunes() {
// sigma(.) without an oracle = { <eps,.>, <.,eps> }; an oracle that keeps
// only splits whose suffix is epsilon must drop one of the two.
expr_ref a = dot();
expr_ref e = eps();
split_oracle keep_eps_suffix = [&](expr*, expr* n) { return n == e.get(); };
split_set se, sl;
ENSURE(eager(a, se, UINT_MAX, split_mode::strong, keep_eps_suffix));
ENSURE(lazy(a, sl, UINT_MAX, split_mode::strong, keep_eps_suffix));
pair_set expected({ { a.get(), e.get() } });
ENSURE(as_set(se) == expected);
ENSURE(as_set(sl) == expected);
}
void test_eager_full_seq() {
// sigma(.*) = { <.*, .*> }
expr_ref ds = dotstar();
split_set s;
ENSURE(eager(ds, s));
ENSURE(as_set(s) == pair_set({ { ds.get(), ds.get() } }));
}
void test_eager_bottom() {
// sigma(empty) = {}
split_set s;
ENSURE(eager(empty_re(), s));
ENSURE(s.empty());
split_set sl;
ENSURE(lazy(empty_re(), sl));
ENSURE(sl.empty());
}
void test_eager_empty_word() {
// sigma(to_re("")) = { <"", ""> } (a single, trivial split)
split_set s;
ENSURE(eager(word(""), s));
ENSURE(as_set(s) == pair_set({ { word("").get(), word("").get() } }));
}
void test_eager_star_content() {
// sigma(a*) = { <eps,eps>, <a*.eps, a.a*>, <a*.a, eps.a*> }
expr_ref a = rng('a', 'a');
expr_ref as(re().mk_star(a), m);
split_set s;
ENSURE(eager(as, s));
pair_set expected({
{ eps().get(), eps().get() },
{ rappend(as, eps()).get(), rappend(a, as).get() },
{ rappend(as, a).get(), rappend(eps(), as).get() },
});
ENSURE(as_set(s) == expected);
}
void test_eager_plus_content() {
// sigma(a+) = a*.sigma(a).a* (the star rule without <eps,eps>)
expr_ref a = rng('a', 'a');
expr_ref as(re().mk_star(a), m);
expr_ref ap(re().mk_plus(a), m);
split_set s;
ENSURE(eager(ap, s));
pair_set expected({
{ rappend(as, eps()).get(), rappend(a, as).get() },
{ rappend(as, a).get(), rappend(eps(), as).get() },
});
ENSURE(as_set(s) == expected);
}
void test_eager_concat_content() {
// sigma(a.b) = sigma(a).b cup a.sigma(b)
expr_ref a = rng('a', 'a'), b = rng('b', 'b');
expr_ref ab(re().mk_concat(a, b), m);
split_set s;
ENSURE(eager(ab, s));
pair_set expected({
{ eps().get(), rappend(a, b).get() }, // <eps, a.b>
{ a.get(), rappend(eps(), b).get() }, // <a, eps.b>
{ rappend(a, eps()).get(), b.get() }, // <a.eps, b>
{ rappend(a, b).get(), eps().get() }, // <a.b, eps>
});
ENSURE(as_set(s) == expected);
}
void test_nary_union() {
// sigma(a|b|c) has 2 splits per char-class
expr_ref a = rng('a', 'a'), b = rng('b', 'b'), c = rng('c', 'c');
expr_ref u3(re().mk_union(a, re().mk_union(b, c)), m);
ENSURE(check_agree(u3) == 6);
}
void test_nary_concat() {
// sigma(a.b.c)
expr_ref a = rng('a', 'a'), b = rng('b', 'b'), c = rng('c', 'c');
expr_ref c3(re().mk_concat(a, re().mk_concat(b, c)), m);
ENSURE(check_agree(c3) >= 4);
}
void test_nested_complement() {
// sigma(~~(a*))
expr_ref cc(re().mk_complement(re().mk_complement(re().mk_star(rng('a', 'a')))), m);
(void)check_agree(cc);
}
void test_determinism() {
expr_ref r(re().mk_concat(rng('a', 'a'), re().mk_star(rng('b', 'b'))), m);
split_set s1, s2;
ENSURE(lazy(r, s1));
ENSURE(lazy(r, s2));
ENSURE(as_set(s1) == as_set(s2));
}
void test_threshold_boundary() {
expr_ref as(re().mk_star(rng('a', 'a')), m); // exactly 3 splits
split_set s;
ENSURE(eager(as, s));
unsigned k = (unsigned)as_set(s).size();
ENSURE(k == 3);
split_set ok_e, ok_l, bad_e, bad_l;
ENSURE(eager(as, ok_e, k));
ENSURE(lazy(as, ok_l, k));
ENSURE(!eager(as, bad_e, k - 1)); // one below threshold; give up
ENSURE(!lazy(as, bad_l, k - 1));
}
void test_early_stop_after_two() {
expr_ref as(re().mk_star(rng('a', 'a')), m); // 3 splits
expr_ref node = m_split.make(as);
ENSURE(node);
seq_split::iterator it = m_split.iterate(node, split_mode::strong, UINT_MAX, {});
expr_ref d(m), n(m);
unsigned seen = 0;
while (seen < 2 && it.next(d, n)) // pull two splits on demand, then stop
++seen;
ENSURE(!it.gave_up());
ENSURE(seen == 2);
}
void test_iterator_exhaustion() {
// Pull every split on demand; gave_up() must stay false on a clean
// exhaustion, and next() must keep returning false once drained.
expr_ref as(re().mk_star(rng('a', 'a')), m); // 3 splits
expr_ref node = m_split.make(as);
ENSURE(node);
seq_split::iterator it = m_split.iterate(node, split_mode::strong, UINT_MAX, {});
expr_ref d(m), n(m);
unsigned seen = 0;
while (it.next(d, n))
++seen;
ENSURE(seen == 3);
ENSURE(!it.gave_up());
// idempotent past the end
ENSURE(!it.next(d, n));
ENSURE(!it.gave_up());
}
void test_iterator_giveup() {
// A threshold overrun aborts: next() returns false and gave_up() is true.
expr_ref as(re().mk_star(rng('a', 'a')), m); // 3 splits, cap at 1
expr_ref node = m_split.make(as);
ENSURE(node);
seq_split::iterator it = m_split.iterate(node, split_mode::strong, /*threshold*/ 1, {});
expr_ref d(m), n(m);
unsigned seen = 0;
while (it.next(d, n))
++seen;
ENSURE(it.gave_up()); // aborted, not a clean exhaustion
ENSURE(seen <= 1); // produced at most the capped number
// A weak-mode Boolean closure is likewise a give-up.
expr_ref inter(re().mk_inter(re().mk_star(rng('a', 'a')), re().mk_star(rng('b', 'b'))), m);
expr_ref inode = m_split.make(inter);
ENSURE(inode);
seq_split::iterator wit = m_split.iterate(inode, split_mode::weak, UINT_MAX, {});
ENSURE(!wit.next(d, n));
ENSURE(wit.gave_up());
}
void test_simplify() {
expr_ref regs[] = {
expr_ref(re().mk_star(rng('a', 'a')), m),
expr_ref(re().mk_complement(re().mk_star(rng('a', 'a'))), m),
expr_ref(re().mk_concat(rng('a', 'a'), rng('b', 'b')), m),
};
for (auto& r : regs) {
split_set s;
ENSURE(eager(r, s));
unsigned before = (unsigned)s.size();
m_split.simplify(s);
ENSURE(s.size() <= before);
ENSURE(!s.empty());
// idempotent
split_set s2(s);
m_split.simplify(s2);
ENSURE(as_set(s) == as_set(s2));
}
}
void test_trivial_oracle() {
expr_ref r(re().mk_star(rng('a', 'a')), m);
split_oracle keep_all = [](expr*, expr*) { return true; };
split_set s_no, s_yes;
ENSURE(eager(r, s_no));
ENSURE(eager(r, s_yes, UINT_MAX, split_mode::strong, keep_all));
ENSURE(as_set(s_no) == as_set(s_yes));
}
void run() {
test_eager_epsilon();
test_eager_char();
test_eager_word();
test_eager_union();
test_agree_all();
test_lazy_early_stop();
test_threshold_giveup();
test_weak_vs_strong();
test_make_non_regex();
test_oracle_prunes();
test_eager_full_seq();
test_eager_bottom();
test_eager_empty_word();
test_eager_star_content();
test_eager_plus_content();
test_eager_concat_content();
test_nary_union();
test_nary_concat();
test_nested_complement();
test_determinism();
test_threshold_boundary();
test_early_stop_after_two();
test_iterator_exhaustion();
test_iterator_giveup();
test_simplify();
test_trivial_oracle();
}
};
void tst_seq_split() {
seq_split_test t;
t.run();
}