mirror of
https://github.com/Z3Prover/z3
synced 2026-07-01 12:58:54 +00:00
Porting seq_split to master (#9840)
Co-authored-by: Nikolaj Bjorner <nbjorner@microsoft.com>
This commit is contained in:
parent
c22a7bac7c
commit
b3143e759b
11 changed files with 1545 additions and 1 deletions
|
|
@ -41,6 +41,7 @@ z3_add_component(rewriter
|
|||
seq_eq_solver.cpp
|
||||
seq_derive.cpp
|
||||
seq_subset.cpp
|
||||
seq_split.cpp
|
||||
seq_derive.cpp
|
||||
seq_range_collapse.cpp
|
||||
seq_range_predicate.cpp
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@ Notes:
|
|||
--*/
|
||||
#pragma once
|
||||
|
||||
#include "seq_split.h"
|
||||
#include "ast/seq_decl_plugin.h"
|
||||
#include "ast/rewriter/seq_derive.h"
|
||||
#include "ast/ast_pp.h"
|
||||
|
|
@ -133,6 +134,7 @@ class seq_rewriter {
|
|||
|
||||
seq_util m_util;
|
||||
seq_subset m_subset;
|
||||
seq_split m_split;
|
||||
arith_util m_autil;
|
||||
bool_rewriter m_br;
|
||||
seq::derive m_derive;
|
||||
|
|
@ -332,7 +334,7 @@ class seq_rewriter {
|
|||
|
||||
public:
|
||||
seq_rewriter(ast_manager & m, params_ref const & p = params_ref()):
|
||||
m_util(m), m_subset(m_util.re), m_autil(m), m_br(m, p), m_derive(m, *this),
|
||||
m_util(m), m_subset(m_util.re), m_split(*this), m_autil(m), m_br(m, p), m_derive(m, *this), // m_re2aut(m),
|
||||
m_op_cache(m), m_es(m),
|
||||
m_lhs(m), m_rhs(m) {
|
||||
}
|
||||
|
|
@ -411,6 +413,20 @@ public:
|
|||
return result;
|
||||
}
|
||||
|
||||
// Split decomposition (sigma) of a regex; see seq_split.h. `oracle` (optional)
|
||||
// prunes non-viable splits during generation.
|
||||
bool split(expr* r, split_set& out, unsigned threshold,
|
||||
const split_mode mode = split_mode::strong, split_oracle const& oracle = {}) {
|
||||
return m_split.compute(r, out, threshold, mode, oracle);
|
||||
}
|
||||
|
||||
void simplify_split(split_set& s) { m_split.simplify(s); }
|
||||
|
||||
// decompose a membership constraint into a set of pairs of regex splits
|
||||
std::pair<expr_ref, expr_ref> split_membership(expr* str, expr* regex, unsigned threshold, split_set& result) const {
|
||||
return m_split.split_membership(str, regex, threshold, result);
|
||||
}
|
||||
|
||||
/**
|
||||
* check if regular expression is of the form all ++ s ++ all ++ t + u ++ all, where, s, t, u are sequences
|
||||
*/
|
||||
|
|
|
|||
820
src/ast/rewriter/seq_split.cpp
Normal file
820
src/ast/rewriter/seq_split.cpp
Normal file
|
|
@ -0,0 +1,820 @@
|
|||
/*++
|
||||
Copyright (c) 2026 Microsoft Corporation
|
||||
|
||||
Module Name:
|
||||
|
||||
seq_split.cpp
|
||||
|
||||
Abstract:
|
||||
|
||||
Regex split decomposition (the split function sigma). See seq_split.h.
|
||||
|
||||
Author:
|
||||
|
||||
Clemens Eisenhofer 2026-6-10
|
||||
|
||||
--*/
|
||||
|
||||
#include "ast/rewriter/seq_split.h"
|
||||
#include "ast/rewriter/seq_rewriter.h"
|
||||
#include "ast/ast_pp.h"
|
||||
#include "util/obj_hashtable.h"
|
||||
#include "util/stack.h"
|
||||
|
||||
seq_split::seq_split(seq_rewriter& rw) :
|
||||
m(rw.m()), m_rw(rw), m_subset(rw.u().re),
|
||||
m_set_sort(m),
|
||||
m_d_empty(m), m_d_single(m), m_d_fromre(m), m_d_union(m),
|
||||
m_d_inter(m), m_d_compl(m), m_d_lcat(m), m_d_rcat(m),
|
||||
m_empty_app(m) {}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Suspended split-set representation (split algebra over `expr`).
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
void seq_split::ensure_decls(sort* seq_sort) {
|
||||
SASSERT(seq_sort);
|
||||
if (m_seq_sort == seq_sort)
|
||||
return;
|
||||
sort* re_sort = re().mk_re(seq_sort);
|
||||
m_set_sort = m.mk_uninterpreted_sort(symbol("seq.split.set"));
|
||||
sort* ss = m_set_sort;
|
||||
m_d_empty = m.mk_func_decl(symbol("seq.split.empty"), 0u, nullptr, ss);
|
||||
m_d_single = m.mk_func_decl(symbol("seq.split.single"), re_sort, re_sort, ss);
|
||||
m_d_fromre = m.mk_func_decl(symbol("seq.split.from_re"), re_sort, ss);
|
||||
m_d_union = m.mk_func_decl(symbol("seq.split.union"), ss, ss, ss);
|
||||
m_d_inter = m.mk_func_decl(symbol("seq.split.inter"), ss, ss, ss);
|
||||
m_d_compl = m.mk_func_decl(symbol("seq.split.compl"), ss, ss);
|
||||
m_d_lcat = m.mk_func_decl(symbol("seq.split.lcat"), re_sort, ss, ss);
|
||||
m_d_rcat = m.mk_func_decl(symbol("seq.split.rcat"), ss, re_sort, ss);
|
||||
m_empty_app = m.mk_const(m_d_empty);
|
||||
m_seq_sort = seq_sort;
|
||||
}
|
||||
|
||||
// --- smart constructors ----------------------------------------------------
|
||||
|
||||
expr_ref seq_split::mk_empty() {
|
||||
SASSERT(m_empty_app);
|
||||
return m_empty_app;
|
||||
}
|
||||
|
||||
expr_ref seq_split::mk_single(expr* d, expr* n) {
|
||||
SASSERT(d && n);
|
||||
if (re().is_empty(d) || re().is_empty(n))
|
||||
return mk_empty();
|
||||
return expr_ref(m.mk_app(m_d_single, d, n), m);
|
||||
}
|
||||
|
||||
expr_ref seq_split::mk_fromre(expr* r) {
|
||||
SASSERT(r);
|
||||
sort* seq_sort = nullptr;
|
||||
VERIFY(seq().is_re(r, seq_sort));
|
||||
ensure_decls(seq_sort);
|
||||
if (re().is_empty(r))
|
||||
return mk_empty();
|
||||
return expr_ref(m.mk_app(m_d_fromre, r), m);
|
||||
}
|
||||
|
||||
expr_ref seq_split::mk_union(expr* a, expr* b) {
|
||||
SASSERT(a && b);
|
||||
if (is_empty_ss(a))
|
||||
return expr_ref(b, m);
|
||||
if (is_empty_ss(b))
|
||||
return expr_ref(a, m);
|
||||
return expr_ref(m.mk_app(m_d_union, a, b), m);
|
||||
}
|
||||
|
||||
expr_ref seq_split::mk_inter(expr* a, expr* b) {
|
||||
SASSERT(a && b);
|
||||
if (is_empty_ss(a) || is_empty_ss(b))
|
||||
return mk_empty();
|
||||
return expr_ref(m.mk_app(m_d_inter, a, b), m);
|
||||
}
|
||||
|
||||
expr_ref seq_split::mk_compl(expr* a) {
|
||||
SASSERT(a);
|
||||
return expr_ref(m.mk_app(m_d_compl, a), m);
|
||||
}
|
||||
|
||||
expr_ref seq_split::mk_lcat(expr* r, expr* s) {
|
||||
SASSERT(r && s);
|
||||
if (is_empty_ss(s))
|
||||
return mk_empty();
|
||||
if (re().is_epsilon(r)) // eps . S = S
|
||||
return expr_ref(s, m);
|
||||
return expr_ref(m.mk_app(m_d_lcat, r, s), m);
|
||||
}
|
||||
|
||||
expr_ref seq_split::mk_rcat(expr* s, expr* r) {
|
||||
SASSERT(r && s);
|
||||
if (is_empty_ss(s))
|
||||
return mk_empty();
|
||||
if (re().is_epsilon(r)) // S . eps = S
|
||||
return expr_ref(s, m);
|
||||
return expr_ref(m.mk_app(m_d_rcat, s, r), m);
|
||||
}
|
||||
|
||||
// --- recognizers -----------------------------------------------------------
|
||||
|
||||
bool seq_split::is_empty_ss(expr* e) const {
|
||||
return is_app(e) && to_app(e)->get_decl() == m_d_empty;
|
||||
}
|
||||
bool seq_split::is_single(expr* e, expr*& d, expr*& n) const {
|
||||
if (!is_app(e) || to_app(e)->get_decl() != m_d_single)
|
||||
return false;
|
||||
d = to_app(e)->get_arg(0);
|
||||
n = to_app(e)->get_arg(1);
|
||||
return true;
|
||||
}
|
||||
bool seq_split::is_fromre(expr* e, expr*& r) const {
|
||||
if (!is_app(e) || to_app(e)->get_decl() != m_d_fromre)
|
||||
return false;
|
||||
r = to_app(e)->get_arg(0);
|
||||
return true;
|
||||
}
|
||||
bool seq_split::is_union(expr* e, expr*& a, expr*& b) const {
|
||||
if (!is_app(e) || to_app(e)->get_decl() != m_d_union)
|
||||
return false;
|
||||
a = to_app(e)->get_arg(0);
|
||||
b = to_app(e)->get_arg(1);
|
||||
return true;
|
||||
}
|
||||
bool seq_split::is_inter(expr* e, expr*& a, expr*& b) const {
|
||||
if (!is_app(e) || to_app(e)->get_decl() != m_d_inter)
|
||||
return false;
|
||||
a = to_app(e)->get_arg(0);
|
||||
b = to_app(e)->get_arg(1);
|
||||
return true;
|
||||
}
|
||||
bool seq_split::is_compl(expr* e, expr*& a) const {
|
||||
if (!is_app(e) || to_app(e)->get_decl() != m_d_compl)
|
||||
return false;
|
||||
a = to_app(e)->get_arg(0);
|
||||
return true;
|
||||
}
|
||||
bool seq_split::is_lcat(expr* e, expr*& r, expr*& s) const {
|
||||
if (!is_app(e) || to_app(e)->get_decl() != m_d_lcat)
|
||||
return false;
|
||||
r = to_app(e)->get_arg(0);
|
||||
s = to_app(e)->get_arg(1);
|
||||
return true;
|
||||
}
|
||||
bool seq_split::is_rcat(expr* e, expr*& s, expr*& r) const {
|
||||
if (!is_app(e) || to_app(e)->get_decl() != m_d_rcat)
|
||||
return false;
|
||||
s = to_app(e)->get_arg(0);
|
||||
r = to_app(e)->get_arg(1);
|
||||
return true;
|
||||
}
|
||||
bool seq_split::is_frontier(expr* e) const {
|
||||
expr *a = nullptr, *b = nullptr;
|
||||
return is_empty_ss(e) || is_single(e, a, b) || is_union(e, a, b);
|
||||
}
|
||||
|
||||
seq_util& seq_split::seq() const { return m_rw.u(); }
|
||||
seq_util::rex& seq_split::re() const { return m_rw.u().re; }
|
||||
|
||||
// Add <d, n> unless the (optional) lookahead oracle prunes it.
|
||||
void seq_split::push(split_set& out, split_oracle const& oracle, expr* d, expr* n) const {
|
||||
if (!oracle || oracle(d, n))
|
||||
out.push_back(split_pair(d, n, m));
|
||||
}
|
||||
|
||||
// Cross-product intersection of two split-sets (split algebra):
|
||||
// S1 cap S2 = { <D1 cap D2, N1 cap N2> | <D1,N1> in S1, <D2,N2> in S2 }.
|
||||
// Pairs where any component is bottom (the empty regex) are dropped.
|
||||
bool seq_split::intersect(split_set const& s1, split_set const& s2, split_set& result,
|
||||
unsigned threshold, split_oracle const& oracle) const {
|
||||
const seq_util::rex& r = re();
|
||||
for (auto const& p1 : s1) {
|
||||
for (auto const& p2 : s2) {
|
||||
if (r.is_empty(p1.m_d) || r.is_empty(p2.m_d) ||
|
||||
r.is_empty(p1.m_n) || r.is_empty(p2.m_n))
|
||||
continue;
|
||||
const expr_ref di(m_rw.mk_regex_inter_normalize(p1.m_d, p2.m_d), m);
|
||||
const expr_ref ni(m_rw.mk_regex_inter_normalize(p1.m_n, p2.m_n), m);
|
||||
push(result, oracle, di, ni);
|
||||
if (result.size() > threshold)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Complement of a split-set via De Morgan: ~S = cap_{s in S} ~s with
|
||||
// ~<D,N> = { <~D, .*>, <.*, ~N> } and ~{} = { <.*, .*> }.
|
||||
// May produce up to 2^|sp| pairs (bounded by the threshold). A threshold
|
||||
// overrun must abort entirely: a partial fold is a strictly weaker (unsound)
|
||||
// split-set, since each ~sp[i] further constrains ~S.
|
||||
bool seq_split::complement(sort* seq_sort, split_set const& sp, split_set& result,
|
||||
const unsigned threshold, split_oracle const& oracle) const {
|
||||
|
||||
seq_util::rex& r = re();
|
||||
sort* re_sort = r.mk_re(seq_sort);
|
||||
const expr_ref full(r.mk_full_seq(re_sort), m); // .*
|
||||
if (sp.empty()) { // ~{} = <.*, .*>
|
||||
push(result, oracle, full, full);
|
||||
return true;
|
||||
}
|
||||
// The acc/next pairs carry genuine output-orientation N components (the De
|
||||
// Morgan ~<D,N> = {<~D,.*>, <.*,~N>}), so the oracle prunes them soundly and
|
||||
// keeps the 2^|sp| fold from blowing up.
|
||||
split_set acc;
|
||||
push(acc, oracle, r.mk_complement(sp[0].m_d), full);
|
||||
push(acc, oracle, full, r.mk_complement(sp[0].m_n));
|
||||
for (unsigned i = 1; i < sp.size(); ++i) {
|
||||
split_set next;
|
||||
push(next, oracle, r.mk_complement(sp[i].m_d), full);
|
||||
push(next, oracle, full, r.mk_complement(sp[i].m_n));
|
||||
split_set tmp;
|
||||
if (!intersect(acc, next, tmp, threshold, oracle))
|
||||
return false;
|
||||
acc = std::move(tmp);
|
||||
if (acc.empty()) // intersection empty => ~S is empty
|
||||
break;
|
||||
if (acc.size() > threshold)
|
||||
return false;
|
||||
}
|
||||
result.append(acc);
|
||||
return true;
|
||||
}
|
||||
|
||||
// One level of the sigma rules. Mirrors the historic eager `compute`, except it
|
||||
// emits *suspended* split-algebra terms (from_re / lcat / rcat / inter / compl) for
|
||||
// the subterms instead of recursing. `mode` is irrelevant here: weak vs. strong is
|
||||
// decided when `head_normalize` reaches an inter / compl node.
|
||||
expr_ref seq_split::expand_fromre(expr* r, bool& ok) {
|
||||
ok = true;
|
||||
seq_util& sq = seq();
|
||||
seq_util::rex& rex = re();
|
||||
|
||||
sort* seq_sort = nullptr;
|
||||
if (!sq.is_re(r, seq_sort)) {
|
||||
ok = false;
|
||||
return expr_ref(m);
|
||||
}
|
||||
ensure_decls(seq_sort);
|
||||
|
||||
// bottom: sigma(empty) = {}
|
||||
if (rex.is_empty(r))
|
||||
return mk_empty();
|
||||
|
||||
// epsilon: sigma(eps) = { <eps, eps> }
|
||||
if (rex.is_epsilon(r)) {
|
||||
const expr_ref eps(rex.mk_epsilon(seq_sort), m);
|
||||
return mk_single(eps, eps);
|
||||
}
|
||||
|
||||
expr* a = nullptr, *b = nullptr;
|
||||
|
||||
// to_re(s): split the literal word s at every position.
|
||||
expr* s = nullptr;
|
||||
if (rex.is_to_re(r, s)) {
|
||||
zstring str;
|
||||
vector<expr*> stack;
|
||||
stack.push_back(s);
|
||||
|
||||
while (!stack.empty()) {
|
||||
expr* cur = stack.back();
|
||||
stack.pop_back();
|
||||
if (seq().str.is_concat(cur, a, b)) {
|
||||
stack.push_back(b);
|
||||
stack.push_back(a);
|
||||
}
|
||||
else {
|
||||
expr* ch;
|
||||
unsigned cv;
|
||||
if (seq().str.is_unit(cur, ch) && seq().is_const_char(ch, cv)) {
|
||||
str += zstring(cv);
|
||||
continue;
|
||||
}
|
||||
zstring str2;
|
||||
if (sq.str.is_string(s, str2)) {
|
||||
str = str2;
|
||||
continue;
|
||||
}
|
||||
// not a constant string; unsupported for now
|
||||
ok = false;
|
||||
return expr_ref(m);
|
||||
}
|
||||
}
|
||||
expr_ref acc = mk_empty();
|
||||
for (unsigned i = 0; i <= str.length(); ++i) {
|
||||
const expr_ref p(rex.mk_to_re(sq.str.mk_string(str.extract(0, i))), m);
|
||||
const expr_ref q(rex.mk_to_re(sq.str.mk_string(str.extract(i, str.length() - i))), m);
|
||||
acc = mk_union(acc, mk_single(p, q));
|
||||
}
|
||||
return acc;
|
||||
}
|
||||
|
||||
// single-character class alpha (., [lo-hi], of_pred):
|
||||
// sigma(alpha) = { <eps, alpha>, <alpha, eps> }
|
||||
if (rex.is_full_char(r) || rex.is_range(r) || rex.is_of_pred(r)) {
|
||||
const expr_ref ex(r, m);
|
||||
const expr_ref eps(rex.mk_epsilon(seq_sort), m);
|
||||
return mk_union(mk_single(eps, ex), mk_single(ex, eps));
|
||||
}
|
||||
|
||||
// .* : sigma(.*) = { <.*, .*> }
|
||||
if (rex.is_full_seq(r)) {
|
||||
const expr_ref ex(r, m);
|
||||
return mk_single(ex, ex);
|
||||
}
|
||||
|
||||
// union: sigma(r0 | ... | r_{n-1}) = U from_re(ri) (re.union may be n-ary)
|
||||
if (rex.is_union(r)) {
|
||||
app* ap = to_app(r);
|
||||
expr_ref acc = mk_empty();
|
||||
for (expr* arg : *ap) {
|
||||
acc = mk_union(acc, mk_fromre(arg));
|
||||
}
|
||||
return acc;
|
||||
}
|
||||
|
||||
// concat: sigma(r0...r_{n-1}) = U_i (r0...r_{i-1}) . sigma(ri) . (r_{i+1}...r_{n-1})
|
||||
// emitted as U_i lcat(left, rcat(from_re(ri), right)) (re.++ may be n-ary)
|
||||
if (rex.is_concat(r)) {
|
||||
app* ap = to_app(r);
|
||||
const unsigned n = ap->get_num_args();
|
||||
expr_ref acc = mk_empty();
|
||||
for (unsigned i = 0; i < n; ++i) {
|
||||
expr_ref left(m), right(m);
|
||||
if (i == 0)
|
||||
left = rex.mk_epsilon(seq_sort);
|
||||
else {
|
||||
for (unsigned j = 0; j < i; ++j) {
|
||||
expr* arg = ap->get_arg(j);
|
||||
left = left ? expr_ref(rex.mk_concat(left, arg), m) : expr_ref(arg, m);
|
||||
}
|
||||
}
|
||||
if (i == n - 1)
|
||||
right = rex.mk_epsilon(seq_sort);
|
||||
else {
|
||||
right = ap->get_arg(i + 1);
|
||||
for (unsigned j = i + 2; j < n; ++j) {
|
||||
expr* arg = ap->get_arg(j);
|
||||
right = rex.mk_concat(right, arg);
|
||||
}
|
||||
}
|
||||
expr_ref term = mk_lcat(left, mk_rcat(mk_fromre(ap->get_arg(i)), right));
|
||||
acc = mk_union(acc, term);
|
||||
}
|
||||
return acc;
|
||||
}
|
||||
|
||||
// star: sigma(a*) = { <eps, eps> } cup a*.sigma(a).a*
|
||||
if (rex.is_star(r, a)) {
|
||||
const expr_ref eps(rex.mk_epsilon(seq_sort), m);
|
||||
expr_ref body = mk_lcat(r, mk_rcat(mk_fromre(a), r)); // a*.from_re(a).a*
|
||||
return mk_union(mk_single(eps, eps), body);
|
||||
}
|
||||
|
||||
// plus: a+ = a.a* ; sigma(a+) = a*.sigma(a).a* (star rule without <eps,eps>)
|
||||
if (rex.is_plus(r, a)) {
|
||||
const expr_ref star(rex.mk_star(a), m); // a*
|
||||
return mk_lcat(star, mk_rcat(mk_fromre(a), star));
|
||||
}
|
||||
|
||||
// intersection: sigma(r0 & ... & r_{n-1}) = cap from_re(ri) (re.inter may be n-ary)
|
||||
if (rex.is_intersection(r)) {
|
||||
app* ap = to_app(r);
|
||||
const unsigned n = ap->get_num_args();
|
||||
expr_ref acc = mk_fromre(ap->get_arg(0));
|
||||
for (unsigned i = 1; i < n; ++i)
|
||||
acc = mk_inter(acc, mk_fromre(ap->get_arg(i)));
|
||||
return acc;
|
||||
}
|
||||
|
||||
// complement: sigma(~a) = ~sigma(a).
|
||||
if (rex.is_complement(r, a))
|
||||
return mk_compl(mk_fromre(a));
|
||||
|
||||
// difference: a \ b = a & ~b ; sigma(a \ b) = sigma(a) cap ~sigma(b).
|
||||
if (rex.is_diff(r, a, b))
|
||||
return mk_inter(mk_fromre(a), mk_compl(mk_fromre(b)));
|
||||
|
||||
// bounded loop / ite / other: not handled (paper "v1: bail").
|
||||
TRACE(seq, tout << "seq_split: unsupported regex " << mk_pp(r, m) << "\n";);
|
||||
ok = false;
|
||||
return expr_ref(m);
|
||||
}
|
||||
|
||||
// r . hs : push the left regex onto the D component of a head-normal split-set.
|
||||
expr_ref seq_split::distribute_lcat(expr* r, expr* hs) {
|
||||
expr *a = nullptr, *b = nullptr, *d = nullptr, *n = nullptr;
|
||||
if (is_empty_ss(hs))
|
||||
return mk_empty();
|
||||
if (is_single(hs, d, n))
|
||||
return mk_single(m_rw.mk_re_append(r, d), n); // r.D
|
||||
if (is_union(hs, a, b))
|
||||
return mk_union(mk_lcat(r, a), mk_lcat(r, b));
|
||||
UNREACHABLE();
|
||||
return expr_ref(hs, m);
|
||||
}
|
||||
|
||||
// hs . r : push the right regex onto the N component of a head-normal split-set.
|
||||
expr_ref seq_split::distribute_rcat(expr* hs, expr* r) {
|
||||
expr *a = nullptr, *b = nullptr, *d = nullptr, *n = nullptr;
|
||||
if (is_empty_ss(hs))
|
||||
return mk_empty();
|
||||
if (is_single(hs, d, n))
|
||||
return mk_single(d, m_rw.mk_re_append(n, r)); // N.r
|
||||
if (is_union(hs, a, b))
|
||||
return mk_union(mk_rcat(a, r), mk_rcat(b, r));
|
||||
UNREACHABLE();
|
||||
return expr_ref(hs, m);
|
||||
}
|
||||
|
||||
expr_ref seq_split::from_split_set(split_set const& s) {
|
||||
expr_ref acc = mk_empty();
|
||||
for (auto const& p : s)
|
||||
acc = mk_union(acc, mk_single(p.m_d, p.m_n));
|
||||
return acc;
|
||||
}
|
||||
|
||||
expr_ref seq_split::head_normalize(expr* t, split_mode mode, unsigned threshold,
|
||||
split_oracle const& oracle, bool& ok) {
|
||||
ok = true;
|
||||
expr *a = nullptr, *b = nullptr, *r = nullptr, *s = nullptr;
|
||||
|
||||
// already a frontier node
|
||||
if (is_frontier(t))
|
||||
return expr_ref(t, m);
|
||||
|
||||
// from_re(r): one level of sigma; recurse to settle a non-frontier head
|
||||
// (plus / inter / compl / diff expand to lcat / inter / compl nodes).
|
||||
if (is_fromre(t, r)) {
|
||||
expr_ref e = expand_fromre(r, ok);
|
||||
if (!ok)
|
||||
return expr_ref(m);
|
||||
if (is_frontier(e))
|
||||
return e;
|
||||
return head_normalize(e, mode, threshold, oracle, ok);
|
||||
}
|
||||
|
||||
// r.S : head-normalize S, then distribute r over the frontier.
|
||||
if (is_lcat(t, r, s)) {
|
||||
expr_ref hs = head_normalize(s, mode, threshold, oracle, ok);
|
||||
if (!ok)
|
||||
return expr_ref(m);
|
||||
return distribute_lcat(r, hs);
|
||||
}
|
||||
if (is_rcat(t, s, r)) {
|
||||
expr_ref hs = head_normalize(s, mode, threshold, oracle, ok);
|
||||
if (!ok)
|
||||
return expr_ref(m);
|
||||
return distribute_rcat(hs, r);
|
||||
}
|
||||
|
||||
// inter / compl are eager by nature: a single split of S1 cap S2 (or ~S)
|
||||
// cannot be produced without materializing the operand split-sets.
|
||||
if (is_inter(t, a, b)) {
|
||||
if (mode == split_mode::weak) {
|
||||
ok = false;
|
||||
return expr_ref(m);
|
||||
}
|
||||
split_set sa, sb, tmp;
|
||||
if (!materialize(a, mode, threshold, oracle, sa) ||
|
||||
!materialize(b, mode, threshold, oracle, sb) ||
|
||||
!intersect(sa, sb, tmp, threshold, oracle)) {
|
||||
ok = false;
|
||||
return expr_ref(m);
|
||||
}
|
||||
return from_split_set(tmp);
|
||||
}
|
||||
if (is_compl(t, a)) {
|
||||
if (mode == split_mode::weak) {
|
||||
ok = false;
|
||||
return expr_ref(m);
|
||||
}
|
||||
// The body is materialized WITHOUT the oracle (its pairs are inverted, so
|
||||
// their N is unrelated to the output N); the oracle is re-applied in
|
||||
// complement().
|
||||
split_set sa, res;
|
||||
if (!materialize(a, mode, threshold, split_oracle{}, sa) ||
|
||||
!complement(m_seq_sort, sa, res, threshold, oracle)) {
|
||||
ok = false;
|
||||
return expr_ref(m);
|
||||
}
|
||||
return from_split_set(res);
|
||||
}
|
||||
|
||||
UNREACHABLE();
|
||||
ok = false;
|
||||
return expr_ref(m);
|
||||
}
|
||||
|
||||
bool seq_split::materialize(expr* node, split_mode mode, unsigned threshold,
|
||||
split_oracle const& oracle, split_set& out) {
|
||||
iterator it(*this, node, mode, threshold, oracle);
|
||||
expr_ref d(m), n(m);
|
||||
while (it.next(d, n))
|
||||
out.push_back(split_pair(d, n, m));
|
||||
return !it.gave_up();
|
||||
}
|
||||
|
||||
expr_ref seq_split::make(expr* r) {
|
||||
SASSERT(r);
|
||||
sort* seq_sort = nullptr;
|
||||
if (!seq().is_re(r, seq_sort))
|
||||
return expr_ref(m);
|
||||
return mk_fromre(r);
|
||||
}
|
||||
|
||||
// --- Lazy enumerator --------------------------------------------------------
|
||||
// The worklist holds suspended split-sets. Each next() pops a node, head-
|
||||
// normalizes it to a frontier (empty | single | union), and either returns the
|
||||
// single split, pushes the two union branches back, or skips an empty. All the
|
||||
// expansion work happens lazily, one split per next() call.
|
||||
|
||||
seq_split::iterator::iterator(seq_split& engine, expr* node, split_mode mode,
|
||||
unsigned threshold, split_oracle oracle) :
|
||||
m_engine(engine), m(engine.m), m_mode(mode), m_threshold(threshold),
|
||||
m_oracle(std::move(oracle)), m_work(engine.m) {
|
||||
SASSERT(node);
|
||||
m_work.push_back(node);
|
||||
}
|
||||
|
||||
bool seq_split::iterator::next(expr_ref& out_d, expr_ref& out_n) {
|
||||
if (m_giveup)
|
||||
return false; // a prior give-up is sticky
|
||||
while (!m_work.empty()) {
|
||||
expr_ref t(m_work.back(), m);
|
||||
m_work.pop_back();
|
||||
|
||||
bool ok = true;
|
||||
expr_ref hn = m_engine.head_normalize(t, m_mode, m_threshold, m_oracle, ok);
|
||||
if (!ok) {
|
||||
m_giveup = true; // unsupported / weak Boolean / overrun
|
||||
return false;
|
||||
}
|
||||
|
||||
expr *a = nullptr, *b = nullptr, *d = nullptr, *n = nullptr;
|
||||
if (m_engine.is_empty_ss(hn))
|
||||
continue;
|
||||
if (m_engine.is_single(hn, d, n)) {
|
||||
if (m_oracle && !m_oracle(d, n))
|
||||
continue; // pruned by lookahead
|
||||
if (++m_count > m_threshold) {
|
||||
m_giveup = true; // safety cap against space bloat
|
||||
return false;
|
||||
}
|
||||
out_d = d;
|
||||
out_n = n;
|
||||
return true;
|
||||
}
|
||||
if (m_engine.is_union(hn, a, b)) {
|
||||
m_work.push_back(a);
|
||||
m_work.push_back(b);
|
||||
continue;
|
||||
}
|
||||
UNREACHABLE();
|
||||
}
|
||||
return false; // exhausted (m_giveup stays false)
|
||||
}
|
||||
|
||||
seq_split::iterator seq_split::iterate(expr* node, split_mode mode, unsigned threshold,
|
||||
split_oracle const& oracle) {
|
||||
return iterator(*this, node, mode, threshold, oracle);
|
||||
}
|
||||
|
||||
// Eager wrapper: drain the lazy enumeration into `out`. Semantics (give-up cases,
|
||||
// oracle discipline) match the historic engine.
|
||||
bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mode mode,
|
||||
split_oracle const& oracle) {
|
||||
SASSERT(r);
|
||||
sort* seq_sort = nullptr;
|
||||
if (!seq().is_re(r, seq_sort))
|
||||
return false;
|
||||
expr_ref node = mk_fromre(r);
|
||||
return materialize(node, mode, threshold, oracle, result);
|
||||
}
|
||||
|
||||
// same-D / same-N merge (paper eqs. 1 & 2):
|
||||
// { <D,N>, <D,N'> } -> <D, N | N'> (by_left = true, group by D)
|
||||
// { <D,N>, <D',N> } -> <D | D', N> (by_left = false, group by N)
|
||||
// Only fires on syntactically-identical (perfectly-shared) key components, so
|
||||
// it is a conservative instance of the rule.
|
||||
void seq_split::merge_by(split_set& pairs, const bool by_left) const {
|
||||
obj_map<expr, unsigned> idx; // key component -> position in `out`
|
||||
split_set out;
|
||||
for (auto const& p : pairs) {
|
||||
expr* key = by_left ? p.m_d.get() : p.m_n.get();
|
||||
expr* other = by_left ? p.m_n.get() : p.m_d.get();
|
||||
unsigned pos;
|
||||
if (idx.find(key, pos)) {
|
||||
expr* prev = by_left ? out[pos].m_n.get() : out[pos].m_d.get();
|
||||
const expr_ref u(m_rw.mk_regex_union_normalize(prev, other), m);
|
||||
if (by_left)
|
||||
out[pos].m_n = u;
|
||||
else
|
||||
out[pos].m_d = u;
|
||||
}
|
||||
else {
|
||||
idx.insert(key, out.size());
|
||||
out.push_back(p);
|
||||
}
|
||||
}
|
||||
pairs.swap(out);
|
||||
}
|
||||
|
||||
void seq_split::simplify(split_set& pairs) const {
|
||||
seq_util::rex& r = re();
|
||||
|
||||
// 1. drop pairs with a bottom (empty-language) component.
|
||||
unsigned w = 0;
|
||||
for (unsigned i = 0; i < pairs.size(); ++i) {
|
||||
if (r.is_empty(pairs[i].m_d) || r.is_empty(pairs[i].m_n))
|
||||
continue;
|
||||
if (w != i)
|
||||
pairs[w] = pairs[i];
|
||||
++w;
|
||||
}
|
||||
pairs.shrink(w);
|
||||
if (pairs.size() <= 1)
|
||||
return;
|
||||
|
||||
// 2. same-D / same-N merge rules.
|
||||
merge_by(pairs, true);
|
||||
merge_by(pairs, false);
|
||||
if (pairs.size() <= 1)
|
||||
return;
|
||||
|
||||
// 3. subsumption: drop <D_i,N_i> when L(D_i) subseteq L(D_j) and
|
||||
// L(N_i) subseteq L(N_j) for some kept j. seq_subset is conservative
|
||||
// (returns true only for definite containment), so we never drop a
|
||||
// needed split.
|
||||
//if (pairs.size() > 64)
|
||||
// return;
|
||||
|
||||
struct row { expr* d; expr* n; unsigned idx; };
|
||||
vector<row> rows;
|
||||
for (unsigned i = 0; i < pairs.size(); ++i)
|
||||
rows.push_back({ pairs[i].m_d.get(), pairs[i].m_n.get(), i });
|
||||
|
||||
auto subsumes = [&](row const& a, row const& b) {
|
||||
return m_subset.is_subset(b.d, a.d) && m_subset.is_subset(b.n, a.n);
|
||||
};
|
||||
|
||||
vector<row> kept;
|
||||
for (row const& row_r : rows) {
|
||||
bool redundant = false;
|
||||
for (row const& k : kept)
|
||||
if (subsumes(k, row_r)) { redundant = true; break; }
|
||||
if (redundant)
|
||||
continue;
|
||||
// drop already-kept rows strictly subsumed by row_r
|
||||
unsigned kw = 0;
|
||||
for (unsigned t = 0; t < kept.size(); ++t) {
|
||||
if (subsumes(row_r, kept[t]))
|
||||
continue;
|
||||
kept[kw++] = kept[t];
|
||||
}
|
||||
kept.shrink(kw);
|
||||
kept.push_back(row_r);
|
||||
}
|
||||
|
||||
split_set result;
|
||||
for (row const& k : kept)
|
||||
result.push_back(pairs[k.idx]);
|
||||
pairs.swap(result);
|
||||
}
|
||||
|
||||
std::pair<expr_ref, expr_ref> seq_split::split_membership(expr* str, expr* regex, unsigned threshold, split_set& result) const {
|
||||
expr_ref_vector tokens(m);
|
||||
vector<expr*> stack;
|
||||
stack.push_back(str);
|
||||
|
||||
while (!stack.empty()) {
|
||||
expr* cur = stack.back();
|
||||
stack.pop_back();
|
||||
expr* l, *r;
|
||||
if (seq().str.is_concat(cur, l, r)) {
|
||||
stack.push_back(r);
|
||||
stack.push_back(l);
|
||||
}
|
||||
else
|
||||
tokens.push_back(expr_ref(cur, m));
|
||||
}
|
||||
|
||||
expr* ch;
|
||||
unsigned i = 0;
|
||||
|
||||
while (i < tokens.size() && (seq().str.is_string(tokens.get(i)) || (seq().str.is_unit(tokens.get(i), ch) && seq().is_const_char(ch)))) {
|
||||
zstring s;
|
||||
if (seq().str.is_string(tokens.get(i), s)) {
|
||||
if (s.empty()) {
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
ch = seq().mk_char(s[0]);
|
||||
tokens[i] = seq().str.mk_string(s.extract(1, s.length() - 1));
|
||||
}
|
||||
else
|
||||
i++;
|
||||
regex = m_rw.mk_derivative(ch, regex);
|
||||
}
|
||||
|
||||
if (i > 0) {
|
||||
unsigned j = 0;
|
||||
for (; i < tokens.size(); i++, j++) {
|
||||
tokens[j] = tokens.get(i);
|
||||
}
|
||||
tokens.shrink(j);
|
||||
}
|
||||
|
||||
// TODO: Do this for the back as well (also, why did no rule before do that?)
|
||||
|
||||
if (tokens.empty())
|
||||
return { expr_ref(m), expr_ref(m) };
|
||||
|
||||
// Choose the factorization boundary so the tail starts with the
|
||||
// longest run of concrete characters c.
|
||||
// This gives the split-engine lookahead oracle the most pruning information.
|
||||
// head = u' (tokens before the run), tail = c · u''' (tokens from the run onward).
|
||||
const unsigned total = tokens.size();
|
||||
unsigned run_start = 0, run_len = 0;
|
||||
for (i = 1; i < total; ) {
|
||||
if (!(seq().str.is_unit(tokens.get(i), ch) && seq().is_const_char(ch))) {
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
unsigned j = i;
|
||||
while (j < total && seq().str.is_unit(tokens.get(j), ch) && seq().is_const_char(ch)) {
|
||||
j++;
|
||||
}
|
||||
if (j - i > run_len) {
|
||||
run_len = j - i;
|
||||
run_start = i;
|
||||
}
|
||||
i = j;
|
||||
}
|
||||
// No constant run => fall back to splitting off the first token.
|
||||
const unsigned p = run_len == 0 ? 1 : run_start;
|
||||
SASSERT(p >= 1);
|
||||
expr* head = tokens.get(0);
|
||||
for (i = 1; i < p; i++) {
|
||||
head = seq().str.mk_concat(head, tokens.get(i));
|
||||
}
|
||||
expr* tail = seq().str.mk_empty(head->get_sort());
|
||||
if (tokens.size() > p + run_len) {
|
||||
tail = tokens.get(p + run_len);
|
||||
for (i = p + run_len + 1; i < tokens.size(); i++) {
|
||||
tail = seq().str.mk_concat(tail, tokens.get(i));
|
||||
}
|
||||
}
|
||||
SASSERT(head && tail);
|
||||
|
||||
// Build the constant lookahead c and (if non-empty) an oracle that
|
||||
// prunes splits whose postfix cannot match c.
|
||||
zstring c;
|
||||
for (i = 0; i < run_len; ++i) {
|
||||
unsigned cv;
|
||||
VERIFY(seq().str.is_unit(tokens.get(run_start + i), ch));
|
||||
VERIFY(seq().is_const_char(ch, cv));
|
||||
c = c + zstring(cv);
|
||||
}
|
||||
split_oracle oracle;
|
||||
if (!c.empty())
|
||||
oracle = [this, &c](expr*, expr* n) { return split_lookahead_viable(n, c); };
|
||||
|
||||
// Decompose the regex into a split-set via the shared seq_split engine
|
||||
if (!m_rw.split(regex, result, threshold, split_mode::strong, oracle)) {
|
||||
result.clear();
|
||||
return { expr_ref(m), expr_ref(m) };
|
||||
}
|
||||
|
||||
simplify(result);
|
||||
|
||||
// Eagerly consume the constant run c from the tail by taking the c-derivative
|
||||
// of each postfix
|
||||
if (!c.empty()) {
|
||||
unsigned w = 0;
|
||||
for (i = 0; i < result.size(); ++i) {
|
||||
expr* d = result[i].m_n;
|
||||
for (unsigned k = 0; d && !seq().re.is_empty(d) && k < c.length(); ++k) {
|
||||
d = m_rw.mk_derivative(seq().mk_char(c[k]), d);
|
||||
}
|
||||
SASSERT(d);
|
||||
if (re().is_empty(d))
|
||||
continue; // postfix can't start with c => infeasible split, drop
|
||||
result[w++] = split_pair(result[i].m_d, d, m);
|
||||
}
|
||||
result.shrink(w);
|
||||
}
|
||||
|
||||
return { expr_ref(head, m), expr_ref(tail, m) };
|
||||
}
|
||||
|
||||
bool seq_split::split_lookahead_viable(expr* regex, zstring const& c) const {
|
||||
SASSERT(regex);
|
||||
for (unsigned i = 0; i < c.length(); i++) {
|
||||
if (m.is_true(m_rw.is_nullable(regex)))
|
||||
return true; // N accepts the prefix c[0..i) => a suffix completes it
|
||||
regex = m_rw.mk_derivative(seq().mk_char(c[i]), regex);
|
||||
SASSERT(regex);
|
||||
if (re().is_empty(regex))
|
||||
return false; // N went (syntactically) dead before reaching c
|
||||
}
|
||||
return !re().is_empty(regex);
|
||||
}
|
||||
225
src/ast/rewriter/seq_split.h
Normal file
225
src/ast/rewriter/seq_split.h
Normal file
|
|
@ -0,0 +1,225 @@
|
|||
/*++
|
||||
Copyright (c) 2026 Microsoft Corporation
|
||||
|
||||
Module Name:
|
||||
|
||||
seq_split.h
|
||||
|
||||
Abstract:
|
||||
|
||||
Regex split decomposition: the split function sigma from the paper
|
||||
"Solving by Splitting". For a regular expression r, sigma(r) is a finite
|
||||
"split-set" of pairs { <D_i, N_i> } such that
|
||||
|
||||
u.v in L(r) iff exists i: u in L(D_i) and v in L(N_i).
|
||||
|
||||
The split algebra (intersection, De Morgan complement, left/right
|
||||
concatenation with a regex) and the cardinality-reducing simplification
|
||||
heuristics (drop bottom, same-D/same-N merge, subsumption via seq_subset)
|
||||
follow the paper.
|
||||
|
||||
Author:
|
||||
|
||||
Clemens Eisenhofer 2026-6-10
|
||||
|
||||
--*/
|
||||
#pragma once
|
||||
|
||||
#include "ast/seq_decl_plugin.h"
|
||||
#include "ast/rewriter/seq_subset.h"
|
||||
#include <functional>
|
||||
|
||||
class seq_rewriter;
|
||||
|
||||
// An individual split <D, N>: the left (prefix) regex D and right (suffix)
|
||||
// regex N. u.v in L(r) for this split iff u in L(D) and v in L(N).
|
||||
struct split_pair {
|
||||
expr_ref m_d;
|
||||
expr_ref m_n;
|
||||
split_pair(expr* d, expr* n, ast_manager& m) : m_d(d, m), m_n(n, m) {
|
||||
SASSERT(d && n);
|
||||
}
|
||||
};
|
||||
|
||||
// A split-set is a union of individual splits.
|
||||
typedef vector<split_pair> split_set;
|
||||
|
||||
// Controls how aggressively sigma expands the Boolean-closure cases:
|
||||
// strong - fully expand complement / intersection via the split algebra
|
||||
// (De Morgan / cross product). This is the behaviour the nseq
|
||||
// solver relies on.
|
||||
// weak - do not perform the (potentially 2^k) Boolean-closure expansion;
|
||||
// give up (return false) on complement / intersection instead.
|
||||
enum class split_mode { weak, strong };
|
||||
|
||||
// Optional lookahead oracle. Called for each candidate split <D, N> as it is
|
||||
// generated; returns true to keep it, false to prune it. An empty oracle (the
|
||||
// default) keeps everything, so sigma is unchanged. See seq_split::compute.
|
||||
typedef std::function<bool(expr* D, expr* N)> split_oracle;
|
||||
|
||||
class seq_split {
|
||||
ast_manager& m;
|
||||
seq_rewriter& m_rw; // for mk_re_append + manager / seq_util access
|
||||
seq_subset m_subset; // language-subset checks for subsumption
|
||||
|
||||
// --- Suspended split-set representation -------------------------------
|
||||
// A split-set computation is kept as an `expr` term over a small family of
|
||||
// locally-declared, uninterpreted function symbols (the split algebra of the
|
||||
// paper / split-algebra.md). Nothing here is ever asserted to the solver;
|
||||
// the terms are only used as scratch structure to drive lazy expansion.
|
||||
//
|
||||
// empty : SplitSet -- {} (bottom)
|
||||
// single : Re x Re -> SplitSet -- a single split <D, N>
|
||||
// from_re : Re -> SplitSet -- the *suspended* sigma(r)
|
||||
// union : SplitSet x SplitSet -> SplitSet
|
||||
// inter : SplitSet x SplitSet -> SplitSet
|
||||
// compl : SplitSet -> SplitSet
|
||||
// lcat : Re x SplitSet -> SplitSet -- r . S (left-concat onto D)
|
||||
// rcat : SplitSet x Re -> SplitSet -- S . r (right-concat onto N)
|
||||
sort* m_seq_sort = nullptr; // sequence sort the decls are built for
|
||||
sort_ref m_set_sort; // the uninterpreted SplitSet sort
|
||||
func_decl_ref m_d_empty, m_d_single, m_d_fromre, m_d_union,
|
||||
m_d_inter, m_d_compl, m_d_lcat, m_d_rcat;
|
||||
expr_ref m_empty_app; // cached nullary `empty` term
|
||||
|
||||
seq_util& seq() const;
|
||||
seq_util::rex& re() const;
|
||||
|
||||
// (Re)build the local declarations for `seq_sort` if not already current.
|
||||
void ensure_decls(sort* seq_sort);
|
||||
|
||||
// Smart constructors: apply the cheap normalizations the eager engine relies
|
||||
// on (drop-bottom, eps cancellation, union absorption of empty).
|
||||
expr_ref mk_empty();
|
||||
expr_ref mk_single(expr* d, expr* n);
|
||||
expr_ref mk_fromre(expr* r);
|
||||
expr_ref mk_union(expr* a, expr* b);
|
||||
expr_ref mk_inter(expr* a, expr* b);
|
||||
expr_ref mk_compl(expr* a);
|
||||
expr_ref mk_lcat(expr* r, expr* s);
|
||||
expr_ref mk_rcat(expr* s, expr* r);
|
||||
|
||||
// Recognizers over the local decls.
|
||||
bool is_empty_ss(expr* e) const;
|
||||
bool is_single(expr* e, expr*& d, expr*& n) const;
|
||||
bool is_fromre(expr* e, expr*& r) const;
|
||||
bool is_union (expr* e, expr*& a, expr*& b) const;
|
||||
bool is_inter (expr* e, expr*& a, expr*& b) const;
|
||||
bool is_compl (expr* e, expr*& a) const;
|
||||
bool is_lcat (expr* e, expr*& r, expr*& s) const;
|
||||
bool is_rcat (expr* e, expr*& s, expr*& r) const;
|
||||
// A term whose head is empty | single | union (ready for the worklist loop).
|
||||
bool is_frontier(expr* e) const;
|
||||
|
||||
// One level of the sigma rules: from_re(r) -> a SplitSet term built from the
|
||||
// immediate subterms. `ok` is set false on an unsupported shape.
|
||||
expr_ref expand_fromre(expr* r, bool& ok);
|
||||
// Distribute a left/right concatenation over a head-normal split-set.
|
||||
expr_ref distribute_lcat(expr* r, expr* hs);
|
||||
expr_ref distribute_rcat(expr* hs, expr* r);
|
||||
// Materialized split-set -> a `union` of `single`s.
|
||||
expr_ref from_split_set(split_set const& s);
|
||||
// Reduce `t` until its head is empty | single | union (one outermost level
|
||||
// for the lazy nodes; inter/compl are expanded eagerly via `materialize`,
|
||||
// since the paper's De Morgan / cross-product cannot yield a split lazily).
|
||||
// `ok` is set false on a give-up (unsupported shape, weak-mode Boolean, or
|
||||
// threshold overrun).
|
||||
expr_ref head_normalize(expr* t, split_mode mode, unsigned threshold,
|
||||
split_oracle const& oracle, bool& ok);
|
||||
// Fully drain a suspended split-set into `out` (used for inter/compl bodies).
|
||||
// Runs an `iterator` to exhaustion; returns false on a give-up.
|
||||
bool materialize(expr* node, split_mode mode, unsigned threshold,
|
||||
split_oracle const& oracle, split_set& out);
|
||||
|
||||
// Push <d, n> onto `out`, unless `oracle` rejects it.
|
||||
void push(split_set& out, split_oracle const& oracle, expr* d, expr* n) const;
|
||||
|
||||
// S1 cap S2 = { <D1 cap D2, N1 cap N2> } dropping any pair with a bottom
|
||||
// component (and any rejected by `oracle`). Returns false on threshold overrun.
|
||||
bool intersect(split_set const& s1, split_set const& s2, split_set& result,
|
||||
unsigned threshold, split_oracle const& oracle) const;
|
||||
|
||||
// De Morgan complement of a split-set: ~S = cap_{s in S} ~s with
|
||||
// ~<D,N> = { <~D, .*>, <.*, ~N> } and ~{} = { <.*, .*> }.
|
||||
bool complement(sort* seq_sort, split_set const& sp, split_set& result,
|
||||
unsigned threshold, split_oracle const& oracle) const;
|
||||
|
||||
// same-D / same-N merge: groups pairs that share a (syntactically identical)
|
||||
// left (resp. right) component and unions the other component.
|
||||
void merge_by(split_set& pairs, bool by_left) const;
|
||||
|
||||
public:
|
||||
explicit seq_split(seq_rewriter& rw);
|
||||
|
||||
// Lazy split enumerator. Holds the suspended split-set worklist and produces
|
||||
// the concrete splits <D, N> one at a time, on demand, instead of computing
|
||||
// them all up front. Obtain one from seq_split::iterate (or construct it
|
||||
// directly) and pull splits with next() until it returns false; gave_up() then
|
||||
// tells a normal exhaustion (false) apart from a give-up (true).
|
||||
//
|
||||
// The threshold is supplied by the caller and serves only as a safety cap
|
||||
// against space bloat (lazy expansion still has to materialize the operands of
|
||||
// intersection / complement). A threshold overrun, an unsupported regex shape,
|
||||
// or a Boolean-closure case in weak mode aborts the enumeration: next() returns
|
||||
// false and gave_up() returns true. To stop early, simply stop calling next().
|
||||
//
|
||||
// `oracle` (optional) prunes non-viable splits as they are produced. It must
|
||||
// be sound to apply per split: a candidate N can still gain a prefix from a
|
||||
// factor appended to its right later (concat/star), so the oracle must use a
|
||||
// "prefix-compatible" test (prune only when N can never match the lookahead,
|
||||
// even partially), NOT a strict "starts-with" test. The complement body is
|
||||
// expanded WITHOUT the oracle (inverted orientation); the oracle is re-applied
|
||||
// to the complement's output fold.
|
||||
class iterator {
|
||||
seq_split& m_engine;
|
||||
ast_manager& m;
|
||||
split_mode m_mode;
|
||||
unsigned m_threshold;
|
||||
split_oracle m_oracle;
|
||||
expr_ref_vector m_work; // GC-safe worklist of suspended split-sets
|
||||
unsigned m_count = 0; // splits produced so far (vs. threshold)
|
||||
bool m_giveup = false;
|
||||
public:
|
||||
iterator(seq_split& engine, expr* node, split_mode mode,
|
||||
unsigned threshold, split_oracle oracle);
|
||||
// Compute the next split. On success returns true and sets <d, n>; on
|
||||
// exhaustion or give-up returns false (see gave_up()). Calling next()
|
||||
// again after it has returned false keeps returning false.
|
||||
bool next(expr_ref& d, expr_ref& n);
|
||||
// Valid after next() has returned false: true iff the enumeration aborted
|
||||
// (unsupported regex / weak-mode Boolean / threshold overrun) rather than
|
||||
// running out of splits.
|
||||
bool gave_up() const { return m_giveup; }
|
||||
};
|
||||
|
||||
// Build the *suspended* sigma(r) as a split-algebra term (no expansion).
|
||||
// Returns null on a non-regex argument. Drive it with `iterate`.
|
||||
expr_ref make(expr* r);
|
||||
|
||||
// Create a lazy enumerator over a suspended split-set `node` (typically the
|
||||
// result of make()). See `iterator` for the meaning of the arguments.
|
||||
iterator iterate(expr* node, split_mode mode, unsigned threshold,
|
||||
split_oracle const& oracle = {});
|
||||
|
||||
// Compute sigma(r), appending to `out` (does not clear it). Thin eager
|
||||
// wrapper that drains an `iterator` to exhaustion; semantics match the historic
|
||||
// engine. See `iterator` for the meaning of `threshold`, `mode`, and `oracle`.
|
||||
bool compute(expr* r, split_set& out, unsigned threshold,
|
||||
split_mode mode = split_mode::strong, split_oracle const& oracle = {});
|
||||
|
||||
// In-place simplification of a split-set: drop bottom components, apply the
|
||||
// same-D / same-N merge rules, and drop splits subsumed by another (using
|
||||
// seq_subset). Size-capped to keep the O(n^2) subsumption affordable.
|
||||
void simplify(split_set& s) const;
|
||||
|
||||
// decompose a membership constraint into a set of pairs of regex splits
|
||||
std::pair<expr_ref, expr_ref> split_membership(expr* str, expr* regex, unsigned threshold, split_set& result) const;
|
||||
|
||||
// Lookahead oracle for the split engine: is the split's right component
|
||||
// `n_regex` prefix-compatible with the constant character sequence `c`?
|
||||
// This is sound to apply during split generation — it never drops a viable split.
|
||||
// Thus, it might not eliminate all cases in order to stay sound
|
||||
bool split_lookahead_viable(expr* regex, zstring const& c) const;
|
||||
|
||||
|
||||
};
|
||||
|
|
@ -138,6 +138,8 @@ def_module_params(module_name='smt',
|
|||
('seq.validate', BOOL, False, 'enable self-validation of theory axioms created by seq theory'),
|
||||
('seq.max_unfolding', UINT, 1000000000, 'maximal unfolding depth for checking string equations and regular expressions'),
|
||||
('seq.min_unfolding', UINT, 1, 'initial bound for strings whose lengths are bounded by iterative deepening. Set this to a higher value if there are only models with larger string lengths'),
|
||||
('seq.regex_factorization_threshold', UINT, 10, 'maximum number of cases to factor a regex into in a single step'),
|
||||
('seq.regex_factorization_enabled', BOOL, False, 'apply regex factorization (sigma splitting)'),
|
||||
('theory_aware_branching', BOOL, False, 'Allow the context to use extra information from theory solvers regarding literal branching prioritization.'),
|
||||
('sls.enable', BOOL, False, 'enable sls co-processor with SMT engine'),
|
||||
('sls.parallel', BOOL, True, 'use sls co-processor in parallel or sequential with SMT engine'),
|
||||
|
|
|
|||
|
|
@ -23,4 +23,6 @@ void theory_seq_params::updt_params(params_ref const & _p) {
|
|||
m_seq_validate = p.seq_validate();
|
||||
m_seq_max_unfolding = p.seq_max_unfolding();
|
||||
m_seq_min_unfolding = p.seq_min_unfolding();
|
||||
m_seq_regex_factorization_enabled = p.seq_regex_factorization_enabled();
|
||||
m_seq_regex_factorization_threshold = p.seq_regex_factorization_threshold();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,6 +26,8 @@ struct theory_seq_params {
|
|||
bool m_seq_validate = false;
|
||||
unsigned m_seq_max_unfolding = UINT_MAX/4;
|
||||
unsigned m_seq_min_unfolding = 1;
|
||||
bool m_seq_regex_factorization_enabled = false;
|
||||
unsigned m_seq_regex_factorization_threshold = 1;
|
||||
|
||||
theory_seq_params(params_ref const & p = params_ref()) {
|
||||
updt_params(p);
|
||||
|
|
|
|||
|
|
@ -128,6 +128,30 @@ namespace smt {
|
|||
return;
|
||||
}
|
||||
|
||||
if (th.get_fparams().m_seq_regex_factorization_enabled) {
|
||||
unsigned threshold = th.get_fparams().m_seq_regex_factorization_threshold;
|
||||
if (threshold == 0)
|
||||
threshold = UINT_MAX;
|
||||
split_set result;
|
||||
auto [head, tail] = seq_rw().split_membership(s, r, threshold, result);
|
||||
if (head) {
|
||||
SASSERT(tail);
|
||||
// propagate all cases
|
||||
expr_ref_vector cases(m);
|
||||
expr_ref_vector branches(m);
|
||||
for (auto [pre, post] : result) {
|
||||
expr_ref mem_head(re().mk_in_re(head, pre), m);
|
||||
expr_ref mem_tail(re().mk_in_re(tail, post), m);
|
||||
cases.push_back(m.mk_and(mem_head, mem_tail));
|
||||
}
|
||||
const expr_ref cases_expr(m.mk_or(cases), m);
|
||||
ctx.internalize(cases_expr, false);
|
||||
th.propagate_lit(nullptr, 1, &lit, ctx.get_literal(cases_expr));
|
||||
return;
|
||||
}
|
||||
// fallthrough; decomposition failed
|
||||
}
|
||||
|
||||
// Convert a non-ground sequence into an additional regex and
|
||||
// strengthen the original regex constraint into an intersection
|
||||
// for example:
|
||||
|
|
|
|||
|
|
@ -138,6 +138,7 @@ add_executable(test-z3
|
|||
simplifier.cpp
|
||||
sls_test.cpp
|
||||
sls_seq_plugin.cpp
|
||||
seq_split.cpp
|
||||
small_object_allocator.cpp
|
||||
smt2print_parse.cpp
|
||||
smt_context.cpp
|
||||
|
|
|
|||
|
|
@ -197,6 +197,7 @@
|
|||
X(ho_matcher) \
|
||||
X(finite_set) \
|
||||
X(finite_set_rewriter) \
|
||||
X(seq_split) \
|
||||
X(fpa) \
|
||||
X(seq_regex_bisim) \
|
||||
X(term_enumeration) \
|
||||
|
|
|
|||
450
src/test/seq_split.cpp
Normal file
450
src/test/seq_split.cpp
Normal file
|
|
@ -0,0 +1,450 @@
|
|||
/*++
|
||||
Copyright (c) 2026 Microsoft Corporation
|
||||
|
||||
Module Name:
|
||||
|
||||
seq_split.cpp
|
||||
|
||||
Abstract:
|
||||
|
||||
Unit tests for the regex split engine (the split function sigma) in ast/rewriter/seq_split.cpp.
|
||||
|
||||
Author:
|
||||
|
||||
Clemens Eisenhofer 2026-6-22
|
||||
|
||||
--*/
|
||||
|
||||
#include "ast/ast.h"
|
||||
#include "ast/reg_decl_plugins.h"
|
||||
#include "ast/seq_decl_plugin.h"
|
||||
#include "ast/rewriter/seq_rewriter.h"
|
||||
#include "ast/rewriter/seq_split.h"
|
||||
#include <set>
|
||||
#include <utility>
|
||||
|
||||
|
||||
struct plugin_registrar {
|
||||
plugin_registrar(ast_manager& m) { reg_decl_plugins(m); }
|
||||
};
|
||||
|
||||
class seq_split_test {
|
||||
ast_manager m;
|
||||
plugin_registrar m_reg;
|
||||
seq_rewriter m_rw;
|
||||
seq_split m_split;
|
||||
seq_util u;
|
||||
sort_ref m_str; // the sequence (String) sort
|
||||
sort_ref m_re; // the RegEx sort over m_str
|
||||
|
||||
seq_util::rex& re() { return u.re; }
|
||||
|
||||
expr_ref eps() { return expr_ref(re().mk_epsilon(m_str), m); } // mk_epsilon takes the seq sort
|
||||
expr_ref dot() { return expr_ref(re().mk_full_char(m_re), m); } // mk_full_char takes the RegEx sort
|
||||
expr_ref dotstar() { return expr_ref(re().mk_full_seq(m_re), m); } // .*
|
||||
expr_ref empty_re() { return expr_ref(re().mk_empty(m_re), m); } // the bottom regex
|
||||
expr_ref rappend(expr* a, expr* b) { return m_rw.mk_re_append(a, b); } // the engine's regex concat
|
||||
expr_ref word(char const* s) { return expr_ref(re().mk_to_re(u.str.mk_string(zstring(s))), m); }
|
||||
expr_ref rng(char lo, char hi) {
|
||||
return expr_ref(re().mk_range(u.str.mk_string(zstring(std::string(1, lo).c_str())),
|
||||
u.str.mk_string(zstring(std::string(1, hi).c_str()))), m);
|
||||
}
|
||||
|
||||
typedef std::set<std::pair<expr*, expr*>> pair_set;
|
||||
|
||||
pair_set as_set(split_set const& s) {
|
||||
pair_set out;
|
||||
for (auto const& p : s)
|
||||
out.insert({ p.m_d.get(), p.m_n.get() });
|
||||
return out;
|
||||
}
|
||||
|
||||
bool eager(expr* r, split_set& out, unsigned threshold = UINT_MAX,
|
||||
split_mode mode = split_mode::strong, split_oracle const& oracle = {}) {
|
||||
return m_split.compute(r, out, threshold, mode, oracle);
|
||||
}
|
||||
|
||||
bool lazy(expr* r, split_set& out, unsigned threshold = UINT_MAX,
|
||||
split_mode mode = split_mode::strong, split_oracle const& oracle = {}) {
|
||||
expr_ref node = m_split.make(r);
|
||||
ENSURE(node);
|
||||
seq_split::iterator it = m_split.iterate(node, mode, threshold, oracle);
|
||||
expr_ref d(m), n(m);
|
||||
while (it.next(d, n))
|
||||
out.push_back(split_pair(d, n, m));
|
||||
return !it.gave_up();
|
||||
}
|
||||
|
||||
// assert that the eager and lazy engines agree on sigma(r) as a *set* of
|
||||
// splits, and report the common cardinality.
|
||||
unsigned check_agree(expr* r) {
|
||||
split_set se, sl;
|
||||
bool oke = eager(r, se);
|
||||
bool okl = lazy(r, sl);
|
||||
ENSURE(oke == okl);
|
||||
if (!oke)
|
||||
return 0;
|
||||
ENSURE(as_set(se) == as_set(sl));
|
||||
return (unsigned)as_set(se).size();
|
||||
}
|
||||
|
||||
public:
|
||||
seq_split_test() : m_reg(m), m_rw(m), m_split(m_rw), u(m), m_str(m), m_re(m) {
|
||||
m_str = u.str.mk_string_sort();
|
||||
m_re = re().mk_re(m_str);
|
||||
}
|
||||
|
||||
void test_eager_epsilon() {
|
||||
split_set s;
|
||||
ENSURE(eager(eps(), s));
|
||||
ENSURE(as_set(s) == pair_set({ { eps().get(), eps().get() } }));
|
||||
}
|
||||
|
||||
void test_eager_char() {
|
||||
// sigma(.) = { <eps, .>, <., eps> }
|
||||
expr_ref a = dot();
|
||||
split_set s;
|
||||
ENSURE(eager(a, s));
|
||||
pair_set expected({ { eps().get(), a.get() }, { a.get(), eps().get() } });
|
||||
ENSURE(as_set(s) == expected);
|
||||
}
|
||||
|
||||
void test_eager_word() {
|
||||
// sigma("ab") = { <"", "ab">, <"a","b">, <"ab",""> }
|
||||
split_set s;
|
||||
ENSURE(eager(word("ab"), s));
|
||||
pair_set expected({
|
||||
{ word("").get(), word("ab").get() },
|
||||
{ word("a").get(), word("b").get() },
|
||||
{ word("ab").get(), word("").get() },
|
||||
});
|
||||
ENSURE(as_set(s) == expected);
|
||||
}
|
||||
|
||||
void test_eager_union() {
|
||||
// sigma(a | b) = sigma(a) cup sigma(b)
|
||||
expr_ref a = rng('a', 'a'), b = rng('b', 'b');
|
||||
expr_ref u_re(re().mk_union(a, b), m);
|
||||
split_set s;
|
||||
ENSURE(eager(u_re, s));
|
||||
pair_set expected({
|
||||
{ eps().get(), a.get() }, { a.get(), eps().get() },
|
||||
{ eps().get(), b.get() }, { b.get(), eps().get() },
|
||||
});
|
||||
ENSURE(as_set(s) == expected);
|
||||
}
|
||||
|
||||
void test_agree_all() {
|
||||
expr_ref a = rng('a', 'a'), b = rng('b', 'b');
|
||||
expr_ref star(re().mk_star(a), m);
|
||||
expr_ref plus(re().mk_plus(a), m);
|
||||
expr_ref concat(re().mk_concat(a, b), m);
|
||||
expr_ref uni(re().mk_union(a, b), m);
|
||||
expr_ref inter(re().mk_inter(re().mk_star(a), re().mk_star(b)), m);
|
||||
expr_ref compl_(re().mk_complement(re().mk_star(a)), m);
|
||||
expr_ref diff(re().mk_diff(re().mk_star(a), re().mk_star(b)), m);
|
||||
|
||||
ENSURE(check_agree(eps()) == 1);
|
||||
ENSURE(check_agree(a) == 2);
|
||||
ENSURE(check_agree(word("ab")) == 3);
|
||||
ENSURE(check_agree(uni) == 4);
|
||||
ENSURE(check_agree(star) == 3); // { <eps,eps>, <a*, a.a*>, <a*.a, a*> }
|
||||
(void)check_agree(plus);
|
||||
(void)check_agree(concat);
|
||||
(void)check_agree(inter); // strong-mode intersection
|
||||
(void)check_agree(compl_); // strong-mode De Morgan complement
|
||||
(void)check_agree(diff);
|
||||
}
|
||||
|
||||
void test_lazy_early_stop() {
|
||||
// a* has 3 splits; pull just the first one and then stop. (Note .* is the
|
||||
// full_seq special case with a single split, so use a proper char-class body.)
|
||||
expr_ref star(re().mk_star(rng('a', 'a')), m);
|
||||
expr_ref node = m_split.make(star);
|
||||
ENSURE(node);
|
||||
seq_split::iterator it = m_split.iterate(node, split_mode::strong, UINT_MAX, {});
|
||||
expr_ref d(m), n(m);
|
||||
unsigned seen = 0;
|
||||
if (it.next(d, n)) // pull exactly one split, then walk away
|
||||
++seen;
|
||||
ENSURE(!it.gave_up()); // stopping early is not a give-up
|
||||
ENSURE(seen == 1);
|
||||
}
|
||||
|
||||
void test_threshold_giveup() {
|
||||
expr_ref star(re().mk_star(rng('a', 'a')), m); // 3 splits
|
||||
split_set s;
|
||||
ENSURE(!lazy(star, s, /*threshold*/ 1));
|
||||
// the eager wrapper honours the same cap
|
||||
split_set s2;
|
||||
ENSURE(!eager(star, s2, /*threshold*/ 1));
|
||||
}
|
||||
|
||||
void test_weak_vs_strong() {
|
||||
expr_ref inter(re().mk_inter(re().mk_star(rng('a', 'a')), re().mk_star(rng('b', 'b'))), m);
|
||||
expr_ref compl_(re().mk_complement(re().mk_star(dot())), m);
|
||||
|
||||
split_set s;
|
||||
ENSURE(!eager(inter, s, UINT_MAX, split_mode::weak));
|
||||
s.reset();
|
||||
ENSURE(!lazy(inter, s, UINT_MAX, split_mode::weak));
|
||||
s.reset();
|
||||
ENSURE(!eager(compl_, s, UINT_MAX, split_mode::weak));
|
||||
s.reset();
|
||||
ENSURE(!lazy(compl_, s, UINT_MAX, split_mode::weak));
|
||||
|
||||
// strong mode succeeds for both
|
||||
s.reset();
|
||||
ENSURE(eager(inter, s, UINT_MAX, split_mode::strong));
|
||||
s.reset();
|
||||
ENSURE(eager(compl_, s, UINT_MAX, split_mode::strong));
|
||||
}
|
||||
|
||||
void test_make_non_regex() {
|
||||
expr_ref not_a_regex(u.str.mk_string(zstring("a")), m); // String, not RegEx
|
||||
expr_ref node = m_split.make(not_a_regex);
|
||||
ENSURE(!node);
|
||||
}
|
||||
|
||||
void test_oracle_prunes() {
|
||||
// sigma(.) without an oracle = { <eps,.>, <.,eps> }; an oracle that keeps
|
||||
// only splits whose suffix is epsilon must drop one of the two.
|
||||
expr_ref a = dot();
|
||||
expr_ref e = eps();
|
||||
split_oracle keep_eps_suffix = [&](expr*, expr* n) { return n == e.get(); };
|
||||
|
||||
split_set se, sl;
|
||||
ENSURE(eager(a, se, UINT_MAX, split_mode::strong, keep_eps_suffix));
|
||||
ENSURE(lazy(a, sl, UINT_MAX, split_mode::strong, keep_eps_suffix));
|
||||
pair_set expected({ { a.get(), e.get() } });
|
||||
ENSURE(as_set(se) == expected);
|
||||
ENSURE(as_set(sl) == expected);
|
||||
}
|
||||
|
||||
void test_eager_full_seq() {
|
||||
// sigma(.*) = { <.*, .*> }
|
||||
expr_ref ds = dotstar();
|
||||
split_set s;
|
||||
ENSURE(eager(ds, s));
|
||||
ENSURE(as_set(s) == pair_set({ { ds.get(), ds.get() } }));
|
||||
}
|
||||
|
||||
void test_eager_bottom() {
|
||||
// sigma(empty) = {}
|
||||
split_set s;
|
||||
ENSURE(eager(empty_re(), s));
|
||||
ENSURE(s.empty());
|
||||
|
||||
split_set sl;
|
||||
ENSURE(lazy(empty_re(), sl));
|
||||
ENSURE(sl.empty());
|
||||
}
|
||||
|
||||
void test_eager_empty_word() {
|
||||
// sigma(to_re("")) = { <"", ""> } (a single, trivial split)
|
||||
split_set s;
|
||||
ENSURE(eager(word(""), s));
|
||||
ENSURE(as_set(s) == pair_set({ { word("").get(), word("").get() } }));
|
||||
}
|
||||
|
||||
void test_eager_star_content() {
|
||||
// sigma(a*) = { <eps,eps>, <a*.eps, a.a*>, <a*.a, eps.a*> }
|
||||
expr_ref a = rng('a', 'a');
|
||||
expr_ref as(re().mk_star(a), m);
|
||||
split_set s;
|
||||
ENSURE(eager(as, s));
|
||||
pair_set expected({
|
||||
{ eps().get(), eps().get() },
|
||||
{ rappend(as, eps()).get(), rappend(a, as).get() },
|
||||
{ rappend(as, a).get(), rappend(eps(), as).get() },
|
||||
});
|
||||
ENSURE(as_set(s) == expected);
|
||||
}
|
||||
|
||||
void test_eager_plus_content() {
|
||||
// sigma(a+) = a*.sigma(a).a* (the star rule without <eps,eps>)
|
||||
expr_ref a = rng('a', 'a');
|
||||
expr_ref as(re().mk_star(a), m);
|
||||
expr_ref ap(re().mk_plus(a), m);
|
||||
split_set s;
|
||||
ENSURE(eager(ap, s));
|
||||
pair_set expected({
|
||||
{ rappend(as, eps()).get(), rappend(a, as).get() },
|
||||
{ rappend(as, a).get(), rappend(eps(), as).get() },
|
||||
});
|
||||
ENSURE(as_set(s) == expected);
|
||||
}
|
||||
|
||||
void test_eager_concat_content() {
|
||||
// sigma(a.b) = sigma(a).b cup a.sigma(b)
|
||||
expr_ref a = rng('a', 'a'), b = rng('b', 'b');
|
||||
expr_ref ab(re().mk_concat(a, b), m);
|
||||
split_set s;
|
||||
ENSURE(eager(ab, s));
|
||||
pair_set expected({
|
||||
{ eps().get(), rappend(a, b).get() }, // <eps, a.b>
|
||||
{ a.get(), rappend(eps(), b).get() }, // <a, eps.b>
|
||||
{ rappend(a, eps()).get(), b.get() }, // <a.eps, b>
|
||||
{ rappend(a, b).get(), eps().get() }, // <a.b, eps>
|
||||
});
|
||||
ENSURE(as_set(s) == expected);
|
||||
}
|
||||
|
||||
void test_nary_union() {
|
||||
// sigma(a|b|c) has 2 splits per char-class
|
||||
expr_ref a = rng('a', 'a'), b = rng('b', 'b'), c = rng('c', 'c');
|
||||
expr_ref u3(re().mk_union(a, re().mk_union(b, c)), m);
|
||||
ENSURE(check_agree(u3) == 6);
|
||||
}
|
||||
|
||||
void test_nary_concat() {
|
||||
// sigma(a.b.c)
|
||||
expr_ref a = rng('a', 'a'), b = rng('b', 'b'), c = rng('c', 'c');
|
||||
expr_ref c3(re().mk_concat(a, re().mk_concat(b, c)), m);
|
||||
ENSURE(check_agree(c3) >= 4);
|
||||
}
|
||||
|
||||
void test_nested_complement() {
|
||||
// sigma(~~(a*))
|
||||
expr_ref cc(re().mk_complement(re().mk_complement(re().mk_star(rng('a', 'a')))), m);
|
||||
(void)check_agree(cc);
|
||||
}
|
||||
|
||||
void test_determinism() {
|
||||
expr_ref r(re().mk_concat(rng('a', 'a'), re().mk_star(rng('b', 'b'))), m);
|
||||
split_set s1, s2;
|
||||
ENSURE(lazy(r, s1));
|
||||
ENSURE(lazy(r, s2));
|
||||
ENSURE(as_set(s1) == as_set(s2));
|
||||
}
|
||||
|
||||
void test_threshold_boundary() {
|
||||
expr_ref as(re().mk_star(rng('a', 'a')), m); // exactly 3 splits
|
||||
split_set s;
|
||||
ENSURE(eager(as, s));
|
||||
unsigned k = (unsigned)as_set(s).size();
|
||||
ENSURE(k == 3);
|
||||
|
||||
split_set ok_e, ok_l, bad_e, bad_l;
|
||||
ENSURE(eager(as, ok_e, k));
|
||||
ENSURE(lazy(as, ok_l, k));
|
||||
ENSURE(!eager(as, bad_e, k - 1)); // one below threshold; give up
|
||||
ENSURE(!lazy(as, bad_l, k - 1));
|
||||
}
|
||||
|
||||
void test_early_stop_after_two() {
|
||||
expr_ref as(re().mk_star(rng('a', 'a')), m); // 3 splits
|
||||
expr_ref node = m_split.make(as);
|
||||
ENSURE(node);
|
||||
seq_split::iterator it = m_split.iterate(node, split_mode::strong, UINT_MAX, {});
|
||||
expr_ref d(m), n(m);
|
||||
unsigned seen = 0;
|
||||
while (seen < 2 && it.next(d, n)) // pull two splits on demand, then stop
|
||||
++seen;
|
||||
ENSURE(!it.gave_up());
|
||||
ENSURE(seen == 2);
|
||||
}
|
||||
|
||||
void test_iterator_exhaustion() {
|
||||
// Pull every split on demand; gave_up() must stay false on a clean
|
||||
// exhaustion, and next() must keep returning false once drained.
|
||||
expr_ref as(re().mk_star(rng('a', 'a')), m); // 3 splits
|
||||
expr_ref node = m_split.make(as);
|
||||
ENSURE(node);
|
||||
seq_split::iterator it = m_split.iterate(node, split_mode::strong, UINT_MAX, {});
|
||||
expr_ref d(m), n(m);
|
||||
unsigned seen = 0;
|
||||
while (it.next(d, n))
|
||||
++seen;
|
||||
ENSURE(seen == 3);
|
||||
ENSURE(!it.gave_up());
|
||||
// idempotent past the end
|
||||
ENSURE(!it.next(d, n));
|
||||
ENSURE(!it.gave_up());
|
||||
}
|
||||
|
||||
void test_iterator_giveup() {
|
||||
// A threshold overrun aborts: next() returns false and gave_up() is true.
|
||||
expr_ref as(re().mk_star(rng('a', 'a')), m); // 3 splits, cap at 1
|
||||
expr_ref node = m_split.make(as);
|
||||
ENSURE(node);
|
||||
seq_split::iterator it = m_split.iterate(node, split_mode::strong, /*threshold*/ 1, {});
|
||||
expr_ref d(m), n(m);
|
||||
unsigned seen = 0;
|
||||
while (it.next(d, n))
|
||||
++seen;
|
||||
ENSURE(it.gave_up()); // aborted, not a clean exhaustion
|
||||
ENSURE(seen <= 1); // produced at most the capped number
|
||||
|
||||
// A weak-mode Boolean closure is likewise a give-up.
|
||||
expr_ref inter(re().mk_inter(re().mk_star(rng('a', 'a')), re().mk_star(rng('b', 'b'))), m);
|
||||
expr_ref inode = m_split.make(inter);
|
||||
ENSURE(inode);
|
||||
seq_split::iterator wit = m_split.iterate(inode, split_mode::weak, UINT_MAX, {});
|
||||
ENSURE(!wit.next(d, n));
|
||||
ENSURE(wit.gave_up());
|
||||
}
|
||||
|
||||
void test_simplify() {
|
||||
expr_ref regs[] = {
|
||||
expr_ref(re().mk_star(rng('a', 'a')), m),
|
||||
expr_ref(re().mk_complement(re().mk_star(rng('a', 'a'))), m),
|
||||
expr_ref(re().mk_concat(rng('a', 'a'), rng('b', 'b')), m),
|
||||
};
|
||||
for (auto& r : regs) {
|
||||
split_set s;
|
||||
ENSURE(eager(r, s));
|
||||
unsigned before = (unsigned)s.size();
|
||||
m_split.simplify(s);
|
||||
ENSURE(s.size() <= before);
|
||||
ENSURE(!s.empty());
|
||||
// idempotent
|
||||
split_set s2(s);
|
||||
m_split.simplify(s2);
|
||||
ENSURE(as_set(s) == as_set(s2));
|
||||
}
|
||||
}
|
||||
|
||||
void test_trivial_oracle() {
|
||||
expr_ref r(re().mk_star(rng('a', 'a')), m);
|
||||
split_oracle keep_all = [](expr*, expr*) { return true; };
|
||||
split_set s_no, s_yes;
|
||||
ENSURE(eager(r, s_no));
|
||||
ENSURE(eager(r, s_yes, UINT_MAX, split_mode::strong, keep_all));
|
||||
ENSURE(as_set(s_no) == as_set(s_yes));
|
||||
}
|
||||
|
||||
void run() {
|
||||
test_eager_epsilon();
|
||||
test_eager_char();
|
||||
test_eager_word();
|
||||
test_eager_union();
|
||||
test_agree_all();
|
||||
test_lazy_early_stop();
|
||||
test_threshold_giveup();
|
||||
test_weak_vs_strong();
|
||||
test_make_non_regex();
|
||||
test_oracle_prunes();
|
||||
test_eager_full_seq();
|
||||
test_eager_bottom();
|
||||
test_eager_empty_word();
|
||||
test_eager_star_content();
|
||||
test_eager_plus_content();
|
||||
test_eager_concat_content();
|
||||
test_nary_union();
|
||||
test_nary_concat();
|
||||
test_nested_complement();
|
||||
test_determinism();
|
||||
test_threshold_boundary();
|
||||
test_early_stop_after_two();
|
||||
test_iterator_exhaustion();
|
||||
test_iterator_giveup();
|
||||
test_simplify();
|
||||
test_trivial_oracle();
|
||||
}
|
||||
};
|
||||
|
||||
void tst_seq_split() {
|
||||
seq_split_test t;
|
||||
t.run();
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue