mirror of
https://github.com/Z3Prover/z3
synced 2026-06-19 15:16:29 +00:00
Moved the regex splitting into rewriter
Added some simplifications
This commit is contained in:
parent
03a76c0309
commit
dbb3f70873
7 changed files with 484 additions and 324 deletions
|
|
@ -40,6 +40,7 @@ z3_add_component(rewriter
|
|||
seq_axioms.cpp
|
||||
seq_eq_solver.cpp
|
||||
seq_subset.cpp
|
||||
seq_split.cpp
|
||||
seq_rewriter.cpp
|
||||
seq_skolem.cpp
|
||||
th_rewriter.cpp
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ Notes:
|
|||
#include "ast/rewriter/rewriter_types.h"
|
||||
#include "ast/rewriter/bool_rewriter.h"
|
||||
#include "ast/rewriter/seq_subset.h"
|
||||
#include "ast/rewriter/seq_split.h"
|
||||
#include "util/params.h"
|
||||
#include "util/lbool.h"
|
||||
#include "util/sign.h"
|
||||
|
|
@ -130,6 +131,7 @@ class seq_rewriter {
|
|||
|
||||
seq_util m_util;
|
||||
seq_subset m_subset;
|
||||
seq_split m_split;
|
||||
arith_util m_autil;
|
||||
bool_rewriter m_br;
|
||||
// re2automaton m_re2aut;
|
||||
|
|
@ -342,7 +344,7 @@ class seq_rewriter {
|
|||
|
||||
public:
|
||||
seq_rewriter(ast_manager & m, params_ref const & p = params_ref()):
|
||||
m_util(m), m_subset(m_util.re), m_autil(m), m_br(m, p), // m_re2aut(m),
|
||||
m_util(m), m_subset(m_util.re), m_split(*this), m_autil(m), m_br(m, p), // m_re2aut(m),
|
||||
m_op_cache(m), m_es(m),
|
||||
m_lhs(m), m_rhs(m), m_coalesce_chars(true) {
|
||||
}
|
||||
|
|
@ -381,6 +383,12 @@ public:
|
|||
return result;
|
||||
}
|
||||
|
||||
// Split decomposition (sigma) of a regex; see seq_split.h.
|
||||
bool split(expr* r, split_set& out, unsigned threshold, split_mode mode = split_mode::strong) {
|
||||
return m_split.compute(r, out, threshold, mode);
|
||||
}
|
||||
void simplify_split(split_set& s) { m_split.simplify(s); }
|
||||
|
||||
expr_ref mk_symmetric_diff(expr *r1, expr *r2);
|
||||
|
||||
/**
|
||||
|
|
|
|||
366
src/ast/rewriter/seq_split.cpp
Normal file
366
src/ast/rewriter/seq_split.cpp
Normal file
|
|
@ -0,0 +1,366 @@
|
|||
/*++
|
||||
Copyright (c) 2026 Microsoft Corporation
|
||||
|
||||
Module Name:
|
||||
|
||||
seq_split.cpp
|
||||
|
||||
Abstract:
|
||||
|
||||
Regex split decomposition (the split function sigma). See seq_split.h.
|
||||
|
||||
Author:
|
||||
|
||||
Nikolaj Bjorner (nbjorner) 2026-6-10
|
||||
Clemens Eisenhofer 2026-6-10
|
||||
|
||||
--*/
|
||||
|
||||
#include "ast/rewriter/seq_split.h"
|
||||
#include "ast/rewriter/seq_rewriter.h"
|
||||
#include "ast/ast_pp.h"
|
||||
#include "util/obj_hashtable.h"
|
||||
|
||||
seq_split::seq_split(seq_rewriter& rw) :
|
||||
m_rw(rw), m_subset(rw.u().re) {}
|
||||
|
||||
ast_manager& seq_split::m() const { return m_rw.m(); }
|
||||
seq_util& seq_split::seq() const { return m_rw.u(); }
|
||||
seq_util::rex& seq_split::re() const { return m_rw.u().re; }
|
||||
|
||||
// Cross-product intersection of two split-sets (split algebra):
|
||||
// S1 cap S2 = { <D1 cap D2, N1 cap N2> | <D1,N1> in S1, <D2,N2> in S2 }.
|
||||
// Pairs where any component is bottom (the empty regex) are dropped.
|
||||
bool seq_split::intersect(split_set const& s1, split_set const& s2, split_set& result, unsigned threshold) {
|
||||
seq_util::rex& r = re();
|
||||
for (auto const& p1 : s1) {
|
||||
for (auto const& p2 : s2) {
|
||||
if (r.is_empty(p1.m_d) || r.is_empty(p2.m_d) ||
|
||||
r.is_empty(p1.m_n) || r.is_empty(p2.m_n))
|
||||
continue;
|
||||
result.push_back(split_pair(r.mk_inter(p1.m_d, p2.m_d),
|
||||
r.mk_inter(p1.m_n, p2.m_n), m()));
|
||||
if (result.size() > threshold)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Complement of a split-set via De Morgan: ~S = cap_{s in S} ~s with
|
||||
// ~<D,N> = { <~D, .*>, <.*, ~N> } and ~{} = { <.*, .*> }.
|
||||
// May produce up to 2^|sp| pairs (bounded by the threshold). A threshold
|
||||
// overrun must abort entirely: a partial fold is a strictly weaker (unsound)
|
||||
// split-set, since each ~sp[i] further constrains ~S.
|
||||
bool seq_split::complement(sort* seq_sort, split_set const& sp, split_set& result, unsigned threshold) {
|
||||
seq_util::rex& r = re();
|
||||
sort* re_sort = r.mk_re(seq_sort);
|
||||
const expr_ref full(r.mk_full_seq(re_sort), m()); // .*
|
||||
if (sp.empty()) { // ~{} = <.*, .*>
|
||||
result.push_back(split_pair(full, full, m()));
|
||||
return true;
|
||||
}
|
||||
split_set acc;
|
||||
acc.push_back(split_pair(r.mk_complement(sp[0].m_d), full, m()));
|
||||
acc.push_back(split_pair(full, r.mk_complement(sp[0].m_n), m()));
|
||||
for (unsigned i = 1; i < sp.size(); ++i) {
|
||||
split_set next;
|
||||
next.push_back(split_pair(r.mk_complement(sp[i].m_d), full, m()));
|
||||
next.push_back(split_pair(full, r.mk_complement(sp[i].m_n), m()));
|
||||
split_set tmp;
|
||||
if (!intersect(acc, next, tmp, threshold))
|
||||
return false;
|
||||
acc = std::move(tmp);
|
||||
if (acc.empty()) // intersection empty => ~S is empty
|
||||
break;
|
||||
if (acc.size() > threshold)
|
||||
return false;
|
||||
}
|
||||
result.append(acc);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mode mode) {
|
||||
SASSERT(r);
|
||||
seq_util& sq = seq();
|
||||
seq_util::rex& rex = re();
|
||||
ast_manager& mm = m();
|
||||
|
||||
sort* seq_sort = nullptr;
|
||||
if (!sq.is_re(r, seq_sort))
|
||||
return false;
|
||||
|
||||
// bottom: sigma(empty) = {} (the empty split-set)
|
||||
if (rex.is_empty(r))
|
||||
return true;
|
||||
|
||||
// epsilon: sigma(eps) = { <eps, eps> }
|
||||
if (rex.is_epsilon(r)) {
|
||||
const expr_ref eps(rex.mk_epsilon(seq_sort), mm);
|
||||
result.push_back(split_pair(eps, eps, mm));
|
||||
return true;
|
||||
}
|
||||
|
||||
expr* a = nullptr, *b = nullptr;
|
||||
|
||||
// to_re(s): split the literal word s at every position.
|
||||
expr* s = nullptr;
|
||||
if (rex.is_to_re(r, s)) {
|
||||
zstring str;
|
||||
if (sq.str.is_string(s, str)) {
|
||||
for (unsigned i = 0; i <= str.length(); ++i) {
|
||||
const expr_ref p(rex.mk_to_re(sq.str.mk_string(str.extract(0, i))), mm);
|
||||
const expr_ref q(rex.mk_to_re(sq.str.mk_string(str.extract(i, str.length() - i))), mm);
|
||||
result.push_back(split_pair(p, q, mm));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// a single symbolic unit behaves like one token: { <eps, u>, <u, eps> }
|
||||
if (sq.str.is_unit(s)) {
|
||||
const expr_ref ex(r, mm);
|
||||
const expr_ref eps(rex.mk_epsilon(seq_sort), mm);
|
||||
result.push_back(split_pair(eps, ex, mm));
|
||||
result.push_back(split_pair(ex, eps, mm));
|
||||
return true;
|
||||
}
|
||||
// to_re over a non-literal sequence: not handled.
|
||||
return false;
|
||||
}
|
||||
|
||||
// single-character class alpha (., [lo-hi], of_pred):
|
||||
// sigma(alpha) = { <eps, alpha>, <alpha, eps> }
|
||||
if (rex.is_full_char(r) || rex.is_range(r) || rex.is_of_pred(r)) {
|
||||
const expr_ref ex(r, mm);
|
||||
const expr_ref eps(rex.mk_epsilon(seq_sort), mm);
|
||||
result.push_back(split_pair(eps, ex, mm));
|
||||
result.push_back(split_pair(ex, eps, mm));
|
||||
return true;
|
||||
}
|
||||
|
||||
// .* : sigma(.*) = { <.*, .*> }
|
||||
if (rex.is_full_seq(r)) {
|
||||
const expr_ref ex(r, mm);
|
||||
result.push_back(split_pair(ex, ex, mm));
|
||||
return true;
|
||||
}
|
||||
|
||||
// union: sigma(r0 | ... | r_{n-1}) = U sigma(ri) (re.union may be n-ary)
|
||||
if (rex.is_union(r)) {
|
||||
app* ap = to_app(r);
|
||||
for (unsigned i = 0; i < ap->get_num_args(); ++i)
|
||||
if (!compute(ap->get_arg(i), result, threshold, mode))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// concat: sigma(r0...r_{n-1}) = U_i (r0...r_{i-1}) . sigma(ri) . (r_{i+1}...r_{n-1})
|
||||
// (re.++ may be n-ary)
|
||||
if (rex.is_concat(r)) {
|
||||
app* ap = to_app(r);
|
||||
const unsigned n = ap->get_num_args();
|
||||
for (unsigned i = 0; i < n; ++i) {
|
||||
split_set sigma_arg;
|
||||
if (!compute(ap->get_arg(i), sigma_arg, threshold, mode))
|
||||
return false;
|
||||
|
||||
expr_ref left(mm), right(mm);
|
||||
if (i == 0)
|
||||
left = rex.mk_epsilon(seq_sort);
|
||||
else
|
||||
for (unsigned j = 0; j < i; ++j) {
|
||||
expr* arg = ap->get_arg(j);
|
||||
left = left ? expr_ref(rex.mk_concat(left, arg), mm) : expr_ref(arg, mm);
|
||||
}
|
||||
if (i == n - 1)
|
||||
right = rex.mk_epsilon(seq_sort);
|
||||
else
|
||||
for (unsigned j = i + 1; j < n; ++j) {
|
||||
expr* arg = ap->get_arg(j);
|
||||
right = right ? expr_ref(rex.mk_concat(right, arg), mm) : expr_ref(arg, mm);
|
||||
}
|
||||
|
||||
for (auto const& [d, nn] : sigma_arg) {
|
||||
const expr_ref p = m_rw.mk_re_append(left, d);
|
||||
const expr_ref q = m_rw.mk_re_append(nn, right);
|
||||
result.push_back(split_pair(p, q, mm));
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// star: sigma(a*) = { <eps, eps> } cup a*.sigma(a).a*
|
||||
if (rex.is_star(r, a)) {
|
||||
const expr_ref eps(rex.mk_epsilon(seq_sort), mm);
|
||||
result.push_back(split_pair(eps, eps, mm));
|
||||
split_set sa;
|
||||
if (!compute(a, sa, threshold, mode))
|
||||
return false;
|
||||
for (auto const& [d, n] : sa) {
|
||||
const expr_ref p = m_rw.mk_re_append(r, d); // a*.D
|
||||
const expr_ref q = m_rw.mk_re_append(n, r); // N.a*
|
||||
result.push_back(split_pair(p, q, mm));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// plus: a+ = a.a* ; sigma(a+) = a*.sigma(a).a* (star rule without <eps,eps>)
|
||||
if (rex.is_plus(r, a)) {
|
||||
const expr_ref star(rex.mk_star(a), mm); // a*
|
||||
split_set sa;
|
||||
if (!compute(a, sa, threshold, mode))
|
||||
return false;
|
||||
for (auto const& [d, n] : sa) {
|
||||
const expr_ref p = m_rw.mk_re_append(star, d);
|
||||
const expr_ref q = m_rw.mk_re_append(n, star);
|
||||
result.push_back(split_pair(p, q, mm));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// intersection: sigma(r0 & ... & r_{n-1}) = cap sigma(ri) (re.inter may be n-ary)
|
||||
if (rex.is_intersection(r)) {
|
||||
if (mode == split_mode::weak)
|
||||
return false;
|
||||
app* ap = to_app(r);
|
||||
const unsigned n = ap->get_num_args();
|
||||
split_set current;
|
||||
if (!compute(ap->get_arg(0), current, threshold, mode))
|
||||
return false;
|
||||
// A give-up on any conjunct must propagate as a give-up: silently treating
|
||||
// it as the empty split-set would collapse the whole intersection to bottom
|
||||
// and be misreported as an (unsound) conflict.
|
||||
for (unsigned i = 1; i < n && !current.empty(); ++i) {
|
||||
split_set arg_i, tmp;
|
||||
if (!compute(ap->get_arg(i), arg_i, threshold, mode))
|
||||
return false;
|
||||
if (!intersect(current, arg_i, tmp, threshold))
|
||||
return false;
|
||||
current = std::move(tmp);
|
||||
}
|
||||
result.append(current);
|
||||
return true;
|
||||
}
|
||||
|
||||
// complement: sigma(~a) = ~sigma(a)
|
||||
if (rex.is_complement(r, a)) {
|
||||
if (mode == split_mode::weak)
|
||||
return false;
|
||||
split_set sa;
|
||||
if (!compute(a, sa, threshold, mode))
|
||||
return false;
|
||||
return complement(seq_sort, sa, result, threshold);
|
||||
}
|
||||
|
||||
// difference: a \ b = a & ~b ; sigma(a \ b) = sigma(a) cap ~sigma(b)
|
||||
if (rex.is_diff(r, a, b)) {
|
||||
if (mode == split_mode::weak)
|
||||
return false;
|
||||
split_set sa, sb, sb_compl, tmp;
|
||||
if (!compute(a, sa, threshold, mode))
|
||||
return false;
|
||||
if (!compute(b, sb, threshold, mode))
|
||||
return false;
|
||||
if (!complement(seq_sort, sb, sb_compl, threshold))
|
||||
return false;
|
||||
if (!intersect(sa, sb_compl, tmp, threshold))
|
||||
return false;
|
||||
result.append(tmp);
|
||||
return true;
|
||||
}
|
||||
|
||||
// bounded loop / ite / other: not handled (paper "v1: bail").
|
||||
TRACE(seq, tout << "seq_split: unsupported regex " << mk_pp(r, mm) << "\n";);
|
||||
return false;
|
||||
}
|
||||
|
||||
// same-D / same-N merge (paper eqs. 1 & 2):
|
||||
// { <D,N>, <D,N'> } -> <D, N | N'> (by_left = true, group by D)
|
||||
// { <D,N>, <D',N> } -> <D | D', N> (by_left = false, group by N)
|
||||
// Only fires on syntactically-identical (perfectly-shared) key components, so
|
||||
// it is a conservative instance of the rule.
|
||||
void seq_split::merge_by(split_set& pairs, bool by_left) {
|
||||
seq_util::rex& r = re();
|
||||
ast_manager& mm = m();
|
||||
obj_map<expr, unsigned> idx; // key component -> position in `out`
|
||||
split_set out;
|
||||
for (auto const& p : pairs) {
|
||||
expr* key = by_left ? p.m_d.get() : p.m_n.get();
|
||||
expr* other = by_left ? p.m_n.get() : p.m_d.get();
|
||||
unsigned pos;
|
||||
if (idx.find(key, pos)) {
|
||||
expr* prev = by_left ? out[pos].m_n.get() : out[pos].m_d.get();
|
||||
const expr_ref u(r.mk_union(prev, other), mm);
|
||||
if (by_left)
|
||||
out[pos].m_n = u;
|
||||
else
|
||||
out[pos].m_d = u;
|
||||
}
|
||||
else {
|
||||
idx.insert(key, out.size());
|
||||
out.push_back(p);
|
||||
}
|
||||
}
|
||||
pairs.swap(out);
|
||||
}
|
||||
|
||||
void seq_split::simplify(split_set& pairs) {
|
||||
seq_util::rex& r = re();
|
||||
|
||||
// 1. drop pairs with a bottom (empty-language) component.
|
||||
unsigned w = 0;
|
||||
for (unsigned i = 0; i < pairs.size(); ++i) {
|
||||
if (r.is_empty(pairs[i].m_d) || r.is_empty(pairs[i].m_n))
|
||||
continue;
|
||||
if (w != i)
|
||||
pairs[w] = pairs[i];
|
||||
++w;
|
||||
}
|
||||
pairs.shrink(w);
|
||||
if (pairs.size() <= 1)
|
||||
return;
|
||||
|
||||
// 2. same-D / same-N merge rules.
|
||||
merge_by(pairs, true);
|
||||
merge_by(pairs, false);
|
||||
if (pairs.size() <= 1)
|
||||
return;
|
||||
|
||||
// 3. subsumption: drop <D_i,N_i> when L(D_i) subseteq L(D_j) and
|
||||
// L(N_i) subseteq L(N_j) for some kept j. seq_subset is conservative
|
||||
// (returns true only for definite containment), so we never drop a
|
||||
// needed split. Size-capped to bound the O(n^2) subset checks.
|
||||
if (pairs.size() > 64)
|
||||
return;
|
||||
|
||||
struct row { expr* d; expr* n; unsigned idx; };
|
||||
vector<row> rows;
|
||||
for (unsigned i = 0; i < pairs.size(); ++i)
|
||||
rows.push_back({ pairs[i].m_d.get(), pairs[i].m_n.get(), i });
|
||||
|
||||
auto subsumes = [&](row const& a, row const& b) {
|
||||
return m_subset.is_subset(b.d, a.d) && m_subset.is_subset(b.n, a.n);
|
||||
};
|
||||
|
||||
vector<row> kept;
|
||||
for (row const& row_r : rows) {
|
||||
bool redundant = false;
|
||||
for (row const& k : kept)
|
||||
if (subsumes(k, row_r)) { redundant = true; break; }
|
||||
if (redundant)
|
||||
continue;
|
||||
// drop already-kept rows strictly subsumed by row_r
|
||||
unsigned kw = 0;
|
||||
for (unsigned t = 0; t < kept.size(); ++t) {
|
||||
if (subsumes(row_r, kept[t]))
|
||||
continue;
|
||||
kept[kw++] = kept[t];
|
||||
}
|
||||
kept.shrink(kw);
|
||||
kept.push_back(row_r);
|
||||
}
|
||||
|
||||
split_set result;
|
||||
for (row const& k : kept)
|
||||
result.push_back(pairs[k.idx]);
|
||||
pairs.swap(result);
|
||||
}
|
||||
93
src/ast/rewriter/seq_split.h
Normal file
93
src/ast/rewriter/seq_split.h
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
/*++
|
||||
Copyright (c) 2026 Microsoft Corporation
|
||||
|
||||
Module Name:
|
||||
|
||||
seq_split.h
|
||||
|
||||
Abstract:
|
||||
|
||||
Regex split decomposition: the split function sigma from the paper
|
||||
"Solving by Splitting". For a regular expression r, sigma(r) is a finite
|
||||
"split-set" of pairs { <D_i, N_i> } such that
|
||||
|
||||
u.v in L(r) iff exists i: u in L(D_i) and v in L(N_i).
|
||||
|
||||
This lifts the decomposition previously buried in the nseq solver
|
||||
(seq::compute_sigma over euf::snode) into a self-contained, expr*-based
|
||||
engine that can be reused anywhere a seq_rewriter is available (the legacy
|
||||
seq solver, nseq, and the regex rewriter itself).
|
||||
|
||||
The split algebra (intersection, De Morgan complement, left/right
|
||||
concatenation with a regex) and the cardinality-reducing simplification
|
||||
heuristics (drop bottom, same-D/same-N merge, subsumption via seq_subset)
|
||||
follow the paper.
|
||||
|
||||
Author:
|
||||
|
||||
Nikolaj Bjorner (nbjorner) 2026-6-10
|
||||
Clemens Eisenhofer 2026-6-10
|
||||
|
||||
--*/
|
||||
#pragma once
|
||||
|
||||
#include "ast/seq_decl_plugin.h"
|
||||
#include "ast/rewriter/seq_subset.h"
|
||||
|
||||
class seq_rewriter;
|
||||
|
||||
// An individual split <D, N>: the left (prefix) regex D and right (suffix)
|
||||
// regex N. u.v in L(r) for this split iff u in L(D) and v in L(N).
|
||||
struct split_pair {
|
||||
expr_ref m_d;
|
||||
expr_ref m_n;
|
||||
split_pair(expr* d, expr* n, ast_manager& m) : m_d(d, m), m_n(n, m) {
|
||||
SASSERT(d && n);
|
||||
}
|
||||
};
|
||||
|
||||
// A split-set is a union of individual splits.
|
||||
typedef vector<split_pair> split_set;
|
||||
|
||||
// Controls how aggressively sigma expands the Boolean-closure cases:
|
||||
// strong - fully expand complement / intersection via the split algebra
|
||||
// (De Morgan / cross product). This is the behaviour the nseq
|
||||
// solver relies on.
|
||||
// weak - do not perform the (potentially 2^k) Boolean-closure expansion;
|
||||
// give up (return false) on complement / intersection instead.
|
||||
enum class split_mode { weak, strong };
|
||||
|
||||
class seq_split {
|
||||
seq_rewriter& m_rw; // for mk_re_append + manager / seq_util access
|
||||
seq_subset m_subset; // language-subset checks for subsumption
|
||||
|
||||
ast_manager& m() const;
|
||||
seq_util& seq() const;
|
||||
seq_util::rex& re() const;
|
||||
|
||||
// S1 cap S2 = { <D1 cap D2, N1 cap N2> } dropping any pair with a bottom
|
||||
// component. Returns false on threshold overrun.
|
||||
bool intersect(split_set const& s1, split_set const& s2, split_set& result, unsigned threshold);
|
||||
|
||||
// De Morgan complement of a split-set: ~S = cap_{s in S} ~s with
|
||||
// ~<D,N> = { <~D, .*>, <.*, ~N> } and ~{} = { <.*, .*> }.
|
||||
bool complement(sort* seq_sort, split_set const& sp, split_set& result, unsigned threshold);
|
||||
|
||||
// same-D / same-N merge: groups pairs that share a (syntactically identical)
|
||||
// left (resp. right) component and unions the other component.
|
||||
void merge_by(split_set& pairs, bool by_left);
|
||||
|
||||
public:
|
||||
explicit seq_split(seq_rewriter& rw);
|
||||
|
||||
// Compute sigma(r), appending to `out` (does not clear it). `threshold`
|
||||
// bounds the number of produced splits; an overrun, an unsupported regex
|
||||
// shape (bounded loop / ite), or a Boolean-closure case in weak mode makes
|
||||
// it return false ("give up").
|
||||
bool compute(expr* r, split_set& out, unsigned threshold, split_mode mode = split_mode::strong);
|
||||
|
||||
// In-place simplification of a split-set: drop bottom components, apply the
|
||||
// same-D / same-N merge rules, and drop splits subsumed by another (using
|
||||
// seq_subset). Size-capped to keep the O(n^2) subsumption affordable.
|
||||
void simplify(split_set& s);
|
||||
};
|
||||
|
|
@ -3576,291 +3576,6 @@ namespace seq {
|
|||
// Modifier: apply_regex_factorization (Boolean Closure)
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// Cross-product intersection of two split-sets (split algebra):
|
||||
// S1 ⊓ S2 = { ⟨Δ1⊓Δ2, ∇1⊓∇2⟩ | ⟨Δ1,∇1⟩∈S1, ⟨Δ2,∇2⟩∈S2 }
|
||||
// Pairs where either component is the empty regex are dropped (∅⊓r ≡ ∅).
|
||||
static bool intersect_sigma_pairs(ast_manager& m, seq_util& seq,
|
||||
sigma_pairs const& s1, sigma_pairs const& s2, sigma_pairs& result, unsigned threshold) {
|
||||
for (auto const& p1 : s1) {
|
||||
for (auto const& p2 : s2) {
|
||||
if (seq.re.is_empty(p1.m_p) || seq.re.is_empty(p2.m_p) ||
|
||||
seq.re.is_empty(p1.m_q) || seq.re.is_empty(p2.m_q))
|
||||
continue;
|
||||
result.push_back(sigma_pair(seq.re.mk_inter(p1.m_p, p2.m_p),
|
||||
seq.re.mk_inter(p1.m_q, p2.m_q), m));
|
||||
if (result.size() > threshold)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Complement of a split-set via De Morgan: ~S = ⊓_{s∈S} ~s,
|
||||
// ~⟨Δ,∇⟩ = { ⟨~Δ, .*⟩, ⟨.*, ~∇⟩ } and ~{} = { ⟨.*, .*⟩ }.
|
||||
// str_sort is the sequence-element sort; mk_full_seq needs the regex sort.
|
||||
// May produce up to 2^|sp| pairs (bounded downstream by the factorization threshold).
|
||||
static bool complement_sigma_pairs(ast_manager& m, seq_util& seq, sort* str_sort,
|
||||
sigma_pairs const& sp, sigma_pairs& result, unsigned threshold) {
|
||||
|
||||
sort* re_sort = seq.re.mk_re(str_sort);
|
||||
const expr_ref full(seq.re.mk_full_seq(re_sort), m); // .*
|
||||
if (sp.empty()) { // ~{} = ⟨.*, .*⟩
|
||||
result.push_back(sigma_pair(full, full, m));
|
||||
return true;
|
||||
}
|
||||
sigma_pairs acc;
|
||||
acc.push_back(sigma_pair(seq.re.mk_complement(sp[0].m_p), full, m));
|
||||
acc.push_back(sigma_pair(full, seq.re.mk_complement(sp[0].m_q), m));
|
||||
for (unsigned i = 1; i < sp.size(); ++i) {
|
||||
sigma_pairs next;
|
||||
next.push_back(sigma_pair(seq.re.mk_complement(sp[i].m_p), full, m));
|
||||
next.push_back(sigma_pair(full, seq.re.mk_complement(sp[i].m_q), m));
|
||||
sigma_pairs tmp;
|
||||
// De Morgan fold: acc := acc ⊓ ~sp[i]. intersect_sigma_pairs returns
|
||||
// false ONLY when it overruns the threshold; in that case we must give
|
||||
// up entirely (a partial fold is a strictly weaker — hence unsound —
|
||||
// split set, since each ~sp[i] further constrains ~S).
|
||||
if (!intersect_sigma_pairs(m, seq, acc, next, tmp, threshold))
|
||||
return false;
|
||||
acc = std::move(tmp);
|
||||
if (acc.empty()) // intersection empty ⇒ ~S is empty
|
||||
break;
|
||||
if (acc.size() > threshold)
|
||||
return false;
|
||||
}
|
||||
result.append(acc);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool compute_sigma(ast_manager& m, seq_util& seq, seq_rewriter& rw, const euf::snode* r, sigma_pairs& result, unsigned threshold) {
|
||||
SASSERT(r);
|
||||
sort* str_sort = nullptr;
|
||||
if (!seq.is_re(r->get_expr(), str_sort))
|
||||
return false;
|
||||
// std::cout << "Computing sigma of " << snode_label_html(r, m, false) << std::endl;
|
||||
|
||||
if (r->is_empty()) {
|
||||
const expr_ref eps(seq.re.mk_epsilon(str_sort), m);
|
||||
result.push_back(sigma_pair(eps, eps, m));
|
||||
return true;
|
||||
}
|
||||
if (r->is_to_re()) {
|
||||
const euf::snode* const c = r->arg0();
|
||||
if (c->is_range()) {
|
||||
const expr_ref ex(c->get_expr(), m);
|
||||
const expr_ref eps(seq.re.mk_epsilon(str_sort), m);
|
||||
|
||||
result.push_back(sigma_pair(eps, ex, m));
|
||||
result.push_back(sigma_pair(ex, eps, m));
|
||||
return true;
|
||||
}
|
||||
if (c->is_empty()) {
|
||||
const expr_ref eps(seq.re.mk_epsilon(str_sort), m);
|
||||
result.push_back(sigma_pair(eps, eps, m));
|
||||
return true;
|
||||
}
|
||||
if (c->is_char()) {
|
||||
unsigned val;
|
||||
VERIFY(seq.is_const_char(c->arg0()->get_expr(), val));
|
||||
const expr_ref ex(seq.re.mk_to_re(seq.str.mk_string(zstring(val))), m);
|
||||
const expr_ref eps(seq.re.mk_epsilon(str_sort), m);
|
||||
|
||||
result.push_back(sigma_pair(eps, ex, m));
|
||||
result.push_back(sigma_pair(ex, eps, m));
|
||||
return true;
|
||||
}
|
||||
zstring s;
|
||||
if (c->is_string(s, seq)) {
|
||||
for (unsigned i = 0; i <= s.length(); ++i) {
|
||||
expr_ref p(seq.re.mk_to_re(seq.str.mk_string(s.extract(0, i))), m);
|
||||
expr_ref q(seq.re.mk_to_re(seq.str.mk_string(s.extract(i, s.length() - i))), m);
|
||||
result.push_back(sigma_pair(p, q, m));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
std::cout << mk_pp(c->get_expr(), m) << std::endl;
|
||||
UNREACHABLE();
|
||||
return false;
|
||||
}
|
||||
if (r->is_full_char()) {
|
||||
const expr_ref ex(r->get_expr(), m);
|
||||
const expr_ref eps(seq.re.mk_epsilon(str_sort), m);
|
||||
|
||||
result.push_back(sigma_pair(eps, ex, m));
|
||||
result.push_back(sigma_pair(ex, eps, m));
|
||||
return true;
|
||||
}
|
||||
if (r->is_full_seq()) {
|
||||
const expr_ref ex(r->get_expr(), m);
|
||||
result.push_back(sigma_pair(ex, ex, m));
|
||||
return true;
|
||||
}
|
||||
if (r->is_union()) {
|
||||
// σ(r₁ ⊔ r₂) = σ(r₁) ∪ σ(r₂)
|
||||
SASSERT(r->num_args() >= 2);
|
||||
for (unsigned i = 0; i < r->num_args(); i++) {
|
||||
if (!compute_sigma(m, seq, rw, r->arg(i), result, threshold))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (r->is_intersect()) {
|
||||
// σ(r₁ ⊓ r₂ ⊓ …) = σ(r₁) ⊓ σ(r₂) ⊓ …; empty intersection (0 args) = {⟨.*,.*⟩}
|
||||
const unsigned n = r->num_args();
|
||||
SASSERT(n >= 2);
|
||||
sigma_pairs current;
|
||||
if (!compute_sigma(m, seq, rw, r->arg0(), current, threshold))
|
||||
return false;
|
||||
for (unsigned i = 1; i < n && !current.empty(); ++i) {
|
||||
sigma_pairs arg_i;
|
||||
// A give-up on any conjunct must propagate as a give-up: silently
|
||||
// treating arg_i as the empty split-set would collapse the whole
|
||||
// intersection to ∅ and be misreported as an (unsound) conflict.
|
||||
if (!compute_sigma(m, seq, rw, r->arg(i), arg_i, threshold))
|
||||
return false;
|
||||
sigma_pairs tmp;
|
||||
if (!intersect_sigma_pairs(m, seq, current, arg_i, tmp, threshold))
|
||||
return false;
|
||||
current = std::move(tmp);
|
||||
}
|
||||
result.append(current);
|
||||
return true;
|
||||
}
|
||||
if (r->is_complement()) {
|
||||
// σ(~r) = ~σ(r)
|
||||
sigma_pairs body_pairs;
|
||||
if (!compute_sigma(m, seq, rw, r->arg0(), body_pairs, threshold))
|
||||
return false;
|
||||
return complement_sigma_pairs(m, seq, str_sort, body_pairs, result, threshold);
|
||||
}
|
||||
if (r->is_concat()) {
|
||||
const unsigned n = r->num_args();
|
||||
SASSERT(n >= 2);
|
||||
for (unsigned i = 0; i < n; ++i) {
|
||||
sigma_pairs sigma_arg;
|
||||
if (!compute_sigma(m, seq, rw, r->arg(i), sigma_arg, threshold))
|
||||
return false;
|
||||
|
||||
expr_ref left(m);
|
||||
expr_ref right(m);
|
||||
|
||||
if (i == 0)
|
||||
left = seq.re.mk_epsilon(str_sort);
|
||||
else {
|
||||
for (unsigned j = 0; j < i; ++j) {
|
||||
const euf::snode* arg = r->arg(j);
|
||||
left = left ? seq.re.mk_concat(left, arg->get_expr()) : arg->get_expr();
|
||||
}
|
||||
}
|
||||
|
||||
if (i == n - 1)
|
||||
right = seq.re.mk_epsilon(str_sort);
|
||||
else {
|
||||
for (unsigned j = i + 1; j < n; ++j) {
|
||||
const euf::snode* arg = r->arg(j);
|
||||
right = right ? seq.re.mk_concat(right, arg->get_expr()) : arg->get_expr();
|
||||
}
|
||||
}
|
||||
|
||||
for (auto const &[tp, tq] : sigma_arg) {
|
||||
expr_ref p = rw.mk_re_append(left, tp);
|
||||
expr_ref q = rw.mk_re_append(tq, right);
|
||||
result.push_back(sigma_pair(p, q, m));
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (r->is_star()) {
|
||||
const euf::snode* body = r->arg0();
|
||||
const expr_ref eps(seq.re.mk_epsilon(str_sort), m);
|
||||
result.push_back(sigma_pair(eps, eps, m));
|
||||
|
||||
sigma_pairs sigma_body;
|
||||
if (!compute_sigma(m, seq, rw, body, sigma_body, threshold))
|
||||
return false;
|
||||
for (auto const &[tp, tq] : sigma_body) {
|
||||
auto p = rw.mk_re_append(r->get_expr(), tp);
|
||||
auto q = rw.mk_re_append(tq, r->get_expr());
|
||||
result.push_back(sigma_pair(p, q, m));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (r->is_plus()) {
|
||||
// r⁺ = r·r* ; by Kleene factorization σ(r⁺) = r*·σ(r)·r*.
|
||||
// Same shape as the star rule but with the surrounding factor being
|
||||
// body* without the {⟨ε,ε⟩} pair
|
||||
const euf::snode* body = r->arg0();
|
||||
const expr_ref star(seq.re.mk_star(body->get_expr()), m); // body*
|
||||
|
||||
sigma_pairs sigma_body;
|
||||
if (!compute_sigma(m, seq, rw, body, sigma_body, threshold))
|
||||
return false;
|
||||
for (auto const &[tp, tq] : sigma_body) {
|
||||
auto p = rw.mk_re_append(star, tp); // body* · tp
|
||||
auto q = rw.mk_re_append(tq, star); // tq · body*
|
||||
result.push_back(sigma_pair(p, q, m));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// the simplifier should have eliminated most of them already
|
||||
// TODO: so far, we are, however, still missing bounded repetitions and ite
|
||||
std::cout << "Unknown element " << mk_pp(r->get_expr(), m) << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
void simplify_sigma_pairs(sigma_pairs& pairs, seq_regex& sr, euf::sgraph& sg) {
|
||||
return; // For now
|
||||
if (pairs.size() <= 1)
|
||||
return;
|
||||
// Guard against pathological cost: subsumption is O(n^2) language-subset
|
||||
// BFS checks. Large split-sets are left to the factorization threshold.
|
||||
if (pairs.size() > 64)
|
||||
return;
|
||||
|
||||
struct row { euf::snode* p; euf::snode* q; unsigned idx; };
|
||||
|
||||
// Materialise snodes once; drop pairs with a structurally-empty component.
|
||||
vector<row> rows;
|
||||
for (unsigned i = 0; i < pairs.size(); ++i) {
|
||||
euf::snode* p = sg.mk(pairs[i].m_p);
|
||||
euf::snode* q = sg.mk(pairs[i].m_q);
|
||||
if (sr.is_empty_regex(p) || sr.is_empty_regex(q))
|
||||
continue;
|
||||
rows.push_back({ p, q, i });
|
||||
}
|
||||
|
||||
// a subsumes b iff L(b.p) ⊆ L(a.p) and L(b.q) ⊆ L(a.q).
|
||||
// is_language_subset may return l_undef (inconclusive); only treat a
|
||||
// definite l_true as subsumption, so we never drop a needed split.
|
||||
auto subsumes = [&](row const& a, row const& b) {
|
||||
return sr.is_language_subset(b.p, a.p) == l_true &&
|
||||
sr.is_language_subset(b.q, a.q) == l_true;
|
||||
};
|
||||
|
||||
vector<row> kept;
|
||||
for (row const& r : rows) {
|
||||
bool redundant = false;
|
||||
for (row const& k : kept)
|
||||
if (subsumes(k, r)) { redundant = true; break; }
|
||||
if (redundant)
|
||||
continue;
|
||||
// drop already-kept rows strictly subsumed by r
|
||||
unsigned w = 0;
|
||||
for (unsigned t = 0; t < kept.size(); ++t) {
|
||||
if (subsumes(r, kept[t]))
|
||||
continue;
|
||||
kept[w++] = kept[t];
|
||||
}
|
||||
kept.shrink(w);
|
||||
kept.push_back(r);
|
||||
}
|
||||
|
||||
sigma_pairs result;
|
||||
for (row const& k : kept)
|
||||
result.push_back(pairs[k.idx]);
|
||||
pairs.swap(result);
|
||||
}
|
||||
|
||||
bool nielsen_graph::apply_regex_factorization(nielsen_node* node) {
|
||||
if (m_regex_factorization_threshold == 0)
|
||||
return false;
|
||||
|
|
@ -3874,14 +3589,14 @@ namespace seq {
|
|||
for (str_mem const& mem : node->str_mems()) {
|
||||
SASSERT(mem.well_formed());
|
||||
|
||||
// compute_sigma handles all regex forms (incl. complement / intersection),
|
||||
// split() handles all regex forms (incl. complement / intersection),
|
||||
// so the classical restriction is no longer needed.
|
||||
if (mem.m_str->is_empty() || mem.is_primitive())
|
||||
continue;
|
||||
|
||||
// compute_sigma / compute_tau do not understand the projection
|
||||
// operator (re.proj) — they would recurse into it and hit an
|
||||
// UNREACHABLE. Projection-constrained memberships are handled by the
|
||||
// The split engine works on plain regex AST and does not understand the
|
||||
// projection operator (re.proj) — it would give up on it anyway.
|
||||
// Projection-constrained memberships are handled by the
|
||||
// cycle-decomposition path, so skip them here.
|
||||
if (mem.m_regex->has_projection())
|
||||
continue;
|
||||
|
|
@ -3892,13 +3607,14 @@ namespace seq {
|
|||
euf::snode* tail = m_sg.drop_first(mem.m_str);
|
||||
SASSERT(tail);
|
||||
|
||||
sigma_pairs pairs;
|
||||
// Decompose the regex into a split-set via the shared seq_split engine
|
||||
// (sigma from the paper): first ∈ Δ ∧ tail ∈ ∇ for each ⟨Δ,∇⟩.
|
||||
split_set pairs;
|
||||
seq_rewriter rw(m);
|
||||
if (!compute_sigma(m, m_seq, rw, mem.m_regex, pairs, m_regex_factorization_threshold))
|
||||
if (!rw.split(mem.m_regex->get_expr(), pairs, m_regex_factorization_threshold))
|
||||
continue;
|
||||
|
||||
if (m_seq_regex)
|
||||
simplify_sigma_pairs(pairs, *m_seq_regex, m_sg);
|
||||
rw.simplify_split(pairs);
|
||||
|
||||
vector<rf_split> feasible;
|
||||
dep_tracker eliminated_dep = mem.m_dep;
|
||||
|
|
|
|||
|
|
@ -262,30 +262,6 @@ namespace seq {
|
|||
|
||||
std::string snode_label_html(euf::snode const* n, ast_manager& m, bool html_escape);
|
||||
|
||||
// Split-pair produced by compute_sigma: uv |= r iff exists i: u |= m_p[i] and v |= m_q[i]
|
||||
struct sigma_pair {
|
||||
expr_ref m_p;
|
||||
expr_ref m_q;
|
||||
sigma_pair(expr* p, expr* q, ast_manager& m) : m_p(p, m), m_q(q, m) {
|
||||
SASSERT(p && q);
|
||||
}
|
||||
};
|
||||
typedef vector<sigma_pair> sigma_pairs;
|
||||
|
||||
// Compute the split-set sigma(r) per the splitting rules of the paper
|
||||
// "Extended Regular Expression Membership". Generalises the classical compute_tau
|
||||
// with intersection and complement cases via the split-set algebra.
|
||||
// `result` is appended to (not cleared).
|
||||
bool compute_sigma(ast_manager& m, seq_util& seq, seq_rewriter& rw, const euf::snode* r, sigma_pairs& result, unsigned threshold);
|
||||
|
||||
// Simplify a split-set in place using the split algebra's language-level rules
|
||||
// (paper section "split-set simplification heuristics"): drop pairs with an
|
||||
// empty-language component, and drop any split subsumed by another
|
||||
// (<D_i,N_i> is subsumed by <D_j,N_j> iff L(D_i) subseteq L(D_j) and L(N_i) subseteq L(N_j)).
|
||||
// Subsumption tames the 2^k blow-up of sigma(~r) (e.g. sigma(~a*): 8 -> 2).
|
||||
// Requires the regex emptiness/subset checker and an sgraph to build snodes.
|
||||
void simplify_sigma_pairs(sigma_pairs& pairs, seq_regex& sr, euf::sgraph& sg);
|
||||
|
||||
// simplification result for constraint processing
|
||||
// mirrors ZIPT's SimplifyResult enum
|
||||
enum class simplify_result {
|
||||
|
|
|
|||
|
|
@ -555,7 +555,7 @@ namespace smt {
|
|||
|
||||
// Eager sigma factorization (token-level): when enabled, split a non-primitive
|
||||
// membership s ∈ r at the boundary between the first concat argument (head) and
|
||||
// the rest (tail), using compute_sigma. This mirrors the lazy Nielsen
|
||||
// the rest (tail), using the shared seq_split engine. This mirrors the lazy Nielsen
|
||||
// apply_regex_factorization and the paper's Reduce rule for x·u'.
|
||||
// (s ∈ r) → ⋁_{⟨Δ,∇⟩∈σ(r)} ( head ∈ Δ ∧ tail ∈ ∇ )
|
||||
// Only fires for a concatenation s (single-variable s is already primitive).
|
||||
|
|
@ -570,12 +570,12 @@ namespace smt {
|
|||
const expr_ref tail(m_seq.str.mk_concat(na - 1, a->get_args() + 1, s->get_sort()), m);
|
||||
|
||||
const unsigned threshold = get_fparams().m_nseq_regex_factorization_threshold;
|
||||
seq::sigma_pairs pairs;
|
||||
if (!seq::compute_sigma(m, m_seq, m_rewriter, mem.m_regex, pairs, threshold))
|
||||
split_set pairs;
|
||||
if (!m_rewriter.split(mem.m_regex->get_expr(), pairs, threshold))
|
||||
// we give up
|
||||
return;
|
||||
|
||||
seq::simplify_sigma_pairs(pairs, m_regex, m_sgraph);
|
||||
m_rewriter.simplify_split(pairs);
|
||||
|
||||
if (pairs.empty()) {
|
||||
// no viable splits
|
||||
|
|
@ -598,8 +598,8 @@ namespace smt {
|
|||
lits.push_back(~mem.lit);
|
||||
//std::cout << "Decomposing into:\n";
|
||||
for (auto const& sp : pairs) {
|
||||
expr_ref mem_head(m_seq.re.mk_in_re(head, sp.m_p), m);
|
||||
expr_ref mem_tail(m_seq.re.mk_in_re(tail, sp.m_q), m);
|
||||
expr_ref mem_head(m_seq.re.mk_in_re(head, sp.m_d), m);
|
||||
expr_ref mem_tail(m_seq.re.mk_in_re(tail, sp.m_n), m);
|
||||
expr_ref conj(m.mk_and(mem_head, mem_tail), m);
|
||||
lits.push_back(mk_literal(conj));
|
||||
//seq::dep_tracker dep = nullptr;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue