3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2026-06-26 10:28:48 +00:00

wip(seq): clean_leaf cofactors, mk_union_core simplifications, re_is_empty antimirov emptiness check

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Margus Veanes 2026-06-26 00:36:14 +03:00
parent 3d42d997c6
commit 149549b946
7 changed files with 216 additions and 22 deletions

View file

@ -722,6 +722,20 @@ namespace seq {
expr_ref derive::mk_union_core(expr* a, expr* b) {
// Identity: none R = R (none is the unit of union)
// Idempotence: R R = R
// Absorption: Σ* R = Σ*
// Without these the derivative of an intersection accumulates
// un-simplified unions such as union(inter, union(none, none)),
// producing many syntactically distinct but semantically equal
// states. That defeats state dedup in the emptiness/bisim closure
// and makes contains-pattern intersections blow up.
if (re().is_empty(a)) return expr_ref(b, m);
if (re().is_empty(b)) return expr_ref(a, m);
if (a == b) return expr_ref(a, m);
if (re().is_full_seq(a) || re().is_full_seq(b))
return expr_ref(re().mk_full_seq(a->get_sort()), m);
// Prefix factoring: a·x a·y = a·(x y)
expr *a1, *a2, *b1, *b2;
if (re().is_concat(a, a1, a2) && re().is_concat(b, b1, b2) && a1 == b1) {
@ -1308,14 +1322,29 @@ namespace seq {
// Cofactor enumeration over a transition regex
// -------------------------------------------------------
expr_ref derive::clean_leaf(expr* r) {
expr* a = nullptr, * b = nullptr;
if (re().is_union(r, a, b))
return mk_union(clean_leaf(a), clean_leaf(b));
if (re().is_intersection(r, a, b))
return mk_inter(clean_leaf(a), clean_leaf(b));
return expr_ref(r, m);
}
void derive::get_cofactors_rec(expr* r, expr_ref_pair_vector& result) {
// Hoist the (first) if-then-else condition to the top of r, splitting it
// into the equivalent ite(c, th, el); when r contains no ite it is a
// leaf of the transition regex.
expr_ref c(m), th(m), el(m);
if (!m_br.decompose_ite(r, c, th, el)) {
if (!re().is_empty(r))
result.push_back(get_path_expr(), r);
// Re-normalize the leaf: decompose_ite substitutes ITE branches
// structurally so the leaf may carry un-simplified union(_, none)
// / inter(_, none) nodes. Cleaning them keeps semantically equal
// states syntactically identical, which is essential for state
// dedup in the emptiness/bisim closure.
expr_ref cr = clean_leaf(r);
if (!re().is_empty(cr))
result.push_back(get_path_expr(), cr);
return;
}
// Positive branch: c holds.

View file

@ -167,6 +167,13 @@ namespace seq {
// Cofactor enumeration over a transition regex (ITE-tree).
void get_cofactors_rec(expr* r, expr_ref_pair_vector& result);
// Re-apply union/intersection simplifications bottom-up to a cofactor
// leaf. decompose_ite substitutes ITE branch values structurally
// (no simplification), so leaves can contain un-normalized nodes such
// as union(R, none) or inter(R, none); this rebuilds them through
// mk_union/mk_inter so equal states share a canonical form.
expr_ref clean_leaf(expr* r);
sort* re_sort(expr* r) { return r->get_sort(); }
sort* seq_sort(expr* r) { sort* s = nullptr; m_util.is_re(r, s); return s; }
sort* ele_sort(expr* r) { sort* s = seq_sort(r); sort* e = nullptr; m_util.is_seq(s, e); return e; }

View file

@ -3063,6 +3063,10 @@ bool seq_rewriter::le_char(expr* ch1, expr* ch2) {
Current cases handled:
- a and b are char <= constraints, or negations of char <= constraints
- a and b are equalities (element = constant char), or their negations.
These arise from derivatives of single characters and must be pruned
when combining BDDs so that no unreachable branch such as
ite(x = 'a', ite(x = 'b', ...), ...) (with 'a' != 'b') is created.
*/
bool seq_rewriter::pred_implies(expr* a, expr* b) {
STRACE(seq_verbose, tout << "pred_implies: "
@ -3070,6 +3074,26 @@ bool seq_rewriter::pred_implies(expr* a, expr* b) {
<< "," << mk_pp(b, m()) << std::endl;);
expr *cha1 = nullptr, *cha2 = nullptr, *nota = nullptr,
*chb1 = nullptr, *chb2 = nullptr, *notb = nullptr;
// (element = constant char), returning the element and char code.
auto is_char_eq = [&](expr* e, expr*& x, unsigned& v) {
expr* t1 = nullptr, *t2 = nullptr;
if (!m().is_eq(e, t1, t2))
return false;
if (u().is_const_char(t2, v)) { x = t1; return true; }
if (u().is_const_char(t1, v)) { x = t2; return true; }
return false;
};
expr *xa = nullptr, *xb = nullptr;
unsigned va = 0, vb = 0;
if (is_char_eq(a, xa, va)) {
// a is (xa = va)
if (is_char_eq(b, xb, vb) && xa == xb)
// (x = va) => (x = vb) iff va == vb
return va == vb;
if (m().is_not(b, notb) && is_char_eq(notb, xb, vb) && xa == xb)
// (x = va) => not (x = vb) iff va != vb
return va != vb;
}
if (m().is_not(a, nota) &&
m().is_not(b, notb)) {
return pred_implies(notb, nota);
@ -3267,26 +3291,33 @@ expr_ref seq_rewriter::mk_der_op(decl_kind k, expr* a, expr* b) {
// transformations hide ite sub-terms,
// Rewriting that changes associativity of
// operators may hide ite sub-terms.
switch (k) {
case OP_RE_INTERSECT:
if (BR_FAILED != mk_re_inter0(a, b, result))
return result;
break;
case OP_RE_UNION:
if (BR_FAILED != mk_re_union0(a, b, result))
return result;
break;
case OP_RE_CONCAT:
if (BR_FAILED != mk_re_concat(a, b, result))
return result;
break;
case OP_RE_XOR:
if (BR_FAILED != mk_re_xor0(a, b, result))
return result;
break;
default:
break;
//
// When either operand is an ite (a derivative BDD), skip the
// pre-simplification: its blind ite-hoisting would bypass the
// pred_implies-based pruning in mk_der_op_rec and create unreachable
// branches such as ite(x = 'a', ite(x = 'b', ...), ...) with 'a' != 'b'.
bool has_ite = m().is_ite(a) || m().is_ite(b);
if (!has_ite) {
switch (k) {
case OP_RE_INTERSECT:
if (BR_FAILED != mk_re_inter0(a, b, result))
return result;
break;
case OP_RE_UNION:
if (BR_FAILED != mk_re_union0(a, b, result))
return result;
break;
case OP_RE_CONCAT:
if (BR_FAILED != mk_re_concat(a, b, result))
return result;
break;
case OP_RE_XOR:
if (BR_FAILED != mk_re_xor0(a, b, result))
return result;
break;
default:
break;
}
}
result = m_op_cache.find(k, a, b, nullptr);
if (!result) {
@ -3914,6 +3945,9 @@ br_status seq_rewriter::mk_re_concat(expr* a, expr* b, expr_ref& result) {
return BR_REWRITE3;
}
if (re().is_concat(a, a1, a2)) {
// Maintain right-associative normal form: re().mk_concat is a raw
// constructor, so re-simplify the result to recursively reassociate
// any concat nested in a2 (and re-apply concat simplifications).
result = re().mk_concat(a1, re().mk_concat(a2, b));
return BR_DONE;
}

View file

@ -108,6 +108,12 @@ bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const {
if (m_re.is_concat(b, b1, b2) && m_re.is_full_seq(b1) && is_subset_rec(a, b2, depth))
return true;
// prefix absorption: P·R' ⊆ Σ*·R' for any prefix P (since P ⊆ Σ*).
// Detect that a has R' (= b2) as a concatenation suffix, where b = Σ*·R'.
// Covers contains-patterns, e.g. Σ*·a·Σ*·b·Σ* ⊆ Σ*·b·Σ*.
if (m_re.is_concat(b, b1, b2) && m_re.is_full_seq(b1) && ends_with(a, b2))
return true;
// R ⊆ R'·Σ* if R ⊆ R'
if (m_re.is_concat(b, b1, b2) && m_re.is_full_seq(b2) && is_subset_rec(a, b1, depth))
return true;
@ -140,3 +146,30 @@ bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const {
bool seq_subset::is_subset(expr* a, expr* b) const {
return is_subset_rec(a, b, 0);
}
bool seq_subset::ends_with(expr* a, expr* suf) const {
if (a == suf)
return true;
// Flatten both regexes into their sequence of concatenation factors
// (independent of left/right associativity) and test list-suffix equality.
ptr_vector<expr> af, sf;
flatten_concat(a, af);
flatten_concat(suf, sf);
if (sf.size() > af.size())
return false;
unsigned off = af.size() - sf.size();
for (unsigned i = 0; i < sf.size(); ++i)
if (af[off + i] != sf[i])
return false;
return true;
}
void seq_subset::flatten_concat(expr* a, ptr_vector<expr>& out) const {
expr* a1 = nullptr, * a2 = nullptr;
if (m_re.is_concat(a, a1, a2)) {
flatten_concat(a1, out);
flatten_concat(a2, out);
}
else
out.push_back(a);
}

View file

@ -24,6 +24,12 @@ class seq_subset {
bool is_subset_rec(expr* a, expr* b, unsigned depth) const;
// true if regex a, viewed as a flattened concatenation, has suf as a
// structural (concatenation) suffix.
bool ends_with(expr* a, expr* suf) const;
void flatten_concat(expr* a, ptr_vector<expr>& out) const;
public:
explicit seq_subset(seq_util::rex& re) : m_re(re) {}
bool is_subset(expr* a, expr* b) const;

View file

@ -461,6 +461,24 @@ namespace smt {
if (re().is_empty(r))
//trivially true
return;
// When one side is re.none the equation is a pure emptiness check on
// the other regex (symmetric_diff already returned it as r). Decide
// it directly by antimirov NFA reachability instead of running the
// bisimulation/XOR closure, which would build large un-canonicalized
// product states for intersections of contains-patterns.
if ((re().is_empty(r1) || re().is_empty(r2)) && is_ground(r)) {
switch (re_is_empty(r)) {
case l_true:
STRACE(seq_regex_brief, tout << "empty:eq ";);
return; // languages equal (both empty): trivially true
case l_false:
STRACE(seq_regex_brief, tout << "empty:neq ";);
th.add_axiom(~th.mk_eq(r1, r2, false), false_literal);
return;
case l_undef:
break;
}
}
// Try the bisimulation procedure on ground regexes first. If it
// returns a definite answer, dispatch the corresponding axiom and
// bypass the symbolic emptiness/derivative closure.
@ -671,6 +689,67 @@ namespace smt {
return result;
}
/*
Decide emptiness of a ground regex r via antimirov-mode NFA
reachability.
The symbolic derivative engine runs in antimirov mode, so the
derivative of an intersection distributes into a *set* of individual
product states inter(A_i, B_j) (each a small, ground regex) rather
than one giant union-of-intersections term. get_derivative_targets
enumerates these NFA successor states.
We short-circuit to l_false (non-empty) as soon as a reachable state
is nullable (accepts the empty word) or classical (a regex built only
from to_re/all/union/concat/star/plus/opt/loop, hence non-empty). An
intersection itself is never classical, but once one operand reduces
to Σ* the intersection collapses (via the derivative's subset
simplification) to the other, classical, operand.
If the worklist is exhausted with no such state, r is empty (l_true).
Returns l_undef if a step bound is hit, so callers can fall back to
the general procedure.
*/
lbool seq_regex::re_is_empty(expr* r) {
if (re().is_empty(r))
return l_true;
expr_ref_vector pinned(m);
obj_hashtable<expr> visited;
ptr_vector<expr> work;
work.push_back(r);
visited.insert(r);
pinned.push_back(r);
unsigned const bound = 100000;
unsigned steps = 0;
while (!work.empty()) {
if (++steps > bound)
return l_undef;
expr* s = work.back();
work.pop_back();
auto info = re().get_info(s);
if (!info.is_known())
return l_undef;
// ε ∈ L(s) or s is a non-empty classical regex ⇒ L(r) non-empty.
if (info.nullable == l_true || info.classical)
return l_false;
// Dead state: prune (min_length == UINT_MAX means no word is
// accepted from here).
if (info.min_length == UINT_MAX)
continue;
expr_ref_vector targets(m);
get_derivative_targets(s, targets);
for (expr* t : targets) {
if (visited.contains(t))
continue;
visited.insert(t);
pinned.push_back(t);
work.push_back(t);
}
}
return l_true;
}
/*
Return a list of all target regexes in the derivative of a regex r,
ignoring the conditions along each path.

View file

@ -165,6 +165,12 @@ namespace smt {
expr_ref mk_deriv_accept(expr* s, unsigned i, expr* r);
void get_derivative_targets(expr* r, expr_ref_vector& targets);
// Decide emptiness of a ground regex by antimirov-mode NFA
// reachability: explore derivative target states, short-circuiting to
// "non-empty" on the first reachable nullable or classical state.
// Returns l_true (empty), l_false (non-empty), l_undef (gave up).
lbool re_is_empty(expr* r);
/*
Pretty print the regex of the state id to the out stream,
seq_regex_ptr must be a pointer to seq_regex and the