diff --git a/src/ast/rewriter/seq_derive.cpp b/src/ast/rewriter/seq_derive.cpp index 2f6666a8fc..7447e09113 100644 --- a/src/ast/rewriter/seq_derive.cpp +++ b/src/ast/rewriter/seq_derive.cpp @@ -722,6 +722,20 @@ namespace seq { expr_ref derive::mk_union_core(expr* a, expr* b) { + // Identity: none ∪ R = R (none is the unit of union) + // Idempotence: R ∪ R = R + // Absorption: Σ* ∪ R = Σ* + // Without these the derivative of an intersection accumulates + // un-simplified unions such as union(inter, union(none, none)), + // producing many syntactically distinct but semantically equal + // states. That defeats state dedup in the emptiness/bisim closure + // and makes contains-pattern intersections blow up. + if (re().is_empty(a)) return expr_ref(b, m); + if (re().is_empty(b)) return expr_ref(a, m); + if (a == b) return expr_ref(a, m); + if (re().is_full_seq(a) || re().is_full_seq(b)) + return expr_ref(re().mk_full_seq(a->get_sort()), m); + // Prefix factoring: a·x ∪ a·y = a·(x ∪ y) expr *a1, *a2, *b1, *b2; if (re().is_concat(a, a1, a2) && re().is_concat(b, b1, b2) && a1 == b1) { @@ -1308,14 +1322,29 @@ namespace seq { // Cofactor enumeration over a transition regex // ------------------------------------------------------- + expr_ref derive::clean_leaf(expr* r) { + expr* a = nullptr, * b = nullptr; + if (re().is_union(r, a, b)) + return mk_union(clean_leaf(a), clean_leaf(b)); + if (re().is_intersection(r, a, b)) + return mk_inter(clean_leaf(a), clean_leaf(b)); + return expr_ref(r, m); + } + void derive::get_cofactors_rec(expr* r, expr_ref_pair_vector& result) { // Hoist the (first) if-then-else condition to the top of r, splitting it // into the equivalent ite(c, th, el); when r contains no ite it is a // leaf of the transition regex. expr_ref c(m), th(m), el(m); if (!m_br.decompose_ite(r, c, th, el)) { - if (!re().is_empty(r)) - result.push_back(get_path_expr(), r); + // Re-normalize the leaf: decompose_ite substitutes ITE branches + // structurally so the leaf may carry un-simplified union(_, none) + // / inter(_, none) nodes. Cleaning them keeps semantically equal + // states syntactically identical, which is essential for state + // dedup in the emptiness/bisim closure. + expr_ref cr = clean_leaf(r); + if (!re().is_empty(cr)) + result.push_back(get_path_expr(), cr); return; } // Positive branch: c holds. diff --git a/src/ast/rewriter/seq_derive.h b/src/ast/rewriter/seq_derive.h index 5af528d15f..4a228b6bca 100644 --- a/src/ast/rewriter/seq_derive.h +++ b/src/ast/rewriter/seq_derive.h @@ -167,6 +167,13 @@ namespace seq { // Cofactor enumeration over a transition regex (ITE-tree). void get_cofactors_rec(expr* r, expr_ref_pair_vector& result); + // Re-apply union/intersection simplifications bottom-up to a cofactor + // leaf. decompose_ite substitutes ITE branch values structurally + // (no simplification), so leaves can contain un-normalized nodes such + // as union(R, none) or inter(R, none); this rebuilds them through + // mk_union/mk_inter so equal states share a canonical form. + expr_ref clean_leaf(expr* r); + sort* re_sort(expr* r) { return r->get_sort(); } sort* seq_sort(expr* r) { sort* s = nullptr; m_util.is_re(r, s); return s; } sort* ele_sort(expr* r) { sort* s = seq_sort(r); sort* e = nullptr; m_util.is_seq(s, e); return e; } diff --git a/src/ast/rewriter/seq_rewriter.cpp b/src/ast/rewriter/seq_rewriter.cpp index eeedc8f586..a084a5af20 100644 --- a/src/ast/rewriter/seq_rewriter.cpp +++ b/src/ast/rewriter/seq_rewriter.cpp @@ -3063,6 +3063,10 @@ bool seq_rewriter::le_char(expr* ch1, expr* ch2) { Current cases handled: - a and b are char <= constraints, or negations of char <= constraints + - a and b are equalities (element = constant char), or their negations. + These arise from derivatives of single characters and must be pruned + when combining BDDs so that no unreachable branch such as + ite(x = 'a', ite(x = 'b', ...), ...) (with 'a' != 'b') is created. */ bool seq_rewriter::pred_implies(expr* a, expr* b) { STRACE(seq_verbose, tout << "pred_implies: " @@ -3070,6 +3074,26 @@ bool seq_rewriter::pred_implies(expr* a, expr* b) { << "," << mk_pp(b, m()) << std::endl;); expr *cha1 = nullptr, *cha2 = nullptr, *nota = nullptr, *chb1 = nullptr, *chb2 = nullptr, *notb = nullptr; + // (element = constant char), returning the element and char code. + auto is_char_eq = [&](expr* e, expr*& x, unsigned& v) { + expr* t1 = nullptr, *t2 = nullptr; + if (!m().is_eq(e, t1, t2)) + return false; + if (u().is_const_char(t2, v)) { x = t1; return true; } + if (u().is_const_char(t1, v)) { x = t2; return true; } + return false; + }; + expr *xa = nullptr, *xb = nullptr; + unsigned va = 0, vb = 0; + if (is_char_eq(a, xa, va)) { + // a is (xa = va) + if (is_char_eq(b, xb, vb) && xa == xb) + // (x = va) => (x = vb) iff va == vb + return va == vb; + if (m().is_not(b, notb) && is_char_eq(notb, xb, vb) && xa == xb) + // (x = va) => not (x = vb) iff va != vb + return va != vb; + } if (m().is_not(a, nota) && m().is_not(b, notb)) { return pred_implies(notb, nota); @@ -3267,26 +3291,33 @@ expr_ref seq_rewriter::mk_der_op(decl_kind k, expr* a, expr* b) { // transformations hide ite sub-terms, // Rewriting that changes associativity of // operators may hide ite sub-terms. - - switch (k) { - case OP_RE_INTERSECT: - if (BR_FAILED != mk_re_inter0(a, b, result)) - return result; - break; - case OP_RE_UNION: - if (BR_FAILED != mk_re_union0(a, b, result)) - return result; - break; - case OP_RE_CONCAT: - if (BR_FAILED != mk_re_concat(a, b, result)) - return result; - break; - case OP_RE_XOR: - if (BR_FAILED != mk_re_xor0(a, b, result)) - return result; - break; - default: - break; + // + // When either operand is an ite (a derivative BDD), skip the + // pre-simplification: its blind ite-hoisting would bypass the + // pred_implies-based pruning in mk_der_op_rec and create unreachable + // branches such as ite(x = 'a', ite(x = 'b', ...), ...) with 'a' != 'b'. + bool has_ite = m().is_ite(a) || m().is_ite(b); + if (!has_ite) { + switch (k) { + case OP_RE_INTERSECT: + if (BR_FAILED != mk_re_inter0(a, b, result)) + return result; + break; + case OP_RE_UNION: + if (BR_FAILED != mk_re_union0(a, b, result)) + return result; + break; + case OP_RE_CONCAT: + if (BR_FAILED != mk_re_concat(a, b, result)) + return result; + break; + case OP_RE_XOR: + if (BR_FAILED != mk_re_xor0(a, b, result)) + return result; + break; + default: + break; + } } result = m_op_cache.find(k, a, b, nullptr); if (!result) { @@ -3914,6 +3945,9 @@ br_status seq_rewriter::mk_re_concat(expr* a, expr* b, expr_ref& result) { return BR_REWRITE3; } if (re().is_concat(a, a1, a2)) { + // Maintain right-associative normal form: re().mk_concat is a raw + // constructor, so re-simplify the result to recursively reassociate + // any concat nested in a2 (and re-apply concat simplifications). result = re().mk_concat(a1, re().mk_concat(a2, b)); return BR_DONE; } diff --git a/src/ast/rewriter/seq_subset.cpp b/src/ast/rewriter/seq_subset.cpp index f12bc32b1d..1af42d06d8 100644 --- a/src/ast/rewriter/seq_subset.cpp +++ b/src/ast/rewriter/seq_subset.cpp @@ -108,6 +108,12 @@ bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const { if (m_re.is_concat(b, b1, b2) && m_re.is_full_seq(b1) && is_subset_rec(a, b2, depth)) return true; + // prefix absorption: P·R' ⊆ Σ*·R' for any prefix P (since P ⊆ Σ*). + // Detect that a has R' (= b2) as a concatenation suffix, where b = Σ*·R'. + // Covers contains-patterns, e.g. Σ*·a·Σ*·b·Σ* ⊆ Σ*·b·Σ*. + if (m_re.is_concat(b, b1, b2) && m_re.is_full_seq(b1) && ends_with(a, b2)) + return true; + // R ⊆ R'·Σ* if R ⊆ R' if (m_re.is_concat(b, b1, b2) && m_re.is_full_seq(b2) && is_subset_rec(a, b1, depth)) return true; @@ -140,3 +146,30 @@ bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const { bool seq_subset::is_subset(expr* a, expr* b) const { return is_subset_rec(a, b, 0); } + +bool seq_subset::ends_with(expr* a, expr* suf) const { + if (a == suf) + return true; + // Flatten both regexes into their sequence of concatenation factors + // (independent of left/right associativity) and test list-suffix equality. + ptr_vector af, sf; + flatten_concat(a, af); + flatten_concat(suf, sf); + if (sf.size() > af.size()) + return false; + unsigned off = af.size() - sf.size(); + for (unsigned i = 0; i < sf.size(); ++i) + if (af[off + i] != sf[i]) + return false; + return true; +} + +void seq_subset::flatten_concat(expr* a, ptr_vector& out) const { + expr* a1 = nullptr, * a2 = nullptr; + if (m_re.is_concat(a, a1, a2)) { + flatten_concat(a1, out); + flatten_concat(a2, out); + } + else + out.push_back(a); +} diff --git a/src/ast/rewriter/seq_subset.h b/src/ast/rewriter/seq_subset.h index 7329c898e1..e62333dea3 100644 --- a/src/ast/rewriter/seq_subset.h +++ b/src/ast/rewriter/seq_subset.h @@ -24,6 +24,12 @@ class seq_subset { bool is_subset_rec(expr* a, expr* b, unsigned depth) const; + // true if regex a, viewed as a flattened concatenation, has suf as a + // structural (concatenation) suffix. + bool ends_with(expr* a, expr* suf) const; + + void flatten_concat(expr* a, ptr_vector& out) const; + public: explicit seq_subset(seq_util::rex& re) : m_re(re) {} bool is_subset(expr* a, expr* b) const; diff --git a/src/smt/seq_regex.cpp b/src/smt/seq_regex.cpp index 6c4f5cf5b3..5a801550cd 100644 --- a/src/smt/seq_regex.cpp +++ b/src/smt/seq_regex.cpp @@ -461,6 +461,24 @@ namespace smt { if (re().is_empty(r)) //trivially true return; + // When one side is re.none the equation is a pure emptiness check on + // the other regex (symmetric_diff already returned it as r). Decide + // it directly by antimirov NFA reachability instead of running the + // bisimulation/XOR closure, which would build large un-canonicalized + // product states for intersections of contains-patterns. + if ((re().is_empty(r1) || re().is_empty(r2)) && is_ground(r)) { + switch (re_is_empty(r)) { + case l_true: + STRACE(seq_regex_brief, tout << "empty:eq ";); + return; // languages equal (both empty): trivially true + case l_false: + STRACE(seq_regex_brief, tout << "empty:neq ";); + th.add_axiom(~th.mk_eq(r1, r2, false), false_literal); + return; + case l_undef: + break; + } + } // Try the bisimulation procedure on ground regexes first. If it // returns a definite answer, dispatch the corresponding axiom and // bypass the symbolic emptiness/derivative closure. @@ -671,6 +689,67 @@ namespace smt { return result; } + /* + Decide emptiness of a ground regex r via antimirov-mode NFA + reachability. + + The symbolic derivative engine runs in antimirov mode, so the + derivative of an intersection distributes into a *set* of individual + product states inter(A_i, B_j) (each a small, ground regex) rather + than one giant union-of-intersections term. get_derivative_targets + enumerates these NFA successor states. + + We short-circuit to l_false (non-empty) as soon as a reachable state + is nullable (accepts the empty word) or classical (a regex built only + from to_re/all/union/concat/star/plus/opt/loop, hence non-empty). An + intersection itself is never classical, but once one operand reduces + to Σ* the intersection collapses (via the derivative's subset + simplification) to the other, classical, operand. + + If the worklist is exhausted with no such state, r is empty (l_true). + Returns l_undef if a step bound is hit, so callers can fall back to + the general procedure. + */ + lbool seq_regex::re_is_empty(expr* r) { + if (re().is_empty(r)) + return l_true; + expr_ref_vector pinned(m); + obj_hashtable visited; + ptr_vector work; + work.push_back(r); + visited.insert(r); + pinned.push_back(r); + unsigned const bound = 100000; + unsigned steps = 0; + while (!work.empty()) { + if (++steps > bound) + return l_undef; + expr* s = work.back(); + work.pop_back(); + auto info = re().get_info(s); + if (!info.is_known()) + return l_undef; + // ε ∈ L(s) or s is a non-empty classical regex ⇒ L(r) non-empty. + if (info.nullable == l_true || info.classical) + return l_false; + // Dead state: prune (min_length == UINT_MAX means no word is + // accepted from here). + if (info.min_length == UINT_MAX) + continue; + expr_ref_vector targets(m); + get_derivative_targets(s, targets); + for (expr* t : targets) { + if (visited.contains(t)) + continue; + visited.insert(t); + pinned.push_back(t); + work.push_back(t); + } + } + return l_true; + } + + /* Return a list of all target regexes in the derivative of a regex r, ignoring the conditions along each path. diff --git a/src/smt/seq_regex.h b/src/smt/seq_regex.h index f27f23fa3d..dd1c474b31 100644 --- a/src/smt/seq_regex.h +++ b/src/smt/seq_regex.h @@ -165,6 +165,12 @@ namespace smt { expr_ref mk_deriv_accept(expr* s, unsigned i, expr* r); void get_derivative_targets(expr* r, expr_ref_vector& targets); + // Decide emptiness of a ground regex by antimirov-mode NFA + // reachability: explore derivative target states, short-circuiting to + // "non-empty" on the first reachable nullable or classical state. + // Returns l_true (empty), l_false (non-empty), l_undef (gave up). + lbool re_is_empty(expr* r); + /* Pretty print the regex of the state id to the out stream, seq_regex_ptr must be a pointer to seq_regex and the