wip(seq): clean_leaf cofactors, mk_union_core simplifications, re_is_empty antimirov emptiness check

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-26 10:28:48 +00:00 · 2026-06-26 00:36:14 +03:00 · 2026-06-26 00:36:14 +03:00 · 149549b946
commit 149549b946
parent 3d42d997c6
7 changed files with 216 additions and 22 deletions
--- a/src/ast/rewriter/seq_derive.cpp
+++ b/src/ast/rewriter/seq_derive.cpp
@ -722,6 +722,20 @@ namespace seq {

    expr_ref derive::mk_union_core(expr* a, expr* b) {

+        // Identity:    none ∪ R = R          (none is the unit of union)
+        // Idempotence: R ∪ R = R
+        // Absorption:  Σ* ∪ R = Σ*
+        // Without these the derivative of an intersection accumulates
+        // un-simplified unions such as union(inter, union(none, none)),
+        // producing many syntactically distinct but semantically equal
+        // states.  That defeats state dedup in the emptiness/bisim closure
+        // and makes contains-pattern intersections blow up.
+        if (re().is_empty(a)) return expr_ref(b, m);
+        if (re().is_empty(b)) return expr_ref(a, m);
+        if (a == b) return expr_ref(a, m);
+        if (re().is_full_seq(a) || re().is_full_seq(b))
+            return expr_ref(re().mk_full_seq(a->get_sort()), m);
+
        // Prefix factoring: a·x ∪ a·y = a·(x ∪ y)
        expr *a1, *a2, *b1, *b2;
        if (re().is_concat(a, a1, a2) && re().is_concat(b, b1, b2) && a1 == b1) {
@ -1308,14 +1322,29 @@ namespace seq {
    // Cofactor enumeration over a transition regex
    // -------------------------------------------------------

+    expr_ref derive::clean_leaf(expr* r) {
+        expr* a = nullptr, * b = nullptr;
+        if (re().is_union(r, a, b))
+            return mk_union(clean_leaf(a), clean_leaf(b));
+        if (re().is_intersection(r, a, b))
+            return mk_inter(clean_leaf(a), clean_leaf(b));
+        return expr_ref(r, m);
+    }
+
    void derive::get_cofactors_rec(expr* r, expr_ref_pair_vector& result) {
        // Hoist the (first) if-then-else condition to the top of r, splitting it
        // into the equivalent ite(c, th, el); when r contains no ite it is a
        // leaf of the transition regex.
        expr_ref c(m), th(m), el(m);
        if (!m_br.decompose_ite(r, c, th, el)) {
-            if (!re().is_empty(r))
-                result.push_back(get_path_expr(), r);
+            // Re-normalize the leaf: decompose_ite substitutes ITE branches
+            // structurally so the leaf may carry un-simplified union(_, none)
+            // / inter(_, none) nodes.  Cleaning them keeps semantically equal
+            // states syntactically identical, which is essential for state
+            // dedup in the emptiness/bisim closure.
+            expr_ref cr = clean_leaf(r);
+            if (!re().is_empty(cr))
+                result.push_back(get_path_expr(), cr);
            return;
        }
        // Positive branch: c holds.
--- a/src/ast/rewriter/seq_derive.h
+++ b/src/ast/rewriter/seq_derive.h
@ -167,6 +167,13 @@ namespace seq {
        // Cofactor enumeration over a transition regex (ITE-tree).
        void get_cofactors_rec(expr* r, expr_ref_pair_vector& result);

+        // Re-apply union/intersection simplifications bottom-up to a cofactor
+        // leaf.  decompose_ite substitutes ITE branch values structurally
+        // (no simplification), so leaves can contain un-normalized nodes such
+        // as union(R, none) or inter(R, none); this rebuilds them through
+        // mk_union/mk_inter so equal states share a canonical form.
+        expr_ref clean_leaf(expr* r);
+
        sort* re_sort(expr* r) { return r->get_sort(); }
        sort* seq_sort(expr* r) { sort* s = nullptr; m_util.is_re(r, s); return s; }
        sort* ele_sort(expr* r) { sort* s = seq_sort(r); sort* e = nullptr; m_util.is_seq(s, e); return e; }
--- a/src/ast/rewriter/seq_rewriter.cpp
+++ b/src/ast/rewriter/seq_rewriter.cpp
@ -3063,6 +3063,10 @@ bool seq_rewriter::le_char(expr* ch1, expr* ch2) {

    Current cases handled:
    - a and b are char <= constraints, or negations of char <= constraints
+    - a and b are equalities (element = constant char), or their negations.
+      These arise from derivatives of single characters and must be pruned
+      when combining BDDs so that no unreachable branch such as
+      ite(x = 'a', ite(x = 'b', ...), ...) (with 'a' != 'b') is created.
 */
 bool seq_rewriter::pred_implies(expr* a, expr* b) {
    STRACE(seq_verbose, tout << "pred_implies: "
@ -3070,6 +3074,26 @@ bool seq_rewriter::pred_implies(expr* a, expr* b) {
                               << "," << mk_pp(b, m()) << std::endl;);
    expr *cha1 = nullptr, *cha2 = nullptr, *nota = nullptr,
         *chb1 = nullptr, *chb2 = nullptr, *notb = nullptr;
+    // (element = constant char), returning the element and char code.
+    auto is_char_eq = [&](expr* e, expr*& x, unsigned& v) {
+        expr* t1 = nullptr, *t2 = nullptr;
+        if (!m().is_eq(e, t1, t2))
+            return false;
+        if (u().is_const_char(t2, v)) { x = t1; return true; }
+        if (u().is_const_char(t1, v)) { x = t2; return true; }
+        return false;
+    };
+    expr *xa = nullptr, *xb = nullptr;
+    unsigned va = 0, vb = 0;
+    if (is_char_eq(a, xa, va)) {
+        // a is (xa = va)
+        if (is_char_eq(b, xb, vb) && xa == xb)
+            // (x = va) => (x = vb) iff va == vb
+            return va == vb;
+        if (m().is_not(b, notb) && is_char_eq(notb, xb, vb) && xa == xb)
+            // (x = va) => not (x = vb) iff va != vb
+            return va != vb;
+    }
    if (m().is_not(a, nota) &&
        m().is_not(b, notb)) {
        return pred_implies(notb, nota);
@ -3267,26 +3291,33 @@ expr_ref seq_rewriter::mk_der_op(decl_kind k, expr* a, expr* b) {
    // transformations hide ite sub-terms, 
    // Rewriting that changes associativity of
    // operators may hide ite sub-terms.
-
-    switch (k) {
-    case OP_RE_INTERSECT:
-        if (BR_FAILED != mk_re_inter0(a, b, result))
-            return result;
-        break;
-    case OP_RE_UNION:
-        if (BR_FAILED != mk_re_union0(a, b, result))
-            return result;
-        break;
-    case OP_RE_CONCAT:
-        if (BR_FAILED != mk_re_concat(a, b, result))
-            return result;
-        break;
-    case OP_RE_XOR:
-        if (BR_FAILED != mk_re_xor0(a, b, result))
-            return result;
-        break;
-    default:
-        break;
+    //
+    // When either operand is an ite (a derivative BDD), skip the
+    // pre-simplification: its blind ite-hoisting would bypass the
+    // pred_implies-based pruning in mk_der_op_rec and create unreachable
+    // branches such as ite(x = 'a', ite(x = 'b', ...), ...) with 'a' != 'b'.
+    bool has_ite = m().is_ite(a) || m().is_ite(b);
+    if (!has_ite) {
+        switch (k) {
+        case OP_RE_INTERSECT:
+            if (BR_FAILED != mk_re_inter0(a, b, result))
+                return result;
+            break;
+        case OP_RE_UNION:
+            if (BR_FAILED != mk_re_union0(a, b, result))
+                return result;
+            break;
+        case OP_RE_CONCAT:
+            if (BR_FAILED != mk_re_concat(a, b, result))
+                return result;
+            break;
+        case OP_RE_XOR:
+            if (BR_FAILED != mk_re_xor0(a, b, result))
+                return result;
+            break;
+        default:
+            break;
+        }
    }
    result = m_op_cache.find(k, a, b, nullptr);
    if (!result) {
@ -3914,6 +3945,9 @@ br_status seq_rewriter::mk_re_concat(expr* a, expr* b, expr_ref& result) {
        return BR_REWRITE3;
    }
    if (re().is_concat(a, a1, a2)) {
+        // Maintain right-associative normal form: re().mk_concat is a raw
+        // constructor, so re-simplify the result to recursively reassociate
+        // any concat nested in a2 (and re-apply concat simplifications).
        result = re().mk_concat(a1, re().mk_concat(a2, b));
        return BR_DONE;
    }
--- a/src/ast/rewriter/seq_subset.cpp
+++ b/src/ast/rewriter/seq_subset.cpp
@ -108,6 +108,12 @@ bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const {
        if (m_re.is_concat(b, b1, b2) && m_re.is_full_seq(b1) && is_subset_rec(a, b2, depth))
            return true;

+        // prefix absorption: P·R' ⊆ Σ*·R' for any prefix P (since P ⊆ Σ*).
+        // Detect that a has R' (= b2) as a concatenation suffix, where b = Σ*·R'.
+        // Covers contains-patterns, e.g. Σ*·a·Σ*·b·Σ* ⊆ Σ*·b·Σ*.
+        if (m_re.is_concat(b, b1, b2) && m_re.is_full_seq(b1) && ends_with(a, b2))
+            return true;
+
        // R ⊆ R'·Σ* if R ⊆ R'
        if (m_re.is_concat(b, b1, b2) && m_re.is_full_seq(b2) && is_subset_rec(a, b1, depth))
            return true;
@ -140,3 +146,30 @@ bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const {
 bool seq_subset::is_subset(expr* a, expr* b) const {
    return is_subset_rec(a, b, 0);
 }
+
+bool seq_subset::ends_with(expr* a, expr* suf) const {
+    if (a == suf)
+        return true;
+    // Flatten both regexes into their sequence of concatenation factors
+    // (independent of left/right associativity) and test list-suffix equality.
+    ptr_vector<expr> af, sf;
+    flatten_concat(a, af);
+    flatten_concat(suf, sf);
+    if (sf.size() > af.size())
+        return false;
+    unsigned off = af.size() - sf.size();
+    for (unsigned i = 0; i < sf.size(); ++i)
+        if (af[off + i] != sf[i])
+            return false;
+    return true;
+}
+
+void seq_subset::flatten_concat(expr* a, ptr_vector<expr>& out) const {
+    expr* a1 = nullptr, * a2 = nullptr;
+    if (m_re.is_concat(a, a1, a2)) {
+        flatten_concat(a1, out);
+        flatten_concat(a2, out);
+    }
+    else
+        out.push_back(a);
+}
--- a/src/ast/rewriter/seq_subset.h
+++ b/src/ast/rewriter/seq_subset.h
@ -24,6 +24,12 @@ class seq_subset {

    bool is_subset_rec(expr* a, expr* b, unsigned depth) const;

+    // true if regex a, viewed as a flattened concatenation, has suf as a
+    // structural (concatenation) suffix.
+    bool ends_with(expr* a, expr* suf) const;
+
+    void flatten_concat(expr* a, ptr_vector<expr>& out) const;
+
 public:
    explicit seq_subset(seq_util::rex& re) : m_re(re) {}
    bool is_subset(expr* a, expr* b) const;
--- a/src/smt/seq_regex.cpp
+++ b/src/smt/seq_regex.cpp
@ -461,6 +461,24 @@ namespace smt {
        if (re().is_empty(r))
            //trivially true
            return;
+        // When one side is re.none the equation is a pure emptiness check on
+        // the other regex (symmetric_diff already returned it as r).  Decide
+        // it directly by antimirov NFA reachability instead of running the
+        // bisimulation/XOR closure, which would build large un-canonicalized
+        // product states for intersections of contains-patterns.
+        if ((re().is_empty(r1) || re().is_empty(r2)) && is_ground(r)) {
+            switch (re_is_empty(r)) {
+            case l_true:
+                STRACE(seq_regex_brief, tout << "empty:eq ";);
+                return; // languages equal (both empty): trivially true
+            case l_false:
+                STRACE(seq_regex_brief, tout << "empty:neq ";);
+                th.add_axiom(~th.mk_eq(r1, r2, false), false_literal);
+                return;
+            case l_undef:
+                break;
+            }
+        }
        // Try the bisimulation procedure on ground regexes first.  If it
        // returns a definite answer, dispatch the corresponding axiom and
        // bypass the symbolic emptiness/derivative closure.
@ -671,6 +689,67 @@ namespace smt {
        return result;
    }

+    /*
+        Decide emptiness of a ground regex r via antimirov-mode NFA
+        reachability.
+
+        The symbolic derivative engine runs in antimirov mode, so the
+        derivative of an intersection distributes into a *set* of individual
+        product states inter(A_i, B_j) (each a small, ground regex) rather
+        than one giant union-of-intersections term.  get_derivative_targets
+        enumerates these NFA successor states.
+
+        We short-circuit to l_false (non-empty) as soon as a reachable state
+        is nullable (accepts the empty word) or classical (a regex built only
+        from to_re/all/union/concat/star/plus/opt/loop, hence non-empty).  An
+        intersection itself is never classical, but once one operand reduces
+        to Σ* the intersection collapses (via the derivative's subset
+        simplification) to the other, classical, operand.
+
+        If the worklist is exhausted with no such state, r is empty (l_true).
+        Returns l_undef if a step bound is hit, so callers can fall back to
+        the general procedure.
+    */
+    lbool seq_regex::re_is_empty(expr* r) {
+        if (re().is_empty(r))
+            return l_true;
+        expr_ref_vector pinned(m);
+        obj_hashtable<expr> visited;
+        ptr_vector<expr> work;
+        work.push_back(r);
+        visited.insert(r);
+        pinned.push_back(r);
+        unsigned const bound = 100000;
+        unsigned steps = 0;
+        while (!work.empty()) {
+            if (++steps > bound)
+                return l_undef;
+            expr* s = work.back();
+            work.pop_back();
+            auto info = re().get_info(s);
+            if (!info.is_known())
+                return l_undef;
+            // ε ∈ L(s) or s is a non-empty classical regex ⇒ L(r) non-empty.
+            if (info.nullable == l_true || info.classical)
+                return l_false;
+            // Dead state: prune (min_length == UINT_MAX means no word is
+            // accepted from here).
+            if (info.min_length == UINT_MAX)
+                continue;
+            expr_ref_vector targets(m);
+            get_derivative_targets(s, targets);
+            for (expr* t : targets) {
+                if (visited.contains(t))
+                    continue;
+                visited.insert(t);
+                pinned.push_back(t);
+                work.push_back(t);
+            }
+        }
+        return l_true;
+    }
+
+
    /*
        Return a list of all target regexes in the derivative of a regex r,
        ignoring the conditions along each path.
--- a/src/smt/seq_regex.h
+++ b/src/smt/seq_regex.h
@ -165,6 +165,12 @@ namespace smt {
        expr_ref mk_deriv_accept(expr* s, unsigned i, expr* r);
        void get_derivative_targets(expr* r, expr_ref_vector& targets);

+        // Decide emptiness of a ground regex by antimirov-mode NFA
+        // reachability: explore derivative target states, short-circuiting to
+        // "non-empty" on the first reachable nullable or classical state.
+        // Returns l_true (empty), l_false (non-empty), l_undef (gave up).
+        lbool re_is_empty(expr* r);
+
        /* 
           Pretty print the regex of the state id to the out stream, 
           seq_regex_ptr must be a pointer to seq_regex and the