add general purpose emptiness/non-emptiness check

Signed-off-by: Nikolaj Bjorner <nbjorner@microsoft.com>
2026-01-08 12:01:17 +00:00 · 2020-05-26 20:42:21 -07:00 · 2020-05-26 20:42:21 -07:00 · 88e36c6bf3
commit 88e36c6bf3
parent 33cdc06eb4
8 changed files with 298 additions and 65 deletions
--- a/src/ast/expr_abstract.h
+++ b/src/ast/expr_abstract.h
@ -38,6 +38,8 @@ inline expr_ref expr_abstract(expr_ref_vector const& bound, expr* n) { return ex
 inline expr_ref expr_abstract(app_ref_vector const& bound, expr* n) { return expr_abstract(bound.m(), 0, bound.size(), (expr*const*)bound.c_ptr(), n); }
 expr_ref mk_forall(ast_manager& m, unsigned num_bound, app* const* bound, expr* n);
 expr_ref mk_exists(ast_manager& m, unsigned num_bound, app* const* bound, expr* n);
+inline expr_ref mk_forall(ast_manager& m, app* b, expr* n) { return mk_forall(m, 1, &b, n); }
+inline expr_ref mk_forall(ast_manager& m, expr* b, expr* n) { return mk_forall(m, to_app(b), n); }

 #endif

--- a/src/ast/rewriter/seq_rewriter.cpp
+++ b/src/ast/rewriter/seq_rewriter.cpp
@ -28,6 +28,7 @@ Notes:
 #include "ast/well_sorted.h"
 #include "ast/rewriter/var_subst.h"
 #include "ast/rewriter/bool_rewriter.h"
+#include "ast/rewriter/expr_safe_replace.h"
 #include "ast/rewriter/seq_rewriter_params.hpp"
 #include "math/automata/automaton.h"
 #include "math/automata/symbolic_automata_def.h"
@ -2672,6 +2673,138 @@ br_status seq_rewriter::mk_re_opt(expr* a, expr_ref& result) {
    return BR_REWRITE1;
 }

+void seq_rewriter::intersect(unsigned lo, unsigned hi, svector<std::pair<unsigned, unsigned>>& ranges) {
+    unsigned j = 0;
+    for (unsigned i = 0; i < ranges.size(); ++i) {
+        unsigned lo1 = ranges[i].first;
+        unsigned hi1 = ranges[i].second;        
+        if (hi < lo1) 
+            break;
+        if (hi1 >= lo) 
+            ranges[j++] = std::make_pair(std::max(lo1, lo), std::min(hi1, hi));
+    }
+    ranges.shrink(j);
+}
+
+/**
+ * Simplify cond using special case rewriting for character equations
+ * When elem is uninterpreted compute the simplification of Exists elem . cond
+ * if it is possible to solve for elem.
+ */
+void seq_rewriter::elim_condition(expr* elem, expr_ref& cond) {
+    expr_ref_vector conds(m());
+    flatten_and(cond, conds);
+    expr* lhs = nullptr, *rhs = nullptr, *e1 = nullptr; 
+    if (u().is_char(elem)) {
+        unsigned ch = 0;
+        svector<std::pair<unsigned, unsigned>> ranges, ranges1;
+        ranges.push_back(std::make_pair(0, zstring::max_char()));
+        auto exclude_char = [&](unsigned ch) {
+            if (ch == 0) {
+                intersect(1, zstring::max_char(), ranges);
+            }
+            else if (ch == zstring::max_char()) {
+                intersect(0, ch-1, ranges);
+            }
+            else {
+                ranges1.reset();
+                ranges1.append(ranges);
+                intersect(0, ch-1, ranges);
+                intersect(ch + 1, zstring::max_char(), ranges1);
+                ranges.append(ranges1);
+            }
+        };
+        bool all_ranges = true;
+        for (expr* e : conds) {
+            if (m().is_eq(e, lhs, rhs) && elem == lhs && u().is_const_char(rhs, ch)) {
+                intersect(ch, ch, ranges);                
+            }
+            else if (m().is_eq(e, lhs, rhs) && elem == rhs && u().is_const_char(lhs, ch)) {
+                intersect(ch, ch, ranges);
+            }
+            else if (u().is_char_le(e, lhs, rhs) && elem == lhs && u().is_const_char(rhs, ch)) {
+                intersect(0, ch, ranges);
+            }
+            else if (u().is_char_le(e, lhs, rhs) && elem == rhs && u().is_const_char(lhs, ch)) {
+                intersect(ch, zstring::max_char(), ranges);
+            }
+            else if (m().is_not(e, e1) && m().is_eq(e1, lhs, rhs) && elem == lhs && u().is_const_char(rhs, ch)) {
+                exclude_char(ch);
+            }
+            else if (m().is_not(e, e1) && m().is_eq(e1, lhs, rhs) && elem == rhs && u().is_const_char(lhs, ch)) {
+                exclude_char(ch);
+            }
+            else if (m().is_not(e, e1) && u().is_char_le(e1, lhs, rhs) && elem == lhs && u().is_const_char(rhs, ch)) {
+                // not (e <= ch)
+                if (ch == zstring::max_char()) 
+                    ranges.reset();
+                else 
+                    intersect(ch+1, zstring::max_char(), ranges);
+            }
+            else if (m().is_not(e, e1) && u().is_char_le(e1, lhs, rhs) && elem == rhs && u().is_const_char(lhs, ch)) {
+                // not (ch <= e)
+                if (ch == 0) 
+                    ranges.reset();
+                else                 
+                    intersect(0, ch-1, ranges);
+            }
+            // TBD: case for negation of range (not (and (<= lo e) (<= e hi)))
+            else {
+                all_ranges = false;
+                break;
+            }
+            if (ranges.empty())
+                break;
+        }
+        if (all_ranges) {
+            if (ranges.empty()) {
+                cond = m().mk_false();
+                return;
+            }
+            if (is_uninterp_const(elem)) {
+                cond = m().mk_true();
+                return;
+            }
+        }
+    }
+            
+    expr* solution = nullptr;
+    for (expr* e : conds) {
+        if (!m().is_eq(e, lhs, rhs)) 
+            continue;
+        if (rhs == elem)
+            std::swap(lhs, rhs);
+        if (lhs != elem)
+            continue;
+        solution = rhs;
+        break;        
+    }
+    if (solution) {
+        expr_safe_replace rep(m());
+        rep.insert(elem, solution);
+        rep(cond);
+        if (!is_uninterp_const(elem)) { 
+            cond = m().mk_and(m().mk_eq(elem, solution), cond);
+        }
+    }    
+}
+
+void seq_rewriter::get_cofactors(expr* r, expr_ref_vector& conds, expr_ref_pair_vector& result) {
+    expr_ref cond(m()), th(m()), el(m());
+    if (has_cofactor(r, cond, th, el)) {
+        conds.push_back(cond);
+        get_cofactors(th, conds, result);
+        conds.pop_back();
+        conds.push_back(mk_not(m(), cond));
+        get_cofactors(el, conds, result);
+        conds.pop_back();
+    }
+    else {
+        cond = mk_and(conds);
+        result.push_back(cond, r);
+    }
+}
+
 bool seq_rewriter::has_cofactor(expr* r, expr_ref& cond, expr_ref& th, expr_ref& el) {
    if (m().is_ite(r)) {
        cond = to_app(r)->get_arg(0);
@ -2749,8 +2882,8 @@ bool seq_rewriter::has_cofactor(expr* r, expr_ref& cond, expr_ref& th, expr_ref&
        }
        if (args_th.size() == a->get_num_args()) {
            if (has_cof) {
-                th = m().mk_app(a->get_decl(), args_th);
-                el = m().mk_app(a->get_decl(), args_el);
+                th = mk_app(a->get_decl(), args_th);
+                el = mk_app(a->get_decl(), args_el);
                trail.push_back(th);
                trail.push_back(el);
                cache_th.insert(a, th);
--- a/src/ast/rewriter/seq_rewriter.h
+++ b/src/ast/rewriter/seq_rewriter.h
@ -214,6 +214,9 @@ class seq_rewriter {
    class seq_util::str& str() { return u().str; }
    class seq_util::str const& str() const { return u().str; }

+    void get_cofactors(expr* r, expr_ref_vector& conds, expr_ref_pair_vector& result);
+    void intersect(unsigned lo, unsigned hi, svector<std::pair<unsigned, unsigned>>& ranges);
+
 public:
    seq_rewriter(ast_manager & m, params_ref const & p = params_ref()):
        m_util(m), m_autil(m), m_re2aut(m), m_es(m), m_lhs(m), m_rhs(m), m_coalesce_chars(true) {
@ -235,6 +238,15 @@ public:
    br_status mk_eq_core(expr * lhs, expr * rhs, expr_ref & result);
    br_status mk_bool_app(func_decl* f, unsigned n, expr* const* args, expr_ref& result);

+    expr_ref mk_app(func_decl* f, expr_ref_vector const& args) { return mk_app(f, args.size(), args.c_ptr()); }
+    expr_ref mk_app(func_decl* f, unsigned n, expr* const* args) { 
+        expr_ref result(m());
+        if (f->get_family_id() != u().get_family_id() || 
+            BR_FAILED == mk_app_core(f, n, args, result))
+            result = m().mk_app(f, n, args);
+        return result;
+    }
+        
    bool reduce_eq(expr* l, expr* r, expr_ref_pair_vector& new_eqs, bool& change);

    bool reduce_eq(expr_ref_vector& ls, expr_ref_vector& rs, expr_ref_pair_vector& new_eqs, bool& change);
@ -249,6 +261,15 @@ public:

    bool has_cofactor(expr* r, expr_ref& cond, expr_ref& th, expr_ref& el);

+    void get_cofactors(expr* r, expr_ref_pair_vector& result) {
+        expr_ref_vector conds(m());
+        get_cofactors(r, conds, result);
+    }
+
+    // heuristic elimination of element from condition that comes form a derivative.
+    // special case optimization for conjunctions of equalities, disequalities and ranges.
+    void elim_condition(expr* elem, expr_ref& cond);
+
 };

 #endif
--- a/src/smt/seq_regex.cpp
+++ b/src/smt/seq_regex.cpp
@ -17,6 +17,7 @@ Author:

 #include "smt/seq_regex.h"
 #include "smt/theory_seq.h"
+#include "ast/expr_abstract.h"

 namespace smt {

@ -267,10 +268,20 @@ namespace smt {
    }

    void seq_regex::propagate_eq(expr* r1, expr* r2) {
-        // the dual version of unroll_non_empty, but
-        // skolem functions have to be eliminated or turned into 
-        // universal quantifiers.
-        throw default_exception("emptiness checking for regex is TBD");
+        expr_ref r(m);
+        if (re().is_empty(r1)) 
+            std::swap(r1, r2);
+        if (re().is_empty(r2))
+            r = r1;
+        else 
+            r = re().mk_union(re().mk_diff(r1, r2), re().mk_diff(r2, r1));
+        rewrite(r);
+        sort* seq_sort = nullptr;        
+        VERIFY(u().is_re(r, seq_sort));
+        expr_ref emp(re().mk_empty(seq_sort), m);
+        literal lit = ~th.mk_eq(r, emp, false);
+        expr_ref is_non_empty = sk().mk_is_non_empty(r, emp);
+        th.add_axiom(~lit, th.mk_literal(is_non_empty));
    }
    
    void seq_regex::propagate_ne(expr* r1, expr* r2) {
@ -284,74 +295,112 @@ namespace smt {
        rewrite(r);
        sort* seq_sort = nullptr;        
        VERIFY(u().is_re(r, seq_sort));
-        literal lit = ~th.mk_eq(r, re().mk_empty(seq_sort), false);
-        expr_mark seen;
-        expr_ref non_empty = unroll_non_empty(r, seen, 0);
-        if (non_empty) {
-            rewrite(non_empty);
-            th.add_axiom(~lit, th.mk_literal(non_empty));
-        }
-        else {
-            // generally introduce predicate (re.nonempty r seen)
-            // with inference rules based on unroll_non_empty
-            throw default_exception("unrolling large regexes is TBD");
+        expr_ref emp(re().mk_empty(seq_sort), m);
+        literal lit = ~th.mk_eq(r, emp, false);
+        expr_ref is_empty = sk().mk_is_empty(r, emp);
+        th.add_axiom(~lit, th.mk_literal(is_empty));
+    }
+
+    bool seq_regex::is_member(expr* r, expr* u) {
+        expr* u2 = nullptr;
+        while (re().is_union(u, u, u2)) {
+            if (r == u2)
+                return true;
        }
+        return r == u;        
    }

    /**
-       nonempty(R union Q, Seen) = R != {} or Q != {}
-       nonempty(R[if(p,R1,R2)], Seen) = if(p, nonempty(R[R1], Seen), nonempty(R[R2], Seen))           (co-factor)
-       nonempty(R, Seen) = nullable(R) or (R not in Seen and nonempty(D(first(R),R), Seen u { R }))  (derivative)
-       
-       TBD: eliminate variables from p when possible to perform quantifier elimination.
-       
-       p := first(R) == 'a'
-       then replace first(R) by 'a' in R[R1]
-       TBD: 
-       empty(R, Seen) = R = {} if R does not contain a subterm in Seen and Seen is non-empty
+     * is_non_empty(r, u) => nullable or not c_i or is_non_empty(r_i, u union r)
+     *
+     * for each (c_i, r_i) in cofactors
+     *
+     * is_non_empty(r_i, u union r) := false if r_i in u
+     *
+     */
+    void seq_regex::propagate_is_non_empty(literal lit) {
+        expr* e = ctx.bool_var2expr(lit.var()), *r, *u;
+        VERIFY(sk().is_is_non_empty(e, r, u));
+        expr_ref is_nullable = seq_rw().is_nullable(r);
+        rewrite(is_nullable);
+        if (m.is_true(is_nullable))
+            return;
+        literal null_lit = th.mk_literal(is_nullable);
+        expr_ref hd = mk_first(r);
+        expr_ref d = seq_rw().derivative(hd, r);
+        if (!d)
+            throw default_exception("derivative was not defined");
+        literal_vector lits;
+        expr_ref_pair_vector cofactors(m);
+        seq_rw().get_cofactors(d, cofactors);
+        for (auto const& p : cofactors) {
+            expr_ref cond(p.first, m);
+            seq_rw().elim_condition(hd, cond);
+            rewrite(cond);
+            if (m.is_false(cond))
+                continue;            
+            lits.reset();
+            lits.push_back(~lit);
+            if (!m.is_true(cond))
+                lits.push_back(~th.mk_literal(cond));
+            if (false_literal != null_lit) 
+                lits.push_back(null_lit);
+            if (!is_member(p.second, u))
+                lits.push_back(th.mk_literal(sk().mk_is_non_empty(p.second, re().mk_union(u, r))));
+            th.add_axiom(lits);
+        }
+    }


-       first : RegEx -> Char is a skolem function
-    */
+    /*
+      is_empty(r, u) => ~is_nullable(r)
+      is_empty(r, u) => (forall x . ~cond(x)) or is_empty(r1, u union r)    for (cond, r) in min-terms(D(x,r))      
+
+      is_empty(r, u) is true if r is a member of u
+     */
+    void seq_regex::propagate_is_empty(literal lit) {
+        expr* e = ctx.bool_var2expr(lit.var()), *r, *u;
+        VERIFY(sk().is_is_empty(e, r, u));
+        expr_ref is_nullable = seq_rw().is_nullable(r);
+        rewrite(is_nullable);
+        if (m.is_true(is_nullable)) {
+            th.add_axiom(~lit);
+            return;
+        }
+        th.add_axiom(~lit, ~th.mk_literal(is_nullable));
+        expr_ref hd = mk_first(r);
+        expr_ref d = seq_rw().derivative(hd, r);
+        if (!d)
+            throw default_exception("derivative was not defined");
+        literal_vector lits;
+        expr_ref_pair_vector cofactors(m);
+        seq_rw().get_cofactors(d, cofactors);
+
+        // is_empty(r, u) => forall hd . cond => is_empty(r1, u union r)
+        
+        for (auto const& p : cofactors) {
+            if (is_member(p.second, u))
+                continue;
+            expr_ref cond(p.first, m);
+            seq_rw().elim_condition(hd, cond);
+            rewrite(cond);
+            if (m.is_false(cond))
+                continue;
+            lits.reset();
+            lits.push_back(~lit);
+            expr_ref is_empty1 = sk().mk_is_non_empty(p.second, re().mk_union(u, r));
+            if (!m.is_true(cond)) {
+                lits.push_back(th.mk_literal(mk_forall(m, hd, m.mk_not(cond))));
+            }
+            lits.push_back(th.mk_literal(is_empty1)); 
+            th.add_axiom(lits);
+        }        
+    }

    expr_ref seq_regex::mk_first(expr* r) {
        sort* elem_sort = nullptr, *seq_sort = nullptr;
        VERIFY(u().is_re(r, seq_sort));
        VERIFY(u().is_seq(seq_sort, elem_sort));
        return expr_ref(m.mk_fresh_const("re.first", elem_sort), m);
-        //   return sk().mk("re.first", r, elem_sort);  
-        // - for this to be effective, requires internalizer to skip skolem function internalization, 
-        //   because of the regex argument r and we don't handle extensionality of regex well.
-        //   It is probably a good idea to skip internalization of all skolem expressions, 
-        //   but requires some changes to theory_seq.
-        // - it is more useful to eliminate quantifiers in he common case, so never have to
-        //   work with fresh expressions in the fist hand. This is possible for characters and
-        //   ranges (just equalities and inequalities with constant bounds).
-    }
-
-    expr_ref seq_regex::unroll_non_empty(expr* r, expr_mark& seen, unsigned depth) {
-        if (seen.is_marked(r))
-            return expr_ref(m.mk_false(), m);
-        if (depth > 300)
-            return expr_ref(m);
-        expr_ref result(m), cond(m), th(m), el(m);
-        // TBD: try also rewriting
-        if (seq_rw().has_cofactor(r, cond, th, el)) {
-            th = unroll_non_empty(th, seen, depth + 1);
-            el = unroll_non_empty(el, seen, depth + 1);
-            if (th && el) 
-                result = m.mk_ite(cond, th, el);
-            return result;
-        }    
-        expr_ref hd = mk_first(r);
-        result = seq_rw().derivative(hd, r);
-        if (result) {
-            // TBD fast check if r is a subterm of result, if not, then 
-            // loop instead of recurse
-            seen.mark(r, true);
-            result = unroll_non_empty(result, seen, depth + 1);
-            seen.mark(r, false);
-        }
-        return result;
    }
 }
--- a/src/smt/seq_regex.h
+++ b/src/smt/seq_regex.h
@ -65,6 +65,8 @@ namespace smt {

        bool unfold_cofactors(expr_ref& r, literal_vector& conds);

+        bool is_member(expr* r, expr* u);
+
    public:

        seq_regex(theory_seq& th);
@ -84,6 +86,10 @@ namespace smt {
        void propagate_eq(expr* r1, expr* r2);

        void propagate_ne(expr* r1, expr* r2);
+
+        void propagate_is_non_empty(literal lit);
+
+        void propagate_is_empty(literal lit);
        
    };

--- a/src/smt/seq_skolem.cpp
+++ b/src/smt/seq_skolem.cpp
@ -37,6 +37,8 @@ seq_skolem::seq_skolem(ast_manager& m, th_rewriter& rw):
    m_seq_align      = "seq.align";
    m_max_unfolding  = "seq.max_unfolding";
    m_length_limit   = "seq.length_limit";
+    m_is_empty       = "re.is_empty";
+    m_is_non_empty   = "re.is_non_empty";
 }

 expr_ref seq_skolem::mk(symbol const& s, expr* e1, expr* e2, expr* e3, expr* e4, sort* range) {
--- a/src/smt/seq_skolem.h
+++ b/src/smt/seq_skolem.h
@ -36,6 +36,7 @@ namespace smt {
        symbol         m_indexof_left, m_indexof_right;   // inverse of indexof: (indexof_left s t) + s + (indexof_right s t) = t, for s in t. 
        symbol         m_aut_step;                        // regex unfolding state
        symbol         m_accept;                          // regex
+        symbol         m_is_empty, m_is_non_empty;        // regex emptiness check
        symbol         m_pre, m_post;                     // inverse of at: (pre s i) + (at s i) + (post s i) = s if 0 <= i < (len s)
        symbol         m_eq;                              // equality atom
        symbol         m_seq_align;
@ -60,6 +61,9 @@ namespace smt {
        expr_ref mk_align(expr* e1, expr* e2, expr* e3, expr* e4) { return mk(m_seq_align, e1, e2, e3, e4); }
        expr_ref mk_accept(expr_ref_vector const& args) { return expr_ref(seq.mk_skolem(m_accept, args.size(), args.c_ptr(), m.mk_bool_sort()), m); }
        expr_ref mk_accept(expr* s, expr* i, expr* r) { return mk(m_accept, s, i, r, nullptr, m.mk_bool_sort()); }
+        expr_ref mk_is_non_empty(expr* r, expr* u) { return mk(m_is_non_empty, r, u, m.mk_bool_sort()); }
+        expr_ref mk_is_empty(expr* r, expr* u) { return mk(m_is_empty, r, u, m.mk_bool_sort()); }
+
        expr_ref mk_indexof_left(expr* t, expr* s, expr* offset = nullptr) { return mk(m_indexof_left, t, s, offset); }
        expr_ref mk_indexof_right(expr* t, expr* s, expr* offset = nullptr) { return mk(m_indexof_right, t, s, offset); }
        expr_ref mk_last_indexof_left(expr* t, expr* s, expr* offset = nullptr) { return mk("seq.last_indexof_left", t, s, offset); }
@ -82,6 +86,7 @@ namespace smt {
        expr_ref mk_right(expr* x, expr* y, expr* z = nullptr) { return mk("seq.right", x, y, z); }
        expr_ref mk_max_unfolding_depth(unsigned d);
        expr_ref mk_length_limit(expr* e, unsigned d);
+
        
        bool is_skolem(symbol const& s, expr* e) const;
        bool is_skolem(expr* e) const { return seq.is_skolem(e); }
@ -117,7 +122,14 @@ namespace smt {
        bool is_max_unfolding(expr* e) const { return is_skolem(m_max_unfolding, e); }
        bool is_length_limit(expr* e) const { return is_skolem(m_length_limit, e); }
        bool is_length_limit(expr* p, unsigned& lim, expr*& s) const; 
-
+        bool is_is_empty(expr* e) const { return is_skolem(m_is_empty, e); }
+        bool is_is_non_empty(expr* e) const { return is_skolem(m_is_non_empty, e); }
+        bool is_is_empty(expr* e, expr*& r, expr*& u) const { 
+            return is_skolem(m_is_empty, e) && (r = to_app(e)->get_arg(0), u = to_app(e)->get_arg(1), true); 
+        }
+        bool is_is_non_empty(expr* e, expr*& r, expr*& u) const { 
+            return is_skolem(m_is_non_empty, e) && (r = to_app(e)->get_arg(0), u = to_app(e)->get_arg(1), true); 
+        }

        void decompose(expr* e, expr_ref& head, expr_ref& tail);

--- a/src/smt/theory_seq.cpp
+++ b/src/smt/theory_seq.cpp
@ -1536,7 +1536,7 @@ bool theory_seq::internalize_term(app* term) {
    }

    if (ctx.get_fparams().m_seq_use_derivatives && 
-        (m_util.str.is_in_re(term) || m_sk.is_accept(term))) {
+        (m_util.str.is_in_re(term) || m_sk.is_skolem(term))) {
        bool_var bv = ctx.mk_bool_var(term);
        ctx.set_var_theory(bv, get_id());
        ctx.mark_as_relevant(bv);
@ -3069,6 +3069,14 @@ void theory_seq::assign_eh(bool_var v, bool is_true) {
            }
        }
    }
+    else if (m_sk.is_is_empty(e)) {
+        if (is_true)
+            m_regex.propagate_is_empty(lit);
+    }
+    else if (m_sk.is_is_non_empty(e)) {
+        if (is_true)
+            m_regex.propagate_is_non_empty(lit);
+    }
    else if (m_sk.is_step(e)) {
        if (is_true) {
            propagate_step(lit, e);