in progress (#4386)

* initial work on replacing str in regex check * finish rewriter for empty string in regex * remove unnecessary argument in mk_regexp_contains_emptystr; initial template for eval_regexp_derivative * progress on string in regexp general check using derivatives * added recursive nullable and derivative funcitons, partially working * remove tests from z3test * fix rewriting infinite loop and some failing simplify checks * several fixes addressing comments for z3 main branch PR #4386 * redo derivative to return an expr_ref, and null on failure Co-authored-by: calebstanford-msr <t-casta@microsoft.com>
2025-08-07 11:41:22 +00:00 · 2020-05-19 15:55:19 -07:00 · 2020-05-19 15:55:19 -07:00 · 7756e2c6d5
commit 7756e2c6d5
parent b3366bae5a
2 changed files with 210 additions and 9 deletions
--- a/src/ast/rewriter/seq_rewriter.cpp
+++ b/src/ast/rewriter/seq_rewriter.cpp
@ -631,10 +631,10 @@ br_status seq_rewriter::mk_seq_unit(expr* e, expr_ref& result) {

 /*
   string + string = string
-   a + (b + c) = (a + b) + c
+   (a + b) + c = a + (b + c)
   a + "" = a
   "" + a = a
-   (a + string) + string = a + string
+   string + (string + a) = string + a
 */
 br_status seq_rewriter::mk_seq_concat(expr* a, expr* b, expr_ref& result) {
    zstring s1, s2;
@ -657,11 +657,6 @@ br_status seq_rewriter::mk_seq_concat(expr* a, expr* b, expr_ref& result) {
        result = a;
        return BR_DONE;
    }
-    // TBD concatenation is right-associative
-    if (isc2 && m_util.str.is_concat(a, c, d) && m_util.str.is_string(d, s1)) {
-        result = m_util.str.mk_concat(c, m_util.str.mk_string(s1 + s2));
-        return BR_DONE;
-    }
    if (isc1 && m_util.str.is_concat(b, c, d) && m_util.str.is_string(c, s2)) {
        result = m_util.str.mk_concat(m_util.str.mk_string(s1 + s2), d);
        return BR_DONE;
@ -2034,6 +2029,12 @@ expr_ref seq_rewriter::is_nullable(expr* r) {
    else if (m_util.re.is_complement(r, r1)) {
        result = mk_not(m(), is_nullable(r1));
    }
+    else if (m_util.re.is_to_re(r, r1)) {
+        sort* seq_sort = nullptr;
+        VERIFY(m_util.is_re(r, seq_sort));
+        expr* emptystr = m_util.str.mk_empty(seq_sort);
+        result = m().mk_eq(emptystr, r1);
+    }
    else {
        sort* seq_sort = nullptr;
        VERIFY(m_util.is_re(r, seq_sort));
@ -2042,7 +2043,180 @@ expr_ref seq_rewriter::is_nullable(expr* r) {
    return result;
 }

+/*
+    Symbolic derivative
+    Evaluates recursively.
+    Returns null expression `expr_ref(m())` on failure.
+*/
+expr_ref seq_rewriter::derivative(expr* elem, expr* r) {
+    // Check assumption: elem is a single character string
+    // TODO: I want to check if is_char, not if it's a unit.
+    SASSERT(m_util.str.is_unit(elem));
+    expr* r1 = nullptr;
+    expr* r2 = nullptr;
+    expr_ref dr1(m());
+    expr_ref dr2(m());
+    expr* p = nullptr;
+    expr_ref result(m());
+    unsigned lo = 0, hi = 0;
+    if (m_util.re.is_concat(r, r1, r2)) {
+        expr_ref is_n = is_nullable(r1);
+        dr1 = derivative(elem, r1);
+        if (!dr1) {
+            result = dr1; // failed
+        }
+        else if (m().is_false(is_n)) {
+            result = m_util.re.mk_concat(dr1, r2);
+        }
+        else {
+            dr2 = derivative(elem, r2);
+            if (!dr2) {
+                result = dr2; // failed
+            }
+            else if (m().is_true(is_n)) {
+                result = m_util.re.mk_union(
+                    m_util.re.mk_concat(dr1, r2),
+                    dr2
+                );
+            }
+            else {
+                result = m_util.re.mk_union(
+                    m_util.re.mk_concat(dr1, r2),
+                    kleene_and(is_n, dr2)
+                );
+            }
+        }
+    }
+    else if (m_util.re.is_star(r, r1)) {
+        dr1 = derivative(elem, r1);
+        if (dr1) {
+            result = m_util.re.mk_concat(dr1, r);
+        }
+        else {
+            result = dr1; // failed
+        }
+    }
+    else if (m_util.re.is_plus(r, r1)) {
+        result = derivative(elem, m_util.re.mk_star(r1));
+    }
+    else if (m_util.re.is_union(r, r1, r2)) {
+        dr1 = derivative(elem, r1);
+        dr2 = derivative(elem, r2);
+        if (!dr1 || !dr2) {
+            result = expr_ref(m()); // failed
+        } else {
+            result = m_util.re.mk_union(dr1, dr2);
+        }
+    }
+    else if (m_util.re.is_intersection(r, r1, r2)) {
+        dr1 = derivative(elem, r1);
+        dr2 = derivative(elem, r2);
+        if (!dr1 || !dr2) {
+            result = expr_ref(m()); // failed
+        }
+        else {
+            result = m_util.re.mk_inter(dr1, dr2);
+        }
+    }
+    else if (m_util.re.is_opt(r, r1)) {
+        result = derivative(elem, r1);
+    }
+    else if (m_util.re.is_complement(r, r1)) {
+        dr1 = derivative(elem, r1);
+        if (dr1) {
+            result = m_util.re.mk_complement(dr1);
+        }
+        else {
+            result = dr1; // failed
+        }
+    }
+    else if (m_util.re.is_loop(r, r1, lo)) {
+        dr1 = derivative(elem, r1);
+        if (dr1) {
+            if (lo > 0) {
+                lo--;
+            }
+            result = m_util.re.mk_concat(
+                dr1,
+                m_util.re.mk_loop(r1, lo)
+            );
+        }
+        else {
+            result = dr1; // failed
+        }
+    }
+    else if (m_util.re.is_loop(r, r1, lo, hi)) {
+        if (hi == 0) {
+            result = m_util.re.mk_empty(m().get_sort(r));
+        }
+        else {
+            dr1 = derivative(elem, r1);
+            if (dr1) {
+                hi--;
+                if (lo > 0) {
+                    lo--;
+                }
+                result = m_util.re.mk_concat(
+                    dr1,
+                    m_util.re.mk_loop(r1, lo, hi)
+                );
+            }
+            else {
+                result = dr1; // failed
+            }
+        }
+    }
+    else if (m_util.re.is_full_seq(r) ||
+             m_util.re.is_empty(r)) {
+        result = r;
+    }
+    else if (m_util.re.is_to_re(r, r1)) {
+        // r1 is a string here (not a regexp)
+        expr_ref hd(m());
+        expr_ref tl(m());
+        if (get_head_tail(r1, hd, tl)) {
+            // head must be equal; if so, derivative is tail
+            result = kleene_and(
+                m().mk_eq(elem, hd),
+                m_util.re.mk_to_re(tl)
+            );
+        }
+        else if (m_util.str.is_empty(r1)) {
+            result = m_util.re.mk_empty(m().get_sort(r));
+        }
+        else {
+            result = expr_ref(m()); // failed
+        }
+    }
+    else {
+        // Remaining cases may need epsilon regex. Make it
+        // return empty string
+        sort* seq_sort = nullptr;
+        VERIFY(m_util.is_re(r, seq_sort));
+        expr_ref epsilon_re(m());
+        epsilon_re = m_util.re.mk_to_re(
+            m_util.str.mk_empty(seq_sort)
+        );
+        if (m_util.re.is_full_char(r)) {
+            result = epsilon_re;
+        }
+        else if (m_util.re.is_range(r)) {
+            // TODO: check if character is in range
+            result = expr_ref(m()); // failed
+        }
+        else if (m_util.re.is_of_pred(r, p)) {
+            // TODO: check if character satisfies predicate
+            result = expr_ref(m()); // failed
+        }
+        else {
+            result = expr_ref(m()); // failed
+        }
+    }
+    return result;
+}
+
 br_status seq_rewriter::mk_str_in_regexp(expr* a, expr* b, expr_ref& result) {
+
    if (m_util.re.is_empty(b)) {
        result = m().mk_false();
        return BR_DONE;
@ -2056,6 +2230,30 @@ br_status seq_rewriter::mk_str_in_regexp(expr* a, expr* b, expr_ref& result) {
        result = m().mk_eq(a, b1);
        return BR_REWRITE1;
    }
+    if (m_util.str.is_empty(a)) {
+        result = is_nullable(b);
+        // is_nullable doesn't rewrite. But we also want to avoid the
+        // case where it didn't succeed in changing anything.
+        if (m_util.str.is_in_re(result)) {
+            return BR_FAILED;
+        }
+        else {
+            return BR_REWRITE_FULL;
+        }
+    }
+
+    expr_ref hd(m());
+    expr_ref tl(m());
+    if (get_head_tail(a, hd, tl)) {
+        expr_ref db = derivative(hd, b); // null if failed
+        if (db) {
+            result = m_util.re.mk_in_re(tl, db);
+            return BR_REWRITE_FULL;
+        }
+    }
+
+    return BR_FAILED; // For testing purposes, only depend on new functionality
+
    scoped_ptr<eautomaton> aut;
    expr_ref_vector seq(m());
    if (!(aut = m_re2aut(b))) {
@ -2918,7 +3116,7 @@ bool seq_rewriter::is_epsilon(expr* e) const {
 }

 bool seq_rewriter::reduce_subsequence(expr_ref_vector& ls, expr_ref_vector& rs, expr_ref_pair_vector& eqs) {
-    
+
    if (ls.size() > rs.size()) 
        ls.swap(rs);

--- a/src/ast/rewriter/seq_rewriter.h
+++ b/src/ast/rewriter/seq_rewriter.h
@ -132,10 +132,13 @@ class seq_rewriter {
    }
    length_comparison compare_lengths(unsigned sza, expr* const* as, unsigned szb, expr* const* bs);

+
+    // Support for regular expression derivatives
    bool get_head_tail(expr* e, expr_ref& head, expr_ref& tail);
    expr_ref is_nullable(expr* r);
    expr_ref kleene_and(expr* cond, expr* r);
    expr_ref kleene_predicate(expr* cond, sort* seq_sort);
+    expr_ref derivative(expr* hd, expr* r);

    br_status mk_seq_unit(expr* e, expr_ref& result);
    br_status mk_seq_concat(expr* a, expr* b, expr_ref& result);
@ -204,7 +207,7 @@ class seq_rewriter {
    void remove_empty_and_concats(expr_ref_vector& es);
    void remove_leading(unsigned n, expr_ref_vector& es);

-public:    
+public:
    seq_rewriter(ast_manager & m, params_ref const & p = params_ref()):
        m_util(m), m_autil(m), m_re2aut(m), m_es(m), m_lhs(m), m_rhs(m), m_coalesce_chars(true) {
    }