From 6b799706b5adf865cc1b3e73d00d37a86610c619 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Wed, 10 Jan 2018 17:24:47 -0500
Subject: [PATCH] add path constraint generation for regex terms

---
 src/smt/params/smt_params_helper.pyg |   1 +
 src/smt/params/theory_str_params.cpp |   1 +
 src/smt/params/theory_str_params.h   |   9 +-
 src/smt/theory_str.cpp               | 165 +++++++++++++++++++++------
 src/smt/theory_str.h                 |   4 +-
 5 files changed, 145 insertions(+), 35 deletions(-)

diff --git a/src/smt/params/smt_params_helper.pyg b/src/smt/params/smt_params_helper.pyg
index 122d09f32..7ab0d52f2 100644
--- a/src/smt/params/smt_params_helper.pyg
+++ b/src/smt/params/smt_params_helper.pyg
@@ -80,6 +80,7 @@ def_module_params(module_name='smt',
                           ('str.finite_overlap_models', BOOL, False, 'attempt a finite model search for overlapping variables instead of completely giving up on the arrangement'),
                           ('str.overlap_priority', DOUBLE, -0.1, 'theory-aware priority for overlapping variable cases; use smt.theory_aware_branching=true'),
                           ('str.regex_automata', BOOL, True, 'use automata-based reasoning for regular expressions (Z3str3 only)'),
+                          ('str.regex_automata_difficulty_threshold', UINT, 1000, 'difficulty threshold for regex automata heuristics'),
                           ('core.minimize', BOOL, False, 'minimize unsat core produced by SMT context'),
                           ('core.extend_patterns', BOOL, False, 'extend unsat core with literals that trigger (potential) quantifier instances'),
                           ('core.extend_patterns.max_distance', UINT, UINT_MAX, 'limits the distance of a pattern-extended unsat core'),
diff --git a/src/smt/params/theory_str_params.cpp b/src/smt/params/theory_str_params.cpp
index b5451b9dc..fa6862161 100644
--- a/src/smt/params/theory_str_params.cpp
+++ b/src/smt/params/theory_str_params.cpp
@@ -32,4 +32,5 @@ void theory_str_params::updt_params(params_ref const & _p) {
     m_BinarySearchInitialUpperBound = p.str_binary_search_start();
     m_OverlapTheoryAwarePriority = p.str_overlap_priority();
     m_RegexAutomata = p.str_regex_automata();
+    m_RegexAutomata_DifficultyThreshold = p.str_regex_automata_difficulty_threshold();
 }
diff --git a/src/smt/params/theory_str_params.h b/src/smt/params/theory_str_params.h
index 4135cf0d6..48c0205ee 100644
--- a/src/smt/params/theory_str_params.h
+++ b/src/smt/params/theory_str_params.h
@@ -87,6 +87,12 @@ struct theory_str_params {
      */
     bool m_RegexAutomata;
 
+    /*
+     * RegexAutomata_DifficultyThreshold is the lowest difficulty above which Z3str3
+     * will not eagerly construct an automaton for a regular expression term.
+     */
+    unsigned m_RegexAutomata_DifficultyThreshold;
+
     theory_str_params(params_ref const & p = params_ref()):
         m_StrongArrangements(true),
         m_AggressiveLengthTesting(false),
@@ -99,7 +105,8 @@ struct theory_str_params {
         m_UseBinarySearch(false),
         m_BinarySearchInitialUpperBound(64),
         m_OverlapTheoryAwarePriority(-0.1),
-        m_RegexAutomata(true)
+        m_RegexAutomata(true),
+        m_RegexAutomata_DifficultyThreshold(1000)
     {
         updt_params(p);
     }
diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 54636b8df..5910d9d95 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -6918,9 +6918,11 @@ namespace smt {
     /*
      * Create finite path constraints for the string variable `str` with respect to the automaton `aut`.
      * The returned expression is the right-hand side of a constraint of the form
-     * (str in re) AND (|str| = len) AND (any applicable length assumptions on aut) -> RHS
+     * (str in re) AND (|str| = len) AND (any applicable length assumptions on aut) -> (rhs AND character constraints).
+     * The character constraints, which are (str = c0 . c1 . (...) . cn) and (|c0| = 1, ...),
+     * are returned in `characterConstraints`.
      */
-    expr_ref theory_str::generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal) {
+    expr_ref theory_str::generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal, expr_ref & characterConstraints) {
         ENSURE(aut != NULL);
         context & ctx = get_context();
         ast_manager & m = get_manager();
@@ -7060,29 +7062,23 @@ namespace smt {
         expr_ref result(mk_or(ors));
         TRACE("str", tout << "regex path constraint: " << mk_pp(result, m) << "\n";);
 
-        // if the path constraint is instantly false, then we know that the LHS must be false,
-        // so negate it and instantly assert a conflict clause
-        if (m.is_false(result)) {
-            expr_ref conflict(m.mk_false(), m);
-            return conflict;
+        expr_ref concat_rhs(m);
+        if (pathChars.size() == 1) {
+            concat_rhs = ctx.mk_eq_atom(stringTerm, pathChars.get(0));
         } else {
-            expr_ref concat_rhs(m);
-            if (pathChars.size() == 1) {
-                concat_rhs = ctx.mk_eq_atom(stringTerm, pathChars.get(0));
-            } else {
-                expr_ref acc(pathChars.get(0), m);
-                for (unsigned i = 1; i < pathChars.size(); ++i) {
-                    acc = mk_concat(acc, pathChars.get(i));
-                }
-                concat_rhs = ctx.mk_eq_atom(stringTerm, acc);
+            expr_ref acc(pathChars.get(0), m);
+            for (unsigned i = 1; i < pathChars.size(); ++i) {
+                acc = mk_concat(acc, pathChars.get(i));
             }
-
-            expr_ref toplevel_rhs(m.mk_and(result, mk_and(pathChars_len_constraints), concat_rhs), m);
-            //expr_ref final_axiom(rewrite_implication(mk_and(toplevel_lhs), toplevel_rhs), m);
-            //regex_automata_assertions.insert(stringTerm, final_axiom);
-            //m_trail_stack.push(insert_obj_map<theory_str, expr, expr* >(regex_automata_assertions, stringTerm) );
-            return toplevel_rhs;
+            concat_rhs = ctx.mk_eq_atom(stringTerm, acc);
         }
+
+        //expr_ref toplevel_rhs(m.mk_and(result, mk_and(pathChars_len_constraints), concat_rhs), m);
+        characterConstraints = m.mk_and(mk_and(pathChars_len_constraints), concat_rhs);
+        //expr_ref final_axiom(rewrite_implication(mk_and(toplevel_lhs), toplevel_rhs), m);
+        //regex_automata_assertions.insert(stringTerm, final_axiom);
+        //m_trail_stack.push(insert_obj_map<theory_str, expr, expr* >(regex_automata_assertions, stringTerm) );
+        return result;
     }
 
     /*
@@ -8241,6 +8237,27 @@ namespace smt {
                    );
     }
 
+    // returns true if needle appears as a subterm anywhere under haystack,
+    // or if needle appears in the same EQC as a subterm anywhere under haystack
+    bool theory_str::term_appears_as_subterm(expr * needle, expr * haystack) {
+        if (in_same_eqc(needle, haystack)) {
+            return true;
+        }
+
+        if (is_app(haystack)) {
+            app * a_haystack = to_app(haystack);
+            for (unsigned i = 0; i < a_haystack->get_num_args(); ++i) {
+                expr * subterm = a_haystack->get_arg(i);
+                if (term_appears_as_subterm(needle, subterm)) {
+                    return true;
+                }
+            }
+        }
+
+        // not found
+        return false;
+    }
+
     void theory_str::classify_ast_by_type(expr * node, std::map<expr*, int> & varMap,
                                           std::map<expr*, int> & concatMap, std::map<expr*, int> & unrollMap) {
 
@@ -9512,8 +9529,78 @@ namespace smt {
                 rational exact_length_value;
                 if (get_len_value(str, exact_length_value)) {
                     TRACE("str", tout << "exact length of " << mk_pp(str, m) << " is " << exact_length_value << std::endl;);
-                    // TODO generate_regex_path_constraints();
-                    NOT_IMPLEMENTED_YET();
+
+                    if (regex_terms_with_path_constraints.contains(str_in_re)) {
+                        TRACE("str", tout << "term " << mk_pp(str_in_re, m) << " already has path constraints set up" << std::endl;);
+                        continue;
+                    }
+
+                    // find a consistent automaton for this term
+                    bool found = false;
+                    regex_automaton_under_assumptions assumption;
+                    if (regex_automaton_assumptions.contains(re) &&
+                                                !regex_automaton_assumptions[re].empty()){
+                        for (svector<regex_automaton_under_assumptions>::iterator it = regex_automaton_assumptions[re].begin();
+                                it != regex_automaton_assumptions[re].end(); ++it) {
+                            regex_automaton_under_assumptions autA = *it;
+                            rational assumed_upper_bound, assumed_lower_bound;
+                            bool assumes_upper_bound = autA.get_upper_bound(assumed_upper_bound);
+                            bool assumes_lower_bound = autA.get_lower_bound(assumed_lower_bound);
+                            if (!assumes_upper_bound && !assumes_lower_bound) {
+                                // automaton with no assumptions is always usable
+                                assumption = autA;
+                                found = true;
+                                break;
+                            }
+                            // check consistency of bounds assumptions
+                        } // foreach(a in regex_automaton_assumptions)
+                    }
+                    if (found) {
+                        expr_ref pathConstraint(m);
+                        expr_ref characterConstraints(m);
+                        pathConstraint = generate_regex_path_constraints(str, assumption.get_automaton(), exact_length_value, characterConstraints);
+                        TRACE("str", tout << "generated regex path constraint " << mk_pp(pathConstraint, m) << std::endl;);
+                        TRACE("str", tout << "character constraints are " << mk_pp(characterConstraints, m) << std::endl;);
+
+                        expr_ref_vector lhs_terms(m);
+                        lhs_terms.push_back(str_in_re);
+                        lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true)));
+                        expr_ref lhs(mk_and(lhs_terms), m);
+                        // If the automaton was built with the same polarity as the constraint,
+                        // assert directly. Otherwise, negate the path constraint
+                        if ( (current_assignment == l_true && assumption.get_polarity())
+                                || (current_assignment == l_false && !assumption.get_polarity())) {
+                            expr_ref rhs(m.mk_and(pathConstraint, characterConstraints), m);
+                            assert_implication(lhs, rhs);
+                        } else {
+                            expr_ref rhs(m.mk_and(m.mk_not(pathConstraint), characterConstraints), m);
+                            assert_implication(lhs, rhs);
+                        }
+                        regex_terms_with_path_constraints.insert(str_in_re);
+                        m_trail_stack.push(insert_obj_trail<theory_str, expr>(regex_terms_with_path_constraints, str_in_re));
+                        regex_axiom_add = true;
+                        continue;
+                    } else {
+                        // no automata available, or else all bounds assumptions are invalid
+                        unsigned expected_complexity = estimate_regex_complexity(re);
+                        if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) {
+                            eautomaton * aut = m_mk_aut(re);
+                            aut->compress();
+                            regex_automata.push_back(aut);
+                            regex_automaton_under_assumptions new_aut(re, aut, true);
+                            if (!regex_automaton_assumptions.contains(re)) {
+                                regex_automaton_assumptions.insert(re, svector<regex_automaton_under_assumptions>());
+                            }
+                            regex_automaton_assumptions[re].push_back(new_aut);
+                            TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
+                            regex_axiom_add = true;
+                            // TODO immediately attempt to learn lower/upper bound info here
+                        } else {
+                            // TODO increment failure count
+                            NOT_IMPLEMENTED_YET();
+                        }
+                        continue;
+                    }
                 }
                 expr_ref str_len(mk_strlen(str), m);
                 rational lower_bound_value;
@@ -9607,8 +9694,7 @@ namespace smt {
                         // no existing automata/assumptions.
                         // if it's easy to construct a full automaton for R, do so
                         unsigned expected_complexity = estimate_regex_complexity(re);
-                        unsigned threshold = 1000; // TODO better metric
-                        if (expected_complexity <= threshold) {
+                        if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) {
                             eautomaton * aut = m_mk_aut(re);
                             aut->compress();
                             regex_automata.push_back(aut);
@@ -9622,9 +9708,11 @@ namespace smt {
                             // TODO immediately attempt to learn lower/upper bound info here
                         } else {
                             // TODO check negation?
-                            // construct a partial automaton for R to the given upper bound
+                            // TODO construct a partial automaton for R to the given upper bound?
+                            // TODO increment failure count if we can't
                             NOT_IMPLEMENTED_YET();
                         }
+                        continue;
                     }
                     // if we have *any* automaton for R, and the upper bound is not too large,
                     // finitize the automaton (if we have not already done so) and assert all solutions
@@ -9718,8 +9806,7 @@ namespace smt {
                             // no existing automata/assumptions.
                             // if it's easy to construct a full automaton for R, do so
                             unsigned expected_complexity = estimate_regex_complexity(re);
-                            unsigned threshold = 1000; // TODO better metric
-                            if (expected_complexity <= threshold) {
+                            if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) {
                                 eautomaton * aut = m_mk_aut(re);
                                 aut->compress();
                                 regex_automata.push_back(aut);
@@ -9733,9 +9820,11 @@ namespace smt {
                                 // TODO immediately attempt to learn lower/upper bound info here
                             } else {
                                 // TODO check negation?
-                                // construct a partial automaton for R to the given upper bound
+                                // TODO construct a partial automaton for R to the given lower bound?
+                                // TODO increment failure count
                                 NOT_IMPLEMENTED_YET();
                             }
+                            continue;
                         }
                     } else { // !lower_bound_exists
                         // no bounds information
@@ -9745,8 +9834,7 @@ namespace smt {
                         NOT_IMPLEMENTED_YET();
                         if (true) {
                             unsigned expected_complexity = estimate_regex_complexity(re);
-                            unsigned threshold = 1000; // TODO better metric
-                            if (expected_complexity <= threshold) {
+                            if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) {
                                 eautomaton * aut = m_mk_aut(re);
                                 aut->compress();
                                 regex_automata.push_back(aut);
@@ -11265,7 +11353,18 @@ namespace smt {
                     // this length tester is in any way constrained by regex terms,
                     // do not perform value testing -- this term is not a free variable.
                     if (m_params.m_RegexAutomata) {
-                        NOT_IMPLEMENTED_YET(); // TODO
+                        std::map<std::pair<expr*, zstring>, expr*>::iterator it = regex_in_bool_map.begin();
+                        for (; it != regex_in_bool_map.end(); ++it) {
+                            expr * re = it->second;
+                            expr * re_str = to_app(re)->get_arg(0);
+                            // recursive descent through all subterms of re_str to see if any of them are
+                            // - the same as freeVar
+                            // - in the same EQC as freeVar
+                            if (term_appears_as_subterm(freeVar, re_str)) {
+                                TRACE("str", tout << "prevent value testing on free var " << mk_pp(freeVar, m) << " as it belongs to one or more regex constraints." << std::endl;);
+                                return NULL;
+                            }
+                        }
                     }
 
                     TRACE("str", tout << "length is fixed; generating models for free var" << std::endl;);
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index c6dad045f..c64eec838 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -407,6 +407,7 @@ protected:
     obj_map<expr, ptr_vector<expr> > regex_terms_by_string; // S --> [ (str.in.re S *) ]
     obj_map<expr, svector<regex_automaton_under_assumptions> > regex_automaton_assumptions; // RegEx --> [ aut+assumptions ]
     std::map<expr*, nfa> regex_nfa_cache; // Regex term --> NFA
+    obj_hashtable<expr> regex_terms_with_path_constraints; // set of string terms which have had path constraints asserted in the current scope
 
     svector<char> char_set;
     std::map<char, int>  charSetLookupTable;
@@ -538,7 +539,7 @@ protected:
     expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables);
     bool refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound);
     bool refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound);
-    expr_ref generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal);
+    expr_ref generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal, expr_ref & characterConstraints);
     void aut_path_add_next(u_map<expr*>& next, expr_ref_vector& trail, unsigned idx, expr* cond);
     expr_ref aut_path_rewrite_constraint(expr * cond, expr * ch_var);
 
@@ -629,6 +630,7 @@ protected:
             std::map<expr*, std::map<expr*, int> > & concat_eq_concat_map,
             std::map<expr*, std::set<expr*> > & unrollGroupMap);
 
+    bool term_appears_as_subterm(expr * needle, expr * haystack);
     void classify_ast_by_type(expr * node, std::map<expr*, int> & varMap,
             std::map<expr*, int> & concatMap, std::map<expr*, int> & unrollMap);
     void classify_ast_by_type_in_positive_context(std::map<expr*, int> & varMap,