From fbe8d1577ed54671a6d9fab0a59520225b701127 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Mon, 4 Dec 2017 18:05:00 -0500
Subject: [PATCH 01/36] new regex automata start; add complexity estimation

---
 src/ast/rewriter/seq_rewriter.cpp    |   3 +
 src/ast/rewriter/seq_rewriter.h      |   5 +-
 src/smt/params/smt_params_helper.pyg |   1 +
 src/smt/params/theory_str_params.cpp |   1 +
 src/smt/params/theory_str_params.h   |  10 ++-
 src/smt/theory_str.cpp               | 105 +++++++++++++++++++++++++++
 src/smt/theory_str.h                 |   5 +-
 7 files changed, 127 insertions(+), 3 deletions(-)

diff --git a/src/ast/rewriter/seq_rewriter.cpp b/src/ast/rewriter/seq_rewriter.cpp
index fa95278b4..6f501c6e0 100644
--- a/src/ast/rewriter/seq_rewriter.cpp
+++ b/src/ast/rewriter/seq_rewriter.cpp
@@ -200,6 +200,9 @@ void re2automaton::set_solver(expr_solver* solver) {
     m_sa = alloc(symbolic_automata_t, sm, *m_ba.get());
 }
 
+eautomaton* re2automaton::mk_product(eautomaton* a1, eautomaton* a2) {
+    return m_sa->mk_product(*a1, *a2);
+}
 
 eautomaton* re2automaton::operator()(expr* e) { 
     eautomaton* r = re2aut(e); 
diff --git a/src/ast/rewriter/seq_rewriter.h b/src/ast/rewriter/seq_rewriter.h
index 69f319168..95b043bd7 100644
--- a/src/ast/rewriter/seq_rewriter.h
+++ b/src/ast/rewriter/seq_rewriter.h
@@ -53,7 +53,9 @@ public:
     bool is_range() const { return m_ty == t_range; }
     sort* get_sort() const { return m_sort; }
     expr* get_char() const { SASSERT(is_char()); return m_t; }
-
+    expr* get_pred() const { SASSERT(is_pred()); return m_t; }
+    expr* get_lo() const { SASSERT(is_range()); return m_t; }
+    expr* get_hi() const { SASSERT(is_range()); return m_s; }
 };
 
 class sym_expr_manager {
@@ -87,6 +89,7 @@ public:
     ~re2automaton();
     eautomaton* operator()(expr* e);
     void set_solver(expr_solver* solver);
+    eautomaton* mk_product(eautomaton *a1, eautomaton *a2);
 };
 
 /**
diff --git a/src/smt/params/smt_params_helper.pyg b/src/smt/params/smt_params_helper.pyg
index 937aa6a2b..7f35feb64 100644
--- a/src/smt/params/smt_params_helper.pyg
+++ b/src/smt/params/smt_params_helper.pyg
@@ -79,6 +79,7 @@ def_module_params(module_name='smt',
                           ('theory_aware_branching', BOOL, False, 'Allow the context to use extra information from theory solvers regarding literal branching prioritization.'),
                           ('str.finite_overlap_models', BOOL, False, 'attempt a finite model search for overlapping variables instead of completely giving up on the arrangement'),
                           ('str.overlap_priority', DOUBLE, -0.1, 'theory-aware priority for overlapping variable cases; use smt.theory_aware_branching=true'),
+                          ('str.regex_automata', BOOL, True, 'use automata-based reasoning for regular expressions (Z3str3 only)'),
                           ('core.minimize', BOOL, False, 'minimize unsat core produced by SMT context'),
                           ('core.extend_patterns', BOOL, False, 'extend unsat core with literals that trigger (potential) quantifier instances'),
                           ('core.extend_patterns.max_distance', UINT, UINT_MAX, 'limits the distance of a pattern-extended unsat core'),
diff --git a/src/smt/params/theory_str_params.cpp b/src/smt/params/theory_str_params.cpp
index afbfc33fc..b5451b9dc 100644
--- a/src/smt/params/theory_str_params.cpp
+++ b/src/smt/params/theory_str_params.cpp
@@ -31,4 +31,5 @@ void theory_str_params::updt_params(params_ref const & _p) {
     m_UseBinarySearch = p.str_use_binary_search();
     m_BinarySearchInitialUpperBound = p.str_binary_search_start();
     m_OverlapTheoryAwarePriority = p.str_overlap_priority();
+    m_RegexAutomata = p.str_regex_automata();
 }
diff --git a/src/smt/params/theory_str_params.h b/src/smt/params/theory_str_params.h
index c841609db..4135cf0d6 100644
--- a/src/smt/params/theory_str_params.h
+++ b/src/smt/params/theory_str_params.h
@@ -80,6 +80,13 @@ struct theory_str_params {
 
     double m_OverlapTheoryAwarePriority;
 
+    /*
+     * If RegexAutomata is set to true,
+     * Z3str3 will use automata-based methods to reason about
+     * regular expression constraints.
+     */
+    bool m_RegexAutomata;
+
     theory_str_params(params_ref const & p = params_ref()):
         m_StrongArrangements(true),
         m_AggressiveLengthTesting(false),
@@ -91,7 +98,8 @@ struct theory_str_params {
         m_FiniteOverlapModels(false),
         m_UseBinarySearch(false),
         m_BinarySearchInitialUpperBound(64),
-        m_OverlapTheoryAwarePriority(-0.1)
+        m_OverlapTheoryAwarePriority(-0.1),
+        m_RegexAutomata(true)
     {
         updt_params(p);
     }
diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 7936b581c..a859dae9d 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -6446,6 +6446,111 @@ namespace smt {
         }
     }
 
+    // saturating unsigned addition
+    unsigned inline _qadd(unsigned a, unsigned b) {
+        if (a == UINT_MAX || b == UINT_MAX) {
+            return UINT_MAX;
+        }
+        unsigned result = a + b;
+        if (result < a || result < b) {
+            return UINT_MAX;
+        }
+        return result;
+    }
+
+    // saturating unsigned multiply
+    unsigned inline _qmul(unsigned a, unsigned b) {
+        if (a == UINT_MAX || b == UINT_MAX) {
+            return UINT_MAX;
+        }
+        unsigned result = a * b;
+        if (result < a || result < b) {
+            return UINT_MAX;
+        }
+        return result;
+    }
+
+    unsigned theory_str::estimate_regex_complexity(expr * re) {
+        ENSURE(u.is_re(re));
+        expr * sub1;
+        expr * sub2;
+        if (u.re.is_to_re(re, sub1)) {
+            SASSERT(u.str.is_string(sub1));
+            zstring str;
+            u.str.is_string(sub1, str);
+            return str.length();
+        } else if (u.re.is_complement(re, sub1)) {
+            return estimate_regex_complexity_under_complement(sub1);
+        } else if (u.re.is_concat(re, sub1, sub2)) {
+            unsigned cx1 = estimate_regex_complexity(sub1);
+            unsigned cx2 = estimate_regex_complexity(sub2);
+            return _qadd(cx1, cx2);
+        } else if (u.re.is_union(re, sub1, sub2)) {
+            unsigned cx1 = estimate_regex_complexity(sub1);
+            unsigned cx2 = estimate_regex_complexity(sub2);
+            return _qadd(cx1, cx2);
+        } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) {
+            unsigned cx = estimate_regex_complexity(sub1);
+            return _qmul(2, cx);
+        } else if (u.re.is_range(re, sub1, sub2)) {
+            SASSERT(u.str.is_string(sub1));
+            SASSERT(u.str.is_string(sub2));
+            zstring str1, str2;
+            u.str.is_string(sub1, str1);
+            u.str.is_string(sub2, str2);
+            SASSERT(str1.length() == 1);
+            SASSERT(str2.length() == 1);
+            return 1 + str2[0] - str1[0];
+        } else if (u.re.is_full(re)) {
+            return 1;
+        } else {
+            TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;);
+            return 1;
+        }
+    }
+
+    unsigned theory_str::estimate_regex_complexity_under_complement(expr * re) {
+        ENSURE(u.is_re(re));
+        expr * sub1;
+        expr * sub2;
+        if (u.re.is_to_re(re, sub1)) {
+            SASSERT(u.str.is_string(sub1));
+            zstring str;
+            u.str.is_string(sub1, str);
+            return str.length();
+        } else if (u.re.is_complement(re, sub1)) {
+            // Why don't we return the regular complexity here?
+            // We could, but this might be called from under another complemented subexpression.
+            // It's better to give a worst-case complexity.
+            return estimate_regex_complexity_under_complement(sub1);
+        } else if (u.re.is_concat(re, sub1, sub2)) {
+            unsigned cx1 = estimate_regex_complexity_under_complement(sub1);
+            unsigned cx2 = estimate_regex_complexity_under_complement(sub2);
+            return _qadd(_qmul(2, cx1), cx2);
+        } else if (u.re.is_union(re, sub1, sub2)) {
+            unsigned cx1 = estimate_regex_complexity_under_complement(sub1);
+            unsigned cx2 = estimate_regex_complexity_under_complement(sub2);
+            return _qmul(cx1, cx2);
+        } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) {
+            unsigned cx = estimate_regex_complexity_under_complement(sub1);
+            return _qmul(2, cx);
+        } else if (u.re.is_range(re, sub1, sub2)) {
+            SASSERT(u.str.is_string(sub1));
+            SASSERT(u.str.is_string(sub2));
+            zstring str1, str2;
+            u.str.is_string(sub1, str1);
+            u.str.is_string(sub2, str2);
+            SASSERT(str1.length() == 1);
+            SASSERT(str2.length() == 1);
+            return 1 + str2[0] - str1[0];
+        } else if (u.re.is_full(re)) {
+            return 1;
+        } else {
+            TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;);
+            return 1;
+        }
+    }
+
     /*
      * strArgmt::solve_concat_eq_str()
      * Solve concatenations of the form:
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index 03fd31162..72643c4be 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -457,11 +457,14 @@ protected:
     expr * mk_RegexIn(expr * str, expr * regexp);
     void instantiate_axiom_RegexIn(enode * e);
     app * mk_unroll(expr * n, expr * bound);
-
     void process_unroll_eq_const_str(expr * unrollFunc, expr * constStr);
     void unroll_str2reg_constStr(expr * unrollFunc, expr * eqConstStr);
     void process_concat_eq_unroll(expr * concat, expr * unroll);
 
+    // regex automata
+    unsigned estimate_regex_complexity(expr * re);
+    unsigned estimate_regex_complexity_under_complement(expr * re);
+
     void set_up_axioms(expr * ex);
     void handle_equality(expr * lhs, expr * rhs);
 

From a5c828f6f2ea967b4b17efc8ba22070a517696d2 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Wed, 6 Dec 2017 18:32:11 -0500
Subject: [PATCH 02/36] length estimation

---
 src/smt/theory_str.cpp | 116 ++++++++++++++++++++++++++++++++++++++++-
 src/smt/theory_str.h   |   7 ++-
 2 files changed, 121 insertions(+), 2 deletions(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index a859dae9d..88180352c 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -25,6 +25,8 @@
 #include "smt/theory_seq_empty.h"
 #include "smt/theory_arith.h"
 #include "ast/ast_util.h"
+#include "seq_rewriter.h"
+#include "smt_kernel.h"
 
 namespace smt {
 
@@ -48,6 +50,7 @@ namespace smt {
         finalCheckProgressIndicator(false),
         m_trail(m),
         m_factory(nullptr),
+        m_mk_aut(m),
         m_unused_id(0),
         m_delayed_axiom_setup_terms(m),
         tmpStringVarCount(0),
@@ -95,6 +98,26 @@ namespace smt {
         return u.str.mk_string(sym);
     }
 
+    class seq_expr_solver : public expr_solver {
+        kernel m_kernel;
+    public:
+        seq_expr_solver(ast_manager& m, smt_params& fp):
+            m_kernel(m, fp) {}
+        virtual lbool check_sat(expr* e) {
+            m_kernel.push();
+            m_kernel.assert_expr(e);
+            lbool r = m_kernel.check();
+            m_kernel.pop(1);
+            return r;
+        }
+    };
+
+    void theory_str::init(context * ctx) {
+        theory::init(ctx);
+        m_mk_aut.set_solver(alloc(seq_expr_solver, get_manager(),
+                                  get_context().get_fparams()));
+    }
+
     void theory_str::initialize_charset() {
         bool defaultCharset = true;
         if (defaultCharset) {
@@ -1718,6 +1741,11 @@ namespace smt {
             regex_in_var_reg_str_map[ex->get_arg(0)].insert(regexStr);
         }
 
+        if (m_params.m_RegexAutomata) {
+            // stop setting up axioms here, we do this differently
+            return;
+        }
+
         expr_ref str(ex->get_arg(0), m);
         app * regex = to_app(ex->get_arg(1));
 
@@ -1887,7 +1915,7 @@ namespace smt {
             check_contain_in_new_eq(lhs, rhs);
         }
 
-        if (!regex_in_bool_map.empty()) {
+        if (!regex_in_bool_map.empty() && !m_params.m_RegexAutomata) {
             TRACE("str", tout << "checking regex consistency" << std::endl;);
             check_regex_in(lhs, rhs);
         }
@@ -6550,6 +6578,92 @@ namespace smt {
             return 1;
         }
     }
+    /*
+     * Infer all length constraints implied by the given regular expression `re`
+     * in order to constrain `lenVar` (which must be of sort Int).
+     * This assumes that `re` appears in a positive context.
+     * Returns a Boolean formula expressing the appropriate constraints over `lenVar`.
+     * In some cases, the returned formula requires one or more free integer variables to be created.
+     * These variables are returned in the reference parameter `freeVariables`.
+     * Extra assertions should be made for these free variables constraining them to be non-negative.
+     */
+    expr_ref theory_str::infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables) {
+        ENSURE(u.is_re(re));
+        context & ctx = get_context();
+        ast_manager & m = get_manager();
+        expr * sub1;
+        expr * sub2;
+        if (u.re.is_to_re(re, sub1)) {
+            SASSERT(u.str.is_string(sub1));
+            zstring str;
+            u.str.is_string(sub1, str);
+            rational strlen(str.length());
+            expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_numeral(strlen, true)), m);
+            return retval;
+        } else if (u.re.is_union(re, sub1, sub2)) {
+            expr_ref r1 = infer_all_regex_lengths(lenVar, sub1, freeVariables);
+            expr_ref r2 = infer_all_regex_lengths(lenVar, sub2, freeVariables);
+            expr_ref retval(m.mk_or(r1, r2), m);
+            return retval;
+        } else if (u.re.is_concat(re, sub1, sub2)) {
+            expr * v1 = mk_int_var("rlen1");
+            expr * v2 = mk_int_var("rlen2");
+            freeVariables.push_back(v1);
+            freeVariables.push_back(v2);
+            expr_ref r1 = infer_all_regex_lengths(v1, sub1, freeVariables);
+            expr_ref r2 = infer_all_regex_lengths(v2, sub2, freeVariables);
+            expr_ref_vector finalResult(m);
+            finalResult.push_back(ctx.mk_eq_atom(lenVar, m_autil.mk_add(v1, v2)));
+            finalResult.push_back(r1);
+            finalResult.push_back(r2);
+            expr_ref retval(mk_and(finalResult), m);
+            return retval;
+        } else if (u.re.is_star(re, sub1)) {
+            expr * v = mk_int_var("rlen");
+            expr * n = mk_int_var("rstar");
+            freeVariables.push_back(v);
+            freeVariables.push_back(n);
+            expr_ref rsub = infer_all_regex_lengths(v, sub1, freeVariables);
+            expr_ref_vector finalResult(m);
+            finalResult.push_back(rsub);
+            finalResult.push_back(ctx.mk_eq_atom(lenVar, m_autil.mk_mul(v, n)));
+            expr_ref retval(mk_and(finalResult), m);
+            return retval;
+        } else if (u.re.is_plus(re, sub1)) {
+            expr * v = mk_int_var("rlen");
+            expr * n = mk_int_var("rstar");
+            freeVariables.push_back(v);
+            freeVariables.push_back(n);
+            expr_ref rsub = infer_all_regex_lengths(v, sub1, freeVariables);
+            expr_ref_vector finalResult(m);
+            finalResult.push_back(rsub);
+            finalResult.push_back(ctx.mk_eq_atom(lenVar, m_autil.mk_mul(v, n)));
+            finalResult.push_back(m_autil.mk_ge(n, m_autil.mk_numeral(rational::one(), true)));
+            expr_ref retval(mk_and(finalResult), m);
+            return retval;
+        } else if (u.re.is_range(re, sub1, sub2)) {
+            SASSERT(u.str.is_string(sub1));
+            SASSERT(u.str.is_string(sub2));
+            zstring str1, str2;
+            u.str.is_string(sub1, str1);
+            u.str.is_string(sub2, str2);
+            SASSERT(str1.length() == 1);
+            SASSERT(str2.length() == 1);
+            expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_numeral(rational::one(), true)), m);
+            return retval;
+        } else if (u.re.is_full(re)) {
+            expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_numeral(rational::one(), true)), m);
+            return retval;
+        } else if (u.re.is_complement(re)) {
+            // skip complement for now, in general this is difficult to predict
+            expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m);
+            return retval;
+        } else {
+            TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, m) << std::endl;);
+            expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m);
+            return retval;
+        }
+    }
 
     /*
      * strArgmt::solve_concat_eq_str()
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index 72643c4be..602f9feab 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -23,6 +23,7 @@
 #include "ast/ast_pp.h"
 #include "ast/arith_decl_plugin.h"
 #include "ast/rewriter/th_rewriter.h"
+#include "ast/rewriter/seq_rewriter.h"
 #include "ast/seq_decl_plugin.h"
 #include "smt/smt_theory.h"
 #include "smt/params/theory_str_params.h"
@@ -267,6 +268,8 @@ protected:
 
     str_value_factory * m_factory;
 
+    re2automaton m_mk_aut;
+
     // Unique identifier appended to unused variables to ensure that model construction
     // does not introduce equalities when they weren't enforced.
     unsigned m_unused_id;
@@ -461,9 +464,10 @@ protected:
     void unroll_str2reg_constStr(expr * unrollFunc, expr * eqConstStr);
     void process_concat_eq_unroll(expr * concat, expr * unroll);
 
-    // regex automata
+    // regex automata and length-aware regex
     unsigned estimate_regex_complexity(expr * re);
     unsigned estimate_regex_complexity_under_complement(expr * re);
+    expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables);
 
     void set_up_axioms(expr * ex);
     void handle_equality(expr * lhs, expr * rhs);
@@ -640,6 +644,7 @@ protected:
     virtual void new_diseq_eh(theory_var, theory_var);
 
     virtual theory* mk_fresh(context*) { return alloc(theory_str, get_manager(), m_params); }
+    virtual void init(context * ctx);
     virtual void init_search_eh();
     virtual void add_theory_assumptions(expr_ref_vector & assumptions);
     virtual lbool validate_unsat_core(expr_ref_vector & unsat_core);

From 0ac7082c8092c25efb3b9f893b2982a9e0ab556e Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Thu, 21 Dec 2017 17:13:39 -0500
Subject: [PATCH 03/36] add upper bound refinement (WIP)

---
 src/smt/theory_str.cpp | 239 ++++++++++++++++++++++++++++++++++++++++-
 src/smt/theory_str.h   |  69 ++++++++++++
 2 files changed, 305 insertions(+), 3 deletions(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 88180352c..2f41a6617 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -74,6 +74,7 @@ namespace smt {
 
     theory_str::~theory_str() {
         m_trail_stack.reset();
+        regex_automata.clear();
     }
 
     expr * theory_str::mk_string(zstring const& str) {
@@ -1741,14 +1742,19 @@ namespace smt {
             regex_in_var_reg_str_map[ex->get_arg(0)].insert(regexStr);
         }
 
+        expr_ref str(ex->get_arg(0), m);
+        app * regex = to_app(ex->get_arg(1));
+
         if (m_params.m_RegexAutomata) {
+            regex_terms.insert(ex);
+            if (!regex_terms_by_string.contains(str)) {
+                regex_terms_by_string.insert(str, ptr_vector<expr>());
+            }
+            regex_terms_by_string[str].push_back(ex);
             // stop setting up axioms here, we do this differently
             return;
         }
 
-        expr_ref str(ex->get_arg(0), m);
-        app * regex = to_app(ex->get_arg(1));
-
         // quick reference for the following code:
         //  - ex: top-level regex membership term
         //  - str: string term appearing in ex
@@ -6665,6 +6671,77 @@ namespace smt {
         }
     }
 
+    /*
+     * Refine the upper bound on the length of a solution to a given automaton.
+     * The method returns TRUE if a solution of length `current_upper_bound` exists,
+     * and FALSE otherwise. In addition, the reference parameter `refined_upper_bound`
+     * is assigned the length of the longest solution shorter than `current_upper_bound`,
+     * if a shorter solution exists, or -1 otherwise.
+     */
+    bool theory_str::refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound) {
+        ENSURE(aut != NULL);
+
+        if (aut->final_states().empty()) {
+            // no solutions at all!
+            refined_upper_bound = rational::minus_one();
+            return false;
+        }
+
+        // from here we assume there is a final state reachable from the initial state
+        unsigned_vector search_queue;
+        // populate search queue with all states reachable from the epsilon-closure of the start state
+        aut->get_epsilon_closure(aut->init(), search_queue);
+
+        rational last_solution_depth = rational::minus_one();
+        bool found_solution_at_upper_bound = false;
+
+        unsigned search_depth = 0;
+        hashtable<unsigned, unsigned_hash, default_eq<unsigned> > next_states;
+        unsigned_vector next_search_queue;
+
+        while(!search_queue.empty()) {
+            // see if any of the current states are final
+            for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) {
+                unsigned src = *it;
+                if (aut->is_final_state(src)) {
+                    if (search_depth == current_upper_bound.get_unsigned()) {
+                        found_solution_at_upper_bound = true;
+                    } else {
+                        last_solution_depth = rational(search_depth);
+                    }
+                    break;
+                }
+            }
+
+            if (search_depth == current_upper_bound.get_unsigned()) {
+                break;
+            }
+
+            next_states.reset();
+            next_search_queue.clear();
+            // move one step along all states
+            for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) {
+                unsigned src = *it;
+                eautomaton::moves next_moves;
+                aut->get_moves_from(src, next_moves, true);
+                for (eautomaton::moves::iterator moves_it = next_moves.begin();
+                        moves_it != next_moves.end(); ++moves_it) {
+                    unsigned dst = moves_it->dst();
+                    if (!next_states.contains(dst)) {
+                        next_states.insert(dst);
+                        next_search_queue.push_back(dst);
+                    }
+                }
+            }
+            search_queue.clear();
+            search_queue.append(next_search_queue);
+            search_depth += 1;
+        } //!search_queue.empty()
+
+        refined_upper_bound = last_solution_depth;
+        return found_solution_at_upper_bound;
+    }
+
     /*
      * strArgmt::solve_concat_eq_str()
      * Solve concatenations of the form:
@@ -9076,6 +9153,155 @@ namespace smt {
             }
         }
 
+        // regex automata
+        if (m_params.m_RegexAutomata) {
+            bool regex_axiom_add = false;
+            for (obj_hashtable<expr>::iterator it = regex_terms.begin(); it != regex_terms.end(); ++it) {
+                expr * str_in_re = *it;
+                expr * str;
+                expr * re;
+                u.str.is_in_re(str_in_re, str, re);
+                lbool current_assignment = ctx.get_assignment(str_in_re);
+                TRACE("str", tout << "regex term: " << mk_pp(str, m) << " in " << mk_pp(re, m) << " : " << current_assignment << std::endl;);
+                if (current_assignment == l_undef) {
+                    continue;
+                }
+                rational exact_length_value;
+                if (get_len_value(str, exact_length_value)) {
+                    TRACE("str", tout << "exact length of " << mk_pp(str, m) << " is " << exact_length_value << std::endl;);
+                    NOT_IMPLEMENTED_YET();
+                }
+                expr_ref str_len(mk_strlen(str), m);
+                rational lower_bound_value;
+                rational upper_bound_value;
+                bool lower_bound_exists = lower_bound(str_len, lower_bound_value);
+                bool upper_bound_exists = upper_bound(str_len, upper_bound_value);
+                CTRACE("str", lower_bound_exists, tout << "lower bound of " << mk_pp(str, m) << " is " << lower_bound_value << std::endl;);
+                CTRACE("str", upper_bound_exists, tout << "upper bound of " << mk_pp(str, m) << " is " << upper_bound_value << std::endl;);
+
+                if (upper_bound_exists) {
+                    // check current assumptions
+                    if (regex_automaton_assumptions.contains(re) &&
+                            !regex_automaton_assumptions[re].empty()){
+                        // one or more existing assumptions.
+                        // see if the (current best) upper bound can be refined
+                        // (note that if we have an automaton with no assumption,
+                        // this automatically counts as best)
+                        bool need_assumption = true;
+                        regex_automaton_under_assumptions last_assumption;
+                        rational last_ub = rational::minus_one();
+                        for (svector<regex_automaton_under_assumptions>::iterator it = regex_automaton_assumptions[re].begin();
+                                it != regex_automaton_assumptions[re].end(); ++it) {
+                            regex_automaton_under_assumptions autA = *it;
+                            if ((current_assignment == l_true && autA.get_polarity() == false)
+                                    || (current_assignment == l_false && autA.get_polarity() == true)) {
+                                // automaton uses incorrect polarity
+                                continue;
+                            }
+                            rational this_ub;
+                            if (autA.get_upper_bound(this_ub)) {
+                                if (last_ub == rational::minus_one() || this_ub < last_ub) {
+                                    last_ub = this_ub;
+                                    last_assumption = autA;
+                                }
+                            } else {
+                                need_assumption = false;
+                                last_assumption = autA;
+                                break;
+                            }
+                        }
+                        if (!last_ub.is_minus_one() || !need_assumption) {
+                            CTRACE("str", !need_assumption, tout << "using automaton with full length information" << std::endl;);
+                            CTRACE("str", need_assumption, tout << "using automaton with assumed upper bound of " << last_ub << std::endl;);
+
+                            rational refined_upper_bound;
+                            bool solution_at_upper_bound = refine_automaton_upper_bound(last_assumption.get_automaton(),
+                                    upper_bound_value, refined_upper_bound);
+                            TRACE("str", tout << "refined upper bound is " << refined_upper_bound <<
+                                    (solution_at_upper_bound?", solution at upper bound":", no solution at upper bound") << std::endl;);
+
+                            expr_ref_vector lhs(m);
+                            if (current_assignment == l_false) {
+                                lhs.push_back(m.mk_not(str_in_re));
+                            } else {
+                                lhs.push_back(str_in_re);
+                            }
+                            if (need_assumption) {
+                                lhs.push_back(m_autil.mk_le(str_len, m_autil.mk_numeral(last_ub, true)));
+                            }
+                            lhs.push_back(m_autil.mk_le(str_len, m_autil.mk_numeral(upper_bound_value, true)));
+
+                            expr_ref_vector rhs(m);
+
+                            if (solution_at_upper_bound) {
+                                if (refined_upper_bound.is_minus_one()) {
+                                    // If there are solutions at the upper bound but not below it, make the bound exact.
+                                    NOT_IMPLEMENTED_YET();
+                                } else {
+                                    // If there are solutions at and below the upper bound, add an additional bound.
+                                    NOT_IMPLEMENTED_YET();
+                                }
+                            } else {
+                                if (refined_upper_bound.is_minus_one()) {
+                                    // If there are no solutions at or below the upper bound, assert a conflict clause.
+                                    rhs.push_back(m.mk_not(m_autil.mk_le(str_len, m_autil.mk_numeral(upper_bound_value, true))));
+                                } else {
+                                    // If there are solutions below the upper bound but not at it, refine the bound.
+                                    NOT_IMPLEMENTED_YET();
+                                }
+                            }
+
+                            expr_ref lhs_terms(mk_and(lhs), m);
+                            expr_ref rhs_terms(mk_and(rhs), m);
+                            assert_implication(lhs_terms, rhs_terms);
+                            regex_axiom_add = true;
+                        }
+                    } else {
+                        // no existing automata/assumptions.
+                        // if it's easy to construct a full automaton for R, do so
+                        unsigned expected_complexity = estimate_regex_complexity(re);
+                        unsigned threshold = 1000; // TODO better metric
+                        if (expected_complexity <= threshold) {
+                            eautomaton * aut = m_mk_aut(re);
+                            aut->compress();
+                            regex_automata.push_back(aut);
+                            regex_automaton_under_assumptions new_aut(re, aut, true);
+                            if (!regex_automaton_assumptions.contains(re)) {
+                                regex_automaton_assumptions.insert(re, svector<regex_automaton_under_assumptions>());
+                            }
+                            regex_automaton_assumptions[re].push_back(new_aut);
+                            TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
+                            regex_axiom_add = true;
+                            // TODO immediately attempt to learn lower/upper bound info here
+                        } else {
+                            // TODO check negation?
+                            // construct a partial automaton for R to the given upper bound
+                            NOT_IMPLEMENTED_YET();
+                        }
+                    }
+                    // if we have *any* automaton for R, and the upper bound is not too large,
+                    // finitize the automaton (if we have not already done so) and assert all solutions
+                    if (upper_bound_value < 50) { // TODO better metric for threshold
+                        // NOT_IMPLEMENTED_YET(); // TODO(mtrberzi)
+                    }
+                } else { // !upper_bound_exists
+                    // no upper bound information
+                    if (lower_bound_exists) {
+                        // lower bound, no upper bound
+                        NOT_IMPLEMENTED_YET();
+                    } else { // !lower_bound_exists
+                        // no bounds information
+                        NOT_IMPLEMENTED_YET();
+                    }
+                }
+
+                // NOT_IMPLEMENTED_YET();
+            }
+            if (regex_axiom_add) {
+                return FC_CONTINUE;
+            }
+        } // RegexAutomata
+
         bool needToAssignFreeVars = false;
         std::set<expr*> free_variables;
         std::set<expr*> unused_internal_variables;
@@ -10567,6 +10793,13 @@ namespace smt {
                     SASSERT(lenTestAssert != NULL);
                     return lenTestAssert;
                 } else {
+                    // if we are performing automata-based reasoning and the term associated with
+                    // this length tester is in any way constrained by regex terms,
+                    // do not perform value testing -- this term is not a free variable.
+                    if (m_params.m_RegexAutomata) {
+                        NOT_IMPLEMENTED_YET(); // TODO
+                    }
+
                     TRACE("str", tout << "length is fixed; generating models for free var" << std::endl;);
                     // length is fixed
                     expr * valueAssert = gen_free_var_options(freeVar, effectiveLenInd, effectiveLenIndiStr, NULL, zstring(""));
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index 602f9feab..57fa48142 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -166,6 +166,69 @@ public:
     bool matches(zstring input);
 };
 
+class regex_automaton_under_assumptions {
+protected:
+    expr * str_in_re;
+    eautomaton * aut;
+    bool polarity;
+
+    bool assume_lower_bound;
+    rational lower_bound;
+
+    bool assume_upper_bound;
+    rational upper_bound;
+public:
+    regex_automaton_under_assumptions() :
+        str_in_re(NULL), aut(NULL), polarity(false),
+        assume_lower_bound(false), assume_upper_bound(false) {}
+
+    regex_automaton_under_assumptions(expr * str_in_re, eautomaton * aut, bool polarity) :
+        str_in_re(str_in_re), aut(aut), polarity(polarity),
+        assume_lower_bound(false), assume_upper_bound(false) {}
+
+    void set_lower_bound(rational & lb) {
+        lower_bound = lb;
+        assume_lower_bound = true;
+    }
+    void unset_lower_bound() {
+        assume_lower_bound = false;
+    }
+
+    void set_upper_bound(rational & ub) {
+        upper_bound = ub;
+        assume_upper_bound = true;
+    }
+    void unset_upper_bound() {
+        assume_upper_bound = false;
+    }
+
+    bool get_lower_bound(rational & lb) const {
+        if (assume_lower_bound) {
+            lb = lower_bound;
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    bool get_upper_bound(rational & ub) const {
+        if (assume_upper_bound) {
+            ub = upper_bound;
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    eautomaton * get_automaton() const { return aut; }
+    bool get_polarity() const { return polarity; }
+
+    virtual ~regex_automaton_under_assumptions() {
+        // don't free str_in_re or aut;
+        // they are managed separately
+    }
+};
+
 class theory_str : public theory {
     struct T_cut
     {
@@ -338,6 +401,11 @@ protected:
     std::map<std::pair<expr*, zstring>, expr*> regex_in_bool_map;
     std::map<expr*, std::set<zstring> > regex_in_var_reg_str_map;
 
+    // regex automata
+    ptr_vector<eautomaton> regex_automata;
+    obj_hashtable<expr> regex_terms;
+    obj_map<expr, ptr_vector<expr> > regex_terms_by_string; // S --> [ (str.in.re S *) ]
+    obj_map<expr, svector<regex_automaton_under_assumptions> > regex_automaton_assumptions; // RegEx --> [ aut+assumptions ]
     std::map<expr*, nfa> regex_nfa_cache; // Regex term --> NFA
 
     svector<char> char_set;
@@ -468,6 +536,7 @@ protected:
     unsigned estimate_regex_complexity(expr * re);
     unsigned estimate_regex_complexity_under_complement(expr * re);
     expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables);
+    bool refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound);
 
     void set_up_axioms(expr * ex);
     void handle_equality(expr * lhs, expr * rhs);

From 0917af7c569eb7380f04ff0165861912ba9f3109 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Wed, 3 Jan 2018 12:02:11 -0500
Subject: [PATCH 04/36] full upper bound refinement

---
 src/smt/theory_str.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 2f41a6617..b0b712511 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -9236,10 +9236,13 @@ namespace smt {
                             if (solution_at_upper_bound) {
                                 if (refined_upper_bound.is_minus_one()) {
                                     // If there are solutions at the upper bound but not below it, make the bound exact.
-                                    NOT_IMPLEMENTED_YET();
+                                    rhs.push_back(ctx.mk_eq_atom(str_len, m_autil.mk_numeral(upper_bound_value, true)));
                                 } else {
                                     // If there are solutions at and below the upper bound, add an additional bound.
-                                    NOT_IMPLEMENTED_YET();
+                                    rhs.push_back(m.mk_or(
+                                            ctx.mk_eq_atom(str_len, m_autil.mk_numeral(upper_bound_value, true)),
+                                            m_autil.mk_le(str_len, m_autil.mk_numeral(refined_upper_bound, true))
+                                            ));
                                 }
                             } else {
                                 if (refined_upper_bound.is_minus_one()) {
@@ -9247,7 +9250,7 @@ namespace smt {
                                     rhs.push_back(m.mk_not(m_autil.mk_le(str_len, m_autil.mk_numeral(upper_bound_value, true))));
                                 } else {
                                     // If there are solutions below the upper bound but not at it, refine the bound.
-                                    NOT_IMPLEMENTED_YET();
+                                    rhs.push_back(m_autil.mk_le(str_len, m_autil.mk_numeral(refined_upper_bound, true)));
                                 }
                             }
 

From 0f20944aebf2f13e624b36d7b66dbba5a8e1233a Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Wed, 3 Jan 2018 13:54:18 -0500
Subject: [PATCH 05/36] regex lower bound (WIP)

---
 src/smt/theory_str.cpp | 158 ++++++++++++++++++++++++++++++++++++++++-
 src/smt/theory_str.h   |   1 +
 2 files changed, 158 insertions(+), 1 deletion(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index b0b712511..04eb86fe4 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -6671,6 +6671,118 @@ namespace smt {
         }
     }
 
+    /*
+     * Refine the lower bound on the length of a solution to a given automaton.
+     * The method returns TRUE if a solution of length `current_lower_bound` exists,
+     * and FALSE otherwise. In addition, the reference parameter `refined_lower_bound`
+     * is assigned the length of the shortest solution longer than `current_lower_bound`
+     * if it exists, or -1 otherwise.
+     */
+    bool theory_str::refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound) {
+        ENSURE(aut != NULL);
+
+        ast_manager & m = get_manager();
+
+        if (aut->final_states().size() < 1) {
+            // no solutions at all
+            refined_lower_bound = rational::minus_one();
+            return false;
+        }
+
+        // from here we assume that there is a final state reachable from the initial state
+
+        unsigned_vector search_queue;
+        // populate search_queue with all states reachable from the epsilon-closure of start state
+        aut->get_epsilon_closure(aut->init(), search_queue);
+
+        unsigned search_depth = 0;
+        hashtable<unsigned, unsigned_hash, default_eq<unsigned>> next_states;
+        unsigned_vector next_search_queue;
+
+        bool found_solution_at_lower_bound = false;
+
+        while (!search_queue.empty()) {
+            // if we are at the lower bound, check for final states
+            if (search_depth == current_lower_bound.get_unsigned()) {
+                for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) {
+                    unsigned state = *it;
+                    if (aut->is_final_state(state)) {
+                        found_solution_at_lower_bound = true;
+                        break;
+                    }
+                }
+                // end phase 1
+                break;
+            }
+            next_states.reset();
+            next_search_queue.clear();
+            // move one step along all states
+            for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) {
+                unsigned src = *it;
+                eautomaton::moves next_moves;
+                aut->get_moves_from(src, next_moves, true);
+                for (eautomaton::moves::iterator move_it = next_moves.begin();
+                        move_it != next_moves.end(); ++move_it) {
+                    unsigned dst = move_it->dst();
+                    if (!next_states.contains(dst)) {
+                        next_states.insert(dst);
+                        next_search_queue.push_back(dst);
+                    }
+                }
+            }
+            search_queue.clear();
+            search_queue.append(next_search_queue);
+            search_depth += 1;
+        } // !search_queue.empty()
+
+        // if we got here before reaching the lower bound,
+        // there aren't any solutions at or above it, so stop
+        if (search_depth < current_lower_bound.get_unsigned()) {
+            refined_lower_bound = rational::minus_one();
+            return false;
+        }
+
+        // phase 2: continue exploring the automaton above the lower bound
+        SASSERT(search_depth == current_lower_bound.get_unsigned());
+
+        while (!search_queue.empty()) {
+            if (search_depth > current_lower_bound.get_unsigned()) {
+                // check if we have found a solution above the lower bound
+                for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) {
+                    unsigned state = *it;
+                    if (aut->is_final_state(state)) {
+                        // this is a solution at a depth higher than the lower bound
+                        refined_lower_bound = rational(search_depth);
+                        return found_solution_at_lower_bound;
+                    }
+                }
+            }
+            next_states.reset();
+            next_search_queue.clear();
+            // move one step along all states
+            for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) {
+                unsigned src = *it;
+                eautomaton::moves next_moves;
+                aut->get_moves_from(src, next_moves, true);
+                for (eautomaton::moves::iterator move_it = next_moves.begin();
+                        move_it != next_moves.end(); ++move_it) {
+                    unsigned dst = move_it->dst();
+                    if (!next_states.contains(dst)) {
+                        next_states.insert(dst);
+                        next_search_queue.push_back(dst);
+                    }
+                }
+            }
+            search_queue.clear();
+            search_queue.append(next_search_queue);
+            search_depth += 1;
+        }
+        // if we reached this point, we explored the whole automaton and didn't find any
+        // solutions above the lower bound
+        refined_lower_bound = rational::minus_one();
+        return found_solution_at_lower_bound;
+    }
+
     /*
      * Refine the upper bound on the length of a solution to a given automaton.
      * The method returns TRUE if a solution of length `current_upper_bound` exists,
@@ -9291,7 +9403,51 @@ namespace smt {
                     // no upper bound information
                     if (lower_bound_exists) {
                         // lower bound, no upper bound
-                        NOT_IMPLEMENTED_YET();
+
+                        // check current assumptions
+                        if (regex_automaton_assumptions.contains(re) &&
+                                !regex_automaton_assumptions[re].empty()){
+                            // one or more existing assumptions.
+                            // see if the (current best) lower bound can be refined
+                            // (note that if we have an automaton with no assumption,
+                            // this automatically counts as best)
+                            bool need_assumption = true;
+                            regex_automaton_under_assumptions last_assumption;
+                            rational last_lb = rational::zero(); // the default
+                            for (svector<regex_automaton_under_assumptions>::iterator it = regex_automaton_assumptions[re].begin();
+                                    it != regex_automaton_assumptions[re].end(); ++it) {
+                                regex_automaton_under_assumptions autA = *it;
+                                if ((current_assignment == l_true && autA.get_polarity() == false)
+                                        || (current_assignment == l_false && autA.get_polarity() == true)) {
+                                    // automaton uses incorrect polarity
+                                    continue;
+                                }
+                                rational this_lb;
+                                if (autA.get_lower_bound(this_lb)) {
+                                    if (this_lb > last_lb) {
+                                        last_lb = this_lb;
+                                        last_assumption = autA;
+                                    }
+                                } else {
+                                    need_assumption = false;
+                                    last_assumption = autA;
+                                    break;
+                                }
+                            }
+                            if (!last_lb.is_zero() || !need_assumption) {
+                                CTRACE("str", !need_assumption, tout << "using automaton with full length information" << std::endl;);
+                                CTRACE("str", need_assumption, tout << "using automaton with assumed lower bound of " << last_lb << std::endl;);
+                                rational refined_lower_bound;
+                                bool solution_at_lower_bound = refine_automaton_lower_bound(last_assumption.get_automaton(),
+                                        lower_bound_value, refined_lower_bound);
+                                TRACE("str", tout << "refined lower bound is " << refined_lower_bound <<
+                                        (solution_at_lower_bound?", solution at upper bound":", no solution at upper bound") << std::endl;);
+                                NOT_IMPLEMENTED_YET();
+                            }
+                        } else {
+                            // no existing automata/assumptions.
+                            NOT_IMPLEMENTED_YET();
+                        }
                     } else { // !lower_bound_exists
                         // no bounds information
                         NOT_IMPLEMENTED_YET();
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index 57fa48142..999fc7dc0 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -536,6 +536,7 @@ protected:
     unsigned estimate_regex_complexity(expr * re);
     unsigned estimate_regex_complexity_under_complement(expr * re);
     expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables);
+    bool refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound);
     bool refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound);
 
     void set_up_axioms(expr * ex);

From 98691a2c498198f2e71cc2cc01c3fb5c17210b65 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Mon, 8 Jan 2018 15:56:21 -0500
Subject: [PATCH 06/36] lower bound refinement

---
 src/smt/theory_str.cpp | 83 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 80 insertions(+), 3 deletions(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 9ced6d186..eb1da0c4e 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -9441,16 +9441,93 @@ namespace smt {
                                 bool solution_at_lower_bound = refine_automaton_lower_bound(last_assumption.get_automaton(),
                                         lower_bound_value, refined_lower_bound);
                                 TRACE("str", tout << "refined lower bound is " << refined_lower_bound <<
-                                        (solution_at_lower_bound?", solution at upper bound":", no solution at upper bound") << std::endl;);
-                                NOT_IMPLEMENTED_YET();
+                                        (solution_at_lower_bound?", solution at lower bound":", no solution at lower bound") << std::endl;);
+
+                                expr_ref_vector lhs(m);
+                                if (current_assignment == l_false) {
+                                    lhs.push_back(m.mk_not(str_in_re));
+                                } else {
+                                    lhs.push_back(str_in_re);
+                                }
+                                if (need_assumption) {
+                                    lhs.push_back(m_autil.mk_ge(str_len, m_autil.mk_numeral(last_lb, true)));
+                                }
+                                lhs.push_back(m_autil.mk_ge(str_len, m_autil.mk_numeral(lower_bound_value, true)));
+
+                                expr_ref_vector rhs(m);
+
+                                if (solution_at_lower_bound) {
+                                    if (refined_lower_bound.is_minus_one()) {
+                                        // If there are solutions at the lower bound but not above it, make the bound exact.
+                                        rhs.push_back(ctx.mk_eq_atom(str_len, m_autil.mk_numeral(lower_bound_value, true)));
+                                    } else {
+                                        // If there are solutions at and above the lower bound, add an additional bound.
+                                        rhs.push_back(m.mk_or(
+                                                ctx.mk_eq_atom(str_len, m_autil.mk_numeral(lower_bound_value, true)),
+                                                m_autil.mk_ge(str_len, m_autil.mk_numeral(refined_lower_bound, true))
+                                        ));
+                                    }
+                                } else {
+                                    if (refined_lower_bound.is_minus_one()) {
+                                        // If there are no solutions at or above the lower bound, assert a conflict clause.
+                                        rhs.push_back(m.mk_not(m_autil.mk_ge(str_len, m_autil.mk_numeral(lower_bound_value, true))));
+                                    } else {
+                                        // If there are solutions above the lower bound but not at it, refine the bound.
+                                        rhs.push_back(m_autil.mk_ge(str_len, m_autil.mk_numeral(refined_lower_bound, true)));
+                                    }
+                                }
+
+                                expr_ref lhs_terms(mk_and(lhs), m);
+                                expr_ref rhs_terms(mk_and(rhs), m);
+                                assert_implication(lhs_terms, rhs_terms);
+                                regex_axiom_add = true;
                             }
                         } else {
                             // no existing automata/assumptions.
-                            NOT_IMPLEMENTED_YET();
+                            // if it's easy to construct a full automaton for R, do so
+                            unsigned expected_complexity = estimate_regex_complexity(re);
+                            unsigned threshold = 1000; // TODO better metric
+                            if (expected_complexity <= threshold) {
+                                eautomaton * aut = m_mk_aut(re);
+                                aut->compress();
+                                regex_automata.push_back(aut);
+                                regex_automaton_under_assumptions new_aut(re, aut, true);
+                                if (!regex_automaton_assumptions.contains(re)) {
+                                    regex_automaton_assumptions.insert(re, svector<regex_automaton_under_assumptions>());
+                                }
+                                regex_automaton_assumptions[re].push_back(new_aut);
+                                TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
+                                regex_axiom_add = true;
+                                // TODO immediately attempt to learn lower/upper bound info here
+                            } else {
+                                // TODO check negation?
+                                // construct a partial automaton for R to the given upper bound
+                                NOT_IMPLEMENTED_YET();
+                            }
                         }
                     } else { // !lower_bound_exists
                         // no bounds information
+                        // check for existing automata;
+                        // try to construct an automaton if we don't have one yet
+                        // and doing so without bounds is not difficult
                         NOT_IMPLEMENTED_YET();
+                        if (true) {
+                            unsigned expected_complexity = estimate_regex_complexity(re);
+                            unsigned threshold = 1000; // TODO better metric
+                            if (expected_complexity <= threshold) {
+                                eautomaton * aut = m_mk_aut(re);
+                                aut->compress();
+                                regex_automata.push_back(aut);
+                                regex_automaton_under_assumptions new_aut(re, aut, true);
+                                if (!regex_automaton_assumptions.contains(re)) {
+                                    regex_automaton_assumptions.insert(re, svector<regex_automaton_under_assumptions>());
+                                }
+                                regex_automaton_assumptions[re].push_back(new_aut);
+                                TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
+                                regex_axiom_add = true;
+                                // TODO immediately attempt to learn lower/upper bound info here
+                            }
+                        }
                     }
                 }
 

From bac5a648d960a45295f2a05e433b3f85cd4df0f1 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Tue, 9 Jan 2018 20:20:04 -0500
Subject: [PATCH 07/36] regex path constraint generation (WIP)

---
 src/smt/theory_str.cpp | 232 +++++++++++++++++++++++++++++++++++++++++
 src/smt/theory_str.h   |   3 +
 2 files changed, 235 insertions(+)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index eb1da0c4e..54636b8df 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -6854,6 +6854,237 @@ namespace smt {
         return found_solution_at_upper_bound;
     }
 
+    void theory_str::aut_path_add_next(u_map<expr*>& next, expr_ref_vector& trail, unsigned idx, expr* cond) {
+        expr* acc;
+        if (!get_manager().is_true(cond) && next.find(idx, acc)) {
+            expr* args[2] = { cond, acc };
+            cond = mk_or(get_manager(), 2, args);
+        }
+        trail.push_back(cond);
+        next.insert(idx, cond);
+    }
+
+    expr_ref theory_str::aut_path_rewrite_constraint(expr * cond, expr * ch_var) {
+        context & ctx = get_context();
+        ast_manager & m = get_manager();
+        bv_util bvu(m);
+
+        expr_ref retval(m);
+
+        rational char_val;
+        unsigned int bv_width;
+
+        expr * lhs;
+        expr * rhs;
+
+        if (bvu.is_numeral(cond, char_val, bv_width)) {
+            SASSERT(char_val.is_nonneg() && char_val.get_unsigned() < 256);
+            TRACE("str", tout << "rewrite character constant " << char_val << std::endl;);
+            zstring str_const(char_val.get_unsigned());
+            retval = u.str.mk_string(str_const);
+            return retval;
+        } else if (is_var(cond)) {
+            TRACE("str", tout << "substitute var" << std::endl;);
+            retval = ch_var;
+            return retval;
+        } else if (m.is_eq(cond, lhs, rhs)) {
+            // handle this specially because the sort of the equality will change
+            expr_ref new_lhs(aut_path_rewrite_constraint(lhs, ch_var), m);
+            SASSERT(new_lhs);
+            expr_ref new_rhs(aut_path_rewrite_constraint(rhs, ch_var), m);
+            SASSERT(new_rhs);
+            retval = ctx.mk_eq_atom(new_lhs, new_rhs);
+            return retval;
+        } else if (m.is_bool(cond)) {
+            TRACE("str", tout << "rewrite boolean term " << mk_pp(cond, m) << std::endl;);
+            app * a_cond = to_app(cond);
+            expr_ref_vector rewritten_args(m);
+            for (unsigned i = 0; i < a_cond->get_num_args(); ++i) {
+                expr * argI = a_cond->get_arg(i);
+                expr_ref new_arg(aut_path_rewrite_constraint(argI, ch_var), m);
+                SASSERT(new_arg);
+                rewritten_args.push_back(new_arg);
+            }
+            retval = m.mk_app(a_cond->get_decl(), rewritten_args.c_ptr());
+            TRACE("str", tout << "final rewritten term is " << mk_pp(retval, m) << std::endl;);
+            return retval;
+        } else {
+            TRACE("str", tout << "ERROR: unrecognized automaton path constraint " << mk_pp(cond, m) << ", cannot translate" << std::endl;);
+            retval = NULL;
+            return retval;
+        }
+    }
+
+    /*
+     * Create finite path constraints for the string variable `str` with respect to the automaton `aut`.
+     * The returned expression is the right-hand side of a constraint of the form
+     * (str in re) AND (|str| = len) AND (any applicable length assumptions on aut) -> RHS
+     */
+    expr_ref theory_str::generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal) {
+        ENSURE(aut != NULL);
+        context & ctx = get_context();
+        ast_manager & m = get_manager();
+
+        if (lenVal.is_zero()) {
+            // if any state in the epsilon-closure of the start state is accepting,
+            // then the empty string is in this language
+            unsigned_vector states;
+            bool has_final = false;
+            aut->get_epsilon_closure(aut->init(), states);
+            for (unsigned i = 0; i < states.size() && !has_final; ++i) {
+                has_final = aut->is_final_state(states[i]);
+            }
+            if (has_final) {
+                // empty string is OK, assert axiom
+                expr_ref rhs(ctx.mk_eq_atom(stringTerm, mk_string("")), m);
+                SASSERT(rhs);
+                //regex_automata_assertions.insert(stringTerm, final_axiom);
+                //m_trail_stack.push(insert_obj_map<theory_str, expr, expr* >(regex_automata_assertions, stringTerm) );
+                return rhs;
+            } else {
+                // negate -- the empty string isn't in the language
+                //expr_ref conflict(m.mk_not(mk_and(toplevel_lhs)), m);
+                //assert_axiom(conflict);
+                expr_ref conflict(m.mk_false(), m);
+                return conflict;
+            }
+        } // lenVal.is_zero()
+
+        expr_ref_vector pathChars(m);
+        expr_ref_vector pathChars_len_constraints(m);
+        for (unsigned i = 0; i < lenVal.get_unsigned(); ++i) {
+            std::stringstream ss;
+            ss << "ch" << i;
+            expr_ref ch(mk_str_var(ss.str()), m);
+            pathChars.push_back(ch);
+            pathChars_len_constraints.push_back(ctx.mk_eq_atom(mk_strlen(ch), m_autil.mk_numeral(rational::one(), true)));
+        }
+
+        // modification of code in seq_rewriter::mk_str_in_regexp()
+        expr_ref_vector trail(m);
+        u_map<expr*> maps[2];
+        bool select_map = false;
+        expr_ref ch(m), cond(m);
+        eautomaton::moves mvs;
+        maps[0].insert(aut->init(), m.mk_true());
+        // is_accepted(a, aut) & some state in frontier is final.
+        for (unsigned i = 0; i < lenVal.get_unsigned(); ++i) {
+            u_map<expr*>& frontier = maps[select_map];
+            u_map<expr*>& next = maps[!select_map];
+            select_map = !select_map;
+            ch = pathChars.get(i);
+            next.reset();
+            u_map<expr*>::iterator it = frontier.begin(), end = frontier.end();
+            for (; it != end; ++it) {
+                mvs.reset();
+                unsigned state = it->m_key;
+                expr*    acc  = it->m_value;
+                aut->get_moves_from(state, mvs, false);
+                for (unsigned j = 0; j < mvs.size(); ++j) {
+                    eautomaton::move const& mv = mvs[j];
+                    SASSERT(mv.t());
+                    if (mv.t()->is_char() && m.is_value(mv.t()->get_char())) {
+                        // change this to a string constraint
+                        expr_ref cond_rhs = aut_path_rewrite_constraint(mv.t()->get_char(), ch);
+                        SASSERT(cond_rhs);
+                        cond = ctx.mk_eq_atom(ch, cond_rhs);
+                        SASSERT(cond);
+                        expr * args[2] = {cond, acc};
+                        cond = mk_and(m, 2, args);
+                        aut_path_add_next(next, trail, mv.dst(), cond);
+                    } else if (mv.t()->is_range()) {
+                        expr_ref range_lo(mv.t()->get_lo(), m);
+                        expr_ref range_hi(mv.t()->get_hi(), m);
+                        bv_util bvu(m);
+
+                        rational lo_val, hi_val;
+                        unsigned int bv_width;
+
+                        if (bvu.is_numeral(range_lo, lo_val, bv_width) && bvu.is_numeral(range_hi, hi_val, bv_width)) {
+                            TRACE("str", tout << "make range predicate from " << lo_val << " to " << hi_val << std::endl;);
+                            expr_ref cond_rhs(m);
+
+                            if (hi_val < lo_val) {
+                                rational tmp = lo_val;
+                                lo_val = hi_val;
+                                hi_val = tmp;
+                            }
+
+                            expr_ref_vector cond_rhs_terms(m);
+                            for (unsigned i = lo_val.get_unsigned(); i <= hi_val.get_unsigned(); ++i) {
+                                zstring str_const(i);
+                                expr_ref str_expr(u.str.mk_string(str_const), m);
+                                cond_rhs_terms.push_back(ctx.mk_eq_atom(ch, str_expr));
+                            }
+                            cond_rhs = mk_or(cond_rhs_terms);
+                            SASSERT(cond_rhs);
+                            expr * args[2] = {cond_rhs, acc};
+                            cond = mk_and(m, 2, args);
+                            aut_path_add_next(next, trail, mv.dst(), cond);
+                        } else {
+                            TRACE("str", tout << "warning: non-bitvectors in automaton range predicate" << std::endl;);
+                            UNREACHABLE();
+                        }
+                    } else if (mv.t()->is_pred()) {
+                        // rewrite this constraint over string terms
+                        expr_ref cond_rhs = aut_path_rewrite_constraint(mv.t()->get_pred(), ch);
+                        SASSERT(cond_rhs);
+
+                        if (m.is_false(cond_rhs)) {
+                            continue;
+                        } else if (m.is_true(cond_rhs)) {
+                            aut_path_add_next(next, trail, mv.dst(), acc);
+                            continue;
+                        }
+                        expr * args[2] = {cond_rhs, acc};
+                        cond = mk_and(m, 2, args);
+                        aut_path_add_next(next, trail, mv.dst(), cond);
+                    }
+                }
+            }
+        }
+        u_map<expr*> const& frontier = maps[select_map];
+        u_map<expr*>::iterator it = frontier.begin(), end = frontier.end();
+        expr_ref_vector ors(m);
+        for (; it != end; ++it) {
+            unsigned_vector states;
+            bool has_final = false;
+            aut->get_epsilon_closure(it->m_key, states);
+            for (unsigned i = 0; i < states.size() && !has_final; ++i) {
+                has_final = aut->is_final_state(states[i]);
+            }
+            if (has_final) {
+                ors.push_back(it->m_value);
+            }
+        }
+        expr_ref result(mk_or(ors));
+        TRACE("str", tout << "regex path constraint: " << mk_pp(result, m) << "\n";);
+
+        // if the path constraint is instantly false, then we know that the LHS must be false,
+        // so negate it and instantly assert a conflict clause
+        if (m.is_false(result)) {
+            expr_ref conflict(m.mk_false(), m);
+            return conflict;
+        } else {
+            expr_ref concat_rhs(m);
+            if (pathChars.size() == 1) {
+                concat_rhs = ctx.mk_eq_atom(stringTerm, pathChars.get(0));
+            } else {
+                expr_ref acc(pathChars.get(0), m);
+                for (unsigned i = 1; i < pathChars.size(); ++i) {
+                    acc = mk_concat(acc, pathChars.get(i));
+                }
+                concat_rhs = ctx.mk_eq_atom(stringTerm, acc);
+            }
+
+            expr_ref toplevel_rhs(m.mk_and(result, mk_and(pathChars_len_constraints), concat_rhs), m);
+            //expr_ref final_axiom(rewrite_implication(mk_and(toplevel_lhs), toplevel_rhs), m);
+            //regex_automata_assertions.insert(stringTerm, final_axiom);
+            //m_trail_stack.push(insert_obj_map<theory_str, expr, expr* >(regex_automata_assertions, stringTerm) );
+            return toplevel_rhs;
+        }
+    }
+
     /*
      * strArgmt::solve_concat_eq_str()
      * Solve concatenations of the form:
@@ -9281,6 +9512,7 @@ namespace smt {
                 rational exact_length_value;
                 if (get_len_value(str, exact_length_value)) {
                     TRACE("str", tout << "exact length of " << mk_pp(str, m) << " is " << exact_length_value << std::endl;);
+                    // TODO generate_regex_path_constraints();
                     NOT_IMPLEMENTED_YET();
                 }
                 expr_ref str_len(mk_strlen(str), m);
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index 999fc7dc0..c6dad045f 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -538,6 +538,9 @@ protected:
     expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables);
     bool refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound);
     bool refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound);
+    expr_ref generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal);
+    void aut_path_add_next(u_map<expr*>& next, expr_ref_vector& trail, unsigned idx, expr* cond);
+    expr_ref aut_path_rewrite_constraint(expr * cond, expr * ch_var);
 
     void set_up_axioms(expr * ex);
     void handle_equality(expr * lhs, expr * rhs);

From 6b799706b5adf865cc1b3e73d00d37a86610c619 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Wed, 10 Jan 2018 17:24:47 -0500
Subject: [PATCH 08/36] add path constraint generation for regex terms

---
 src/smt/params/smt_params_helper.pyg |   1 +
 src/smt/params/theory_str_params.cpp |   1 +
 src/smt/params/theory_str_params.h   |   9 +-
 src/smt/theory_str.cpp               | 165 +++++++++++++++++++++------
 src/smt/theory_str.h                 |   4 +-
 5 files changed, 145 insertions(+), 35 deletions(-)

diff --git a/src/smt/params/smt_params_helper.pyg b/src/smt/params/smt_params_helper.pyg
index 122d09f32..7ab0d52f2 100644
--- a/src/smt/params/smt_params_helper.pyg
+++ b/src/smt/params/smt_params_helper.pyg
@@ -80,6 +80,7 @@ def_module_params(module_name='smt',
                           ('str.finite_overlap_models', BOOL, False, 'attempt a finite model search for overlapping variables instead of completely giving up on the arrangement'),
                           ('str.overlap_priority', DOUBLE, -0.1, 'theory-aware priority for overlapping variable cases; use smt.theory_aware_branching=true'),
                           ('str.regex_automata', BOOL, True, 'use automata-based reasoning for regular expressions (Z3str3 only)'),
+                          ('str.regex_automata_difficulty_threshold', UINT, 1000, 'difficulty threshold for regex automata heuristics'),
                           ('core.minimize', BOOL, False, 'minimize unsat core produced by SMT context'),
                           ('core.extend_patterns', BOOL, False, 'extend unsat core with literals that trigger (potential) quantifier instances'),
                           ('core.extend_patterns.max_distance', UINT, UINT_MAX, 'limits the distance of a pattern-extended unsat core'),
diff --git a/src/smt/params/theory_str_params.cpp b/src/smt/params/theory_str_params.cpp
index b5451b9dc..fa6862161 100644
--- a/src/smt/params/theory_str_params.cpp
+++ b/src/smt/params/theory_str_params.cpp
@@ -32,4 +32,5 @@ void theory_str_params::updt_params(params_ref const & _p) {
     m_BinarySearchInitialUpperBound = p.str_binary_search_start();
     m_OverlapTheoryAwarePriority = p.str_overlap_priority();
     m_RegexAutomata = p.str_regex_automata();
+    m_RegexAutomata_DifficultyThreshold = p.str_regex_automata_difficulty_threshold();
 }
diff --git a/src/smt/params/theory_str_params.h b/src/smt/params/theory_str_params.h
index 4135cf0d6..48c0205ee 100644
--- a/src/smt/params/theory_str_params.h
+++ b/src/smt/params/theory_str_params.h
@@ -87,6 +87,12 @@ struct theory_str_params {
      */
     bool m_RegexAutomata;
 
+    /*
+     * RegexAutomata_DifficultyThreshold is the lowest difficulty above which Z3str3
+     * will not eagerly construct an automaton for a regular expression term.
+     */
+    unsigned m_RegexAutomata_DifficultyThreshold;
+
     theory_str_params(params_ref const & p = params_ref()):
         m_StrongArrangements(true),
         m_AggressiveLengthTesting(false),
@@ -99,7 +105,8 @@ struct theory_str_params {
         m_UseBinarySearch(false),
         m_BinarySearchInitialUpperBound(64),
         m_OverlapTheoryAwarePriority(-0.1),
-        m_RegexAutomata(true)
+        m_RegexAutomata(true),
+        m_RegexAutomata_DifficultyThreshold(1000)
     {
         updt_params(p);
     }
diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 54636b8df..5910d9d95 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -6918,9 +6918,11 @@ namespace smt {
     /*
      * Create finite path constraints for the string variable `str` with respect to the automaton `aut`.
      * The returned expression is the right-hand side of a constraint of the form
-     * (str in re) AND (|str| = len) AND (any applicable length assumptions on aut) -> RHS
+     * (str in re) AND (|str| = len) AND (any applicable length assumptions on aut) -> (rhs AND character constraints).
+     * The character constraints, which are (str = c0 . c1 . (...) . cn) and (|c0| = 1, ...),
+     * are returned in `characterConstraints`.
      */
-    expr_ref theory_str::generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal) {
+    expr_ref theory_str::generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal, expr_ref & characterConstraints) {
         ENSURE(aut != NULL);
         context & ctx = get_context();
         ast_manager & m = get_manager();
@@ -7060,29 +7062,23 @@ namespace smt {
         expr_ref result(mk_or(ors));
         TRACE("str", tout << "regex path constraint: " << mk_pp(result, m) << "\n";);
 
-        // if the path constraint is instantly false, then we know that the LHS must be false,
-        // so negate it and instantly assert a conflict clause
-        if (m.is_false(result)) {
-            expr_ref conflict(m.mk_false(), m);
-            return conflict;
+        expr_ref concat_rhs(m);
+        if (pathChars.size() == 1) {
+            concat_rhs = ctx.mk_eq_atom(stringTerm, pathChars.get(0));
         } else {
-            expr_ref concat_rhs(m);
-            if (pathChars.size() == 1) {
-                concat_rhs = ctx.mk_eq_atom(stringTerm, pathChars.get(0));
-            } else {
-                expr_ref acc(pathChars.get(0), m);
-                for (unsigned i = 1; i < pathChars.size(); ++i) {
-                    acc = mk_concat(acc, pathChars.get(i));
-                }
-                concat_rhs = ctx.mk_eq_atom(stringTerm, acc);
+            expr_ref acc(pathChars.get(0), m);
+            for (unsigned i = 1; i < pathChars.size(); ++i) {
+                acc = mk_concat(acc, pathChars.get(i));
             }
-
-            expr_ref toplevel_rhs(m.mk_and(result, mk_and(pathChars_len_constraints), concat_rhs), m);
-            //expr_ref final_axiom(rewrite_implication(mk_and(toplevel_lhs), toplevel_rhs), m);
-            //regex_automata_assertions.insert(stringTerm, final_axiom);
-            //m_trail_stack.push(insert_obj_map<theory_str, expr, expr* >(regex_automata_assertions, stringTerm) );
-            return toplevel_rhs;
+            concat_rhs = ctx.mk_eq_atom(stringTerm, acc);
         }
+
+        //expr_ref toplevel_rhs(m.mk_and(result, mk_and(pathChars_len_constraints), concat_rhs), m);
+        characterConstraints = m.mk_and(mk_and(pathChars_len_constraints), concat_rhs);
+        //expr_ref final_axiom(rewrite_implication(mk_and(toplevel_lhs), toplevel_rhs), m);
+        //regex_automata_assertions.insert(stringTerm, final_axiom);
+        //m_trail_stack.push(insert_obj_map<theory_str, expr, expr* >(regex_automata_assertions, stringTerm) );
+        return result;
     }
 
     /*
@@ -8241,6 +8237,27 @@ namespace smt {
                    );
     }
 
+    // returns true if needle appears as a subterm anywhere under haystack,
+    // or if needle appears in the same EQC as a subterm anywhere under haystack
+    bool theory_str::term_appears_as_subterm(expr * needle, expr * haystack) {
+        if (in_same_eqc(needle, haystack)) {
+            return true;
+        }
+
+        if (is_app(haystack)) {
+            app * a_haystack = to_app(haystack);
+            for (unsigned i = 0; i < a_haystack->get_num_args(); ++i) {
+                expr * subterm = a_haystack->get_arg(i);
+                if (term_appears_as_subterm(needle, subterm)) {
+                    return true;
+                }
+            }
+        }
+
+        // not found
+        return false;
+    }
+
     void theory_str::classify_ast_by_type(expr * node, std::map<expr*, int> & varMap,
                                           std::map<expr*, int> & concatMap, std::map<expr*, int> & unrollMap) {
 
@@ -9512,8 +9529,78 @@ namespace smt {
                 rational exact_length_value;
                 if (get_len_value(str, exact_length_value)) {
                     TRACE("str", tout << "exact length of " << mk_pp(str, m) << " is " << exact_length_value << std::endl;);
-                    // TODO generate_regex_path_constraints();
-                    NOT_IMPLEMENTED_YET();
+
+                    if (regex_terms_with_path_constraints.contains(str_in_re)) {
+                        TRACE("str", tout << "term " << mk_pp(str_in_re, m) << " already has path constraints set up" << std::endl;);
+                        continue;
+                    }
+
+                    // find a consistent automaton for this term
+                    bool found = false;
+                    regex_automaton_under_assumptions assumption;
+                    if (regex_automaton_assumptions.contains(re) &&
+                                                !regex_automaton_assumptions[re].empty()){
+                        for (svector<regex_automaton_under_assumptions>::iterator it = regex_automaton_assumptions[re].begin();
+                                it != regex_automaton_assumptions[re].end(); ++it) {
+                            regex_automaton_under_assumptions autA = *it;
+                            rational assumed_upper_bound, assumed_lower_bound;
+                            bool assumes_upper_bound = autA.get_upper_bound(assumed_upper_bound);
+                            bool assumes_lower_bound = autA.get_lower_bound(assumed_lower_bound);
+                            if (!assumes_upper_bound && !assumes_lower_bound) {
+                                // automaton with no assumptions is always usable
+                                assumption = autA;
+                                found = true;
+                                break;
+                            }
+                            // check consistency of bounds assumptions
+                        } // foreach(a in regex_automaton_assumptions)
+                    }
+                    if (found) {
+                        expr_ref pathConstraint(m);
+                        expr_ref characterConstraints(m);
+                        pathConstraint = generate_regex_path_constraints(str, assumption.get_automaton(), exact_length_value, characterConstraints);
+                        TRACE("str", tout << "generated regex path constraint " << mk_pp(pathConstraint, m) << std::endl;);
+                        TRACE("str", tout << "character constraints are " << mk_pp(characterConstraints, m) << std::endl;);
+
+                        expr_ref_vector lhs_terms(m);
+                        lhs_terms.push_back(str_in_re);
+                        lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true)));
+                        expr_ref lhs(mk_and(lhs_terms), m);
+                        // If the automaton was built with the same polarity as the constraint,
+                        // assert directly. Otherwise, negate the path constraint
+                        if ( (current_assignment == l_true && assumption.get_polarity())
+                                || (current_assignment == l_false && !assumption.get_polarity())) {
+                            expr_ref rhs(m.mk_and(pathConstraint, characterConstraints), m);
+                            assert_implication(lhs, rhs);
+                        } else {
+                            expr_ref rhs(m.mk_and(m.mk_not(pathConstraint), characterConstraints), m);
+                            assert_implication(lhs, rhs);
+                        }
+                        regex_terms_with_path_constraints.insert(str_in_re);
+                        m_trail_stack.push(insert_obj_trail<theory_str, expr>(regex_terms_with_path_constraints, str_in_re));
+                        regex_axiom_add = true;
+                        continue;
+                    } else {
+                        // no automata available, or else all bounds assumptions are invalid
+                        unsigned expected_complexity = estimate_regex_complexity(re);
+                        if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) {
+                            eautomaton * aut = m_mk_aut(re);
+                            aut->compress();
+                            regex_automata.push_back(aut);
+                            regex_automaton_under_assumptions new_aut(re, aut, true);
+                            if (!regex_automaton_assumptions.contains(re)) {
+                                regex_automaton_assumptions.insert(re, svector<regex_automaton_under_assumptions>());
+                            }
+                            regex_automaton_assumptions[re].push_back(new_aut);
+                            TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
+                            regex_axiom_add = true;
+                            // TODO immediately attempt to learn lower/upper bound info here
+                        } else {
+                            // TODO increment failure count
+                            NOT_IMPLEMENTED_YET();
+                        }
+                        continue;
+                    }
                 }
                 expr_ref str_len(mk_strlen(str), m);
                 rational lower_bound_value;
@@ -9607,8 +9694,7 @@ namespace smt {
                         // no existing automata/assumptions.
                         // if it's easy to construct a full automaton for R, do so
                         unsigned expected_complexity = estimate_regex_complexity(re);
-                        unsigned threshold = 1000; // TODO better metric
-                        if (expected_complexity <= threshold) {
+                        if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) {
                             eautomaton * aut = m_mk_aut(re);
                             aut->compress();
                             regex_automata.push_back(aut);
@@ -9622,9 +9708,11 @@ namespace smt {
                             // TODO immediately attempt to learn lower/upper bound info here
                         } else {
                             // TODO check negation?
-                            // construct a partial automaton for R to the given upper bound
+                            // TODO construct a partial automaton for R to the given upper bound?
+                            // TODO increment failure count if we can't
                             NOT_IMPLEMENTED_YET();
                         }
+                        continue;
                     }
                     // if we have *any* automaton for R, and the upper bound is not too large,
                     // finitize the automaton (if we have not already done so) and assert all solutions
@@ -9718,8 +9806,7 @@ namespace smt {
                             // no existing automata/assumptions.
                             // if it's easy to construct a full automaton for R, do so
                             unsigned expected_complexity = estimate_regex_complexity(re);
-                            unsigned threshold = 1000; // TODO better metric
-                            if (expected_complexity <= threshold) {
+                            if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) {
                                 eautomaton * aut = m_mk_aut(re);
                                 aut->compress();
                                 regex_automata.push_back(aut);
@@ -9733,9 +9820,11 @@ namespace smt {
                                 // TODO immediately attempt to learn lower/upper bound info here
                             } else {
                                 // TODO check negation?
-                                // construct a partial automaton for R to the given upper bound
+                                // TODO construct a partial automaton for R to the given lower bound?
+                                // TODO increment failure count
                                 NOT_IMPLEMENTED_YET();
                             }
+                            continue;
                         }
                     } else { // !lower_bound_exists
                         // no bounds information
@@ -9745,8 +9834,7 @@ namespace smt {
                         NOT_IMPLEMENTED_YET();
                         if (true) {
                             unsigned expected_complexity = estimate_regex_complexity(re);
-                            unsigned threshold = 1000; // TODO better metric
-                            if (expected_complexity <= threshold) {
+                            if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) {
                                 eautomaton * aut = m_mk_aut(re);
                                 aut->compress();
                                 regex_automata.push_back(aut);
@@ -11265,7 +11353,18 @@ namespace smt {
                     // this length tester is in any way constrained by regex terms,
                     // do not perform value testing -- this term is not a free variable.
                     if (m_params.m_RegexAutomata) {
-                        NOT_IMPLEMENTED_YET(); // TODO
+                        std::map<std::pair<expr*, zstring>, expr*>::iterator it = regex_in_bool_map.begin();
+                        for (; it != regex_in_bool_map.end(); ++it) {
+                            expr * re = it->second;
+                            expr * re_str = to_app(re)->get_arg(0);
+                            // recursive descent through all subterms of re_str to see if any of them are
+                            // - the same as freeVar
+                            // - in the same EQC as freeVar
+                            if (term_appears_as_subterm(freeVar, re_str)) {
+                                TRACE("str", tout << "prevent value testing on free var " << mk_pp(freeVar, m) << " as it belongs to one or more regex constraints." << std::endl;);
+                                return NULL;
+                            }
+                        }
                     }
 
                     TRACE("str", tout << "length is fixed; generating models for free var" << std::endl;);
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index c6dad045f..c64eec838 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -407,6 +407,7 @@ protected:
     obj_map<expr, ptr_vector<expr> > regex_terms_by_string; // S --> [ (str.in.re S *) ]
     obj_map<expr, svector<regex_automaton_under_assumptions> > regex_automaton_assumptions; // RegEx --> [ aut+assumptions ]
     std::map<expr*, nfa> regex_nfa_cache; // Regex term --> NFA
+    obj_hashtable<expr> regex_terms_with_path_constraints; // set of string terms which have had path constraints asserted in the current scope
 
     svector<char> char_set;
     std::map<char, int>  charSetLookupTable;
@@ -538,7 +539,7 @@ protected:
     expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables);
     bool refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound);
     bool refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound);
-    expr_ref generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal);
+    expr_ref generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal, expr_ref & characterConstraints);
     void aut_path_add_next(u_map<expr*>& next, expr_ref_vector& trail, unsigned idx, expr* cond);
     expr_ref aut_path_rewrite_constraint(expr * cond, expr * ch_var);
 
@@ -629,6 +630,7 @@ protected:
             std::map<expr*, std::map<expr*, int> > & concat_eq_concat_map,
             std::map<expr*, std::set<expr*> > & unrollGroupMap);
 
+    bool term_appears_as_subterm(expr * needle, expr * haystack);
     void classify_ast_by_type(expr * node, std::map<expr*, int> & varMap,
             std::map<expr*, int> & concatMap, std::map<expr*, int> & unrollMap);
     void classify_ast_by_type_in_positive_context(std::map<expr*, int> & varMap,

From ca3784449f02ce860736a741befb62ff33d9549b Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Fri, 12 Jan 2018 13:53:02 -0500
Subject: [PATCH 09/36] regex failsafe and intersect WIP

---
 src/smt/params/smt_params_helper.pyg |  4 ++
 src/smt/params/theory_str_params.cpp |  4 ++
 src/smt/params/theory_str_params.h   | 30 ++++++++-
 src/smt/theory_str.cpp               | 94 ++++++++++++++++++++++++++--
 src/smt/theory_str.h                 |  9 +++
 5 files changed, 136 insertions(+), 5 deletions(-)

diff --git a/src/smt/params/smt_params_helper.pyg b/src/smt/params/smt_params_helper.pyg
index 7ab0d52f2..1f691d587 100644
--- a/src/smt/params/smt_params_helper.pyg
+++ b/src/smt/params/smt_params_helper.pyg
@@ -81,6 +81,10 @@ def_module_params(module_name='smt',
                           ('str.overlap_priority', DOUBLE, -0.1, 'theory-aware priority for overlapping variable cases; use smt.theory_aware_branching=true'),
                           ('str.regex_automata', BOOL, True, 'use automata-based reasoning for regular expressions (Z3str3 only)'),
                           ('str.regex_automata_difficulty_threshold', UINT, 1000, 'difficulty threshold for regex automata heuristics'),
+                          ('str.regex_automata_intersection_difficulty_threshold', UINT, 1000, 'difficulty threshold for regex intersection heuristics'),
+                          ('str.regex_automata_failed_automaton_threshold', UINT, 10, 'number of failed automaton construction attempts after which a full automaton is automatically built'),
+                          ('str.regex_automata_failed_intersection_threshold', UINT, 10, 'number of failed automaton intersection attempts after which intersection is always computed'),
+                          ('str.regex_automata_length_attempt_threshold', UINT, 10, 'number of length/path constraint attempts before checking unsatisfiability of regex terms'),
                           ('core.minimize', BOOL, False, 'minimize unsat core produced by SMT context'),
                           ('core.extend_patterns', BOOL, False, 'extend unsat core with literals that trigger (potential) quantifier instances'),
                           ('core.extend_patterns.max_distance', UINT, UINT_MAX, 'limits the distance of a pattern-extended unsat core'),
diff --git a/src/smt/params/theory_str_params.cpp b/src/smt/params/theory_str_params.cpp
index fa6862161..92478bcd9 100644
--- a/src/smt/params/theory_str_params.cpp
+++ b/src/smt/params/theory_str_params.cpp
@@ -33,4 +33,8 @@ void theory_str_params::updt_params(params_ref const & _p) {
     m_OverlapTheoryAwarePriority = p.str_overlap_priority();
     m_RegexAutomata = p.str_regex_automata();
     m_RegexAutomata_DifficultyThreshold = p.str_regex_automata_difficulty_threshold();
+    m_RegexAutomata_IntersectionDifficultyThreshold = p.str_regex_automata_intersection_difficulty_threshold();
+    m_RegexAutomata_FailedAutomatonThreshold = p.str_regex_automata_failed_automaton_threshold();
+    m_RegexAutomata_FailedIntersectionThreshold = p.str_regex_automata_failed_intersection_threshold();
+    m_RegexAutomata_LengthAttemptThreshold = p.str_regex_automata_length_attempt_threshold();
 }
diff --git a/src/smt/params/theory_str_params.h b/src/smt/params/theory_str_params.h
index 48c0205ee..8c7816839 100644
--- a/src/smt/params/theory_str_params.h
+++ b/src/smt/params/theory_str_params.h
@@ -93,6 +93,30 @@ struct theory_str_params {
      */
     unsigned m_RegexAutomata_DifficultyThreshold;
 
+    /*
+     * RegexAutomata_IntersectionDifficultyThreshold is the lowest difficulty above which Z3str3
+     * will not eagerly intersect automata to check unsatisfiability.
+     */
+    unsigned m_RegexAutomata_IntersectionDifficultyThreshold;
+
+    /*
+     * RegexAutomata_FailedAutomatonThreshold is the number of failed attempts to build an automaton
+     * after which a full automaton (i.e. with no length information) will be built regardless of difficulty.
+     */
+    unsigned m_RegexAutomata_FailedAutomatonThreshold;
+
+    /*
+     * RegexAutomaton_FailedIntersectionThreshold is the number of failed attempts to perform automaton
+     * intersection after which intersection will always be performed regardless of difficulty.
+     */
+    unsigned m_RegexAutomata_FailedIntersectionThreshold;
+
+    /*
+     * RegexAutomaton_LengthAttemptThreshold is the number of attempts to satisfy length/path constraints
+     * before which we begin checking unsatisfiability of a regex term.
+     */
+    unsigned m_RegexAutomata_LengthAttemptThreshold;
+
     theory_str_params(params_ref const & p = params_ref()):
         m_StrongArrangements(true),
         m_AggressiveLengthTesting(false),
@@ -106,7 +130,11 @@ struct theory_str_params {
         m_BinarySearchInitialUpperBound(64),
         m_OverlapTheoryAwarePriority(-0.1),
         m_RegexAutomata(true),
-        m_RegexAutomata_DifficultyThreshold(1000)
+        m_RegexAutomata_DifficultyThreshold(1000),
+        m_RegexAutomata_IntersectionDifficultyThreshold(1000),
+        m_RegexAutomata_FailedAutomatonThreshold(10),
+        m_RegexAutomata_FailedIntersectionThreshold(10),
+        m_RegexAutomata_LengthAttemptThreshold(10)
     {
         updt_params(p);
     }
diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 5910d9d95..6e4211f7b 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -6480,6 +6480,26 @@ namespace smt {
         }
     }
 
+    void theory_str::regex_inc_counter(obj_map<expr, unsigned> & counter_map, expr * key) {
+        unsigned old_v;
+        if (counter_map.find(key, old_v)) {
+            unsigned new_v = old_v += 1;
+            counter_map.insert(key, new_v);
+        } else {
+            counter_map.insert(key, 1);
+        }
+    }
+
+    unsigned theory_str::regex_get_counter(obj_map<expr, unsigned> & counter_map, expr * key) {
+        unsigned v;
+        if (counter_map.find(key, v)) {
+            return v;
+        } else {
+            counter_map.insert(key, 0);
+            return 0;
+        }
+    }
+
     // saturating unsigned addition
     unsigned inline _qadd(unsigned a, unsigned b) {
         if (a == UINT_MAX || b == UINT_MAX) {
@@ -9515,6 +9535,7 @@ namespace smt {
 
         // regex automata
         if (m_params.m_RegexAutomata) {
+            // TODO since heuristics might fail, the "no progress" flag might need to be handled specially here
             bool regex_axiom_add = false;
             for (obj_hashtable<expr>::iterator it = regex_terms.begin(); it != regex_terms.end(); ++it) {
                 expr * str_in_re = *it;
@@ -9579,11 +9600,16 @@ namespace smt {
                         regex_terms_with_path_constraints.insert(str_in_re);
                         m_trail_stack.push(insert_obj_trail<theory_str, expr>(regex_terms_with_path_constraints, str_in_re));
                         regex_axiom_add = true;
+
+                        // increment LengthAttemptCount
+                        regex_inc_counter(regex_length_attempt_count, str_in_re);
                         continue;
                     } else {
                         // no automata available, or else all bounds assumptions are invalid
                         unsigned expected_complexity = estimate_regex_complexity(re);
-                        if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) {
+                        if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold) {
+                            CTRACE("str", regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold,
+                                    tout << "failed automaton threshold reached for " << mk_pp(str_in_re, m) << " -- automatically constructing full automaton" << std::endl;);
                             eautomaton * aut = m_mk_aut(re);
                             aut->compress();
                             regex_automata.push_back(aut);
@@ -9596,8 +9622,7 @@ namespace smt {
                             regex_axiom_add = true;
                             // TODO immediately attempt to learn lower/upper bound info here
                         } else {
-                            // TODO increment failure count
-                            NOT_IMPLEMENTED_YET();
+                            regex_inc_counter(regex_fail_count, str_in_re);
                         }
                         continue;
                     }
@@ -9852,7 +9877,68 @@ namespace smt {
                 }
 
                 // NOT_IMPLEMENTED_YET();
-            }
+            } // foreach (entry in regex_terms)
+
+            for (obj_map<expr, ptr_vector<expr> >::iterator it = regex_terms_by_string.begin();
+                    it != regex_terms_by_string.end(); ++it) {
+                // TODO do we need to check equivalence classes of strings here?
+
+                expr * str = it->m_key;
+                ptr_vector<expr> str_in_re_terms = it->m_value;
+
+                svector<regex_automaton_under_assumptions> intersect_constraints;
+                // we may find empty intersection before checking every constraint;
+                // this vector keeps track of which ones actually take part in intersection
+                svector<regex_automaton_under_assumptions> used_intersect_constraints;
+
+                // choose an automaton/assumption for each assigned (str.in.re)
+                // that's consistent with the current length information
+                NOT_IMPLEMENTED_YET();
+
+                eautomaton * aut_inter = NULL;
+                CTRACE("str", !intersect_constraints.empty(), tout << "check intersection of automata constraints for " << mk_pp(str, m) << std::endl;);
+                for (svector<regex_automaton_under_assumptions>::iterator aut_it = intersect_constraints.begin();
+                        aut_it != intersect_constraints.end(); ++aut_it) {
+                    regex_automaton_under_assumptions aut = *aut_it;
+                    if (aut_inter == NULL) {
+                        // start somewhere
+                        aut_inter = aut.get_automaton();
+                        used_intersect_constraints.push_back(aut);
+                        continue;
+                    }
+
+                    if (regex_get_counter(regex_length_attempt_count, aut.get_regex_term()) >= m_params.m_RegexAutomata_LengthAttemptThreshold) {
+                        unsigned intersectionDifficulty = 0; // TODO EstimateIntersectionDifficulty
+                        if (intersectionDifficulty <= m_params.m_RegexAutomata_IntersectionDifficultyThreshold
+                                || regex_get_counter(regex_intersection_fail_count, aut.get_regex_term()) >= m_params.m_RegexAutomata_FailedIntersectionThreshold) {
+                            lbool current_assignment = ctx.get_assignment(aut.get_regex_term());
+                            // if the assignment is consistent with our assumption, use the automaton directly;
+                            // otherwise, complement it (and save that automaton for next time)
+                            // TODO we should cache these intermediate results
+                            // TODO do we need to push the intermediates into a vector for deletion anyway?
+                            if ( (current_assignment == l_true && aut.get_polarity())
+                                    || (current_assignment == l_false && !aut.get_polarity())) {
+                                aut_inter = m_mk_aut.mk_product(aut_inter, aut.get_automaton());
+                            } else {
+                                // need to complement first
+                                NOT_IMPLEMENTED_YET();
+                            }
+                            used_intersect_constraints.push_back(aut);
+                            if (aut_inter->is_empty()) {
+                                break;
+                            }
+                        } else {
+                            // failed intersection
+                            regex_inc_counter(regex_intersection_fail_count, aut.get_regex_term());
+                        }
+                    }
+                } // foreach(entry in intersect_constraints)
+                if (aut_inter->is_empty()) {
+                    TRACE("str", tout << "product automaton is empty; asserting conflict clause" << std::endl;);
+                    NOT_IMPLEMENTED_YET();
+                }
+            } // foreach (entry in regex_terms_by_string)
+
             if (regex_axiom_add) {
                 return FC_CONTINUE;
             }
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index c64eec838..45df675fb 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -221,6 +221,7 @@ public:
     }
 
     eautomaton * get_automaton() const { return aut; }
+    expr * get_regex_term() const { return str_in_re; }
     bool get_polarity() const { return polarity; }
 
     virtual ~regex_automaton_under_assumptions() {
@@ -409,6 +410,12 @@ protected:
     std::map<expr*, nfa> regex_nfa_cache; // Regex term --> NFA
     obj_hashtable<expr> regex_terms_with_path_constraints; // set of string terms which have had path constraints asserted in the current scope
 
+    // each counter maps a (str.in.re) expression to an integer.
+    // use helper functions regex_inc_counter() and regex_get_counter() to access
+    obj_map<expr, unsigned> regex_length_attempt_count;
+    obj_map<expr, unsigned> regex_fail_count;
+    obj_map<expr, unsigned> regex_intersection_fail_count;
+
     svector<char> char_set;
     std::map<char, int>  charSetLookupTable;
     int           charSetSize;
@@ -542,6 +549,8 @@ protected:
     expr_ref generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal, expr_ref & characterConstraints);
     void aut_path_add_next(u_map<expr*>& next, expr_ref_vector& trail, unsigned idx, expr* cond);
     expr_ref aut_path_rewrite_constraint(expr * cond, expr * ch_var);
+    void regex_inc_counter(obj_map<expr, unsigned> & counter_map, expr * key);
+    unsigned regex_get_counter(obj_map<expr, unsigned> & counter_map, expr * key);
 
     void set_up_axioms(expr * ex);
     void handle_equality(expr * lhs, expr * rhs);

From 6f889ab699b4115f92530040840d2fd01489717e Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Mon, 15 Jan 2018 14:08:15 -0500
Subject: [PATCH 10/36] intersection WIP; fix polarity of generated path
 constraint LHS

---
 src/smt/theory_str.cpp | 74 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 71 insertions(+), 3 deletions(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 6e4211f7b..27757a53e 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -6982,6 +6982,9 @@ namespace smt {
             pathChars_len_constraints.push_back(ctx.mk_eq_atom(mk_strlen(ch), m_autil.mk_numeral(rational::one(), true)));
         }
 
+        // TODO(mtrberzi) possibly modify this to reuse character terms over the same string,
+        // i.e. across different constraints over S the same variables S_0, S_1, ... are always used and refreshed
+
         // modification of code in seq_rewriter::mk_str_in_regexp()
         expr_ref_vector trail(m);
         u_map<expr*> maps[2];
@@ -9536,6 +9539,7 @@ namespace smt {
         // regex automata
         if (m_params.m_RegexAutomata) {
             // TODO since heuristics might fail, the "no progress" flag might need to be handled specially here
+            // TODO learning of linear length constraints in the style of length automata, if possible?
             bool regex_axiom_add = false;
             for (obj_hashtable<expr>::iterator it = regex_terms.begin(); it != regex_terms.end(); ++it) {
                 expr * str_in_re = *it;
@@ -9584,16 +9588,22 @@ namespace smt {
                         TRACE("str", tout << "character constraints are " << mk_pp(characterConstraints, m) << std::endl;);
 
                         expr_ref_vector lhs_terms(m);
-                        lhs_terms.push_back(str_in_re);
+                        if (current_assignment == l_true) {
+                            lhs_terms.push_back(str_in_re);
+                        } else {
+                            lhs_terms.push_back(m.mk_not(str_in_re));
+                        }
                         lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true)));
                         expr_ref lhs(mk_and(lhs_terms), m);
                         // If the automaton was built with the same polarity as the constraint,
                         // assert directly. Otherwise, negate the path constraint
                         if ( (current_assignment == l_true && assumption.get_polarity())
                                 || (current_assignment == l_false && !assumption.get_polarity())) {
+                            TRACE("str", tout << "automaton and regex term have same polarity" << std::endl;);
                             expr_ref rhs(m.mk_and(pathConstraint, characterConstraints), m);
                             assert_implication(lhs, rhs);
                         } else {
+                            TRACE("str", tout << "automaton and regex term have opposite polarity" << std::endl;);
                             expr_ref rhs(m.mk_and(m.mk_not(pathConstraint), characterConstraints), m);
                             assert_implication(lhs, rhs);
                         }
@@ -9893,7 +9903,64 @@ namespace smt {
 
                 // choose an automaton/assumption for each assigned (str.in.re)
                 // that's consistent with the current length information
-                NOT_IMPLEMENTED_YET();
+                for (ptr_vector<expr>::iterator term_it = str_in_re_terms.begin();
+                        term_it != str_in_re_terms.end(); ++term_it) {
+                    expr * _unused;
+                    expr * re;
+                    SASSERT(u.str.is_in_re(*term_it));
+                    u.str.is_in_re(*term_it, _unused, re);
+
+                    rational exact_len;
+                    bool has_exact_len = get_len_value(str, exact_len);
+
+                    rational lb, ub;
+                    bool has_lower_bound = lower_bound(mk_strlen(str), lb);
+                    bool has_upper_bound = upper_bound(mk_strlen(str), ub);
+
+                    if (regex_automaton_assumptions.contains(re) &&
+                                                    !regex_automaton_assumptions[re].empty()){
+                        for (svector<regex_automaton_under_assumptions>::iterator aut_it = regex_automaton_assumptions[re].begin();
+                                                           aut_it != regex_automaton_assumptions[re].end(); ++aut_it) {
+                            regex_automaton_under_assumptions aut = *aut_it;
+                            rational aut_ub;
+                            bool assume_ub = aut.get_upper_bound(aut_ub);
+                            rational aut_lb;
+                            bool assume_lb = aut.get_lower_bound(aut_lb);
+                            bool consistent = true;
+
+                            if (assume_ub) {
+                                // check consistency of assumed upper bound
+                                if (has_exact_len) {
+                                    if (exact_len > aut_ub) {
+                                        consistent = false;
+                                    }
+                                } else {
+                                    if (has_upper_bound && ub > aut_ub) {
+                                        consistent = false;
+                                    }
+                                }
+                            }
+
+                            if (assume_lb) {
+                                // check consistency of assumed lower bound
+                                if (has_exact_len) {
+                                    if (exact_len < aut_lb) {
+                                        consistent = false;
+                                    }
+                                } else {
+                                    if (has_lower_bound && lb < aut_lb) {
+                                        consistent = false;
+                                    }
+                                }
+                            }
+
+                            if (consistent) {
+                                intersect_constraints.push_back(aut);
+                                break;
+                            }
+                        }
+                    }
+                } // foreach(term in str_in_re_terms)
 
                 eautomaton * aut_inter = NULL;
                 CTRACE("str", !intersect_constraints.empty(), tout << "check intersection of automata constraints for " << mk_pp(str, m) << std::endl;);
@@ -9933,7 +10000,8 @@ namespace smt {
                         }
                     }
                 } // foreach(entry in intersect_constraints)
-                if (aut_inter->is_empty()) {
+                TRACE("str", tout << "intersected " << used_intersect_constraints.size() << " constraints" << std::endl;);
+                if (aut_inter != NULL && aut_inter->is_empty()) {
                     TRACE("str", tout << "product automaton is empty; asserting conflict clause" << std::endl;);
                     NOT_IMPLEMENTED_YET();
                 }

From 058d24fd096ec30b0f55fe84afa56c9213b045e7 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Mon, 15 Jan 2018 14:30:12 -0500
Subject: [PATCH 11/36] reuse regex character constraints for the same string

---
 src/smt/theory_str.cpp | 43 +++++++++++++++++++++++++++++++++---------
 src/smt/theory_str.h   |  1 +
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 27757a53e..1fc04c77f 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -6974,16 +6974,41 @@ namespace smt {
 
         expr_ref_vector pathChars(m);
         expr_ref_vector pathChars_len_constraints(m);
-        for (unsigned i = 0; i < lenVal.get_unsigned(); ++i) {
-            std::stringstream ss;
-            ss << "ch" << i;
-            expr_ref ch(mk_str_var(ss.str()), m);
-            pathChars.push_back(ch);
-            pathChars_len_constraints.push_back(ctx.mk_eq_atom(mk_strlen(ch), m_autil.mk_numeral(rational::one(), true)));
-        }
 
-        // TODO(mtrberzi) possibly modify this to reuse character terms over the same string,
-        // i.e. across different constraints over S the same variables S_0, S_1, ... are always used and refreshed
+        // reuse character terms over the same string
+        if (string_chars.contains(stringTerm)) {
+            // find out whether we have enough characters already
+            ptr_vector<expr> old_chars;
+            string_chars.find(stringTerm, old_chars);
+            if (old_chars.size() < lenVal.get_unsigned()) {
+                for (unsigned i = old_chars.size(); i < lenVal.get_unsigned(); ++i) {
+                    std::stringstream ss;
+                    ss << "ch" << i;
+                    expr_ref ch(mk_str_var(ss.str()), m);
+                    m_trail.push_back(ch);
+                    old_chars.push_back(ch);
+                }
+            }
+            string_chars.insert(stringTerm, old_chars);
+            // now we're guaranteed to have at least the right number of characters in old_chars
+            for (unsigned i = 0; i < lenVal.get_unsigned(); ++i) {
+                expr_ref ch(old_chars.get(i), m);
+                refresh_theory_var(ch);
+                pathChars.push_back(ch);
+                pathChars_len_constraints.push_back(ctx.mk_eq_atom(mk_strlen(ch), m_autil.mk_numeral(rational::one(), true)));
+            }
+        } else {
+            ptr_vector<expr> new_chars;
+            for (unsigned i = 0; i < lenVal.get_unsigned(); ++i) {
+                std::stringstream ss;
+                ss << "ch" << i;
+                expr_ref ch(mk_str_var(ss.str()), m);
+                pathChars.push_back(ch);
+                pathChars_len_constraints.push_back(ctx.mk_eq_atom(mk_strlen(ch), m_autil.mk_numeral(rational::one(), true)));
+                new_chars.push_back(ch);
+            }
+            string_chars.insert(stringTerm, new_chars);
+        }
 
         // modification of code in seq_rewriter::mk_str_in_regexp()
         expr_ref_vector trail(m);
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index 45df675fb..86929d4d3 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -415,6 +415,7 @@ protected:
     obj_map<expr, unsigned> regex_length_attempt_count;
     obj_map<expr, unsigned> regex_fail_count;
     obj_map<expr, unsigned> regex_intersection_fail_count;
+    obj_map<expr, ptr_vector<expr> > string_chars; // S --> [S_0, S_1, ...] for character terms S_i
 
     svector<char> char_set;
     std::map<char, int>  charSetLookupTable;

From 191cc30e2a6116b1f3c03c05495bb52da762275a Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Mon, 15 Jan 2018 15:30:12 -0500
Subject: [PATCH 12/36] intersection of regex constraints produces a conflict
 clause

---
 src/smt/theory_str.cpp | 56 +++++++++++++++++++++++++++++++++++++-----
 src/smt/theory_str.h   | 10 ++++----
 2 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 1fc04c77f..4c36b0f45 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -9637,7 +9637,13 @@ namespace smt {
                         regex_axiom_add = true;
 
                         // increment LengthAttemptCount
-                        regex_inc_counter(regex_length_attempt_count, str_in_re);
+                        regex_inc_counter(regex_length_attempt_count, re);
+
+                        {
+                            unsigned v = regex_get_counter(regex_length_attempt_count, re);
+                            TRACE("str", tout << "length attempt count for " << mk_pp(re, m) << " is " << v << std::endl;);
+                        }
+
                         continue;
                     } else {
                         // no automata available, or else all bounds assumptions are invalid
@@ -9910,8 +9916,6 @@ namespace smt {
                         }
                     }
                 }
-
-                // NOT_IMPLEMENTED_YET();
             } // foreach (entry in regex_terms)
 
             for (obj_map<expr, ptr_vector<expr> >::iterator it = regex_terms_by_string.begin();
@@ -9999,11 +10003,19 @@ namespace smt {
                         continue;
                     }
 
+                    {
+                        unsigned v = regex_get_counter(regex_length_attempt_count, aut.get_regex_term());
+                        TRACE("str", tout << "length attempt count of " << mk_pp(aut.get_regex_term(), m) << " is " << v
+                                << ", threshold is " << m_params.m_RegexAutomata_LengthAttemptThreshold << std::endl;);
+                    }
+
                     if (regex_get_counter(regex_length_attempt_count, aut.get_regex_term()) >= m_params.m_RegexAutomata_LengthAttemptThreshold) {
                         unsigned intersectionDifficulty = 0; // TODO EstimateIntersectionDifficulty
                         if (intersectionDifficulty <= m_params.m_RegexAutomata_IntersectionDifficultyThreshold
                                 || regex_get_counter(regex_intersection_fail_count, aut.get_regex_term()) >= m_params.m_RegexAutomata_FailedIntersectionThreshold) {
-                            lbool current_assignment = ctx.get_assignment(aut.get_regex_term());
+
+                            expr * str_in_re_term(u.re.mk_in_re(str, aut.get_regex_term()));
+                            lbool current_assignment = ctx.get_assignment(str_in_re_term);
                             // if the assignment is consistent with our assumption, use the automaton directly;
                             // otherwise, complement it (and save that automaton for next time)
                             // TODO we should cache these intermediate results
@@ -10013,7 +10025,12 @@ namespace smt {
                                 aut_inter = m_mk_aut.mk_product(aut_inter, aut.get_automaton());
                             } else {
                                 // need to complement first
-                                NOT_IMPLEMENTED_YET();
+                                expr_ref rc(u.re.mk_complement(aut.get_regex_term()), m);
+                                eautomaton * aut_c = m_mk_aut(rc);
+                                regex_automata.push_back(aut_c);
+                                // TODO is there any way to build a complement automaton from an existing one?
+                                // this discards length information
+                                aut_inter = m_mk_aut.mk_product(aut_inter, aut_c);
                             }
                             used_intersect_constraints.push_back(aut);
                             if (aut_inter->is_empty()) {
@@ -10028,7 +10045,34 @@ namespace smt {
                 TRACE("str", tout << "intersected " << used_intersect_constraints.size() << " constraints" << std::endl;);
                 if (aut_inter != NULL && aut_inter->is_empty()) {
                     TRACE("str", tout << "product automaton is empty; asserting conflict clause" << std::endl;);
-                    NOT_IMPLEMENTED_YET();
+                    expr_ref_vector conflict_terms(m);
+
+                    for (svector<regex_automaton_under_assumptions>::iterator aut_it = used_intersect_constraints.begin();
+                            aut_it != used_intersect_constraints.end(); ++aut_it) {
+                        regex_automaton_under_assumptions aut = *aut_it;
+                        expr * str_in_re_term(u.re.mk_in_re(str, aut.get_regex_term()));
+                        lbool current_assignment = ctx.get_assignment(str_in_re_term);
+                        if (current_assignment == l_true) {
+                            conflict_terms.push_back(str_in_re_term);
+                        } else if (current_assignment == l_false) {
+                            conflict_terms.push_back(m.mk_not(str_in_re_term));
+                        }
+                        // add length assumptions, if any
+                        rational ub;
+                        if (aut.get_upper_bound(ub)) {
+                            expr_ref ub_term(m_autil.mk_le(mk_strlen(str), m_autil.mk_numeral(ub, true)), m);
+                            conflict_terms.push_back(ub_term);
+                        }
+                        rational lb;
+                        if (aut.get_lower_bound(lb)) {
+                            expr_ref lb_term(m_autil.mk_ge(mk_strlen(str), m_autil.mk_numeral(lb, true)), m);
+                            conflict_terms.push_back(lb_term);
+                        }
+                    }
+
+                    expr_ref conflict_clause(m.mk_not(mk_and(conflict_terms)), m);
+                    assert_axiom(conflict_clause);
+                    regex_axiom_add = true;
                 }
             } // foreach (entry in regex_terms_by_string)
 
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index 86929d4d3..acab8e019 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -168,7 +168,7 @@ public:
 
 class regex_automaton_under_assumptions {
 protected:
-    expr * str_in_re;
+    expr * re_term;
     eautomaton * aut;
     bool polarity;
 
@@ -179,11 +179,11 @@ protected:
     rational upper_bound;
 public:
     regex_automaton_under_assumptions() :
-        str_in_re(NULL), aut(NULL), polarity(false),
+        re_term(NULL), aut(NULL), polarity(false),
         assume_lower_bound(false), assume_upper_bound(false) {}
 
-    regex_automaton_under_assumptions(expr * str_in_re, eautomaton * aut, bool polarity) :
-        str_in_re(str_in_re), aut(aut), polarity(polarity),
+    regex_automaton_under_assumptions(expr * re_term, eautomaton * aut, bool polarity) :
+        re_term(re_term), aut(aut), polarity(polarity),
         assume_lower_bound(false), assume_upper_bound(false) {}
 
     void set_lower_bound(rational & lb) {
@@ -221,7 +221,7 @@ public:
     }
 
     eautomaton * get_automaton() const { return aut; }
-    expr * get_regex_term() const { return str_in_re; }
+    expr * get_regex_term() const { return re_term; }
     bool get_polarity() const { return polarity; }
 
     virtual ~regex_automaton_under_assumptions() {

From 6c4c9a10e4be2ae1baf646580f635a831f319b02 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Tue, 16 Jan 2018 13:16:31 -0500
Subject: [PATCH 13/36] regex length linearity check WIP

---
 src/smt/theory_str.cpp | 34 ++++++++++++++++++++++++++++++++++
 src/smt/theory_str.h   |  2 ++
 2 files changed, 36 insertions(+)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 4c36b0f45..c19b17aec 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -6604,6 +6604,40 @@ namespace smt {
             return 1;
         }
     }
+
+    // Check whether a regex translates well to a linear set of length constraints.
+    bool theory_str::check_regex_length_linearity(expr * re) {
+        return check_regex_length_linearity_helper(re, false);
+    }
+
+    bool theory_str::check_regex_length_linearity_helper(expr * re, bool already_star) {
+        expr * sub1;
+        expr * sub2;
+        if (u.re.is_to_re(re)) {
+            return true;
+        } else if (u.re.is_concat(re, sub1, sub2)) {
+            return check_regex_length_linearity_helper(sub1, already_star) && check_regex_length_linearity_helper(sub2, already_star);
+        } else if (u.re.is_union(re, sub1, sub2)) {
+            return check_regex_length_linearity_helper(sub1, already_star) && check_regex_length_linearity_helper(sub2, already_star);
+        } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) {
+            if (already_star) {
+                return false;
+            } else {
+                return check_regex_length_linearity_helper(sub1, true);
+            }
+        } else if (u.re.is_range(re)) {
+            return true;
+        } else if (u.re.is_full(re)) {
+            return true;
+        } else if (u.re.is_complement(re)) {
+            // TODO can we do better?
+            return false;
+        } else {
+            TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;);
+            UNREACHABLE(); return false;
+        }
+    }
+
     /*
      * Infer all length constraints implied by the given regular expression `re`
      * in order to constrain `lenVar` (which must be of sort Int).
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index acab8e019..26a6acf8d 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -544,6 +544,8 @@ protected:
     // regex automata and length-aware regex
     unsigned estimate_regex_complexity(expr * re);
     unsigned estimate_regex_complexity_under_complement(expr * re);
+    bool check_regex_length_linearity(expr * re);
+    bool check_regex_length_linearity_helper(expr * re, bool already_star);
     expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables);
     bool refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound);
     bool refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound);

From 153701eabe3562fd169d4bc863ebb1b5827fe842 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Tue, 16 Jan 2018 13:56:01 -0500
Subject: [PATCH 14/36] regex length term assertion WIP

---
 src/smt/theory_str.cpp | 49 ++++++++++++++++++++++++++++++++++++++++--
 src/smt/theory_str.h   |  3 +++
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index c19b17aec..59b3b7753 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -6735,8 +6735,6 @@ namespace smt {
     bool theory_str::refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound) {
         ENSURE(aut != NULL);
 
-        ast_manager & m = get_manager();
-
         if (aut->final_states().size() < 1) {
             // no solutions at all
             refined_lower_bound = rational::minus_one();
@@ -9610,6 +9608,53 @@ namespace smt {
                 if (current_assignment == l_undef) {
                     continue;
                 }
+
+                if (!regex_terms_with_length_constraints.contains(str_in_re)) {
+                    if (current_assignment == l_true && check_regex_length_linearity(re)) {
+                        TRACE("str", tout << "regex length constraints expected to be linear -- generating and asserting them" << std::endl;);
+
+                        if (regex_term_to_length_constraint.contains(str_in_re)) {
+                            // use existing length constraint
+                            expr * top_level_length_constraint;
+                            regex_term_to_length_constraint.find(str_in_re, top_level_length_constraint);
+
+                            ptr_vector<expr> extra_length_vars;
+                            regex_term_to_extra_length_vars.find(str_in_re, extra_length_vars);
+
+                            assert_axiom(top_level_length_constraint);
+                            for(ptr_vector<expr>::iterator it = extra_length_vars.begin(); it != extra_length_vars.end(); ++it) {
+                                expr * v = *it;
+                                refresh_theory_var(v);
+                                expr_ref len_constraint(m_autil.mk_ge(v, m_autil.mk_numeral(rational::zero(), true)), m);
+                                assert_axiom(len_constraint);
+                            }
+                        } else {
+                            // generate new length constraint
+                            expr_ref_vector extra_length_vars(m);
+                            expr_ref top_level_length_constraint = infer_all_regex_lengths(mk_strlen(str), re, extra_length_vars);
+                            TRACE("str", tout << "top-level length constraint: " << mk_pp(top_level_length_constraint, m) << std::endl;);
+                            // assert and track length constraint
+                            assert_axiom(top_level_length_constraint);
+                            for(expr_ref_vector::iterator it = extra_length_vars.begin(); it != extra_length_vars.end(); ++it) {
+                                expr * v = *it;
+                                expr_ref len_constraint(m_autil.mk_ge(v, m_autil.mk_numeral(rational::zero(), true)), m);
+                                assert_axiom(len_constraint);
+                            }
+
+                            regex_term_to_length_constraint.insert(str_in_re, top_level_length_constraint);
+                            ptr_vector<expr> vtmp;
+                            for(expr_ref_vector::iterator it = extra_length_vars.begin(); it != extra_length_vars.end(); ++it) {
+                                vtmp.push_back(*it);
+                            }
+                            regex_term_to_extra_length_vars.insert(str_in_re, vtmp);
+                        }
+
+                        regex_terms_with_length_constraints.insert(str_in_re);
+                        m_trail_stack.push(insert_obj_trail<theory_str, expr>(regex_terms_with_length_constraints, str_in_re));
+                        regex_axiom_add = true;
+                    }
+                } // re not in regex_terms_with_length_constraints
+
                 rational exact_length_value;
                 if (get_len_value(str, exact_length_value)) {
                     TRACE("str", tout << "exact length of " << mk_pp(str, m) << " is " << exact_length_value << std::endl;);
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index 26a6acf8d..0d56211cd 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -409,6 +409,9 @@ protected:
     obj_map<expr, svector<regex_automaton_under_assumptions> > regex_automaton_assumptions; // RegEx --> [ aut+assumptions ]
     std::map<expr*, nfa> regex_nfa_cache; // Regex term --> NFA
     obj_hashtable<expr> regex_terms_with_path_constraints; // set of string terms which have had path constraints asserted in the current scope
+    obj_hashtable<expr> regex_terms_with_length_constraints; // set of regex terms which had had length constraints asserted in the current scope
+    obj_map<expr, expr*> regex_term_to_length_constraint; // (str.in.re S R) -> (length constraint over S wrt. R)
+    obj_map<expr, ptr_vector<expr> > regex_term_to_extra_length_vars; // extra length vars used in regex_term_to_length_constraint entries
 
     // each counter maps a (str.in.re) expression to an integer.
     // use helper functions regex_inc_counter() and regex_get_counter() to access

From e5585ecf4c65e1c7427339a6a86998afad450220 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Tue, 16 Jan 2018 18:15:29 -0500
Subject: [PATCH 15/36] regex fail count and automaton fallback

---
 src/smt/theory_str.cpp | 29 +++++++++++++++++++++--------
 src/smt/theory_str.h   |  1 +
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 59b3b7753..474dd9c39 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -9854,8 +9854,11 @@ namespace smt {
                         } else {
                             // TODO check negation?
                             // TODO construct a partial automaton for R to the given upper bound?
-                            // TODO increment failure count if we can't
-                            NOT_IMPLEMENTED_YET();
+                            if (false) {
+
+                            } else {
+                                regex_inc_counter(regex_fail_count, str_in_re);
+                            }
                         }
                         continue;
                     }
@@ -9866,8 +9869,8 @@ namespace smt {
                     }
                 } else { // !upper_bound_exists
                     // no upper bound information
-                    if (lower_bound_exists) {
-                        // lower bound, no upper bound
+                    if (lower_bound_exists && !lower_bound_value.is_zero()) {
+                        // nonzero lower bound, no upper bound
 
                         // check current assumptions
                         if (regex_automaton_assumptions.contains(re) &&
@@ -9967,7 +9970,11 @@ namespace smt {
                                 // TODO check negation?
                                 // TODO construct a partial automaton for R to the given lower bound?
                                 // TODO increment failure count
-                                NOT_IMPLEMENTED_YET();
+                                if (false) {
+
+                                } else {
+                                    regex_inc_counter(regex_fail_count, str_in_re);
+                                }
                             }
                             continue;
                         }
@@ -9976,10 +9983,12 @@ namespace smt {
                         // check for existing automata;
                         // try to construct an automaton if we don't have one yet
                         // and doing so without bounds is not difficult
-                        NOT_IMPLEMENTED_YET();
-                        if (true) {
+                        bool existingAutomata = (regex_automaton_assumptions.contains(re) && !regex_automaton_assumptions[re].empty());
+                        bool failureThresholdExceeded = (regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold);
+                        if (!existingAutomata || failureThresholdExceeded) {
                             unsigned expected_complexity = estimate_regex_complexity(re);
-                            if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) {
+                            if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold
+                                    || failureThresholdExceeded) {
                                 eautomaton * aut = m_mk_aut(re);
                                 aut->compress();
                                 regex_automata.push_back(aut);
@@ -9991,7 +10000,11 @@ namespace smt {
                                 TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
                                 regex_axiom_add = true;
                                 // TODO immediately attempt to learn lower/upper bound info here
+                            } else {
+                                regex_inc_counter(regex_fail_count, str_in_re);
                             }
+                        } else {
+                            regex_inc_counter(regex_fail_count, str_in_re);
                         }
                     }
                 }
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index 0d56211cd..7398f599b 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -418,6 +418,7 @@ protected:
     obj_map<expr, unsigned> regex_length_attempt_count;
     obj_map<expr, unsigned> regex_fail_count;
     obj_map<expr, unsigned> regex_intersection_fail_count;
+
     obj_map<expr, ptr_vector<expr> > string_chars; // S --> [S_0, S_1, ...] for character terms S_i
 
     svector<char> char_set;

From 26ab91a4480f60a0c8ecd7a6d5cac8bc502eecc9 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Wed, 17 Jan 2018 13:02:32 -0500
Subject: [PATCH 16/36] check duplicate bounds info for regex terms

---
 src/smt/theory_str.cpp | 32 ++++++++++++++++++++++++++++++--
 src/smt/theory_str.h   |  5 +++++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 474dd9c39..3dccc5681 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -9755,7 +9755,32 @@ namespace smt {
                 CTRACE("str", lower_bound_exists, tout << "lower bound of " << mk_pp(str, m) << " is " << lower_bound_value << std::endl;);
                 CTRACE("str", upper_bound_exists, tout << "upper bound of " << mk_pp(str, m) << " is " << upper_bound_value << std::endl;);
 
-                if (upper_bound_exists) {
+                bool new_lower_bound_info = true;
+                bool new_upper_bound_info = true;
+                // check last seen lower/upper bound to avoid performing duplicate work
+                if (regex_last_lower_bound.contains(str)) {
+                    rational last_lb_value;
+                    regex_last_lower_bound.find(str, last_lb_value);
+                    if (last_lb_value == lower_bound_value) {
+                        new_lower_bound_info = false;
+                    }
+                }
+                if (regex_last_upper_bound.contains(str)) {
+                    rational last_ub_value;
+                    regex_last_upper_bound.find(str, last_ub_value);
+                    if (last_ub_value == upper_bound_value) {
+                        new_upper_bound_info = false;
+                    }
+                }
+
+                if (new_lower_bound_info) {
+                    regex_last_lower_bound.insert(str, lower_bound_value);
+                }
+                if (new_upper_bound_info) {
+                    regex_last_upper_bound.insert(str, upper_bound_value);
+                }
+
+                if (upper_bound_exists && new_upper_bound_info) {
                     // check current assumptions
                     if (regex_automaton_assumptions.contains(re) &&
                             !regex_automaton_assumptions[re].empty()){
@@ -9869,7 +9894,7 @@ namespace smt {
                     }
                 } else { // !upper_bound_exists
                     // no upper bound information
-                    if (lower_bound_exists && !lower_bound_value.is_zero()) {
+                    if (lower_bound_exists && !lower_bound_value.is_zero() && new_lower_bound_info) {
                         // nonzero lower bound, no upper bound
 
                         // check current assumptions
@@ -9930,10 +9955,13 @@ namespace smt {
                                         rhs.push_back(ctx.mk_eq_atom(str_len, m_autil.mk_numeral(lower_bound_value, true)));
                                     } else {
                                         // If there are solutions at and above the lower bound, add an additional bound.
+                                        // DISABLED as this is causing non-termination in the integer solver. --mtrberzi
+                                        /*
                                         rhs.push_back(m.mk_or(
                                                 ctx.mk_eq_atom(str_len, m_autil.mk_numeral(lower_bound_value, true)),
                                                 m_autil.mk_ge(str_len, m_autil.mk_numeral(refined_lower_bound, true))
                                         ));
+                                        */
                                     }
                                 } else {
                                     if (refined_lower_bound.is_minus_one()) {
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index 7398f599b..cf3958ceb 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -413,6 +413,11 @@ protected:
     obj_map<expr, expr*> regex_term_to_length_constraint; // (str.in.re S R) -> (length constraint over S wrt. R)
     obj_map<expr, ptr_vector<expr> > regex_term_to_extra_length_vars; // extra length vars used in regex_term_to_length_constraint entries
 
+    // keep track of the last lower/upper bound we saw for each string term
+    // so we don't perform duplicate work
+    obj_map<expr, rational> regex_last_lower_bound;
+    obj_map<expr, rational> regex_last_upper_bound;
+
     // each counter maps a (str.in.re) expression to an integer.
     // use helper functions regex_inc_counter() and regex_get_counter() to access
     obj_map<expr, unsigned> regex_length_attempt_count;

From c0ed683882c77143539050149cb1e6ef27cbd735 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Wed, 17 Jan 2018 13:32:44 -0500
Subject: [PATCH 17/36] disable regex length constraint generation as it
 currently makes unsound axioms

---
 src/smt/theory_str.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 3dccc5681..9aec6fab7 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -9609,6 +9609,8 @@ namespace smt {
                     continue;
                 }
 
+                // DISABLED due to bug -- star-over-union (and possibly others) results in unsound constraints
+                /*
                 if (!regex_terms_with_length_constraints.contains(str_in_re)) {
                     if (current_assignment == l_true && check_regex_length_linearity(re)) {
                         TRACE("str", tout << "regex length constraints expected to be linear -- generating and asserting them" << std::endl;);
@@ -9654,6 +9656,7 @@ namespace smt {
                         regex_axiom_add = true;
                     }
                 } // re not in regex_terms_with_length_constraints
+                */
 
                 rational exact_length_value;
                 if (get_len_value(str, exact_length_value)) {

From c2b268c64555199e9134948474899e87eb7b9707 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Wed, 17 Jan 2018 18:26:31 -0500
Subject: [PATCH 18/36] short path for length-0 regex terms

---
 src/smt/theory_str.cpp | 324 ++++++++++++++++++++++++++---------------
 src/smt/theory_str.h   |   1 +
 2 files changed, 206 insertions(+), 119 deletions(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 9aec6fab7..e25dc84f3 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -6646,6 +6646,9 @@ namespace smt {
      * In some cases, the returned formula requires one or more free integer variables to be created.
      * These variables are returned in the reference parameter `freeVariables`.
      * Extra assertions should be made for these free variables constraining them to be non-negative.
+     *
+     * TODO: star unrolling?
+     * TODO: generate stars "correctly" as a linear combination of all possible subterm lengths
      */
     expr_ref theory_str::infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables) {
         ENSURE(u.is_re(re));
@@ -6725,6 +6728,59 @@ namespace smt {
         }
     }
 
+    /*
+     * Assert initial lower and upper bounds for the positive constraint (str in re) corresponding
+     * to the automaton `aut`.
+     * This asserts a constraint of the form:
+     *   str_in_re --> (len(str) ?= 0 OR len(str) >= lb) AND len(str) <= ub
+     * where the upper bound clause is omitted if the upper bound doesn't exist
+     * and the equality with 0 is based on whether solutions of length 0 are allowed.
+     */
+    void theory_str::find_automaton_initial_bounds(expr * str_in_re, eautomaton * aut) {
+        ENSURE(aut != NULL);
+        context & ctx = get_context();
+        ast_manager & m = get_manager();
+
+        expr_ref_vector rhs(m);
+        expr * str;
+        expr * re;
+        u.str.is_in_re(str_in_re, str, re);
+        expr_ref strlen(mk_strlen(str), m);
+
+        // lower bound first
+        rational nonzero_lower_bound;
+        bool zero_sol_exists = refine_automaton_lower_bound(aut, rational::zero(), nonzero_lower_bound);
+        if (zero_sol_exists) {
+            regex_last_lower_bound.insert(str, rational::zero());
+            // solution at 0
+            if (!nonzero_lower_bound.is_minus_one()) {
+                expr_ref rhs1(ctx.mk_eq_atom(strlen, m_autil.mk_numeral(rational::zero(), true)), m);
+                expr_ref rhs2(m_autil.mk_ge(strlen, m_autil.mk_numeral(nonzero_lower_bound, true)), m);
+                rhs.push_back(m.mk_or(rhs1, rhs2));
+            } else {
+                // shouldn't happen
+                UNREACHABLE();
+            }
+        } else {
+            // no solution at 0
+            if (!nonzero_lower_bound.is_minus_one()) {
+                regex_last_lower_bound.insert(str, nonzero_lower_bound);
+                expr_ref rhs2(m_autil.mk_ge(strlen, m_autil.mk_numeral(nonzero_lower_bound, true)), m);
+                rhs.push_back(rhs2);
+            } else {
+                // shouldn't happen
+                UNREACHABLE();
+            }
+        }
+        // TODO upper bound check
+
+        if (!rhs.empty()) {
+            expr_ref lhs(str_in_re, m);
+            expr_ref _rhs(mk_and(rhs), m);
+            assert_implication(lhs, _rhs);
+        }
+    }
+
     /*
      * Refine the lower bound on the length of a solution to a given automaton.
      * The method returns TRUE if a solution of length `current_lower_bound` exists,
@@ -9596,7 +9652,6 @@ namespace smt {
         // regex automata
         if (m_params.m_RegexAutomata) {
             // TODO since heuristics might fail, the "no progress" flag might need to be handled specially here
-            // TODO learning of linear length constraints in the style of length automata, if possible?
             bool regex_axiom_add = false;
             for (obj_hashtable<expr>::iterator it = regex_terms.begin(); it != regex_terms.end(); ++it) {
                 expr * str_in_re = *it;
@@ -9667,89 +9722,96 @@ namespace smt {
                         continue;
                     }
 
-                    // find a consistent automaton for this term
-                    bool found = false;
-                    regex_automaton_under_assumptions assumption;
-                    if (regex_automaton_assumptions.contains(re) &&
-                                                !regex_automaton_assumptions[re].empty()){
-                        for (svector<regex_automaton_under_assumptions>::iterator it = regex_automaton_assumptions[re].begin();
-                                it != regex_automaton_assumptions[re].end(); ++it) {
-                            regex_automaton_under_assumptions autA = *it;
-                            rational assumed_upper_bound, assumed_lower_bound;
-                            bool assumes_upper_bound = autA.get_upper_bound(assumed_upper_bound);
-                            bool assumes_lower_bound = autA.get_lower_bound(assumed_lower_bound);
-                            if (!assumes_upper_bound && !assumes_lower_bound) {
-                                // automaton with no assumptions is always usable
-                                assumption = autA;
-                                found = true;
-                                break;
-                            }
-                            // check consistency of bounds assumptions
-                        } // foreach(a in regex_automaton_assumptions)
-                    }
-                    if (found) {
-                        expr_ref pathConstraint(m);
-                        expr_ref characterConstraints(m);
-                        pathConstraint = generate_regex_path_constraints(str, assumption.get_automaton(), exact_length_value, characterConstraints);
-                        TRACE("str", tout << "generated regex path constraint " << mk_pp(pathConstraint, m) << std::endl;);
-                        TRACE("str", tout << "character constraints are " << mk_pp(characterConstraints, m) << std::endl;);
-
-                        expr_ref_vector lhs_terms(m);
-                        if (current_assignment == l_true) {
-                            lhs_terms.push_back(str_in_re);
-                        } else {
-                            lhs_terms.push_back(m.mk_not(str_in_re));
-                        }
-                        lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true)));
-                        expr_ref lhs(mk_and(lhs_terms), m);
-                        // If the automaton was built with the same polarity as the constraint,
-                        // assert directly. Otherwise, negate the path constraint
-                        if ( (current_assignment == l_true && assumption.get_polarity())
-                                || (current_assignment == l_false && !assumption.get_polarity())) {
-                            TRACE("str", tout << "automaton and regex term have same polarity" << std::endl;);
-                            expr_ref rhs(m.mk_and(pathConstraint, characterConstraints), m);
-                            assert_implication(lhs, rhs);
-                        } else {
-                            TRACE("str", tout << "automaton and regex term have opposite polarity" << std::endl;);
-                            expr_ref rhs(m.mk_and(m.mk_not(pathConstraint), characterConstraints), m);
-                            assert_implication(lhs, rhs);
-                        }
-                        regex_terms_with_path_constraints.insert(str_in_re);
-                        m_trail_stack.push(insert_obj_trail<theory_str, expr>(regex_terms_with_path_constraints, str_in_re));
-                        regex_axiom_add = true;
-
-                        // increment LengthAttemptCount
-                        regex_inc_counter(regex_length_attempt_count, re);
-
-                        {
-                            unsigned v = regex_get_counter(regex_length_attempt_count, re);
-                            TRACE("str", tout << "length attempt count for " << mk_pp(re, m) << " is " << v << std::endl;);
-                        }
-
-                        continue;
+                    if (exact_length_value.is_zero()) {
+                        // shortcut
+                        expr_ref lhs(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(rational::zero(), true)), m);
+                        expr_ref rhs(ctx.mk_eq_atom(str, mk_string("")), m);
+                        assert_implication(lhs, rhs);
                     } else {
-                        // no automata available, or else all bounds assumptions are invalid
-                        unsigned expected_complexity = estimate_regex_complexity(re);
-                        if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold) {
-                            CTRACE("str", regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold,
-                                    tout << "failed automaton threshold reached for " << mk_pp(str_in_re, m) << " -- automatically constructing full automaton" << std::endl;);
-                            eautomaton * aut = m_mk_aut(re);
-                            aut->compress();
-                            regex_automata.push_back(aut);
-                            regex_automaton_under_assumptions new_aut(re, aut, true);
-                            if (!regex_automaton_assumptions.contains(re)) {
-                                regex_automaton_assumptions.insert(re, svector<regex_automaton_under_assumptions>());
-                            }
-                            regex_automaton_assumptions[re].push_back(new_aut);
-                            TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
-                            regex_axiom_add = true;
-                            // TODO immediately attempt to learn lower/upper bound info here
-                        } else {
-                            regex_inc_counter(regex_fail_count, str_in_re);
+                        // find a consistent automaton for this term
+                        bool found = false;
+                        regex_automaton_under_assumptions assumption;
+                        if (regex_automaton_assumptions.contains(re) &&
+                                !regex_automaton_assumptions[re].empty()){
+                            for (svector<regex_automaton_under_assumptions>::iterator it = regex_automaton_assumptions[re].begin();
+                                    it != regex_automaton_assumptions[re].end(); ++it) {
+                                regex_automaton_under_assumptions autA = *it;
+                                rational assumed_upper_bound, assumed_lower_bound;
+                                bool assumes_upper_bound = autA.get_upper_bound(assumed_upper_bound);
+                                bool assumes_lower_bound = autA.get_lower_bound(assumed_lower_bound);
+                                if (!assumes_upper_bound && !assumes_lower_bound) {
+                                    // automaton with no assumptions is always usable
+                                    assumption = autA;
+                                    found = true;
+                                    break;
+                                }
+                                // check consistency of bounds assumptions
+                            } // foreach(a in regex_automaton_assumptions)
                         }
-                        continue;
-                    }
-                }
+                        if (found) {
+                            expr_ref pathConstraint(m);
+                            expr_ref characterConstraints(m);
+                            pathConstraint = generate_regex_path_constraints(str, assumption.get_automaton(), exact_length_value, characterConstraints);
+                            TRACE("str", tout << "generated regex path constraint " << mk_pp(pathConstraint, m) << std::endl;);
+                            TRACE("str", tout << "character constraints are " << mk_pp(characterConstraints, m) << std::endl;);
+
+                            expr_ref_vector lhs_terms(m);
+                            if (current_assignment == l_true) {
+                                lhs_terms.push_back(str_in_re);
+                            } else {
+                                lhs_terms.push_back(m.mk_not(str_in_re));
+                            }
+                            lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true)));
+                            expr_ref lhs(mk_and(lhs_terms), m);
+                            // If the automaton was built with the same polarity as the constraint,
+                            // assert directly. Otherwise, negate the path constraint
+                            if ( (current_assignment == l_true && assumption.get_polarity())
+                                    || (current_assignment == l_false && !assumption.get_polarity())) {
+                                TRACE("str", tout << "automaton and regex term have same polarity" << std::endl;);
+                                expr_ref rhs(m.mk_and(pathConstraint, characterConstraints), m);
+                                assert_implication(lhs, rhs);
+                            } else {
+                                TRACE("str", tout << "automaton and regex term have opposite polarity" << std::endl;);
+                                expr_ref rhs(m.mk_and(m.mk_not(pathConstraint), characterConstraints), m);
+                                assert_implication(lhs, rhs);
+                            }
+                            regex_terms_with_path_constraints.insert(str_in_re);
+                            m_trail_stack.push(insert_obj_trail<theory_str, expr>(regex_terms_with_path_constraints, str_in_re));
+                            regex_axiom_add = true;
+
+                            // increment LengthAttemptCount
+                            regex_inc_counter(regex_length_attempt_count, re);
+
+                            {
+                                unsigned v = regex_get_counter(regex_length_attempt_count, re);
+                                TRACE("str", tout << "length attempt count for " << mk_pp(re, m) << " is " << v << std::endl;);
+                            }
+
+                            continue;
+                        } else {
+                            // no automata available, or else all bounds assumptions are invalid
+                            unsigned expected_complexity = estimate_regex_complexity(re);
+                            if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold) {
+                                CTRACE("str", regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold,
+                                        tout << "failed automaton threshold reached for " << mk_pp(str_in_re, m) << " -- automatically constructing full automaton" << std::endl;);
+                                eautomaton * aut = m_mk_aut(re);
+                                aut->compress();
+                                regex_automata.push_back(aut);
+                                regex_automaton_under_assumptions new_aut(re, aut, true);
+                                if (!regex_automaton_assumptions.contains(re)) {
+                                    regex_automaton_assumptions.insert(re, svector<regex_automaton_under_assumptions>());
+                                }
+                                regex_automaton_assumptions[re].push_back(new_aut);
+                                TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
+                                regex_axiom_add = true;
+                                find_automaton_initial_bounds(str_in_re, aut);
+                            } else {
+                                regex_inc_counter(regex_fail_count, str_in_re);
+                            }
+                            continue;
+                        }
+                    } // !length is zero
+                } // get_len_value()
                 expr_ref str_len(mk_strlen(str), m);
                 rational lower_bound_value;
                 rational upper_bound_value;
@@ -9858,10 +9920,12 @@ namespace smt {
                                 }
                             }
 
-                            expr_ref lhs_terms(mk_and(lhs), m);
-                            expr_ref rhs_terms(mk_and(rhs), m);
-                            assert_implication(lhs_terms, rhs_terms);
-                            regex_axiom_add = true;
+                            if (!rhs.empty()) {
+                                expr_ref lhs_terms(mk_and(lhs), m);
+                                expr_ref rhs_terms(mk_and(rhs), m);
+                                assert_implication(lhs_terms, rhs_terms);
+                                regex_axiom_add = true;
+                            }
                         }
                     } else {
                         // no existing automata/assumptions.
@@ -9878,7 +9942,7 @@ namespace smt {
                             regex_automaton_assumptions[re].push_back(new_aut);
                             TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
                             regex_axiom_add = true;
-                            // TODO immediately attempt to learn lower/upper bound info here
+                            find_automaton_initial_bounds(str_in_re, aut);
                         } else {
                             // TODO check negation?
                             // TODO construct a partial automaton for R to the given upper bound?
@@ -9976,10 +10040,12 @@ namespace smt {
                                     }
                                 }
 
-                                expr_ref lhs_terms(mk_and(lhs), m);
-                                expr_ref rhs_terms(mk_and(rhs), m);
-                                assert_implication(lhs_terms, rhs_terms);
-                                regex_axiom_add = true;
+                                if (!rhs.empty()) {
+                                    expr_ref lhs_terms(mk_and(lhs), m);
+                                    expr_ref rhs_terms(mk_and(rhs), m);
+                                    assert_implication(lhs_terms, rhs_terms);
+                                    regex_axiom_add = true;
+                                }
                             }
                         } else {
                             // no existing automata/assumptions.
@@ -9996,7 +10062,7 @@ namespace smt {
                                 regex_automaton_assumptions[re].push_back(new_aut);
                                 TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
                                 regex_axiom_add = true;
-                                // TODO immediately attempt to learn lower/upper bound info here
+                                find_automaton_initial_bounds(str_in_re, aut);
                             } else {
                                 // TODO check negation?
                                 // TODO construct a partial automaton for R to the given lower bound?
@@ -10016,7 +10082,7 @@ namespace smt {
                         // and doing so without bounds is not difficult
                         bool existingAutomata = (regex_automaton_assumptions.contains(re) && !regex_automaton_assumptions[re].empty());
                         bool failureThresholdExceeded = (regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold);
-                        if (!existingAutomata || failureThresholdExceeded) {
+                        if (!existingAutomata) {
                             unsigned expected_complexity = estimate_regex_complexity(re);
                             if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold
                                     || failureThresholdExceeded) {
@@ -10030,7 +10096,7 @@ namespace smt {
                                 regex_automaton_assumptions[re].push_back(new_aut);
                                 TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
                                 regex_axiom_add = true;
-                                // TODO immediately attempt to learn lower/upper bound info here
+                                find_automaton_initial_bounds(str_in_re, aut);
                             } else {
                                 regex_inc_counter(regex_fail_count, str_in_re);
                             }
@@ -10165,42 +10231,62 @@ namespace smt {
                         }
                     }
                 } // foreach(entry in intersect_constraints)
+                aut_inter->compress();
                 TRACE("str", tout << "intersected " << used_intersect_constraints.size() << " constraints" << std::endl;);
-                if (aut_inter != NULL && aut_inter->is_empty()) {
-                    TRACE("str", tout << "product automaton is empty; asserting conflict clause" << std::endl;);
-                    expr_ref_vector conflict_terms(m);
 
-                    for (svector<regex_automaton_under_assumptions>::iterator aut_it = used_intersect_constraints.begin();
-                            aut_it != used_intersect_constraints.end(); ++aut_it) {
-                        regex_automaton_under_assumptions aut = *aut_it;
-                        expr * str_in_re_term(u.re.mk_in_re(str, aut.get_regex_term()));
-                        lbool current_assignment = ctx.get_assignment(str_in_re_term);
-                        if (current_assignment == l_true) {
-                            conflict_terms.push_back(str_in_re_term);
-                        } else if (current_assignment == l_false) {
-                            conflict_terms.push_back(m.mk_not(str_in_re_term));
-                        }
-                        // add length assumptions, if any
-                        rational ub;
-                        if (aut.get_upper_bound(ub)) {
-                            expr_ref ub_term(m_autil.mk_le(mk_strlen(str), m_autil.mk_numeral(ub, true)), m);
-                            conflict_terms.push_back(ub_term);
-                        }
-                        rational lb;
-                        if (aut.get_lower_bound(lb)) {
-                            expr_ref lb_term(m_autil.mk_ge(mk_strlen(str), m_autil.mk_numeral(lb, true)), m);
-                            conflict_terms.push_back(lb_term);
+                expr_ref_vector conflict_terms(m);
+                expr_ref conflict_lhs(m);
+                for (svector<regex_automaton_under_assumptions>::iterator aut_it = used_intersect_constraints.begin();
+                        aut_it != used_intersect_constraints.end(); ++aut_it) {
+                    regex_automaton_under_assumptions aut = *aut_it;
+                    expr * str_in_re_term(u.re.mk_in_re(str, aut.get_regex_term()));
+                    lbool current_assignment = ctx.get_assignment(str_in_re_term);
+                    if (current_assignment == l_true) {
+                        conflict_terms.push_back(str_in_re_term);
+                    } else if (current_assignment == l_false) {
+                        conflict_terms.push_back(m.mk_not(str_in_re_term));
+                    }
+                    // add length assumptions, if any
+                    rational ub;
+                    if (aut.get_upper_bound(ub)) {
+                        expr_ref ub_term(m_autil.mk_le(mk_strlen(str), m_autil.mk_numeral(ub, true)), m);
+                        conflict_terms.push_back(ub_term);
+                    }
+                    rational lb;
+                    if (aut.get_lower_bound(lb)) {
+                        expr_ref lb_term(m_autil.mk_ge(mk_strlen(str), m_autil.mk_numeral(lb, true)), m);
+                        conflict_terms.push_back(lb_term);
+                    }
+                }
+                conflict_lhs = mk_and(conflict_terms);
+
+                if (used_intersect_constraints.size() > 1 && aut_inter != NULL) {
+                    // check whether the intersection is only the empty string
+                    unsigned initial_state = aut_inter->init();
+                    if (aut_inter->final_states().size() == 1 && aut_inter->is_final_state(initial_state)) {
+                        // initial state is final and it is the only final state
+                        // if there are no moves from the initial state,
+                        // the only solution is the empty string
+                        if (aut_inter->get_moves_from(initial_state).empty()) {
+                            TRACE("str", tout << "product automaton only accepts empty string" << std::endl;);
+                            expr_ref rhs1(ctx.mk_eq_atom(str, mk_string("")), m);
+                            expr_ref rhs2(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(rational::zero(), true)), m);
+                            expr_ref rhs(m.mk_and(rhs1, rhs2), m);
+                            assert_implication(conflict_lhs, rhs);
+                            regex_axiom_add = true;
                         }
                     }
+                }
 
+                if (aut_inter != NULL && aut_inter->is_empty()) {
+                    TRACE("str", tout << "product automaton is empty; asserting conflict clause" << std::endl;);
                     expr_ref conflict_clause(m.mk_not(mk_and(conflict_terms)), m);
                     assert_axiom(conflict_clause);
                     regex_axiom_add = true;
                 }
             } // foreach (entry in regex_terms_by_string)
-
             if (regex_axiom_add) {
-                return FC_CONTINUE;
+                //return FC_CONTINUE;
             }
         } // RegexAutomata
 
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index cf3958ceb..4e790e838 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -556,6 +556,7 @@ protected:
     bool check_regex_length_linearity(expr * re);
     bool check_regex_length_linearity_helper(expr * re, bool already_star);
     expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables);
+    void find_automaton_initial_bounds(expr * str_in_re, eautomaton * aut);
     bool refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound);
     bool refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound);
     expr_ref generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal, expr_ref & characterConstraints);

From dbb15f65b5d6deba9cefbd4f9298a3e8fe4c6fe5 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Wed, 17 Jan 2018 19:26:42 -0500
Subject: [PATCH 19/36] correct generation of linear length constraints for
 regex star terms

---
 src/smt/theory_str.cpp | 97 ++++++++++++++++++++++++++++++++++++++++--
 src/smt/theory_str.h   |  3 ++
 2 files changed, 96 insertions(+), 4 deletions(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index e25dc84f3..c4e6a9471 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -6638,6 +6638,68 @@ namespace smt {
         }
     }
 
+    // note: returns an empty set `lens` if something went wrong
+    void theory_str::check_subterm_lengths(expr * re, integer_set & lens) {
+        expr * sub1;
+        expr * sub2;
+        if (u.re.is_to_re(re, sub1)) {
+            SASSERT(u.str.is_string(sub1));
+            zstring(str);
+            u.str.is_string(sub1, str);
+            lens.insert(str.length());
+        } else if (u.re.is_concat(re, sub1, sub2)) {
+            integer_set lens_1, lens_2;
+            check_subterm_lengths(sub1, lens_1);
+            check_subterm_lengths(sub2, lens_2);
+            if (lens_1.empty() || lens_2.empty()) {
+                lens.reset();
+            } else {
+                // take all pairwise lengths
+                for (integer_set::iterator it1 = lens_1.begin(); it1 != lens_1.end(); ++it1) {
+                    for(integer_set::iterator it2 = lens_2.begin(); it2 != lens_2.end(); ++it2) {
+                        int l1 = *it1;
+                        int l2 = *it2;
+                        lens.insert(l1 + l2);
+                    }
+                }
+            }
+        } else if (u.re.is_union(re, sub1, sub2)) {
+            integer_set lens_1, lens_2;
+            check_subterm_lengths(sub1, lens_1);
+            check_subterm_lengths(sub2, lens_2);
+            if (lens_1.empty() || lens_2.empty()) {
+                lens.reset();
+            } else {
+                // take all possibilities from either side
+                for (integer_set::iterator it1 = lens_1.begin(); it1 != lens_1.end(); ++it1) {
+                    lens.insert(*it1);
+                }
+                for (integer_set::iterator it2 = lens_2.begin(); it2 != lens_2.end(); ++it2) {
+                    lens.insert(*it2);
+                }
+            }
+        } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) {
+            // this is bad -- term generation requires this not to appear
+            lens.reset();
+        } else if (u.re.is_range(re, sub1, sub2)) {
+            SASSERT(u.str.is_string(sub1));
+            SASSERT(u.str.is_string(sub2));
+            zstring str1, str2;
+            u.str.is_string(sub1, str1);
+            u.str.is_string(sub2, str2);
+            SASSERT(str1.length() == 1);
+            SASSERT(str2.length() == 1);
+            lens.insert(1);
+        } else if (u.re.is_full(re)) {
+            lens.insert(1);
+        } else if (u.re.is_complement(re)) {
+            lens.reset();
+        } else {
+            TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;);
+            lens.reset();
+        }
+    }
+
     /*
      * Infer all length constraints implied by the given regular expression `re`
      * in order to constrain `lenVar` (which must be of sort Int).
@@ -6648,7 +6710,6 @@ namespace smt {
      * Extra assertions should be made for these free variables constraining them to be non-negative.
      *
      * TODO: star unrolling?
-     * TODO: generate stars "correctly" as a linear combination of all possible subterm lengths
      */
     expr_ref theory_str::infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables) {
         ENSURE(u.is_re(re));
@@ -6682,6 +6743,9 @@ namespace smt {
             expr_ref retval(mk_and(finalResult), m);
             return retval;
         } else if (u.re.is_star(re, sub1)) {
+            // stars are generated as a linear combination of all possible subterm lengths;
+            // this requires that there are no stars under this one
+            /*
             expr * v = mk_int_var("rlen");
             expr * n = mk_int_var("rstar");
             freeVariables.push_back(v);
@@ -6692,7 +6756,32 @@ namespace smt {
             finalResult.push_back(ctx.mk_eq_atom(lenVar, m_autil.mk_mul(v, n)));
             expr_ref retval(mk_and(finalResult), m);
             return retval;
+            */
+            integer_set subterm_lens;
+            check_subterm_lengths(sub1, subterm_lens);
+            if (subterm_lens.empty()) {
+                // somehow generation was impossible
+                expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m);
+                return retval;
+            } else {
+                TRACE("str", tout << "subterm lengths:";
+                for(integer_set::iterator it = subterm_lens.begin(); it != subterm_lens.end(); ++it) {
+                    tout << " " << *it;
+                }
+                tout << std::endl;);
+                expr_ref_vector sum_terms(m);
+                for (integer_set::iterator it = subterm_lens.begin(); it != subterm_lens.end(); ++it) {
+                    rational lenOption(*it);
+                    expr * n = mk_int_var("rstar");
+                    freeVariables.push_back(n);
+                    expr_ref term(m_autil.mk_mul(m_autil.mk_numeral(lenOption, true), n), m);
+                    sum_terms.push_back(term);
+                }
+                expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_add_simplify(sum_terms)), m);
+                return retval;
+            }
         } else if (u.re.is_plus(re, sub1)) {
+            /*
             expr * v = mk_int_var("rlen");
             expr * n = mk_int_var("rstar");
             freeVariables.push_back(v);
@@ -6704,6 +6793,9 @@ namespace smt {
             finalResult.push_back(m_autil.mk_ge(n, m_autil.mk_numeral(rational::one(), true)));
             expr_ref retval(mk_and(finalResult), m);
             return retval;
+            */
+            // TODO this should really be done as a sub-case of star
+            NOT_IMPLEMENTED_YET();
         } else if (u.re.is_range(re, sub1, sub2)) {
             SASSERT(u.str.is_string(sub1));
             SASSERT(u.str.is_string(sub2));
@@ -9664,8 +9756,6 @@ namespace smt {
                     continue;
                 }
 
-                // DISABLED due to bug -- star-over-union (and possibly others) results in unsound constraints
-                /*
                 if (!regex_terms_with_length_constraints.contains(str_in_re)) {
                     if (current_assignment == l_true && check_regex_length_linearity(re)) {
                         TRACE("str", tout << "regex length constraints expected to be linear -- generating and asserting them" << std::endl;);
@@ -9711,7 +9801,6 @@ namespace smt {
                         regex_axiom_add = true;
                     }
                 } // re not in regex_terms_with_length_constraints
-                */
 
                 rational exact_length_value;
                 if (get_len_value(str, exact_length_value)) {
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index 4e790e838..ea4449b13 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -20,6 +20,7 @@
 #include "util/trail.h"
 #include "util/union_find.h"
 #include "util/scoped_ptr_vector.h"
+#include "util/hashtable.h"
 #include "ast/ast_pp.h"
 #include "ast/arith_decl_plugin.h"
 #include "ast/rewriter/th_rewriter.h"
@@ -37,6 +38,7 @@
 namespace smt {
 
 typedef hashtable<symbol, symbol_hash_proc, symbol_eq_proc> symbol_set;
+typedef int_hashtable<int_hash, default_eq<int> > integer_set;
 
 class str_value_factory : public value_factory {
     seq_util u;
@@ -556,6 +558,7 @@ protected:
     bool check_regex_length_linearity(expr * re);
     bool check_regex_length_linearity_helper(expr * re, bool already_star);
     expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables);
+    void check_subterm_lengths(expr * re, integer_set & lens);
     void find_automaton_initial_bounds(expr * str_in_re, eautomaton * aut);
     bool refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound);
     bool refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound);

From 5727950a3ced8c32a3a7d19680cd0dc5717fee7e Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Thu, 18 Jan 2018 17:52:55 -0500
Subject: [PATCH 20/36] zero-length automaton solution fix

---
 src/smt/theory_str.cpp | 148 ++++++++++++++++++++++++++++-------------
 1 file changed, 100 insertions(+), 48 deletions(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index c4e6a9471..1f84b5faf 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -9811,33 +9811,85 @@ namespace smt {
                         continue;
                     }
 
-                    if (exact_length_value.is_zero()) {
-                        // shortcut
-                        expr_ref lhs(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(rational::zero(), true)), m);
-                        expr_ref rhs(ctx.mk_eq_atom(str, mk_string("")), m);
-                        assert_implication(lhs, rhs);
-                    } else {
-                        // find a consistent automaton for this term
-                        bool found = false;
-                        regex_automaton_under_assumptions assumption;
-                        if (regex_automaton_assumptions.contains(re) &&
-                                !regex_automaton_assumptions[re].empty()){
-                            for (svector<regex_automaton_under_assumptions>::iterator it = regex_automaton_assumptions[re].begin();
-                                    it != regex_automaton_assumptions[re].end(); ++it) {
-                                regex_automaton_under_assumptions autA = *it;
-                                rational assumed_upper_bound, assumed_lower_bound;
-                                bool assumes_upper_bound = autA.get_upper_bound(assumed_upper_bound);
-                                bool assumes_lower_bound = autA.get_lower_bound(assumed_lower_bound);
-                                if (!assumes_upper_bound && !assumes_lower_bound) {
-                                    // automaton with no assumptions is always usable
-                                    assumption = autA;
-                                    found = true;
-                                    break;
+
+                    // find a consistent automaton for this term
+                    bool found = false;
+                    regex_automaton_under_assumptions assumption;
+                    if (regex_automaton_assumptions.contains(re) &&
+                            !regex_automaton_assumptions[re].empty()){
+                        for (svector<regex_automaton_under_assumptions>::iterator it = regex_automaton_assumptions[re].begin();
+                                it != regex_automaton_assumptions[re].end(); ++it) {
+                            regex_automaton_under_assumptions autA = *it;
+                            rational assumed_upper_bound, assumed_lower_bound;
+                            bool assumes_upper_bound = autA.get_upper_bound(assumed_upper_bound);
+                            bool assumes_lower_bound = autA.get_lower_bound(assumed_lower_bound);
+                            if (!assumes_upper_bound && !assumes_lower_bound) {
+                                // automaton with no assumptions is always usable
+                                assumption = autA;
+                                found = true;
+                                break;
+                            }
+                            // check consistency of bounds assumptions
+                        } // foreach(a in regex_automaton_assumptions)
+                    }
+                    if (found) {
+                        if (exact_length_value.is_zero()) {
+                            // check consistency of 0-length solution with automaton
+                            eautomaton * aut = assumption.get_automaton();
+                            bool zero_solution = false;
+                            unsigned initial_state = aut->init();
+                            if (aut->is_final_state(initial_state)) {
+                                zero_solution = true;
+                            } else {
+                                unsigned_vector eps_states;
+                                aut->get_epsilon_closure(initial_state, eps_states);
+                                for (unsigned_vector::iterator it = eps_states.begin(); it != eps_states.end(); ++it) {
+                                    unsigned state = *it;
+                                    if (aut->is_final_state(state)) {
+                                        zero_solution = true;
+                                        break;
+                                    }
                                 }
-                                // check consistency of bounds assumptions
-                            } // foreach(a in regex_automaton_assumptions)
-                        }
-                        if (found) {
+                            }
+
+                            // now check polarity of automaton wrt. original term
+                            if ( (current_assignment == l_true && !assumption.get_polarity())
+                                    || (current_assignment == l_false && assumption.get_polarity())) {
+                                // invert sense
+                                NOT_IMPLEMENTED_YET();
+                            }
+
+                            if (zero_solution) {
+                                TRACE("str", tout << "zero-length solution OK -- asserting empty path constraint" << std::endl;);
+                                expr_ref_vector lhs_terms(m);
+                                if (current_assignment == l_true) {
+                                    lhs_terms.push_back(str_in_re);
+                                } else {
+                                    lhs_terms.push_back(m.mk_not(str_in_re));
+                                }
+                                lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true)));
+                                expr_ref lhs(mk_and(lhs_terms), m);
+                                expr_ref rhs(ctx.mk_eq_atom(str, mk_string("")), m);
+                                assert_implication(lhs, rhs);
+                                regex_terms_with_path_constraints.insert(str_in_re);
+                                m_trail_stack.push(insert_obj_trail<theory_str, expr>(regex_terms_with_path_constraints, str_in_re));
+                            } else {
+                                TRACE("str", tout << "zero-length solution not admitted by this automaton -- asserting conflict clause" << std::endl;);
+                                expr_ref_vector lhs_terms(m);
+                                if (current_assignment == l_true) {
+                                    lhs_terms.push_back(str_in_re);
+                                } else {
+                                    lhs_terms.push_back(m.mk_not(str_in_re));
+                                }
+                                lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true)));
+                                expr_ref lhs(mk_and(lhs_terms), m);
+                                expr_ref conflict(m.mk_not(lhs), m);
+                                assert_axiom(conflict);
+                            }
+                            regex_axiom_add = true;
+                            regex_inc_counter(regex_length_attempt_count, re);
+                            continue;
+                        } else {
                             expr_ref pathConstraint(m);
                             expr_ref characterConstraints(m);
                             pathConstraint = generate_regex_path_constraints(str, assumption.get_automaton(), exact_length_value, characterConstraints);
@@ -9876,30 +9928,30 @@ namespace smt {
                                 TRACE("str", tout << "length attempt count for " << mk_pp(re, m) << " is " << v << std::endl;);
                             }
 
-                            continue;
-                        } else {
-                            // no automata available, or else all bounds assumptions are invalid
-                            unsigned expected_complexity = estimate_regex_complexity(re);
-                            if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold) {
-                                CTRACE("str", regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold,
-                                        tout << "failed automaton threshold reached for " << mk_pp(str_in_re, m) << " -- automatically constructing full automaton" << std::endl;);
-                                eautomaton * aut = m_mk_aut(re);
-                                aut->compress();
-                                regex_automata.push_back(aut);
-                                regex_automaton_under_assumptions new_aut(re, aut, true);
-                                if (!regex_automaton_assumptions.contains(re)) {
-                                    regex_automaton_assumptions.insert(re, svector<regex_automaton_under_assumptions>());
-                                }
-                                regex_automaton_assumptions[re].push_back(new_aut);
-                                TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
-                                regex_axiom_add = true;
-                                find_automaton_initial_bounds(str_in_re, aut);
-                            } else {
-                                regex_inc_counter(regex_fail_count, str_in_re);
-                            }
                             continue;
                         }
-                    } // !length is zero
+                    } else {
+                        // no automata available, or else all bounds assumptions are invalid
+                        unsigned expected_complexity = estimate_regex_complexity(re);
+                        if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold) {
+                            CTRACE("str", regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold,
+                                    tout << "failed automaton threshold reached for " << mk_pp(str_in_re, m) << " -- automatically constructing full automaton" << std::endl;);
+                            eautomaton * aut = m_mk_aut(re);
+                            aut->compress();
+                            regex_automata.push_back(aut);
+                            regex_automaton_under_assumptions new_aut(re, aut, true);
+                            if (!regex_automaton_assumptions.contains(re)) {
+                                regex_automaton_assumptions.insert(re, svector<regex_automaton_under_assumptions>());
+                            }
+                            regex_automaton_assumptions[re].push_back(new_aut);
+                            TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
+                            regex_axiom_add = true;
+                            find_automaton_initial_bounds(str_in_re, aut);
+                        } else {
+                            regex_inc_counter(regex_fail_count, str_in_re);
+                        }
+                        continue;
+                    }
                 } // get_len_value()
                 expr_ref str_len(mk_strlen(str), m);
                 rational lower_bound_value;

From a9fda81d0350fa586f9ef12c10309adab65847b0 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Thu, 18 Jan 2018 17:53:42 -0500
Subject: [PATCH 21/36] check polarity

---
 src/smt/theory_str.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 1f84b5faf..dceb17a06 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -9856,7 +9856,7 @@ namespace smt {
                             if ( (current_assignment == l_true && !assumption.get_polarity())
                                     || (current_assignment == l_false && assumption.get_polarity())) {
                                 // invert sense
-                                NOT_IMPLEMENTED_YET();
+                                zero_solution = !zero_solution;
                             }
 
                             if (zero_solution) {

From 2065ea88ee3000c24d3d3fc2fade98c899f261d6 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Fri, 19 Jan 2018 14:56:06 -0500
Subject: [PATCH 22/36] fix null pointer dereference

---
 src/smt/theory_str.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index dceb17a06..5d66a4ec3 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -10372,7 +10372,9 @@ namespace smt {
                         }
                     }
                 } // foreach(entry in intersect_constraints)
-                aut_inter->compress();
+                if (aut_inter != NULL) {
+                    aut_inter->compress();
+                }
                 TRACE("str", tout << "intersected " << used_intersect_constraints.size() << " constraints" << std::endl;);
 
                 expr_ref_vector conflict_terms(m);

From d9d3ef78d2a4c8e643e33cdfaf5ae3b30c1c2a4d Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Fri, 19 Jan 2018 16:14:56 -0500
Subject: [PATCH 23/36] temporarily disable final check progress checking it is
 interfering with regex automata solving

---
 src/smt/theory_str.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 5d66a4ec3..933796d9c 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -35,7 +35,7 @@ namespace smt {
         m_params(params),
         /* Options */
         opt_EagerStringConstantLengthAssertions(true),
-        opt_VerifyFinalCheckProgress(true),
+        opt_VerifyFinalCheckProgress(false),
         opt_LCMUnrollStep(2),
         opt_NoQuickReturn_IntegerTheory(false),
         opt_DisableIntegerTheoryIntegration(false),

From d648f95f633626ac66b29c9ff1322ca995065e5d Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Wed, 24 Jan 2018 21:25:45 -0500
Subject: [PATCH 24/36] fix setup of path constraints when the path constraint
 is False

---
 src/smt/theory_str.cpp | 49 +++++++++++++++++++++++++++++++-----------
 1 file changed, 36 insertions(+), 13 deletions(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 933796d9c..680500229 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -9904,21 +9904,44 @@ namespace smt {
                             }
                             lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true)));
                             expr_ref lhs(mk_and(lhs_terms), m);
-                            // If the automaton was built with the same polarity as the constraint,
-                            // assert directly. Otherwise, negate the path constraint
-                            if ( (current_assignment == l_true && assumption.get_polarity())
-                                    || (current_assignment == l_false && !assumption.get_polarity())) {
-                                TRACE("str", tout << "automaton and regex term have same polarity" << std::endl;);
-                                expr_ref rhs(m.mk_and(pathConstraint, characterConstraints), m);
-                                assert_implication(lhs, rhs);
+
+                            // If the path constraint comes out as "false", this means there are no paths of that length
+                            // in the automaton. If the polarity is the same, we can assert a conflict clause.
+                            // If the polarity is opposite, we ignore the path constraint.
+
+                            if (m.is_false(pathConstraint)) {
+                                if ( (current_assignment == l_true && assumption.get_polarity())
+                                        || (current_assignment == l_false && !assumption.get_polarity())) {
+                                    // automaton and constraint have same polarity -- assert conflict clause
+                                    TRACE("str", tout << "path constraint is false with matching polarity; asserting conflict clause" << std::endl;);
+                                    expr_ref conflict(m.mk_not(mk_and(lhs_terms)), m);
+                                    assert_axiom(conflict);
+                                    // don't set up "regex_terms_with_path_constraints" as a conflict clause is not a path constraint
+                                } else {
+                                    // automaton and constraint have opposite polarity -- ignore path constraint
+                                    TRACE("str", tout << "path constraint is false with opposite polarity; ignoring path constraint" << std::endl;);
+                                    assert_implication(lhs, characterConstraints);
+                                    regex_terms_with_path_constraints.insert(str_in_re);
+                                    m_trail_stack.push(insert_obj_trail<theory_str, expr>(regex_terms_with_path_constraints, str_in_re));
+                                }
+                                regex_axiom_add = true;
                             } else {
-                                TRACE("str", tout << "automaton and regex term have opposite polarity" << std::endl;);
-                                expr_ref rhs(m.mk_and(m.mk_not(pathConstraint), characterConstraints), m);
-                                assert_implication(lhs, rhs);
+                                // If the automaton was built with the same polarity as the constraint,
+                                // assert directly. Otherwise, negate the path constraint
+                                if ( (current_assignment == l_true && assumption.get_polarity())
+                                        || (current_assignment == l_false && !assumption.get_polarity())) {
+                                    TRACE("str", tout << "automaton and regex term have same polarity" << std::endl;);
+                                    expr_ref rhs(m.mk_and(pathConstraint, characterConstraints), m);
+                                    assert_implication(lhs, rhs);
+                                } else {
+                                    TRACE("str", tout << "automaton and regex term have opposite polarity" << std::endl;);
+                                    expr_ref rhs(m.mk_and(m.mk_not(pathConstraint), characterConstraints), m);
+                                    assert_implication(lhs, rhs);
+                                }
+                                regex_terms_with_path_constraints.insert(str_in_re);
+                                m_trail_stack.push(insert_obj_trail<theory_str, expr>(regex_terms_with_path_constraints, str_in_re));
+                                regex_axiom_add = true;
                             }
-                            regex_terms_with_path_constraints.insert(str_in_re);
-                            m_trail_stack.push(insert_obj_trail<theory_str, expr>(regex_terms_with_path_constraints, str_in_re));
-                            regex_axiom_add = true;
 
                             // increment LengthAttemptCount
                             regex_inc_counter(regex_length_attempt_count, re);

From 8d5065d35d310b0b2e1d1950797c983df4f4f8a4 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Wed, 24 Jan 2018 22:02:00 -0500
Subject: [PATCH 25/36] fix constant eqc bug in mk_concat

---
 src/smt/theory_str.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 680500229..a9eac7dd4 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -709,6 +709,12 @@ namespace smt {
         bool n2HasEqcValue = false;
         expr * v1 = get_eqc_value(n1, n1HasEqcValue);
         expr * v2 = get_eqc_value(n2, n2HasEqcValue);
+        if (u.str.is_string(v1)) {
+            n1HasEqcValue = true;
+        }
+        if (u.str.is_string(v2)) {
+            n2HasEqcValue = true;
+        }
         if (n1HasEqcValue && n2HasEqcValue) {
             zstring n1_str;
             u.str.is_string(v1, n1_str);

From 852e0e0892718dbbf5b9faf7544f93a6a0e210f4 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Thu, 25 Jan 2018 15:25:36 -0500
Subject: [PATCH 26/36] fix regex difficulty overflow bug; fix final check on
 regex terms that don't get path constraints

---
 src/smt/theory_str.cpp | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index a9eac7dd4..d30d406b2 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -6523,6 +6523,9 @@ namespace smt {
         if (a == UINT_MAX || b == UINT_MAX) {
             return UINT_MAX;
         }
+        if (a == 0 || b == 0) {
+            return 0;
+        }
         unsigned result = a * b;
         if (result < a || result < b) {
             return UINT_MAX;
@@ -9817,7 +9820,6 @@ namespace smt {
                         continue;
                     }
 
-
                     // find a consistent automaton for this term
                     bool found = false;
                     regex_automaton_under_assumptions assumption;
@@ -9835,7 +9837,7 @@ namespace smt {
                                 found = true;
                                 break;
                             }
-                            // check consistency of bounds assumptions
+                            // TODO check consistency of bounds assumptions
                         } // foreach(a in regex_automaton_assumptions)
                     }
                     if (found) {
@@ -10101,7 +10103,8 @@ namespace smt {
                         // no existing automata/assumptions.
                         // if it's easy to construct a full automaton for R, do so
                         unsigned expected_complexity = estimate_regex_complexity(re);
-                        if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) {
+                        bool failureThresholdExceeded = (regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold);
+                        if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || failureThresholdExceeded) {
                             eautomaton * aut = m_mk_aut(re);
                             aut->compress();
                             regex_automata.push_back(aut);
@@ -10221,7 +10224,8 @@ namespace smt {
                             // no existing automata/assumptions.
                             // if it's easy to construct a full automaton for R, do so
                             unsigned expected_complexity = estimate_regex_complexity(re);
-                            if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) {
+                            bool failureThresholdExceeded = (regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold);
+                            if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || failureThresholdExceeded) {
                                 eautomaton * aut = m_mk_aut(re);
                                 aut->compress();
                                 regex_automata.push_back(aut);
@@ -10518,6 +10522,27 @@ namespace smt {
                 return FC_CONTINUE;
             }
 
+            // We must be be 100% certain that if there are any regex constraints,
+            // the string assignment for each variable is consistent with the automaton.
+            // The (probably) easiest way to do this is to ensure
+            // that we have path constraints set up for every assigned regex term.
+            if (m_params.m_RegexAutomata && !regex_terms.empty()) {
+                for (obj_hashtable<expr>::iterator it = regex_terms.begin(); it != regex_terms.end(); ++it) {
+                    expr * str_in_re = *it;
+                    expr * str;
+                    expr * re;
+                    u.str.is_in_re(str_in_re, str, re);
+                    lbool current_assignment = ctx.get_assignment(str_in_re);
+                    if (current_assignment == l_undef) {
+                        continue;
+                    }
+                    if (!regex_terms_with_path_constraints.contains(str_in_re)) {
+                        TRACE("str", tout << "assigned regex term " << mk_pp(str_in_re, m) << " has no path constraints -- continuing search" << std::endl;);
+                        return FC_CONTINUE;
+                    }
+                } // foreach (str.in.re in regex_terms)
+            }
+
             if (unused_internal_variables.empty()) {
                 TRACE("str", tout << "All variables are assigned. Done!" << std::endl;);
                 return FC_DONE;

From 5c3f35dc4407fb0d4d86fdf9247b7d4401de26ed Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Thu, 25 Jan 2018 15:52:57 -0500
Subject: [PATCH 27/36] always rewrite regex length constraints as they are
 sometimes malformed

---
 src/smt/theory_str.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index d30d406b2..9de0375b9 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -9787,7 +9787,10 @@ namespace smt {
                         } else {
                             // generate new length constraint
                             expr_ref_vector extra_length_vars(m);
-                            expr_ref top_level_length_constraint = infer_all_regex_lengths(mk_strlen(str), re, extra_length_vars);
+                            expr_ref _top_level_length_constraint = infer_all_regex_lengths(mk_strlen(str), re, extra_length_vars);
+                            expr_ref top_level_length_constraint(_top_level_length_constraint, m);
+                            th_rewriter rw(m);
+                            rw(top_level_length_constraint);
                             TRACE("str", tout << "top-level length constraint: " << mk_pp(top_level_length_constraint, m) << std::endl;);
                             // assert and track length constraint
                             assert_axiom(top_level_length_constraint);

From c01dda4e313f3fd72a5fa8453e112be6a79ad3ef Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Thu, 25 Jan 2018 16:11:31 -0500
Subject: [PATCH 28/36] experimental str.to.int fix

---
 src/smt/theory_str.cpp | 50 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 9de0375b9..2cbade1d5 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -9371,7 +9371,7 @@ namespace smt {
         bool Ival_exists = get_arith_value(a, Ival);
         if (Ival_exists) {
             TRACE("str", tout << "integer theory assigns " << mk_pp(a, m) << " = " << Ival.to_string() << std::endl;);
-            // if that value is not -1, we can assert (str.to-int S) = Ival --> S = "Ival"
+            // if that value is not -1, we can assert (str.to.int S) = Ival --> S = "Ival"
             if (!Ival.is_minus_one()) {
                 zstring Ival_str(Ival.to_string().c_str());
                 expr_ref premise(ctx.mk_eq_atom(a, m_autil.mk_numeral(Ival, true)), m);
@@ -9393,6 +9393,54 @@ namespace smt {
             // NOT_IMPLEMENTED_YET();
         }
 
+        bool S_hasEqcValue;
+        expr * S_str = get_eqc_value(S, S_hasEqcValue);
+        if (S_hasEqcValue) {
+            zstring str;
+            u.str.is_string(S_str, str);
+            bool valid = true;
+            rational convertedRepresentation(0);
+            rational ten(10);
+            if (str.length() == 0) {
+                valid = false;
+            } else {
+                for (unsigned i = 0; i < str.length(); ++i) {
+                    if (!('0' <= str[i] && str[i] <= '9')) {
+                        valid = false;
+                        break;
+                    } else {
+                        // accumulate
+                        char digit = (int)str[i];
+                        std::string sDigit(1, digit);
+                        int val = atoi(sDigit.c_str());
+                        convertedRepresentation = (ten * convertedRepresentation) + rational(val);
+                    }
+                }
+            }
+            // TODO this duplicates code a bit, we can simplify the branch on "conclusion" only
+            if (valid) {
+                expr_ref premise(ctx.mk_eq_atom(S, mk_string(str)), m);
+                expr_ref conclusion(ctx.mk_eq_atom(a, m_autil.mk_numeral(convertedRepresentation, true)), m);
+                expr_ref axiom(rewrite_implication(premise, conclusion), m);
+                if (!string_int_axioms.contains(axiom)) {
+                    string_int_axioms.insert(axiom);
+                    assert_axiom(axiom);
+                    m_trail_stack.push(insert_obj_trail<theory_str, expr>(string_int_axioms, axiom));
+                    axiomAdd = true;
+                }
+            } else {
+                expr_ref premise(ctx.mk_eq_atom(S, mk_string(str)), m);
+                expr_ref conclusion(ctx.mk_eq_atom(a, m_autil.mk_numeral(rational::minus_one(), true)), m);
+                expr_ref axiom(rewrite_implication(premise, conclusion), m);
+                if (!string_int_axioms.contains(axiom)) {
+                    string_int_axioms.insert(axiom);
+                    assert_axiom(axiom);
+                    m_trail_stack.push(insert_obj_trail<theory_str, expr>(string_int_axioms, axiom));
+                    axiomAdd = true;
+                }
+            }
+        }
+
         return axiomAdd;
     }
 

From 1ee5ce96b8745ec9fa1ea37e2a0dbb2482a373dc Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Fri, 26 Jan 2018 14:52:18 -0500
Subject: [PATCH 29/36] use regex instead of head/tail split for string-integer
 conversion; check sort of refreshed vars; add intersection difficulty
 estimation

---
 src/smt/theory_str.cpp | 26 +++++++++++++++++---------
 src/smt/theory_str.h   |  1 +
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 2cbade1d5..5bc0590e6 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -287,10 +287,13 @@ namespace smt {
     }
 
     void theory_str::refresh_theory_var(expr * e) {
+        ast_manager & m = get_manager();
         enode * en = ensure_enode(e);
         theory_var v = mk_var(en); (void)v;
         TRACE("str", tout << "refresh " << mk_pp(e, get_manager()) << ": v#" << v << std::endl;);
-        m_basicstr_axiom_todo.push_back(en);
+        if (m.get_sort(e) == u.str.mk_string_sort()) {
+            m_basicstr_axiom_todo.push_back(en);
+        }
     }
 
     theory_var theory_str::mk_var(enode* n) {
@@ -1628,12 +1631,11 @@ namespace smt {
 
         {
             expr_ref premise(m_autil.mk_ge(ex, m_autil.mk_numeral(rational::one(), true)), m);
-            expr_ref hd(mk_str_var("hd"), m);
-            expr_ref tl(mk_str_var("tl"), m);
-            expr_ref conclusion1(ctx.mk_eq_atom(S, mk_concat(hd, tl)), m);
-            expr_ref conclusion2(ctx.mk_eq_atom(mk_strlen(hd), m_autil.mk_numeral(rational::one(), true)), m);
-            expr_ref conclusion3(mk_not(m, ctx.mk_eq_atom(hd, mk_string("0"))), m);
-            expr_ref conclusion(m.mk_and(conclusion1, conclusion2, conclusion3), m);
+            // S >= 1 --> S in [1-9][0-9]*
+            expr_ref re_positiveInteger(u.re.mk_concat(
+                    u.re.mk_range(mk_string("1"), mk_string("9")),
+                    u.re.mk_star(u.re.mk_range(mk_string("0"), mk_string("9")))), m);
+            expr_ref conclusion(mk_RegexIn(S, re_positiveInteger), m);
             SASSERT(premise);
             SASSERT(conclusion);
             assert_implication(premise, conclusion);
@@ -6614,6 +6616,12 @@ namespace smt {
         }
     }
 
+    unsigned theory_str::estimate_automata_intersection_difficulty(eautomaton * aut1, eautomaton * aut2) {
+        ENSURE(aut1 != NULL);
+        ENSURE(aut2 != NULL);
+        return _qmul(aut1->num_states(), aut2->num_states());
+    }
+
     // Check whether a regex translates well to a linear set of length constraints.
     bool theory_str::check_regex_length_linearity(expr * re) {
         return check_regex_length_linearity_helper(re, false);
@@ -10291,7 +10299,6 @@ namespace smt {
                             } else {
                                 // TODO check negation?
                                 // TODO construct a partial automaton for R to the given lower bound?
-                                // TODO increment failure count
                                 if (false) {
 
                                 } else {
@@ -10424,7 +10431,8 @@ namespace smt {
                     }
 
                     if (regex_get_counter(regex_length_attempt_count, aut.get_regex_term()) >= m_params.m_RegexAutomata_LengthAttemptThreshold) {
-                        unsigned intersectionDifficulty = 0; // TODO EstimateIntersectionDifficulty
+                        unsigned intersectionDifficulty = estimate_automata_intersection_difficulty(aut_inter, aut.get_automaton());
+                        TRACE("str", tout << "intersection difficulty is " << intersectionDifficulty << std::endl;);
                         if (intersectionDifficulty <= m_params.m_RegexAutomata_IntersectionDifficultyThreshold
                                 || regex_get_counter(regex_intersection_fail_count, aut.get_regex_term()) >= m_params.m_RegexAutomata_FailedIntersectionThreshold) {
 
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index ea4449b13..1330f9904 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -555,6 +555,7 @@ protected:
     // regex automata and length-aware regex
     unsigned estimate_regex_complexity(expr * re);
     unsigned estimate_regex_complexity_under_complement(expr * re);
+    unsigned estimate_automata_intersection_difficulty(eautomaton * aut1, eautomaton * aut2);
     bool check_regex_length_linearity(expr * re);
     bool check_regex_length_linearity_helper(expr * re, bool already_star);
     expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables);

From 762129d4c7df6563cca451f3d33210cc094bae3d Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Mon, 12 Feb 2018 17:45:07 -0500
Subject: [PATCH 30/36] fixups to theory_str for regex

---
 src/smt/theory_str.cpp | 18 +++++++++++++-----
 src/smt/theory_str.h   |  1 +
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 8157ceea9..e68a7fa8a 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -6571,7 +6571,7 @@ namespace smt {
             SASSERT(str1.length() == 1);
             SASSERT(str2.length() == 1);
             return 1 + str2[0] - str1[0];
-        } else if (u.re.is_full(re)) {
+        } else if (u.re.is_full_char(re) || u.re.is_full_seq(re)) {
             return 1;
         } else {
             TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;);
@@ -6613,7 +6613,7 @@ namespace smt {
             SASSERT(str1.length() == 1);
             SASSERT(str2.length() == 1);
             return 1 + str2[0] - str1[0];
-        } else if (u.re.is_full(re)) {
+        } else if (u.re.is_full_char(re) || u.re.is_full_seq(re)) {
             return 1;
         } else {
             TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;);
@@ -6649,8 +6649,10 @@ namespace smt {
             }
         } else if (u.re.is_range(re)) {
             return true;
-        } else if (u.re.is_full(re)) {
+        } else if (u.re.is_full_char(re)) {
             return true;
+        } else if (u.re.is_full_seq(re)) {
+            return false; // generally unbounded
         } else if (u.re.is_complement(re)) {
             // TODO can we do better?
             return false;
@@ -6712,8 +6714,10 @@ namespace smt {
             SASSERT(str1.length() == 1);
             SASSERT(str2.length() == 1);
             lens.insert(1);
-        } else if (u.re.is_full(re)) {
+        } else if (u.re.is_full_char(re)) {
             lens.insert(1);
+        } else if (u.re.is_full_seq(re)) {
+            lens.reset();
         } else if (u.re.is_complement(re)) {
             lens.reset();
         } else {
@@ -6828,9 +6832,13 @@ namespace smt {
             SASSERT(str2.length() == 1);
             expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_numeral(rational::one(), true)), m);
             return retval;
-        } else if (u.re.is_full(re)) {
+        } else if (u.re.is_full_char(re)) {
             expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_numeral(rational::one(), true)), m);
             return retval;
+        } else if (u.re.is_full_seq(re)) {
+            // match any unbounded string
+            expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m);
+            return retval;
         } else if (u.re.is_complement(re)) {
             // skip complement for now, in general this is difficult to predict
             expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m);
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index ef0bbf0ee..506726bc6 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -745,6 +745,7 @@ protected:
     void new_diseq_eh(theory_var, theory_var) override;
 
     theory* mk_fresh(context*) override { return alloc(theory_str, get_manager(), m_params); }
+    void init(context * ctx) override;
     void init_search_eh() override;
     void add_theory_assumptions(expr_ref_vector & assumptions) override;
     lbool validate_unsat_core(expr_ref_vector & unsat_core) override;

From 11a339c490e06dc4a8342c11004e7909423c7e1b Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Sun, 11 Mar 2018 23:26:30 -0400
Subject: [PATCH 31/36] fix include path

---
 src/smt/theory_str.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 305f83df7..4ea96bd1f 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -25,7 +25,7 @@
 #include "smt/theory_seq_empty.h"
 #include "smt/theory_arith.h"
 #include "ast/ast_util.h"
-#include "seq_rewriter.h"
+#include "ast/rewriter/seq_rewriter.h"
 #include "smt_kernel.h"
 
 namespace smt {

From 6bb9a82425e7d2b9a17dfe77a5ae250b11c17ab4 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Thu, 15 Mar 2018 13:56:44 -0400
Subject: [PATCH 32/36] experimental axiom-persist for regex conflict clauses

---
 src/smt/theory_str.cpp | 23 ++++++++++++++++++++---
 src/smt/theory_str.h   |  6 ++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 4ea96bd1f..c9e484b02 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -57,6 +57,8 @@ namespace smt {
         tmpXorVarCount(0),
         tmpLenTestVarCount(0),
         tmpValTestVarCount(0),
+        m_persisted_axioms(m),
+        m_persisted_axiom_todo(m),
         avoidLoopCut(true),
         loopDetected(false),
         m_theoryStrOverlapAssumption_term(m),
@@ -827,7 +829,8 @@ namespace smt {
         return !m_basicstr_axiom_todo.empty() || !m_str_eq_todo.empty()
             || !m_concat_axiom_todo.empty() || !m_concat_eval_todo.empty()
             || !m_library_aware_axiom_todo.empty()
-            || !m_delayed_axiom_setup_terms.empty();
+            || !m_delayed_axiom_setup_terms.empty()
+            || !m_persisted_axiom_todo.empty()
         ;
     }
 
@@ -895,7 +898,11 @@ namespace smt {
                 set_up_axioms(m_delayed_axiom_setup_terms[i].get());
             }
             m_delayed_axiom_setup_terms.reset();
-        }
+            for (expr * a : m_persisted_axiom_todo) {
+                assert_axiom(a);
+            }
+            m_persisted_axiom_todo.reset();
+        } // can_propagate
     }
 
     /*
@@ -6696,7 +6703,7 @@ namespace smt {
         } else if (u.re.is_full_char(re)) {
             return true;
         } else if (u.re.is_full_seq(re)) {
-            return false; // generally unbounded
+            return true;
         } else if (u.re.is_complement(re)) {
             // TODO can we do better?
             return false;
@@ -8474,6 +8481,10 @@ namespace smt {
         }
     }
 
+    void theory_str::add_persisted_axiom(expr * a) {
+        m_persisted_axioms.push_back(a);
+    }
+
     void theory_str::pop_scope_eh(unsigned num_scopes) {
         sLevel -= num_scopes;
         TRACE("str", tout << "pop " << num_scopes << " to " << sLevel << std::endl;);
@@ -8519,6 +8530,11 @@ namespace smt {
         m_basicstr_axiom_todo.reset();
         m_basicstr_axiom_todo = new_m_basicstr;
 
+        for (expr * e : m_persisted_axioms) {
+            TRACE("str", tout << "persist axiom: " << mk_pp(e, get_manager()) << std::endl;);
+            m_persisted_axiom_todo.push_back(e);
+        }
+
         m_trail_stack.pop_scope(num_scopes);
         theory::pop_scope_eh(num_scopes);
 
@@ -10574,6 +10590,7 @@ namespace smt {
                     TRACE("str", tout << "product automaton is empty; asserting conflict clause" << std::endl;);
                     expr_ref conflict_clause(m.mk_not(mk_and(conflict_terms)), m);
                     assert_axiom(conflict_clause);
+                    add_persisted_axiom(conflict_clause);
                     regex_axiom_add = true;
                 }
             } // foreach (entry in regex_terms_by_string)
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index da9ed663d..7188da25c 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -352,6 +352,10 @@ protected:
     // enode lists for library-aware/high-level string terms (e.g. substr, contains)
     ptr_vector<enode> m_library_aware_axiom_todo;
 
+    // list of axioms that are re-asserted every time the scope is popped
+    expr_ref_vector m_persisted_axioms;
+    expr_ref_vector m_persisted_axiom_todo;
+
     // hashtable of all exprs for which we've already set up term-specific axioms --
     // this prevents infinite recursive descent with respect to axioms that
     // include an occurrence of the term for which axioms are being generated
@@ -545,6 +549,8 @@ protected:
     void instantiate_axiom_str_to_int(enode * e);
     void instantiate_axiom_int_to_str(enode * e);
 
+    void add_persisted_axiom(expr * a);
+
     expr * mk_RegexIn(expr * str, expr * regexp);
     void instantiate_axiom_RegexIn(enode * e);
     app * mk_unroll(expr * n, expr * bound);

From 6a3ce301b748280f6bb8c1eef959382848b83bf8 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Tue, 3 Apr 2018 12:51:03 -0400
Subject: [PATCH 33/36] fix collection error

---
 src/smt/theory_str.cpp | 3 +++
 src/smt/theory_str.h   | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 21b6e7b26..811b66798 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -1854,6 +1854,9 @@ namespace smt {
             std::pair<expr*, zstring> key1(ex->get_arg(0), regexStr);
             // skip Z3str's map check, because we already check if we set up axioms on this term
             regex_in_bool_map[key1] = ex;
+            if (!regex_in_var_reg_str_map.contains(ex->get_arg(0))) {
+                regex_in_var_reg_str_map.insert(ex->get_arg(0), std::set<zstring>());
+            }
             regex_in_var_reg_str_map[ex->get_arg(0)].insert(regexStr);
         }
 
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index d68d99abc..e9607585f 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -399,7 +399,7 @@ protected:
     obj_hashtable<expr> regex_terms;
     obj_map<expr, ptr_vector<expr> > regex_terms_by_string; // S --> [ (str.in.re S *) ]
     obj_map<expr, svector<regex_automaton_under_assumptions> > regex_automaton_assumptions; // RegEx --> [ aut+assumptions ]
-    std::map<expr*, nfa> regex_nfa_cache; // Regex term --> NFA
+    obj_map<expr, nfa> regex_nfa_cache; // Regex term --> NFA
     obj_hashtable<expr> regex_terms_with_path_constraints; // set of string terms which have had path constraints asserted in the current scope
     obj_hashtable<expr> regex_terms_with_length_constraints; // set of regex terms which had had length constraints asserted in the current scope
     obj_map<expr, expr*> regex_term_to_length_constraint; // (str.in.re S R) -> (length constraint over S wrt. R)

From 45f48123e76b6e5717927b24d54db51d467b276d Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Fri, 6 Apr 2018 11:39:08 -0400
Subject: [PATCH 34/36] add re.plus length enumeration; fix reordering warning

---
 src/smt/theory_str.cpp | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 811b66798..64cba43fb 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -54,12 +54,12 @@ namespace smt {
         m_unused_id(0),
         m_delayed_axiom_setup_terms(m),
         m_delayed_assertions_todo(m),
+        m_persisted_axioms(m),
+        m_persisted_axiom_todo(m),
         tmpStringVarCount(0),
         tmpXorVarCount(0),
         tmpLenTestVarCount(0),
         tmpValTestVarCount(0),
-        m_persisted_axioms(m),
-        m_persisted_axiom_todo(m),
         avoidLoopCut(true),
         loopDetected(false),
         m_theoryStrOverlapAssumption_term(m),
@@ -6865,7 +6865,7 @@ namespace smt {
             finalResult.push_back(r2);
             expr_ref retval(mk_and(finalResult), m);
             return retval;
-        } else if (u.re.is_star(re, sub1)) {
+        } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) {
             // stars are generated as a linear combination of all possible subterm lengths;
             // this requires that there are no stars under this one
             /*
@@ -6898,27 +6898,16 @@ namespace smt {
                     expr * n = mk_int_var("rstar");
                     freeVariables.push_back(n);
                     expr_ref term(m_autil.mk_mul(m_autil.mk_numeral(lenOption, true), n), m);
-                    sum_terms.push_back(term);
+                    expr_ref term2(term, m);
+                    if (u.re.is_plus(re)) {
+                        // n effectively starts at 1
+                        term2 = m_autil.mk_add(m_autil.mk_numeral(lenOption, true), term);
+                    }
+                    sum_terms.push_back(term2);
                 }
                 expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_add_simplify(sum_terms)), m);
                 return retval;
             }
-        } else if (u.re.is_plus(re, sub1)) {
-            /*
-            expr * v = mk_int_var("rlen");
-            expr * n = mk_int_var("rstar");
-            freeVariables.push_back(v);
-            freeVariables.push_back(n);
-            expr_ref rsub = infer_all_regex_lengths(v, sub1, freeVariables);
-            expr_ref_vector finalResult(m);
-            finalResult.push_back(rsub);
-            finalResult.push_back(ctx.mk_eq_atom(lenVar, m_autil.mk_mul(v, n)));
-            finalResult.push_back(m_autil.mk_ge(n, m_autil.mk_numeral(rational::one(), true)));
-            expr_ref retval(mk_and(finalResult), m);
-            return retval;
-            */
-            // TODO this should really be done as a sub-case of star
-            NOT_IMPLEMENTED_YET();
         } else if (u.re.is_range(re, sub1, sub2)) {
             SASSERT(u.str.is_string(sub1));
             SASSERT(u.str.is_string(sub2));

From 27f2b542df088f060fff7540703896e32e284cb8 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Fri, 6 Apr 2018 12:13:53 -0400
Subject: [PATCH 35/36] remove comment

---
 src/smt/theory_str.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 64cba43fb..0a3089c62 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -6831,8 +6831,6 @@ namespace smt {
      * In some cases, the returned formula requires one or more free integer variables to be created.
      * These variables are returned in the reference parameter `freeVariables`.
      * Extra assertions should be made for these free variables constraining them to be non-negative.
-     *
-     * TODO: star unrolling?
      */
     expr_ref theory_str::infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables) {
         ENSURE(u.is_re(re));

From 3cfb32cd2d18acaca3b5cf5449c8c2c7c090aad9 Mon Sep 17 00:00:00 2001
From: Murphy Berzish <murphy.berzish@gmail.com>
Date: Thu, 12 Apr 2018 14:35:29 -0400
Subject: [PATCH 36/36] fix regex automata leaked memory

---
 src/smt/theory_str.cpp | 5 +++++
 src/smt/theory_str.h   | 1 +
 2 files changed, 6 insertions(+)

diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp
index 0a3089c62..60b365d20 100644
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@@ -77,6 +77,9 @@ namespace smt {
 
     theory_str::~theory_str() {
         m_trail_stack.reset();
+        for (eautomaton * aut : regex_automata) {
+            dealloc(aut);
+        }
         regex_automata.clear();
     }
 
@@ -10559,6 +10562,7 @@ namespace smt {
                             if ( (current_assignment == l_true && aut.get_polarity())
                                     || (current_assignment == l_false && !aut.get_polarity())) {
                                 aut_inter = m_mk_aut.mk_product(aut_inter, aut.get_automaton());
+                                m_automata.push_back(aut_inter);
                             } else {
                                 // need to complement first
                                 expr_ref rc(u.re.mk_complement(aut.get_regex_term()), m);
@@ -10567,6 +10571,7 @@ namespace smt {
                                 // TODO is there any way to build a complement automaton from an existing one?
                                 // this discards length information
                                 aut_inter = m_mk_aut.mk_product(aut_inter, aut_c);
+                                m_automata.push_back(aut_inter);
                             }
                             used_intersect_constraints.push_back(aut);
                             if (aut_inter->is_empty()) {
diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h
index e9607585f..419084091 100644
--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@@ -395,6 +395,7 @@ protected:
     obj_map<expr, std::set<zstring> > regex_in_var_reg_str_map;
 
     // regex automata
+    scoped_ptr_vector<eautomaton> m_automata;
     ptr_vector<eautomaton> regex_automata;
     obj_hashtable<expr> regex_terms;
     obj_map<expr, ptr_vector<expr> > regex_terms_by_string; // S --> [ (str.in.re S *) ]