From fbe8d1577ed54671a6d9fab0a59520225b701127 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Mon, 4 Dec 2017 18:05:00 -0500 Subject: [PATCH 01/36] new regex automata start; add complexity estimation --- src/ast/rewriter/seq_rewriter.cpp | 3 + src/ast/rewriter/seq_rewriter.h | 5 +- src/smt/params/smt_params_helper.pyg | 1 + src/smt/params/theory_str_params.cpp | 1 + src/smt/params/theory_str_params.h | 10 ++- src/smt/theory_str.cpp | 105 +++++++++++++++++++++++++++ src/smt/theory_str.h | 5 +- 7 files changed, 127 insertions(+), 3 deletions(-) diff --git a/src/ast/rewriter/seq_rewriter.cpp b/src/ast/rewriter/seq_rewriter.cpp index fa95278b4..6f501c6e0 100644 --- a/src/ast/rewriter/seq_rewriter.cpp +++ b/src/ast/rewriter/seq_rewriter.cpp @@ -200,6 +200,9 @@ void re2automaton::set_solver(expr_solver* solver) { m_sa = alloc(symbolic_automata_t, sm, *m_ba.get()); } +eautomaton* re2automaton::mk_product(eautomaton* a1, eautomaton* a2) { + return m_sa->mk_product(*a1, *a2); +} eautomaton* re2automaton::operator()(expr* e) { eautomaton* r = re2aut(e); diff --git a/src/ast/rewriter/seq_rewriter.h b/src/ast/rewriter/seq_rewriter.h index 69f319168..95b043bd7 100644 --- a/src/ast/rewriter/seq_rewriter.h +++ b/src/ast/rewriter/seq_rewriter.h @@ -53,7 +53,9 @@ public: bool is_range() const { return m_ty == t_range; } sort* get_sort() const { return m_sort; } expr* get_char() const { SASSERT(is_char()); return m_t; } - + expr* get_pred() const { SASSERT(is_pred()); return m_t; } + expr* get_lo() const { SASSERT(is_range()); return m_t; } + expr* get_hi() const { SASSERT(is_range()); return m_s; } }; class sym_expr_manager { @@ -87,6 +89,7 @@ public: ~re2automaton(); eautomaton* operator()(expr* e); void set_solver(expr_solver* solver); + eautomaton* mk_product(eautomaton *a1, eautomaton *a2); }; /** diff --git a/src/smt/params/smt_params_helper.pyg b/src/smt/params/smt_params_helper.pyg index 937aa6a2b..7f35feb64 100644 --- a/src/smt/params/smt_params_helper.pyg +++ b/src/smt/params/smt_params_helper.pyg @@ -79,6 +79,7 @@ def_module_params(module_name='smt', ('theory_aware_branching', BOOL, False, 'Allow the context to use extra information from theory solvers regarding literal branching prioritization.'), ('str.finite_overlap_models', BOOL, False, 'attempt a finite model search for overlapping variables instead of completely giving up on the arrangement'), ('str.overlap_priority', DOUBLE, -0.1, 'theory-aware priority for overlapping variable cases; use smt.theory_aware_branching=true'), + ('str.regex_automata', BOOL, True, 'use automata-based reasoning for regular expressions (Z3str3 only)'), ('core.minimize', BOOL, False, 'minimize unsat core produced by SMT context'), ('core.extend_patterns', BOOL, False, 'extend unsat core with literals that trigger (potential) quantifier instances'), ('core.extend_patterns.max_distance', UINT, UINT_MAX, 'limits the distance of a pattern-extended unsat core'), diff --git a/src/smt/params/theory_str_params.cpp b/src/smt/params/theory_str_params.cpp index afbfc33fc..b5451b9dc 100644 --- a/src/smt/params/theory_str_params.cpp +++ b/src/smt/params/theory_str_params.cpp @@ -31,4 +31,5 @@ void theory_str_params::updt_params(params_ref const & _p) { m_UseBinarySearch = p.str_use_binary_search(); m_BinarySearchInitialUpperBound = p.str_binary_search_start(); m_OverlapTheoryAwarePriority = p.str_overlap_priority(); + m_RegexAutomata = p.str_regex_automata(); } diff --git a/src/smt/params/theory_str_params.h b/src/smt/params/theory_str_params.h index c841609db..4135cf0d6 100644 --- a/src/smt/params/theory_str_params.h +++ b/src/smt/params/theory_str_params.h @@ -80,6 +80,13 @@ struct theory_str_params { double m_OverlapTheoryAwarePriority; + /* + * If RegexAutomata is set to true, + * Z3str3 will use automata-based methods to reason about + * regular expression constraints. + */ + bool m_RegexAutomata; + theory_str_params(params_ref const & p = params_ref()): m_StrongArrangements(true), m_AggressiveLengthTesting(false), @@ -91,7 +98,8 @@ struct theory_str_params { m_FiniteOverlapModels(false), m_UseBinarySearch(false), m_BinarySearchInitialUpperBound(64), - m_OverlapTheoryAwarePriority(-0.1) + m_OverlapTheoryAwarePriority(-0.1), + m_RegexAutomata(true) { updt_params(p); } diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 7936b581c..a859dae9d 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -6446,6 +6446,111 @@ namespace smt { } } + // saturating unsigned addition + unsigned inline _qadd(unsigned a, unsigned b) { + if (a == UINT_MAX || b == UINT_MAX) { + return UINT_MAX; + } + unsigned result = a + b; + if (result < a || result < b) { + return UINT_MAX; + } + return result; + } + + // saturating unsigned multiply + unsigned inline _qmul(unsigned a, unsigned b) { + if (a == UINT_MAX || b == UINT_MAX) { + return UINT_MAX; + } + unsigned result = a * b; + if (result < a || result < b) { + return UINT_MAX; + } + return result; + } + + unsigned theory_str::estimate_regex_complexity(expr * re) { + ENSURE(u.is_re(re)); + expr * sub1; + expr * sub2; + if (u.re.is_to_re(re, sub1)) { + SASSERT(u.str.is_string(sub1)); + zstring str; + u.str.is_string(sub1, str); + return str.length(); + } else if (u.re.is_complement(re, sub1)) { + return estimate_regex_complexity_under_complement(sub1); + } else if (u.re.is_concat(re, sub1, sub2)) { + unsigned cx1 = estimate_regex_complexity(sub1); + unsigned cx2 = estimate_regex_complexity(sub2); + return _qadd(cx1, cx2); + } else if (u.re.is_union(re, sub1, sub2)) { + unsigned cx1 = estimate_regex_complexity(sub1); + unsigned cx2 = estimate_regex_complexity(sub2); + return _qadd(cx1, cx2); + } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) { + unsigned cx = estimate_regex_complexity(sub1); + return _qmul(2, cx); + } else if (u.re.is_range(re, sub1, sub2)) { + SASSERT(u.str.is_string(sub1)); + SASSERT(u.str.is_string(sub2)); + zstring str1, str2; + u.str.is_string(sub1, str1); + u.str.is_string(sub2, str2); + SASSERT(str1.length() == 1); + SASSERT(str2.length() == 1); + return 1 + str2[0] - str1[0]; + } else if (u.re.is_full(re)) { + return 1; + } else { + TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;); + return 1; + } + } + + unsigned theory_str::estimate_regex_complexity_under_complement(expr * re) { + ENSURE(u.is_re(re)); + expr * sub1; + expr * sub2; + if (u.re.is_to_re(re, sub1)) { + SASSERT(u.str.is_string(sub1)); + zstring str; + u.str.is_string(sub1, str); + return str.length(); + } else if (u.re.is_complement(re, sub1)) { + // Why don't we return the regular complexity here? + // We could, but this might be called from under another complemented subexpression. + // It's better to give a worst-case complexity. + return estimate_regex_complexity_under_complement(sub1); + } else if (u.re.is_concat(re, sub1, sub2)) { + unsigned cx1 = estimate_regex_complexity_under_complement(sub1); + unsigned cx2 = estimate_regex_complexity_under_complement(sub2); + return _qadd(_qmul(2, cx1), cx2); + } else if (u.re.is_union(re, sub1, sub2)) { + unsigned cx1 = estimate_regex_complexity_under_complement(sub1); + unsigned cx2 = estimate_regex_complexity_under_complement(sub2); + return _qmul(cx1, cx2); + } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) { + unsigned cx = estimate_regex_complexity_under_complement(sub1); + return _qmul(2, cx); + } else if (u.re.is_range(re, sub1, sub2)) { + SASSERT(u.str.is_string(sub1)); + SASSERT(u.str.is_string(sub2)); + zstring str1, str2; + u.str.is_string(sub1, str1); + u.str.is_string(sub2, str2); + SASSERT(str1.length() == 1); + SASSERT(str2.length() == 1); + return 1 + str2[0] - str1[0]; + } else if (u.re.is_full(re)) { + return 1; + } else { + TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;); + return 1; + } + } + /* * strArgmt::solve_concat_eq_str() * Solve concatenations of the form: diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index 03fd31162..72643c4be 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -457,11 +457,14 @@ protected: expr * mk_RegexIn(expr * str, expr * regexp); void instantiate_axiom_RegexIn(enode * e); app * mk_unroll(expr * n, expr * bound); - void process_unroll_eq_const_str(expr * unrollFunc, expr * constStr); void unroll_str2reg_constStr(expr * unrollFunc, expr * eqConstStr); void process_concat_eq_unroll(expr * concat, expr * unroll); + // regex automata + unsigned estimate_regex_complexity(expr * re); + unsigned estimate_regex_complexity_under_complement(expr * re); + void set_up_axioms(expr * ex); void handle_equality(expr * lhs, expr * rhs); From a5c828f6f2ea967b4b17efc8ba22070a517696d2 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Wed, 6 Dec 2017 18:32:11 -0500 Subject: [PATCH 02/36] length estimation --- src/smt/theory_str.cpp | 116 ++++++++++++++++++++++++++++++++++++++++- src/smt/theory_str.h | 7 ++- 2 files changed, 121 insertions(+), 2 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index a859dae9d..88180352c 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -25,6 +25,8 @@ #include "smt/theory_seq_empty.h" #include "smt/theory_arith.h" #include "ast/ast_util.h" +#include "seq_rewriter.h" +#include "smt_kernel.h" namespace smt { @@ -48,6 +50,7 @@ namespace smt { finalCheckProgressIndicator(false), m_trail(m), m_factory(nullptr), + m_mk_aut(m), m_unused_id(0), m_delayed_axiom_setup_terms(m), tmpStringVarCount(0), @@ -95,6 +98,26 @@ namespace smt { return u.str.mk_string(sym); } + class seq_expr_solver : public expr_solver { + kernel m_kernel; + public: + seq_expr_solver(ast_manager& m, smt_params& fp): + m_kernel(m, fp) {} + virtual lbool check_sat(expr* e) { + m_kernel.push(); + m_kernel.assert_expr(e); + lbool r = m_kernel.check(); + m_kernel.pop(1); + return r; + } + }; + + void theory_str::init(context * ctx) { + theory::init(ctx); + m_mk_aut.set_solver(alloc(seq_expr_solver, get_manager(), + get_context().get_fparams())); + } + void theory_str::initialize_charset() { bool defaultCharset = true; if (defaultCharset) { @@ -1718,6 +1741,11 @@ namespace smt { regex_in_var_reg_str_map[ex->get_arg(0)].insert(regexStr); } + if (m_params.m_RegexAutomata) { + // stop setting up axioms here, we do this differently + return; + } + expr_ref str(ex->get_arg(0), m); app * regex = to_app(ex->get_arg(1)); @@ -1887,7 +1915,7 @@ namespace smt { check_contain_in_new_eq(lhs, rhs); } - if (!regex_in_bool_map.empty()) { + if (!regex_in_bool_map.empty() && !m_params.m_RegexAutomata) { TRACE("str", tout << "checking regex consistency" << std::endl;); check_regex_in(lhs, rhs); } @@ -6550,6 +6578,92 @@ namespace smt { return 1; } } + /* + * Infer all length constraints implied by the given regular expression `re` + * in order to constrain `lenVar` (which must be of sort Int). + * This assumes that `re` appears in a positive context. + * Returns a Boolean formula expressing the appropriate constraints over `lenVar`. + * In some cases, the returned formula requires one or more free integer variables to be created. + * These variables are returned in the reference parameter `freeVariables`. + * Extra assertions should be made for these free variables constraining them to be non-negative. + */ + expr_ref theory_str::infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables) { + ENSURE(u.is_re(re)); + context & ctx = get_context(); + ast_manager & m = get_manager(); + expr * sub1; + expr * sub2; + if (u.re.is_to_re(re, sub1)) { + SASSERT(u.str.is_string(sub1)); + zstring str; + u.str.is_string(sub1, str); + rational strlen(str.length()); + expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_numeral(strlen, true)), m); + return retval; + } else if (u.re.is_union(re, sub1, sub2)) { + expr_ref r1 = infer_all_regex_lengths(lenVar, sub1, freeVariables); + expr_ref r2 = infer_all_regex_lengths(lenVar, sub2, freeVariables); + expr_ref retval(m.mk_or(r1, r2), m); + return retval; + } else if (u.re.is_concat(re, sub1, sub2)) { + expr * v1 = mk_int_var("rlen1"); + expr * v2 = mk_int_var("rlen2"); + freeVariables.push_back(v1); + freeVariables.push_back(v2); + expr_ref r1 = infer_all_regex_lengths(v1, sub1, freeVariables); + expr_ref r2 = infer_all_regex_lengths(v2, sub2, freeVariables); + expr_ref_vector finalResult(m); + finalResult.push_back(ctx.mk_eq_atom(lenVar, m_autil.mk_add(v1, v2))); + finalResult.push_back(r1); + finalResult.push_back(r2); + expr_ref retval(mk_and(finalResult), m); + return retval; + } else if (u.re.is_star(re, sub1)) { + expr * v = mk_int_var("rlen"); + expr * n = mk_int_var("rstar"); + freeVariables.push_back(v); + freeVariables.push_back(n); + expr_ref rsub = infer_all_regex_lengths(v, sub1, freeVariables); + expr_ref_vector finalResult(m); + finalResult.push_back(rsub); + finalResult.push_back(ctx.mk_eq_atom(lenVar, m_autil.mk_mul(v, n))); + expr_ref retval(mk_and(finalResult), m); + return retval; + } else if (u.re.is_plus(re, sub1)) { + expr * v = mk_int_var("rlen"); + expr * n = mk_int_var("rstar"); + freeVariables.push_back(v); + freeVariables.push_back(n); + expr_ref rsub = infer_all_regex_lengths(v, sub1, freeVariables); + expr_ref_vector finalResult(m); + finalResult.push_back(rsub); + finalResult.push_back(ctx.mk_eq_atom(lenVar, m_autil.mk_mul(v, n))); + finalResult.push_back(m_autil.mk_ge(n, m_autil.mk_numeral(rational::one(), true))); + expr_ref retval(mk_and(finalResult), m); + return retval; + } else if (u.re.is_range(re, sub1, sub2)) { + SASSERT(u.str.is_string(sub1)); + SASSERT(u.str.is_string(sub2)); + zstring str1, str2; + u.str.is_string(sub1, str1); + u.str.is_string(sub2, str2); + SASSERT(str1.length() == 1); + SASSERT(str2.length() == 1); + expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_numeral(rational::one(), true)), m); + return retval; + } else if (u.re.is_full(re)) { + expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_numeral(rational::one(), true)), m); + return retval; + } else if (u.re.is_complement(re)) { + // skip complement for now, in general this is difficult to predict + expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m); + return retval; + } else { + TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, m) << std::endl;); + expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m); + return retval; + } + } /* * strArgmt::solve_concat_eq_str() diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index 72643c4be..602f9feab 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -23,6 +23,7 @@ #include "ast/ast_pp.h" #include "ast/arith_decl_plugin.h" #include "ast/rewriter/th_rewriter.h" +#include "ast/rewriter/seq_rewriter.h" #include "ast/seq_decl_plugin.h" #include "smt/smt_theory.h" #include "smt/params/theory_str_params.h" @@ -267,6 +268,8 @@ protected: str_value_factory * m_factory; + re2automaton m_mk_aut; + // Unique identifier appended to unused variables to ensure that model construction // does not introduce equalities when they weren't enforced. unsigned m_unused_id; @@ -461,9 +464,10 @@ protected: void unroll_str2reg_constStr(expr * unrollFunc, expr * eqConstStr); void process_concat_eq_unroll(expr * concat, expr * unroll); - // regex automata + // regex automata and length-aware regex unsigned estimate_regex_complexity(expr * re); unsigned estimate_regex_complexity_under_complement(expr * re); + expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables); void set_up_axioms(expr * ex); void handle_equality(expr * lhs, expr * rhs); @@ -640,6 +644,7 @@ protected: virtual void new_diseq_eh(theory_var, theory_var); virtual theory* mk_fresh(context*) { return alloc(theory_str, get_manager(), m_params); } + virtual void init(context * ctx); virtual void init_search_eh(); virtual void add_theory_assumptions(expr_ref_vector & assumptions); virtual lbool validate_unsat_core(expr_ref_vector & unsat_core); From 0ac7082c8092c25efb3b9f893b2982a9e0ab556e Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Thu, 21 Dec 2017 17:13:39 -0500 Subject: [PATCH 03/36] add upper bound refinement (WIP) --- src/smt/theory_str.cpp | 239 ++++++++++++++++++++++++++++++++++++++++- src/smt/theory_str.h | 69 ++++++++++++ 2 files changed, 305 insertions(+), 3 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 88180352c..2f41a6617 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -74,6 +74,7 @@ namespace smt { theory_str::~theory_str() { m_trail_stack.reset(); + regex_automata.clear(); } expr * theory_str::mk_string(zstring const& str) { @@ -1741,14 +1742,19 @@ namespace smt { regex_in_var_reg_str_map[ex->get_arg(0)].insert(regexStr); } + expr_ref str(ex->get_arg(0), m); + app * regex = to_app(ex->get_arg(1)); + if (m_params.m_RegexAutomata) { + regex_terms.insert(ex); + if (!regex_terms_by_string.contains(str)) { + regex_terms_by_string.insert(str, ptr_vector()); + } + regex_terms_by_string[str].push_back(ex); // stop setting up axioms here, we do this differently return; } - expr_ref str(ex->get_arg(0), m); - app * regex = to_app(ex->get_arg(1)); - // quick reference for the following code: // - ex: top-level regex membership term // - str: string term appearing in ex @@ -6665,6 +6671,77 @@ namespace smt { } } + /* + * Refine the upper bound on the length of a solution to a given automaton. + * The method returns TRUE if a solution of length `current_upper_bound` exists, + * and FALSE otherwise. In addition, the reference parameter `refined_upper_bound` + * is assigned the length of the longest solution shorter than `current_upper_bound`, + * if a shorter solution exists, or -1 otherwise. + */ + bool theory_str::refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound) { + ENSURE(aut != NULL); + + if (aut->final_states().empty()) { + // no solutions at all! + refined_upper_bound = rational::minus_one(); + return false; + } + + // from here we assume there is a final state reachable from the initial state + unsigned_vector search_queue; + // populate search queue with all states reachable from the epsilon-closure of the start state + aut->get_epsilon_closure(aut->init(), search_queue); + + rational last_solution_depth = rational::minus_one(); + bool found_solution_at_upper_bound = false; + + unsigned search_depth = 0; + hashtable > next_states; + unsigned_vector next_search_queue; + + while(!search_queue.empty()) { + // see if any of the current states are final + for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) { + unsigned src = *it; + if (aut->is_final_state(src)) { + if (search_depth == current_upper_bound.get_unsigned()) { + found_solution_at_upper_bound = true; + } else { + last_solution_depth = rational(search_depth); + } + break; + } + } + + if (search_depth == current_upper_bound.get_unsigned()) { + break; + } + + next_states.reset(); + next_search_queue.clear(); + // move one step along all states + for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) { + unsigned src = *it; + eautomaton::moves next_moves; + aut->get_moves_from(src, next_moves, true); + for (eautomaton::moves::iterator moves_it = next_moves.begin(); + moves_it != next_moves.end(); ++moves_it) { + unsigned dst = moves_it->dst(); + if (!next_states.contains(dst)) { + next_states.insert(dst); + next_search_queue.push_back(dst); + } + } + } + search_queue.clear(); + search_queue.append(next_search_queue); + search_depth += 1; + } //!search_queue.empty() + + refined_upper_bound = last_solution_depth; + return found_solution_at_upper_bound; + } + /* * strArgmt::solve_concat_eq_str() * Solve concatenations of the form: @@ -9076,6 +9153,155 @@ namespace smt { } } + // regex automata + if (m_params.m_RegexAutomata) { + bool regex_axiom_add = false; + for (obj_hashtable::iterator it = regex_terms.begin(); it != regex_terms.end(); ++it) { + expr * str_in_re = *it; + expr * str; + expr * re; + u.str.is_in_re(str_in_re, str, re); + lbool current_assignment = ctx.get_assignment(str_in_re); + TRACE("str", tout << "regex term: " << mk_pp(str, m) << " in " << mk_pp(re, m) << " : " << current_assignment << std::endl;); + if (current_assignment == l_undef) { + continue; + } + rational exact_length_value; + if (get_len_value(str, exact_length_value)) { + TRACE("str", tout << "exact length of " << mk_pp(str, m) << " is " << exact_length_value << std::endl;); + NOT_IMPLEMENTED_YET(); + } + expr_ref str_len(mk_strlen(str), m); + rational lower_bound_value; + rational upper_bound_value; + bool lower_bound_exists = lower_bound(str_len, lower_bound_value); + bool upper_bound_exists = upper_bound(str_len, upper_bound_value); + CTRACE("str", lower_bound_exists, tout << "lower bound of " << mk_pp(str, m) << " is " << lower_bound_value << std::endl;); + CTRACE("str", upper_bound_exists, tout << "upper bound of " << mk_pp(str, m) << " is " << upper_bound_value << std::endl;); + + if (upper_bound_exists) { + // check current assumptions + if (regex_automaton_assumptions.contains(re) && + !regex_automaton_assumptions[re].empty()){ + // one or more existing assumptions. + // see if the (current best) upper bound can be refined + // (note that if we have an automaton with no assumption, + // this automatically counts as best) + bool need_assumption = true; + regex_automaton_under_assumptions last_assumption; + rational last_ub = rational::minus_one(); + for (svector::iterator it = regex_automaton_assumptions[re].begin(); + it != regex_automaton_assumptions[re].end(); ++it) { + regex_automaton_under_assumptions autA = *it; + if ((current_assignment == l_true && autA.get_polarity() == false) + || (current_assignment == l_false && autA.get_polarity() == true)) { + // automaton uses incorrect polarity + continue; + } + rational this_ub; + if (autA.get_upper_bound(this_ub)) { + if (last_ub == rational::minus_one() || this_ub < last_ub) { + last_ub = this_ub; + last_assumption = autA; + } + } else { + need_assumption = false; + last_assumption = autA; + break; + } + } + if (!last_ub.is_minus_one() || !need_assumption) { + CTRACE("str", !need_assumption, tout << "using automaton with full length information" << std::endl;); + CTRACE("str", need_assumption, tout << "using automaton with assumed upper bound of " << last_ub << std::endl;); + + rational refined_upper_bound; + bool solution_at_upper_bound = refine_automaton_upper_bound(last_assumption.get_automaton(), + upper_bound_value, refined_upper_bound); + TRACE("str", tout << "refined upper bound is " << refined_upper_bound << + (solution_at_upper_bound?", solution at upper bound":", no solution at upper bound") << std::endl;); + + expr_ref_vector lhs(m); + if (current_assignment == l_false) { + lhs.push_back(m.mk_not(str_in_re)); + } else { + lhs.push_back(str_in_re); + } + if (need_assumption) { + lhs.push_back(m_autil.mk_le(str_len, m_autil.mk_numeral(last_ub, true))); + } + lhs.push_back(m_autil.mk_le(str_len, m_autil.mk_numeral(upper_bound_value, true))); + + expr_ref_vector rhs(m); + + if (solution_at_upper_bound) { + if (refined_upper_bound.is_minus_one()) { + // If there are solutions at the upper bound but not below it, make the bound exact. + NOT_IMPLEMENTED_YET(); + } else { + // If there are solutions at and below the upper bound, add an additional bound. + NOT_IMPLEMENTED_YET(); + } + } else { + if (refined_upper_bound.is_minus_one()) { + // If there are no solutions at or below the upper bound, assert a conflict clause. + rhs.push_back(m.mk_not(m_autil.mk_le(str_len, m_autil.mk_numeral(upper_bound_value, true)))); + } else { + // If there are solutions below the upper bound but not at it, refine the bound. + NOT_IMPLEMENTED_YET(); + } + } + + expr_ref lhs_terms(mk_and(lhs), m); + expr_ref rhs_terms(mk_and(rhs), m); + assert_implication(lhs_terms, rhs_terms); + regex_axiom_add = true; + } + } else { + // no existing automata/assumptions. + // if it's easy to construct a full automaton for R, do so + unsigned expected_complexity = estimate_regex_complexity(re); + unsigned threshold = 1000; // TODO better metric + if (expected_complexity <= threshold) { + eautomaton * aut = m_mk_aut(re); + aut->compress(); + regex_automata.push_back(aut); + regex_automaton_under_assumptions new_aut(re, aut, true); + if (!regex_automaton_assumptions.contains(re)) { + regex_automaton_assumptions.insert(re, svector()); + } + regex_automaton_assumptions[re].push_back(new_aut); + TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); + regex_axiom_add = true; + // TODO immediately attempt to learn lower/upper bound info here + } else { + // TODO check negation? + // construct a partial automaton for R to the given upper bound + NOT_IMPLEMENTED_YET(); + } + } + // if we have *any* automaton for R, and the upper bound is not too large, + // finitize the automaton (if we have not already done so) and assert all solutions + if (upper_bound_value < 50) { // TODO better metric for threshold + // NOT_IMPLEMENTED_YET(); // TODO(mtrberzi) + } + } else { // !upper_bound_exists + // no upper bound information + if (lower_bound_exists) { + // lower bound, no upper bound + NOT_IMPLEMENTED_YET(); + } else { // !lower_bound_exists + // no bounds information + NOT_IMPLEMENTED_YET(); + } + } + + // NOT_IMPLEMENTED_YET(); + } + if (regex_axiom_add) { + return FC_CONTINUE; + } + } // RegexAutomata + bool needToAssignFreeVars = false; std::set free_variables; std::set unused_internal_variables; @@ -10567,6 +10793,13 @@ namespace smt { SASSERT(lenTestAssert != NULL); return lenTestAssert; } else { + // if we are performing automata-based reasoning and the term associated with + // this length tester is in any way constrained by regex terms, + // do not perform value testing -- this term is not a free variable. + if (m_params.m_RegexAutomata) { + NOT_IMPLEMENTED_YET(); // TODO + } + TRACE("str", tout << "length is fixed; generating models for free var" << std::endl;); // length is fixed expr * valueAssert = gen_free_var_options(freeVar, effectiveLenInd, effectiveLenIndiStr, NULL, zstring("")); diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index 602f9feab..57fa48142 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -166,6 +166,69 @@ public: bool matches(zstring input); }; +class regex_automaton_under_assumptions { +protected: + expr * str_in_re; + eautomaton * aut; + bool polarity; + + bool assume_lower_bound; + rational lower_bound; + + bool assume_upper_bound; + rational upper_bound; +public: + regex_automaton_under_assumptions() : + str_in_re(NULL), aut(NULL), polarity(false), + assume_lower_bound(false), assume_upper_bound(false) {} + + regex_automaton_under_assumptions(expr * str_in_re, eautomaton * aut, bool polarity) : + str_in_re(str_in_re), aut(aut), polarity(polarity), + assume_lower_bound(false), assume_upper_bound(false) {} + + void set_lower_bound(rational & lb) { + lower_bound = lb; + assume_lower_bound = true; + } + void unset_lower_bound() { + assume_lower_bound = false; + } + + void set_upper_bound(rational & ub) { + upper_bound = ub; + assume_upper_bound = true; + } + void unset_upper_bound() { + assume_upper_bound = false; + } + + bool get_lower_bound(rational & lb) const { + if (assume_lower_bound) { + lb = lower_bound; + return true; + } else { + return false; + } + } + + bool get_upper_bound(rational & ub) const { + if (assume_upper_bound) { + ub = upper_bound; + return true; + } else { + return false; + } + } + + eautomaton * get_automaton() const { return aut; } + bool get_polarity() const { return polarity; } + + virtual ~regex_automaton_under_assumptions() { + // don't free str_in_re or aut; + // they are managed separately + } +}; + class theory_str : public theory { struct T_cut { @@ -338,6 +401,11 @@ protected: std::map, expr*> regex_in_bool_map; std::map > regex_in_var_reg_str_map; + // regex automata + ptr_vector regex_automata; + obj_hashtable regex_terms; + obj_map > regex_terms_by_string; // S --> [ (str.in.re S *) ] + obj_map > regex_automaton_assumptions; // RegEx --> [ aut+assumptions ] std::map regex_nfa_cache; // Regex term --> NFA svector char_set; @@ -468,6 +536,7 @@ protected: unsigned estimate_regex_complexity(expr * re); unsigned estimate_regex_complexity_under_complement(expr * re); expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables); + bool refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound); void set_up_axioms(expr * ex); void handle_equality(expr * lhs, expr * rhs); From 0917af7c569eb7380f04ff0165861912ba9f3109 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Wed, 3 Jan 2018 12:02:11 -0500 Subject: [PATCH 04/36] full upper bound refinement --- src/smt/theory_str.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 2f41a6617..b0b712511 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -9236,10 +9236,13 @@ namespace smt { if (solution_at_upper_bound) { if (refined_upper_bound.is_minus_one()) { // If there are solutions at the upper bound but not below it, make the bound exact. - NOT_IMPLEMENTED_YET(); + rhs.push_back(ctx.mk_eq_atom(str_len, m_autil.mk_numeral(upper_bound_value, true))); } else { // If there are solutions at and below the upper bound, add an additional bound. - NOT_IMPLEMENTED_YET(); + rhs.push_back(m.mk_or( + ctx.mk_eq_atom(str_len, m_autil.mk_numeral(upper_bound_value, true)), + m_autil.mk_le(str_len, m_autil.mk_numeral(refined_upper_bound, true)) + )); } } else { if (refined_upper_bound.is_minus_one()) { @@ -9247,7 +9250,7 @@ namespace smt { rhs.push_back(m.mk_not(m_autil.mk_le(str_len, m_autil.mk_numeral(upper_bound_value, true)))); } else { // If there are solutions below the upper bound but not at it, refine the bound. - NOT_IMPLEMENTED_YET(); + rhs.push_back(m_autil.mk_le(str_len, m_autil.mk_numeral(refined_upper_bound, true))); } } From 0f20944aebf2f13e624b36d7b66dbba5a8e1233a Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Wed, 3 Jan 2018 13:54:18 -0500 Subject: [PATCH 05/36] regex lower bound (WIP) --- src/smt/theory_str.cpp | 158 ++++++++++++++++++++++++++++++++++++++++- src/smt/theory_str.h | 1 + 2 files changed, 158 insertions(+), 1 deletion(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index b0b712511..04eb86fe4 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -6671,6 +6671,118 @@ namespace smt { } } + /* + * Refine the lower bound on the length of a solution to a given automaton. + * The method returns TRUE if a solution of length `current_lower_bound` exists, + * and FALSE otherwise. In addition, the reference parameter `refined_lower_bound` + * is assigned the length of the shortest solution longer than `current_lower_bound` + * if it exists, or -1 otherwise. + */ + bool theory_str::refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound) { + ENSURE(aut != NULL); + + ast_manager & m = get_manager(); + + if (aut->final_states().size() < 1) { + // no solutions at all + refined_lower_bound = rational::minus_one(); + return false; + } + + // from here we assume that there is a final state reachable from the initial state + + unsigned_vector search_queue; + // populate search_queue with all states reachable from the epsilon-closure of start state + aut->get_epsilon_closure(aut->init(), search_queue); + + unsigned search_depth = 0; + hashtable> next_states; + unsigned_vector next_search_queue; + + bool found_solution_at_lower_bound = false; + + while (!search_queue.empty()) { + // if we are at the lower bound, check for final states + if (search_depth == current_lower_bound.get_unsigned()) { + for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) { + unsigned state = *it; + if (aut->is_final_state(state)) { + found_solution_at_lower_bound = true; + break; + } + } + // end phase 1 + break; + } + next_states.reset(); + next_search_queue.clear(); + // move one step along all states + for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) { + unsigned src = *it; + eautomaton::moves next_moves; + aut->get_moves_from(src, next_moves, true); + for (eautomaton::moves::iterator move_it = next_moves.begin(); + move_it != next_moves.end(); ++move_it) { + unsigned dst = move_it->dst(); + if (!next_states.contains(dst)) { + next_states.insert(dst); + next_search_queue.push_back(dst); + } + } + } + search_queue.clear(); + search_queue.append(next_search_queue); + search_depth += 1; + } // !search_queue.empty() + + // if we got here before reaching the lower bound, + // there aren't any solutions at or above it, so stop + if (search_depth < current_lower_bound.get_unsigned()) { + refined_lower_bound = rational::minus_one(); + return false; + } + + // phase 2: continue exploring the automaton above the lower bound + SASSERT(search_depth == current_lower_bound.get_unsigned()); + + while (!search_queue.empty()) { + if (search_depth > current_lower_bound.get_unsigned()) { + // check if we have found a solution above the lower bound + for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) { + unsigned state = *it; + if (aut->is_final_state(state)) { + // this is a solution at a depth higher than the lower bound + refined_lower_bound = rational(search_depth); + return found_solution_at_lower_bound; + } + } + } + next_states.reset(); + next_search_queue.clear(); + // move one step along all states + for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) { + unsigned src = *it; + eautomaton::moves next_moves; + aut->get_moves_from(src, next_moves, true); + for (eautomaton::moves::iterator move_it = next_moves.begin(); + move_it != next_moves.end(); ++move_it) { + unsigned dst = move_it->dst(); + if (!next_states.contains(dst)) { + next_states.insert(dst); + next_search_queue.push_back(dst); + } + } + } + search_queue.clear(); + search_queue.append(next_search_queue); + search_depth += 1; + } + // if we reached this point, we explored the whole automaton and didn't find any + // solutions above the lower bound + refined_lower_bound = rational::minus_one(); + return found_solution_at_lower_bound; + } + /* * Refine the upper bound on the length of a solution to a given automaton. * The method returns TRUE if a solution of length `current_upper_bound` exists, @@ -9291,7 +9403,51 @@ namespace smt { // no upper bound information if (lower_bound_exists) { // lower bound, no upper bound - NOT_IMPLEMENTED_YET(); + + // check current assumptions + if (regex_automaton_assumptions.contains(re) && + !regex_automaton_assumptions[re].empty()){ + // one or more existing assumptions. + // see if the (current best) lower bound can be refined + // (note that if we have an automaton with no assumption, + // this automatically counts as best) + bool need_assumption = true; + regex_automaton_under_assumptions last_assumption; + rational last_lb = rational::zero(); // the default + for (svector::iterator it = regex_automaton_assumptions[re].begin(); + it != regex_automaton_assumptions[re].end(); ++it) { + regex_automaton_under_assumptions autA = *it; + if ((current_assignment == l_true && autA.get_polarity() == false) + || (current_assignment == l_false && autA.get_polarity() == true)) { + // automaton uses incorrect polarity + continue; + } + rational this_lb; + if (autA.get_lower_bound(this_lb)) { + if (this_lb > last_lb) { + last_lb = this_lb; + last_assumption = autA; + } + } else { + need_assumption = false; + last_assumption = autA; + break; + } + } + if (!last_lb.is_zero() || !need_assumption) { + CTRACE("str", !need_assumption, tout << "using automaton with full length information" << std::endl;); + CTRACE("str", need_assumption, tout << "using automaton with assumed lower bound of " << last_lb << std::endl;); + rational refined_lower_bound; + bool solution_at_lower_bound = refine_automaton_lower_bound(last_assumption.get_automaton(), + lower_bound_value, refined_lower_bound); + TRACE("str", tout << "refined lower bound is " << refined_lower_bound << + (solution_at_lower_bound?", solution at upper bound":", no solution at upper bound") << std::endl;); + NOT_IMPLEMENTED_YET(); + } + } else { + // no existing automata/assumptions. + NOT_IMPLEMENTED_YET(); + } } else { // !lower_bound_exists // no bounds information NOT_IMPLEMENTED_YET(); diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index 57fa48142..999fc7dc0 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -536,6 +536,7 @@ protected: unsigned estimate_regex_complexity(expr * re); unsigned estimate_regex_complexity_under_complement(expr * re); expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables); + bool refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound); bool refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound); void set_up_axioms(expr * ex); From 98691a2c498198f2e71cc2cc01c3fb5c17210b65 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Mon, 8 Jan 2018 15:56:21 -0500 Subject: [PATCH 06/36] lower bound refinement --- src/smt/theory_str.cpp | 83 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 80 insertions(+), 3 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 9ced6d186..eb1da0c4e 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -9441,16 +9441,93 @@ namespace smt { bool solution_at_lower_bound = refine_automaton_lower_bound(last_assumption.get_automaton(), lower_bound_value, refined_lower_bound); TRACE("str", tout << "refined lower bound is " << refined_lower_bound << - (solution_at_lower_bound?", solution at upper bound":", no solution at upper bound") << std::endl;); - NOT_IMPLEMENTED_YET(); + (solution_at_lower_bound?", solution at lower bound":", no solution at lower bound") << std::endl;); + + expr_ref_vector lhs(m); + if (current_assignment == l_false) { + lhs.push_back(m.mk_not(str_in_re)); + } else { + lhs.push_back(str_in_re); + } + if (need_assumption) { + lhs.push_back(m_autil.mk_ge(str_len, m_autil.mk_numeral(last_lb, true))); + } + lhs.push_back(m_autil.mk_ge(str_len, m_autil.mk_numeral(lower_bound_value, true))); + + expr_ref_vector rhs(m); + + if (solution_at_lower_bound) { + if (refined_lower_bound.is_minus_one()) { + // If there are solutions at the lower bound but not above it, make the bound exact. + rhs.push_back(ctx.mk_eq_atom(str_len, m_autil.mk_numeral(lower_bound_value, true))); + } else { + // If there are solutions at and above the lower bound, add an additional bound. + rhs.push_back(m.mk_or( + ctx.mk_eq_atom(str_len, m_autil.mk_numeral(lower_bound_value, true)), + m_autil.mk_ge(str_len, m_autil.mk_numeral(refined_lower_bound, true)) + )); + } + } else { + if (refined_lower_bound.is_minus_one()) { + // If there are no solutions at or above the lower bound, assert a conflict clause. + rhs.push_back(m.mk_not(m_autil.mk_ge(str_len, m_autil.mk_numeral(lower_bound_value, true)))); + } else { + // If there are solutions above the lower bound but not at it, refine the bound. + rhs.push_back(m_autil.mk_ge(str_len, m_autil.mk_numeral(refined_lower_bound, true))); + } + } + + expr_ref lhs_terms(mk_and(lhs), m); + expr_ref rhs_terms(mk_and(rhs), m); + assert_implication(lhs_terms, rhs_terms); + regex_axiom_add = true; } } else { // no existing automata/assumptions. - NOT_IMPLEMENTED_YET(); + // if it's easy to construct a full automaton for R, do so + unsigned expected_complexity = estimate_regex_complexity(re); + unsigned threshold = 1000; // TODO better metric + if (expected_complexity <= threshold) { + eautomaton * aut = m_mk_aut(re); + aut->compress(); + regex_automata.push_back(aut); + regex_automaton_under_assumptions new_aut(re, aut, true); + if (!regex_automaton_assumptions.contains(re)) { + regex_automaton_assumptions.insert(re, svector()); + } + regex_automaton_assumptions[re].push_back(new_aut); + TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); + regex_axiom_add = true; + // TODO immediately attempt to learn lower/upper bound info here + } else { + // TODO check negation? + // construct a partial automaton for R to the given upper bound + NOT_IMPLEMENTED_YET(); + } } } else { // !lower_bound_exists // no bounds information + // check for existing automata; + // try to construct an automaton if we don't have one yet + // and doing so without bounds is not difficult NOT_IMPLEMENTED_YET(); + if (true) { + unsigned expected_complexity = estimate_regex_complexity(re); + unsigned threshold = 1000; // TODO better metric + if (expected_complexity <= threshold) { + eautomaton * aut = m_mk_aut(re); + aut->compress(); + regex_automata.push_back(aut); + regex_automaton_under_assumptions new_aut(re, aut, true); + if (!regex_automaton_assumptions.contains(re)) { + regex_automaton_assumptions.insert(re, svector()); + } + regex_automaton_assumptions[re].push_back(new_aut); + TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); + regex_axiom_add = true; + // TODO immediately attempt to learn lower/upper bound info here + } + } } } From bac5a648d960a45295f2a05e433b3f85cd4df0f1 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Tue, 9 Jan 2018 20:20:04 -0500 Subject: [PATCH 07/36] regex path constraint generation (WIP) --- src/smt/theory_str.cpp | 232 +++++++++++++++++++++++++++++++++++++++++ src/smt/theory_str.h | 3 + 2 files changed, 235 insertions(+) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index eb1da0c4e..54636b8df 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -6854,6 +6854,237 @@ namespace smt { return found_solution_at_upper_bound; } + void theory_str::aut_path_add_next(u_map& next, expr_ref_vector& trail, unsigned idx, expr* cond) { + expr* acc; + if (!get_manager().is_true(cond) && next.find(idx, acc)) { + expr* args[2] = { cond, acc }; + cond = mk_or(get_manager(), 2, args); + } + trail.push_back(cond); + next.insert(idx, cond); + } + + expr_ref theory_str::aut_path_rewrite_constraint(expr * cond, expr * ch_var) { + context & ctx = get_context(); + ast_manager & m = get_manager(); + bv_util bvu(m); + + expr_ref retval(m); + + rational char_val; + unsigned int bv_width; + + expr * lhs; + expr * rhs; + + if (bvu.is_numeral(cond, char_val, bv_width)) { + SASSERT(char_val.is_nonneg() && char_val.get_unsigned() < 256); + TRACE("str", tout << "rewrite character constant " << char_val << std::endl;); + zstring str_const(char_val.get_unsigned()); + retval = u.str.mk_string(str_const); + return retval; + } else if (is_var(cond)) { + TRACE("str", tout << "substitute var" << std::endl;); + retval = ch_var; + return retval; + } else if (m.is_eq(cond, lhs, rhs)) { + // handle this specially because the sort of the equality will change + expr_ref new_lhs(aut_path_rewrite_constraint(lhs, ch_var), m); + SASSERT(new_lhs); + expr_ref new_rhs(aut_path_rewrite_constraint(rhs, ch_var), m); + SASSERT(new_rhs); + retval = ctx.mk_eq_atom(new_lhs, new_rhs); + return retval; + } else if (m.is_bool(cond)) { + TRACE("str", tout << "rewrite boolean term " << mk_pp(cond, m) << std::endl;); + app * a_cond = to_app(cond); + expr_ref_vector rewritten_args(m); + for (unsigned i = 0; i < a_cond->get_num_args(); ++i) { + expr * argI = a_cond->get_arg(i); + expr_ref new_arg(aut_path_rewrite_constraint(argI, ch_var), m); + SASSERT(new_arg); + rewritten_args.push_back(new_arg); + } + retval = m.mk_app(a_cond->get_decl(), rewritten_args.c_ptr()); + TRACE("str", tout << "final rewritten term is " << mk_pp(retval, m) << std::endl;); + return retval; + } else { + TRACE("str", tout << "ERROR: unrecognized automaton path constraint " << mk_pp(cond, m) << ", cannot translate" << std::endl;); + retval = NULL; + return retval; + } + } + + /* + * Create finite path constraints for the string variable `str` with respect to the automaton `aut`. + * The returned expression is the right-hand side of a constraint of the form + * (str in re) AND (|str| = len) AND (any applicable length assumptions on aut) -> RHS + */ + expr_ref theory_str::generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal) { + ENSURE(aut != NULL); + context & ctx = get_context(); + ast_manager & m = get_manager(); + + if (lenVal.is_zero()) { + // if any state in the epsilon-closure of the start state is accepting, + // then the empty string is in this language + unsigned_vector states; + bool has_final = false; + aut->get_epsilon_closure(aut->init(), states); + for (unsigned i = 0; i < states.size() && !has_final; ++i) { + has_final = aut->is_final_state(states[i]); + } + if (has_final) { + // empty string is OK, assert axiom + expr_ref rhs(ctx.mk_eq_atom(stringTerm, mk_string("")), m); + SASSERT(rhs); + //regex_automata_assertions.insert(stringTerm, final_axiom); + //m_trail_stack.push(insert_obj_map(regex_automata_assertions, stringTerm) ); + return rhs; + } else { + // negate -- the empty string isn't in the language + //expr_ref conflict(m.mk_not(mk_and(toplevel_lhs)), m); + //assert_axiom(conflict); + expr_ref conflict(m.mk_false(), m); + return conflict; + } + } // lenVal.is_zero() + + expr_ref_vector pathChars(m); + expr_ref_vector pathChars_len_constraints(m); + for (unsigned i = 0; i < lenVal.get_unsigned(); ++i) { + std::stringstream ss; + ss << "ch" << i; + expr_ref ch(mk_str_var(ss.str()), m); + pathChars.push_back(ch); + pathChars_len_constraints.push_back(ctx.mk_eq_atom(mk_strlen(ch), m_autil.mk_numeral(rational::one(), true))); + } + + // modification of code in seq_rewriter::mk_str_in_regexp() + expr_ref_vector trail(m); + u_map maps[2]; + bool select_map = false; + expr_ref ch(m), cond(m); + eautomaton::moves mvs; + maps[0].insert(aut->init(), m.mk_true()); + // is_accepted(a, aut) & some state in frontier is final. + for (unsigned i = 0; i < lenVal.get_unsigned(); ++i) { + u_map& frontier = maps[select_map]; + u_map& next = maps[!select_map]; + select_map = !select_map; + ch = pathChars.get(i); + next.reset(); + u_map::iterator it = frontier.begin(), end = frontier.end(); + for (; it != end; ++it) { + mvs.reset(); + unsigned state = it->m_key; + expr* acc = it->m_value; + aut->get_moves_from(state, mvs, false); + for (unsigned j = 0; j < mvs.size(); ++j) { + eautomaton::move const& mv = mvs[j]; + SASSERT(mv.t()); + if (mv.t()->is_char() && m.is_value(mv.t()->get_char())) { + // change this to a string constraint + expr_ref cond_rhs = aut_path_rewrite_constraint(mv.t()->get_char(), ch); + SASSERT(cond_rhs); + cond = ctx.mk_eq_atom(ch, cond_rhs); + SASSERT(cond); + expr * args[2] = {cond, acc}; + cond = mk_and(m, 2, args); + aut_path_add_next(next, trail, mv.dst(), cond); + } else if (mv.t()->is_range()) { + expr_ref range_lo(mv.t()->get_lo(), m); + expr_ref range_hi(mv.t()->get_hi(), m); + bv_util bvu(m); + + rational lo_val, hi_val; + unsigned int bv_width; + + if (bvu.is_numeral(range_lo, lo_val, bv_width) && bvu.is_numeral(range_hi, hi_val, bv_width)) { + TRACE("str", tout << "make range predicate from " << lo_val << " to " << hi_val << std::endl;); + expr_ref cond_rhs(m); + + if (hi_val < lo_val) { + rational tmp = lo_val; + lo_val = hi_val; + hi_val = tmp; + } + + expr_ref_vector cond_rhs_terms(m); + for (unsigned i = lo_val.get_unsigned(); i <= hi_val.get_unsigned(); ++i) { + zstring str_const(i); + expr_ref str_expr(u.str.mk_string(str_const), m); + cond_rhs_terms.push_back(ctx.mk_eq_atom(ch, str_expr)); + } + cond_rhs = mk_or(cond_rhs_terms); + SASSERT(cond_rhs); + expr * args[2] = {cond_rhs, acc}; + cond = mk_and(m, 2, args); + aut_path_add_next(next, trail, mv.dst(), cond); + } else { + TRACE("str", tout << "warning: non-bitvectors in automaton range predicate" << std::endl;); + UNREACHABLE(); + } + } else if (mv.t()->is_pred()) { + // rewrite this constraint over string terms + expr_ref cond_rhs = aut_path_rewrite_constraint(mv.t()->get_pred(), ch); + SASSERT(cond_rhs); + + if (m.is_false(cond_rhs)) { + continue; + } else if (m.is_true(cond_rhs)) { + aut_path_add_next(next, trail, mv.dst(), acc); + continue; + } + expr * args[2] = {cond_rhs, acc}; + cond = mk_and(m, 2, args); + aut_path_add_next(next, trail, mv.dst(), cond); + } + } + } + } + u_map const& frontier = maps[select_map]; + u_map::iterator it = frontier.begin(), end = frontier.end(); + expr_ref_vector ors(m); + for (; it != end; ++it) { + unsigned_vector states; + bool has_final = false; + aut->get_epsilon_closure(it->m_key, states); + for (unsigned i = 0; i < states.size() && !has_final; ++i) { + has_final = aut->is_final_state(states[i]); + } + if (has_final) { + ors.push_back(it->m_value); + } + } + expr_ref result(mk_or(ors)); + TRACE("str", tout << "regex path constraint: " << mk_pp(result, m) << "\n";); + + // if the path constraint is instantly false, then we know that the LHS must be false, + // so negate it and instantly assert a conflict clause + if (m.is_false(result)) { + expr_ref conflict(m.mk_false(), m); + return conflict; + } else { + expr_ref concat_rhs(m); + if (pathChars.size() == 1) { + concat_rhs = ctx.mk_eq_atom(stringTerm, pathChars.get(0)); + } else { + expr_ref acc(pathChars.get(0), m); + for (unsigned i = 1; i < pathChars.size(); ++i) { + acc = mk_concat(acc, pathChars.get(i)); + } + concat_rhs = ctx.mk_eq_atom(stringTerm, acc); + } + + expr_ref toplevel_rhs(m.mk_and(result, mk_and(pathChars_len_constraints), concat_rhs), m); + //expr_ref final_axiom(rewrite_implication(mk_and(toplevel_lhs), toplevel_rhs), m); + //regex_automata_assertions.insert(stringTerm, final_axiom); + //m_trail_stack.push(insert_obj_map(regex_automata_assertions, stringTerm) ); + return toplevel_rhs; + } + } + /* * strArgmt::solve_concat_eq_str() * Solve concatenations of the form: @@ -9281,6 +9512,7 @@ namespace smt { rational exact_length_value; if (get_len_value(str, exact_length_value)) { TRACE("str", tout << "exact length of " << mk_pp(str, m) << " is " << exact_length_value << std::endl;); + // TODO generate_regex_path_constraints(); NOT_IMPLEMENTED_YET(); } expr_ref str_len(mk_strlen(str), m); diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index 999fc7dc0..c6dad045f 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -538,6 +538,9 @@ protected: expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables); bool refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound); bool refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound); + expr_ref generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal); + void aut_path_add_next(u_map& next, expr_ref_vector& trail, unsigned idx, expr* cond); + expr_ref aut_path_rewrite_constraint(expr * cond, expr * ch_var); void set_up_axioms(expr * ex); void handle_equality(expr * lhs, expr * rhs); From 6b799706b5adf865cc1b3e73d00d37a86610c619 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Wed, 10 Jan 2018 17:24:47 -0500 Subject: [PATCH 08/36] add path constraint generation for regex terms --- src/smt/params/smt_params_helper.pyg | 1 + src/smt/params/theory_str_params.cpp | 1 + src/smt/params/theory_str_params.h | 9 +- src/smt/theory_str.cpp | 165 +++++++++++++++++++++------ src/smt/theory_str.h | 4 +- 5 files changed, 145 insertions(+), 35 deletions(-) diff --git a/src/smt/params/smt_params_helper.pyg b/src/smt/params/smt_params_helper.pyg index 122d09f32..7ab0d52f2 100644 --- a/src/smt/params/smt_params_helper.pyg +++ b/src/smt/params/smt_params_helper.pyg @@ -80,6 +80,7 @@ def_module_params(module_name='smt', ('str.finite_overlap_models', BOOL, False, 'attempt a finite model search for overlapping variables instead of completely giving up on the arrangement'), ('str.overlap_priority', DOUBLE, -0.1, 'theory-aware priority for overlapping variable cases; use smt.theory_aware_branching=true'), ('str.regex_automata', BOOL, True, 'use automata-based reasoning for regular expressions (Z3str3 only)'), + ('str.regex_automata_difficulty_threshold', UINT, 1000, 'difficulty threshold for regex automata heuristics'), ('core.minimize', BOOL, False, 'minimize unsat core produced by SMT context'), ('core.extend_patterns', BOOL, False, 'extend unsat core with literals that trigger (potential) quantifier instances'), ('core.extend_patterns.max_distance', UINT, UINT_MAX, 'limits the distance of a pattern-extended unsat core'), diff --git a/src/smt/params/theory_str_params.cpp b/src/smt/params/theory_str_params.cpp index b5451b9dc..fa6862161 100644 --- a/src/smt/params/theory_str_params.cpp +++ b/src/smt/params/theory_str_params.cpp @@ -32,4 +32,5 @@ void theory_str_params::updt_params(params_ref const & _p) { m_BinarySearchInitialUpperBound = p.str_binary_search_start(); m_OverlapTheoryAwarePriority = p.str_overlap_priority(); m_RegexAutomata = p.str_regex_automata(); + m_RegexAutomata_DifficultyThreshold = p.str_regex_automata_difficulty_threshold(); } diff --git a/src/smt/params/theory_str_params.h b/src/smt/params/theory_str_params.h index 4135cf0d6..48c0205ee 100644 --- a/src/smt/params/theory_str_params.h +++ b/src/smt/params/theory_str_params.h @@ -87,6 +87,12 @@ struct theory_str_params { */ bool m_RegexAutomata; + /* + * RegexAutomata_DifficultyThreshold is the lowest difficulty above which Z3str3 + * will not eagerly construct an automaton for a regular expression term. + */ + unsigned m_RegexAutomata_DifficultyThreshold; + theory_str_params(params_ref const & p = params_ref()): m_StrongArrangements(true), m_AggressiveLengthTesting(false), @@ -99,7 +105,8 @@ struct theory_str_params { m_UseBinarySearch(false), m_BinarySearchInitialUpperBound(64), m_OverlapTheoryAwarePriority(-0.1), - m_RegexAutomata(true) + m_RegexAutomata(true), + m_RegexAutomata_DifficultyThreshold(1000) { updt_params(p); } diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 54636b8df..5910d9d95 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -6918,9 +6918,11 @@ namespace smt { /* * Create finite path constraints for the string variable `str` with respect to the automaton `aut`. * The returned expression is the right-hand side of a constraint of the form - * (str in re) AND (|str| = len) AND (any applicable length assumptions on aut) -> RHS + * (str in re) AND (|str| = len) AND (any applicable length assumptions on aut) -> (rhs AND character constraints). + * The character constraints, which are (str = c0 . c1 . (...) . cn) and (|c0| = 1, ...), + * are returned in `characterConstraints`. */ - expr_ref theory_str::generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal) { + expr_ref theory_str::generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal, expr_ref & characterConstraints) { ENSURE(aut != NULL); context & ctx = get_context(); ast_manager & m = get_manager(); @@ -7060,29 +7062,23 @@ namespace smt { expr_ref result(mk_or(ors)); TRACE("str", tout << "regex path constraint: " << mk_pp(result, m) << "\n";); - // if the path constraint is instantly false, then we know that the LHS must be false, - // so negate it and instantly assert a conflict clause - if (m.is_false(result)) { - expr_ref conflict(m.mk_false(), m); - return conflict; + expr_ref concat_rhs(m); + if (pathChars.size() == 1) { + concat_rhs = ctx.mk_eq_atom(stringTerm, pathChars.get(0)); } else { - expr_ref concat_rhs(m); - if (pathChars.size() == 1) { - concat_rhs = ctx.mk_eq_atom(stringTerm, pathChars.get(0)); - } else { - expr_ref acc(pathChars.get(0), m); - for (unsigned i = 1; i < pathChars.size(); ++i) { - acc = mk_concat(acc, pathChars.get(i)); - } - concat_rhs = ctx.mk_eq_atom(stringTerm, acc); + expr_ref acc(pathChars.get(0), m); + for (unsigned i = 1; i < pathChars.size(); ++i) { + acc = mk_concat(acc, pathChars.get(i)); } - - expr_ref toplevel_rhs(m.mk_and(result, mk_and(pathChars_len_constraints), concat_rhs), m); - //expr_ref final_axiom(rewrite_implication(mk_and(toplevel_lhs), toplevel_rhs), m); - //regex_automata_assertions.insert(stringTerm, final_axiom); - //m_trail_stack.push(insert_obj_map(regex_automata_assertions, stringTerm) ); - return toplevel_rhs; + concat_rhs = ctx.mk_eq_atom(stringTerm, acc); } + + //expr_ref toplevel_rhs(m.mk_and(result, mk_and(pathChars_len_constraints), concat_rhs), m); + characterConstraints = m.mk_and(mk_and(pathChars_len_constraints), concat_rhs); + //expr_ref final_axiom(rewrite_implication(mk_and(toplevel_lhs), toplevel_rhs), m); + //regex_automata_assertions.insert(stringTerm, final_axiom); + //m_trail_stack.push(insert_obj_map(regex_automata_assertions, stringTerm) ); + return result; } /* @@ -8241,6 +8237,27 @@ namespace smt { ); } + // returns true if needle appears as a subterm anywhere under haystack, + // or if needle appears in the same EQC as a subterm anywhere under haystack + bool theory_str::term_appears_as_subterm(expr * needle, expr * haystack) { + if (in_same_eqc(needle, haystack)) { + return true; + } + + if (is_app(haystack)) { + app * a_haystack = to_app(haystack); + for (unsigned i = 0; i < a_haystack->get_num_args(); ++i) { + expr * subterm = a_haystack->get_arg(i); + if (term_appears_as_subterm(needle, subterm)) { + return true; + } + } + } + + // not found + return false; + } + void theory_str::classify_ast_by_type(expr * node, std::map & varMap, std::map & concatMap, std::map & unrollMap) { @@ -9512,8 +9529,78 @@ namespace smt { rational exact_length_value; if (get_len_value(str, exact_length_value)) { TRACE("str", tout << "exact length of " << mk_pp(str, m) << " is " << exact_length_value << std::endl;); - // TODO generate_regex_path_constraints(); - NOT_IMPLEMENTED_YET(); + + if (regex_terms_with_path_constraints.contains(str_in_re)) { + TRACE("str", tout << "term " << mk_pp(str_in_re, m) << " already has path constraints set up" << std::endl;); + continue; + } + + // find a consistent automaton for this term + bool found = false; + regex_automaton_under_assumptions assumption; + if (regex_automaton_assumptions.contains(re) && + !regex_automaton_assumptions[re].empty()){ + for (svector::iterator it = regex_automaton_assumptions[re].begin(); + it != regex_automaton_assumptions[re].end(); ++it) { + regex_automaton_under_assumptions autA = *it; + rational assumed_upper_bound, assumed_lower_bound; + bool assumes_upper_bound = autA.get_upper_bound(assumed_upper_bound); + bool assumes_lower_bound = autA.get_lower_bound(assumed_lower_bound); + if (!assumes_upper_bound && !assumes_lower_bound) { + // automaton with no assumptions is always usable + assumption = autA; + found = true; + break; + } + // check consistency of bounds assumptions + } // foreach(a in regex_automaton_assumptions) + } + if (found) { + expr_ref pathConstraint(m); + expr_ref characterConstraints(m); + pathConstraint = generate_regex_path_constraints(str, assumption.get_automaton(), exact_length_value, characterConstraints); + TRACE("str", tout << "generated regex path constraint " << mk_pp(pathConstraint, m) << std::endl;); + TRACE("str", tout << "character constraints are " << mk_pp(characterConstraints, m) << std::endl;); + + expr_ref_vector lhs_terms(m); + lhs_terms.push_back(str_in_re); + lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true))); + expr_ref lhs(mk_and(lhs_terms), m); + // If the automaton was built with the same polarity as the constraint, + // assert directly. Otherwise, negate the path constraint + if ( (current_assignment == l_true && assumption.get_polarity()) + || (current_assignment == l_false && !assumption.get_polarity())) { + expr_ref rhs(m.mk_and(pathConstraint, characterConstraints), m); + assert_implication(lhs, rhs); + } else { + expr_ref rhs(m.mk_and(m.mk_not(pathConstraint), characterConstraints), m); + assert_implication(lhs, rhs); + } + regex_terms_with_path_constraints.insert(str_in_re); + m_trail_stack.push(insert_obj_trail(regex_terms_with_path_constraints, str_in_re)); + regex_axiom_add = true; + continue; + } else { + // no automata available, or else all bounds assumptions are invalid + unsigned expected_complexity = estimate_regex_complexity(re); + if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) { + eautomaton * aut = m_mk_aut(re); + aut->compress(); + regex_automata.push_back(aut); + regex_automaton_under_assumptions new_aut(re, aut, true); + if (!regex_automaton_assumptions.contains(re)) { + regex_automaton_assumptions.insert(re, svector()); + } + regex_automaton_assumptions[re].push_back(new_aut); + TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); + regex_axiom_add = true; + // TODO immediately attempt to learn lower/upper bound info here + } else { + // TODO increment failure count + NOT_IMPLEMENTED_YET(); + } + continue; + } } expr_ref str_len(mk_strlen(str), m); rational lower_bound_value; @@ -9607,8 +9694,7 @@ namespace smt { // no existing automata/assumptions. // if it's easy to construct a full automaton for R, do so unsigned expected_complexity = estimate_regex_complexity(re); - unsigned threshold = 1000; // TODO better metric - if (expected_complexity <= threshold) { + if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) { eautomaton * aut = m_mk_aut(re); aut->compress(); regex_automata.push_back(aut); @@ -9622,9 +9708,11 @@ namespace smt { // TODO immediately attempt to learn lower/upper bound info here } else { // TODO check negation? - // construct a partial automaton for R to the given upper bound + // TODO construct a partial automaton for R to the given upper bound? + // TODO increment failure count if we can't NOT_IMPLEMENTED_YET(); } + continue; } // if we have *any* automaton for R, and the upper bound is not too large, // finitize the automaton (if we have not already done so) and assert all solutions @@ -9718,8 +9806,7 @@ namespace smt { // no existing automata/assumptions. // if it's easy to construct a full automaton for R, do so unsigned expected_complexity = estimate_regex_complexity(re); - unsigned threshold = 1000; // TODO better metric - if (expected_complexity <= threshold) { + if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) { eautomaton * aut = m_mk_aut(re); aut->compress(); regex_automata.push_back(aut); @@ -9733,9 +9820,11 @@ namespace smt { // TODO immediately attempt to learn lower/upper bound info here } else { // TODO check negation? - // construct a partial automaton for R to the given upper bound + // TODO construct a partial automaton for R to the given lower bound? + // TODO increment failure count NOT_IMPLEMENTED_YET(); } + continue; } } else { // !lower_bound_exists // no bounds information @@ -9745,8 +9834,7 @@ namespace smt { NOT_IMPLEMENTED_YET(); if (true) { unsigned expected_complexity = estimate_regex_complexity(re); - unsigned threshold = 1000; // TODO better metric - if (expected_complexity <= threshold) { + if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) { eautomaton * aut = m_mk_aut(re); aut->compress(); regex_automata.push_back(aut); @@ -11265,7 +11353,18 @@ namespace smt { // this length tester is in any way constrained by regex terms, // do not perform value testing -- this term is not a free variable. if (m_params.m_RegexAutomata) { - NOT_IMPLEMENTED_YET(); // TODO + std::map, expr*>::iterator it = regex_in_bool_map.begin(); + for (; it != regex_in_bool_map.end(); ++it) { + expr * re = it->second; + expr * re_str = to_app(re)->get_arg(0); + // recursive descent through all subterms of re_str to see if any of them are + // - the same as freeVar + // - in the same EQC as freeVar + if (term_appears_as_subterm(freeVar, re_str)) { + TRACE("str", tout << "prevent value testing on free var " << mk_pp(freeVar, m) << " as it belongs to one or more regex constraints." << std::endl;); + return NULL; + } + } } TRACE("str", tout << "length is fixed; generating models for free var" << std::endl;); diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index c6dad045f..c64eec838 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -407,6 +407,7 @@ protected: obj_map > regex_terms_by_string; // S --> [ (str.in.re S *) ] obj_map > regex_automaton_assumptions; // RegEx --> [ aut+assumptions ] std::map regex_nfa_cache; // Regex term --> NFA + obj_hashtable regex_terms_with_path_constraints; // set of string terms which have had path constraints asserted in the current scope svector char_set; std::map charSetLookupTable; @@ -538,7 +539,7 @@ protected: expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables); bool refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound); bool refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound); - expr_ref generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal); + expr_ref generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal, expr_ref & characterConstraints); void aut_path_add_next(u_map& next, expr_ref_vector& trail, unsigned idx, expr* cond); expr_ref aut_path_rewrite_constraint(expr * cond, expr * ch_var); @@ -629,6 +630,7 @@ protected: std::map > & concat_eq_concat_map, std::map > & unrollGroupMap); + bool term_appears_as_subterm(expr * needle, expr * haystack); void classify_ast_by_type(expr * node, std::map & varMap, std::map & concatMap, std::map & unrollMap); void classify_ast_by_type_in_positive_context(std::map & varMap, From ca3784449f02ce860736a741befb62ff33d9549b Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Fri, 12 Jan 2018 13:53:02 -0500 Subject: [PATCH 09/36] regex failsafe and intersect WIP --- src/smt/params/smt_params_helper.pyg | 4 ++ src/smt/params/theory_str_params.cpp | 4 ++ src/smt/params/theory_str_params.h | 30 ++++++++- src/smt/theory_str.cpp | 94 ++++++++++++++++++++++++++-- src/smt/theory_str.h | 9 +++ 5 files changed, 136 insertions(+), 5 deletions(-) diff --git a/src/smt/params/smt_params_helper.pyg b/src/smt/params/smt_params_helper.pyg index 7ab0d52f2..1f691d587 100644 --- a/src/smt/params/smt_params_helper.pyg +++ b/src/smt/params/smt_params_helper.pyg @@ -81,6 +81,10 @@ def_module_params(module_name='smt', ('str.overlap_priority', DOUBLE, -0.1, 'theory-aware priority for overlapping variable cases; use smt.theory_aware_branching=true'), ('str.regex_automata', BOOL, True, 'use automata-based reasoning for regular expressions (Z3str3 only)'), ('str.regex_automata_difficulty_threshold', UINT, 1000, 'difficulty threshold for regex automata heuristics'), + ('str.regex_automata_intersection_difficulty_threshold', UINT, 1000, 'difficulty threshold for regex intersection heuristics'), + ('str.regex_automata_failed_automaton_threshold', UINT, 10, 'number of failed automaton construction attempts after which a full automaton is automatically built'), + ('str.regex_automata_failed_intersection_threshold', UINT, 10, 'number of failed automaton intersection attempts after which intersection is always computed'), + ('str.regex_automata_length_attempt_threshold', UINT, 10, 'number of length/path constraint attempts before checking unsatisfiability of regex terms'), ('core.minimize', BOOL, False, 'minimize unsat core produced by SMT context'), ('core.extend_patterns', BOOL, False, 'extend unsat core with literals that trigger (potential) quantifier instances'), ('core.extend_patterns.max_distance', UINT, UINT_MAX, 'limits the distance of a pattern-extended unsat core'), diff --git a/src/smt/params/theory_str_params.cpp b/src/smt/params/theory_str_params.cpp index fa6862161..92478bcd9 100644 --- a/src/smt/params/theory_str_params.cpp +++ b/src/smt/params/theory_str_params.cpp @@ -33,4 +33,8 @@ void theory_str_params::updt_params(params_ref const & _p) { m_OverlapTheoryAwarePriority = p.str_overlap_priority(); m_RegexAutomata = p.str_regex_automata(); m_RegexAutomata_DifficultyThreshold = p.str_regex_automata_difficulty_threshold(); + m_RegexAutomata_IntersectionDifficultyThreshold = p.str_regex_automata_intersection_difficulty_threshold(); + m_RegexAutomata_FailedAutomatonThreshold = p.str_regex_automata_failed_automaton_threshold(); + m_RegexAutomata_FailedIntersectionThreshold = p.str_regex_automata_failed_intersection_threshold(); + m_RegexAutomata_LengthAttemptThreshold = p.str_regex_automata_length_attempt_threshold(); } diff --git a/src/smt/params/theory_str_params.h b/src/smt/params/theory_str_params.h index 48c0205ee..8c7816839 100644 --- a/src/smt/params/theory_str_params.h +++ b/src/smt/params/theory_str_params.h @@ -93,6 +93,30 @@ struct theory_str_params { */ unsigned m_RegexAutomata_DifficultyThreshold; + /* + * RegexAutomata_IntersectionDifficultyThreshold is the lowest difficulty above which Z3str3 + * will not eagerly intersect automata to check unsatisfiability. + */ + unsigned m_RegexAutomata_IntersectionDifficultyThreshold; + + /* + * RegexAutomata_FailedAutomatonThreshold is the number of failed attempts to build an automaton + * after which a full automaton (i.e. with no length information) will be built regardless of difficulty. + */ + unsigned m_RegexAutomata_FailedAutomatonThreshold; + + /* + * RegexAutomaton_FailedIntersectionThreshold is the number of failed attempts to perform automaton + * intersection after which intersection will always be performed regardless of difficulty. + */ + unsigned m_RegexAutomata_FailedIntersectionThreshold; + + /* + * RegexAutomaton_LengthAttemptThreshold is the number of attempts to satisfy length/path constraints + * before which we begin checking unsatisfiability of a regex term. + */ + unsigned m_RegexAutomata_LengthAttemptThreshold; + theory_str_params(params_ref const & p = params_ref()): m_StrongArrangements(true), m_AggressiveLengthTesting(false), @@ -106,7 +130,11 @@ struct theory_str_params { m_BinarySearchInitialUpperBound(64), m_OverlapTheoryAwarePriority(-0.1), m_RegexAutomata(true), - m_RegexAutomata_DifficultyThreshold(1000) + m_RegexAutomata_DifficultyThreshold(1000), + m_RegexAutomata_IntersectionDifficultyThreshold(1000), + m_RegexAutomata_FailedAutomatonThreshold(10), + m_RegexAutomata_FailedIntersectionThreshold(10), + m_RegexAutomata_LengthAttemptThreshold(10) { updt_params(p); } diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 5910d9d95..6e4211f7b 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -6480,6 +6480,26 @@ namespace smt { } } + void theory_str::regex_inc_counter(obj_map & counter_map, expr * key) { + unsigned old_v; + if (counter_map.find(key, old_v)) { + unsigned new_v = old_v += 1; + counter_map.insert(key, new_v); + } else { + counter_map.insert(key, 1); + } + } + + unsigned theory_str::regex_get_counter(obj_map & counter_map, expr * key) { + unsigned v; + if (counter_map.find(key, v)) { + return v; + } else { + counter_map.insert(key, 0); + return 0; + } + } + // saturating unsigned addition unsigned inline _qadd(unsigned a, unsigned b) { if (a == UINT_MAX || b == UINT_MAX) { @@ -9515,6 +9535,7 @@ namespace smt { // regex automata if (m_params.m_RegexAutomata) { + // TODO since heuristics might fail, the "no progress" flag might need to be handled specially here bool regex_axiom_add = false; for (obj_hashtable::iterator it = regex_terms.begin(); it != regex_terms.end(); ++it) { expr * str_in_re = *it; @@ -9579,11 +9600,16 @@ namespace smt { regex_terms_with_path_constraints.insert(str_in_re); m_trail_stack.push(insert_obj_trail(regex_terms_with_path_constraints, str_in_re)); regex_axiom_add = true; + + // increment LengthAttemptCount + regex_inc_counter(regex_length_attempt_count, str_in_re); continue; } else { // no automata available, or else all bounds assumptions are invalid unsigned expected_complexity = estimate_regex_complexity(re); - if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) { + if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold) { + CTRACE("str", regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold, + tout << "failed automaton threshold reached for " << mk_pp(str_in_re, m) << " -- automatically constructing full automaton" << std::endl;); eautomaton * aut = m_mk_aut(re); aut->compress(); regex_automata.push_back(aut); @@ -9596,8 +9622,7 @@ namespace smt { regex_axiom_add = true; // TODO immediately attempt to learn lower/upper bound info here } else { - // TODO increment failure count - NOT_IMPLEMENTED_YET(); + regex_inc_counter(regex_fail_count, str_in_re); } continue; } @@ -9852,7 +9877,68 @@ namespace smt { } // NOT_IMPLEMENTED_YET(); - } + } // foreach (entry in regex_terms) + + for (obj_map >::iterator it = regex_terms_by_string.begin(); + it != regex_terms_by_string.end(); ++it) { + // TODO do we need to check equivalence classes of strings here? + + expr * str = it->m_key; + ptr_vector str_in_re_terms = it->m_value; + + svector intersect_constraints; + // we may find empty intersection before checking every constraint; + // this vector keeps track of which ones actually take part in intersection + svector used_intersect_constraints; + + // choose an automaton/assumption for each assigned (str.in.re) + // that's consistent with the current length information + NOT_IMPLEMENTED_YET(); + + eautomaton * aut_inter = NULL; + CTRACE("str", !intersect_constraints.empty(), tout << "check intersection of automata constraints for " << mk_pp(str, m) << std::endl;); + for (svector::iterator aut_it = intersect_constraints.begin(); + aut_it != intersect_constraints.end(); ++aut_it) { + regex_automaton_under_assumptions aut = *aut_it; + if (aut_inter == NULL) { + // start somewhere + aut_inter = aut.get_automaton(); + used_intersect_constraints.push_back(aut); + continue; + } + + if (regex_get_counter(regex_length_attempt_count, aut.get_regex_term()) >= m_params.m_RegexAutomata_LengthAttemptThreshold) { + unsigned intersectionDifficulty = 0; // TODO EstimateIntersectionDifficulty + if (intersectionDifficulty <= m_params.m_RegexAutomata_IntersectionDifficultyThreshold + || regex_get_counter(regex_intersection_fail_count, aut.get_regex_term()) >= m_params.m_RegexAutomata_FailedIntersectionThreshold) { + lbool current_assignment = ctx.get_assignment(aut.get_regex_term()); + // if the assignment is consistent with our assumption, use the automaton directly; + // otherwise, complement it (and save that automaton for next time) + // TODO we should cache these intermediate results + // TODO do we need to push the intermediates into a vector for deletion anyway? + if ( (current_assignment == l_true && aut.get_polarity()) + || (current_assignment == l_false && !aut.get_polarity())) { + aut_inter = m_mk_aut.mk_product(aut_inter, aut.get_automaton()); + } else { + // need to complement first + NOT_IMPLEMENTED_YET(); + } + used_intersect_constraints.push_back(aut); + if (aut_inter->is_empty()) { + break; + } + } else { + // failed intersection + regex_inc_counter(regex_intersection_fail_count, aut.get_regex_term()); + } + } + } // foreach(entry in intersect_constraints) + if (aut_inter->is_empty()) { + TRACE("str", tout << "product automaton is empty; asserting conflict clause" << std::endl;); + NOT_IMPLEMENTED_YET(); + } + } // foreach (entry in regex_terms_by_string) + if (regex_axiom_add) { return FC_CONTINUE; } diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index c64eec838..45df675fb 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -221,6 +221,7 @@ public: } eautomaton * get_automaton() const { return aut; } + expr * get_regex_term() const { return str_in_re; } bool get_polarity() const { return polarity; } virtual ~regex_automaton_under_assumptions() { @@ -409,6 +410,12 @@ protected: std::map regex_nfa_cache; // Regex term --> NFA obj_hashtable regex_terms_with_path_constraints; // set of string terms which have had path constraints asserted in the current scope + // each counter maps a (str.in.re) expression to an integer. + // use helper functions regex_inc_counter() and regex_get_counter() to access + obj_map regex_length_attempt_count; + obj_map regex_fail_count; + obj_map regex_intersection_fail_count; + svector char_set; std::map charSetLookupTable; int charSetSize; @@ -542,6 +549,8 @@ protected: expr_ref generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal, expr_ref & characterConstraints); void aut_path_add_next(u_map& next, expr_ref_vector& trail, unsigned idx, expr* cond); expr_ref aut_path_rewrite_constraint(expr * cond, expr * ch_var); + void regex_inc_counter(obj_map & counter_map, expr * key); + unsigned regex_get_counter(obj_map & counter_map, expr * key); void set_up_axioms(expr * ex); void handle_equality(expr * lhs, expr * rhs); From 6f889ab699b4115f92530040840d2fd01489717e Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Mon, 15 Jan 2018 14:08:15 -0500 Subject: [PATCH 10/36] intersection WIP; fix polarity of generated path constraint LHS --- src/smt/theory_str.cpp | 74 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 3 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 6e4211f7b..27757a53e 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -6982,6 +6982,9 @@ namespace smt { pathChars_len_constraints.push_back(ctx.mk_eq_atom(mk_strlen(ch), m_autil.mk_numeral(rational::one(), true))); } + // TODO(mtrberzi) possibly modify this to reuse character terms over the same string, + // i.e. across different constraints over S the same variables S_0, S_1, ... are always used and refreshed + // modification of code in seq_rewriter::mk_str_in_regexp() expr_ref_vector trail(m); u_map maps[2]; @@ -9536,6 +9539,7 @@ namespace smt { // regex automata if (m_params.m_RegexAutomata) { // TODO since heuristics might fail, the "no progress" flag might need to be handled specially here + // TODO learning of linear length constraints in the style of length automata, if possible? bool regex_axiom_add = false; for (obj_hashtable::iterator it = regex_terms.begin(); it != regex_terms.end(); ++it) { expr * str_in_re = *it; @@ -9584,16 +9588,22 @@ namespace smt { TRACE("str", tout << "character constraints are " << mk_pp(characterConstraints, m) << std::endl;); expr_ref_vector lhs_terms(m); - lhs_terms.push_back(str_in_re); + if (current_assignment == l_true) { + lhs_terms.push_back(str_in_re); + } else { + lhs_terms.push_back(m.mk_not(str_in_re)); + } lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true))); expr_ref lhs(mk_and(lhs_terms), m); // If the automaton was built with the same polarity as the constraint, // assert directly. Otherwise, negate the path constraint if ( (current_assignment == l_true && assumption.get_polarity()) || (current_assignment == l_false && !assumption.get_polarity())) { + TRACE("str", tout << "automaton and regex term have same polarity" << std::endl;); expr_ref rhs(m.mk_and(pathConstraint, characterConstraints), m); assert_implication(lhs, rhs); } else { + TRACE("str", tout << "automaton and regex term have opposite polarity" << std::endl;); expr_ref rhs(m.mk_and(m.mk_not(pathConstraint), characterConstraints), m); assert_implication(lhs, rhs); } @@ -9893,7 +9903,64 @@ namespace smt { // choose an automaton/assumption for each assigned (str.in.re) // that's consistent with the current length information - NOT_IMPLEMENTED_YET(); + for (ptr_vector::iterator term_it = str_in_re_terms.begin(); + term_it != str_in_re_terms.end(); ++term_it) { + expr * _unused; + expr * re; + SASSERT(u.str.is_in_re(*term_it)); + u.str.is_in_re(*term_it, _unused, re); + + rational exact_len; + bool has_exact_len = get_len_value(str, exact_len); + + rational lb, ub; + bool has_lower_bound = lower_bound(mk_strlen(str), lb); + bool has_upper_bound = upper_bound(mk_strlen(str), ub); + + if (regex_automaton_assumptions.contains(re) && + !regex_automaton_assumptions[re].empty()){ + for (svector::iterator aut_it = regex_automaton_assumptions[re].begin(); + aut_it != regex_automaton_assumptions[re].end(); ++aut_it) { + regex_automaton_under_assumptions aut = *aut_it; + rational aut_ub; + bool assume_ub = aut.get_upper_bound(aut_ub); + rational aut_lb; + bool assume_lb = aut.get_lower_bound(aut_lb); + bool consistent = true; + + if (assume_ub) { + // check consistency of assumed upper bound + if (has_exact_len) { + if (exact_len > aut_ub) { + consistent = false; + } + } else { + if (has_upper_bound && ub > aut_ub) { + consistent = false; + } + } + } + + if (assume_lb) { + // check consistency of assumed lower bound + if (has_exact_len) { + if (exact_len < aut_lb) { + consistent = false; + } + } else { + if (has_lower_bound && lb < aut_lb) { + consistent = false; + } + } + } + + if (consistent) { + intersect_constraints.push_back(aut); + break; + } + } + } + } // foreach(term in str_in_re_terms) eautomaton * aut_inter = NULL; CTRACE("str", !intersect_constraints.empty(), tout << "check intersection of automata constraints for " << mk_pp(str, m) << std::endl;); @@ -9933,7 +10000,8 @@ namespace smt { } } } // foreach(entry in intersect_constraints) - if (aut_inter->is_empty()) { + TRACE("str", tout << "intersected " << used_intersect_constraints.size() << " constraints" << std::endl;); + if (aut_inter != NULL && aut_inter->is_empty()) { TRACE("str", tout << "product automaton is empty; asserting conflict clause" << std::endl;); NOT_IMPLEMENTED_YET(); } From 058d24fd096ec30b0f55fe84afa56c9213b045e7 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Mon, 15 Jan 2018 14:30:12 -0500 Subject: [PATCH 11/36] reuse regex character constraints for the same string --- src/smt/theory_str.cpp | 43 +++++++++++++++++++++++++++++++++--------- src/smt/theory_str.h | 1 + 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 27757a53e..1fc04c77f 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -6974,16 +6974,41 @@ namespace smt { expr_ref_vector pathChars(m); expr_ref_vector pathChars_len_constraints(m); - for (unsigned i = 0; i < lenVal.get_unsigned(); ++i) { - std::stringstream ss; - ss << "ch" << i; - expr_ref ch(mk_str_var(ss.str()), m); - pathChars.push_back(ch); - pathChars_len_constraints.push_back(ctx.mk_eq_atom(mk_strlen(ch), m_autil.mk_numeral(rational::one(), true))); - } - // TODO(mtrberzi) possibly modify this to reuse character terms over the same string, - // i.e. across different constraints over S the same variables S_0, S_1, ... are always used and refreshed + // reuse character terms over the same string + if (string_chars.contains(stringTerm)) { + // find out whether we have enough characters already + ptr_vector old_chars; + string_chars.find(stringTerm, old_chars); + if (old_chars.size() < lenVal.get_unsigned()) { + for (unsigned i = old_chars.size(); i < lenVal.get_unsigned(); ++i) { + std::stringstream ss; + ss << "ch" << i; + expr_ref ch(mk_str_var(ss.str()), m); + m_trail.push_back(ch); + old_chars.push_back(ch); + } + } + string_chars.insert(stringTerm, old_chars); + // now we're guaranteed to have at least the right number of characters in old_chars + for (unsigned i = 0; i < lenVal.get_unsigned(); ++i) { + expr_ref ch(old_chars.get(i), m); + refresh_theory_var(ch); + pathChars.push_back(ch); + pathChars_len_constraints.push_back(ctx.mk_eq_atom(mk_strlen(ch), m_autil.mk_numeral(rational::one(), true))); + } + } else { + ptr_vector new_chars; + for (unsigned i = 0; i < lenVal.get_unsigned(); ++i) { + std::stringstream ss; + ss << "ch" << i; + expr_ref ch(mk_str_var(ss.str()), m); + pathChars.push_back(ch); + pathChars_len_constraints.push_back(ctx.mk_eq_atom(mk_strlen(ch), m_autil.mk_numeral(rational::one(), true))); + new_chars.push_back(ch); + } + string_chars.insert(stringTerm, new_chars); + } // modification of code in seq_rewriter::mk_str_in_regexp() expr_ref_vector trail(m); diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index 45df675fb..86929d4d3 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -415,6 +415,7 @@ protected: obj_map regex_length_attempt_count; obj_map regex_fail_count; obj_map regex_intersection_fail_count; + obj_map > string_chars; // S --> [S_0, S_1, ...] for character terms S_i svector char_set; std::map charSetLookupTable; From 191cc30e2a6116b1f3c03c05495bb52da762275a Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Mon, 15 Jan 2018 15:30:12 -0500 Subject: [PATCH 12/36] intersection of regex constraints produces a conflict clause --- src/smt/theory_str.cpp | 56 +++++++++++++++++++++++++++++++++++++----- src/smt/theory_str.h | 10 ++++---- 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 1fc04c77f..4c36b0f45 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -9637,7 +9637,13 @@ namespace smt { regex_axiom_add = true; // increment LengthAttemptCount - regex_inc_counter(regex_length_attempt_count, str_in_re); + regex_inc_counter(regex_length_attempt_count, re); + + { + unsigned v = regex_get_counter(regex_length_attempt_count, re); + TRACE("str", tout << "length attempt count for " << mk_pp(re, m) << " is " << v << std::endl;); + } + continue; } else { // no automata available, or else all bounds assumptions are invalid @@ -9910,8 +9916,6 @@ namespace smt { } } } - - // NOT_IMPLEMENTED_YET(); } // foreach (entry in regex_terms) for (obj_map >::iterator it = regex_terms_by_string.begin(); @@ -9999,11 +10003,19 @@ namespace smt { continue; } + { + unsigned v = regex_get_counter(regex_length_attempt_count, aut.get_regex_term()); + TRACE("str", tout << "length attempt count of " << mk_pp(aut.get_regex_term(), m) << " is " << v + << ", threshold is " << m_params.m_RegexAutomata_LengthAttemptThreshold << std::endl;); + } + if (regex_get_counter(regex_length_attempt_count, aut.get_regex_term()) >= m_params.m_RegexAutomata_LengthAttemptThreshold) { unsigned intersectionDifficulty = 0; // TODO EstimateIntersectionDifficulty if (intersectionDifficulty <= m_params.m_RegexAutomata_IntersectionDifficultyThreshold || regex_get_counter(regex_intersection_fail_count, aut.get_regex_term()) >= m_params.m_RegexAutomata_FailedIntersectionThreshold) { - lbool current_assignment = ctx.get_assignment(aut.get_regex_term()); + + expr * str_in_re_term(u.re.mk_in_re(str, aut.get_regex_term())); + lbool current_assignment = ctx.get_assignment(str_in_re_term); // if the assignment is consistent with our assumption, use the automaton directly; // otherwise, complement it (and save that automaton for next time) // TODO we should cache these intermediate results @@ -10013,7 +10025,12 @@ namespace smt { aut_inter = m_mk_aut.mk_product(aut_inter, aut.get_automaton()); } else { // need to complement first - NOT_IMPLEMENTED_YET(); + expr_ref rc(u.re.mk_complement(aut.get_regex_term()), m); + eautomaton * aut_c = m_mk_aut(rc); + regex_automata.push_back(aut_c); + // TODO is there any way to build a complement automaton from an existing one? + // this discards length information + aut_inter = m_mk_aut.mk_product(aut_inter, aut_c); } used_intersect_constraints.push_back(aut); if (aut_inter->is_empty()) { @@ -10028,7 +10045,34 @@ namespace smt { TRACE("str", tout << "intersected " << used_intersect_constraints.size() << " constraints" << std::endl;); if (aut_inter != NULL && aut_inter->is_empty()) { TRACE("str", tout << "product automaton is empty; asserting conflict clause" << std::endl;); - NOT_IMPLEMENTED_YET(); + expr_ref_vector conflict_terms(m); + + for (svector::iterator aut_it = used_intersect_constraints.begin(); + aut_it != used_intersect_constraints.end(); ++aut_it) { + regex_automaton_under_assumptions aut = *aut_it; + expr * str_in_re_term(u.re.mk_in_re(str, aut.get_regex_term())); + lbool current_assignment = ctx.get_assignment(str_in_re_term); + if (current_assignment == l_true) { + conflict_terms.push_back(str_in_re_term); + } else if (current_assignment == l_false) { + conflict_terms.push_back(m.mk_not(str_in_re_term)); + } + // add length assumptions, if any + rational ub; + if (aut.get_upper_bound(ub)) { + expr_ref ub_term(m_autil.mk_le(mk_strlen(str), m_autil.mk_numeral(ub, true)), m); + conflict_terms.push_back(ub_term); + } + rational lb; + if (aut.get_lower_bound(lb)) { + expr_ref lb_term(m_autil.mk_ge(mk_strlen(str), m_autil.mk_numeral(lb, true)), m); + conflict_terms.push_back(lb_term); + } + } + + expr_ref conflict_clause(m.mk_not(mk_and(conflict_terms)), m); + assert_axiom(conflict_clause); + regex_axiom_add = true; } } // foreach (entry in regex_terms_by_string) diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index 86929d4d3..acab8e019 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -168,7 +168,7 @@ public: class regex_automaton_under_assumptions { protected: - expr * str_in_re; + expr * re_term; eautomaton * aut; bool polarity; @@ -179,11 +179,11 @@ protected: rational upper_bound; public: regex_automaton_under_assumptions() : - str_in_re(NULL), aut(NULL), polarity(false), + re_term(NULL), aut(NULL), polarity(false), assume_lower_bound(false), assume_upper_bound(false) {} - regex_automaton_under_assumptions(expr * str_in_re, eautomaton * aut, bool polarity) : - str_in_re(str_in_re), aut(aut), polarity(polarity), + regex_automaton_under_assumptions(expr * re_term, eautomaton * aut, bool polarity) : + re_term(re_term), aut(aut), polarity(polarity), assume_lower_bound(false), assume_upper_bound(false) {} void set_lower_bound(rational & lb) { @@ -221,7 +221,7 @@ public: } eautomaton * get_automaton() const { return aut; } - expr * get_regex_term() const { return str_in_re; } + expr * get_regex_term() const { return re_term; } bool get_polarity() const { return polarity; } virtual ~regex_automaton_under_assumptions() { From 6c4c9a10e4be2ae1baf646580f635a831f319b02 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Tue, 16 Jan 2018 13:16:31 -0500 Subject: [PATCH 13/36] regex length linearity check WIP --- src/smt/theory_str.cpp | 34 ++++++++++++++++++++++++++++++++++ src/smt/theory_str.h | 2 ++ 2 files changed, 36 insertions(+) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 4c36b0f45..c19b17aec 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -6604,6 +6604,40 @@ namespace smt { return 1; } } + + // Check whether a regex translates well to a linear set of length constraints. + bool theory_str::check_regex_length_linearity(expr * re) { + return check_regex_length_linearity_helper(re, false); + } + + bool theory_str::check_regex_length_linearity_helper(expr * re, bool already_star) { + expr * sub1; + expr * sub2; + if (u.re.is_to_re(re)) { + return true; + } else if (u.re.is_concat(re, sub1, sub2)) { + return check_regex_length_linearity_helper(sub1, already_star) && check_regex_length_linearity_helper(sub2, already_star); + } else if (u.re.is_union(re, sub1, sub2)) { + return check_regex_length_linearity_helper(sub1, already_star) && check_regex_length_linearity_helper(sub2, already_star); + } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) { + if (already_star) { + return false; + } else { + return check_regex_length_linearity_helper(sub1, true); + } + } else if (u.re.is_range(re)) { + return true; + } else if (u.re.is_full(re)) { + return true; + } else if (u.re.is_complement(re)) { + // TODO can we do better? + return false; + } else { + TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;); + UNREACHABLE(); return false; + } + } + /* * Infer all length constraints implied by the given regular expression `re` * in order to constrain `lenVar` (which must be of sort Int). diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index acab8e019..26a6acf8d 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -544,6 +544,8 @@ protected: // regex automata and length-aware regex unsigned estimate_regex_complexity(expr * re); unsigned estimate_regex_complexity_under_complement(expr * re); + bool check_regex_length_linearity(expr * re); + bool check_regex_length_linearity_helper(expr * re, bool already_star); expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables); bool refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound); bool refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound); From 153701eabe3562fd169d4bc863ebb1b5827fe842 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Tue, 16 Jan 2018 13:56:01 -0500 Subject: [PATCH 14/36] regex length term assertion WIP --- src/smt/theory_str.cpp | 49 ++++++++++++++++++++++++++++++++++++++++-- src/smt/theory_str.h | 3 +++ 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index c19b17aec..59b3b7753 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -6735,8 +6735,6 @@ namespace smt { bool theory_str::refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound) { ENSURE(aut != NULL); - ast_manager & m = get_manager(); - if (aut->final_states().size() < 1) { // no solutions at all refined_lower_bound = rational::minus_one(); @@ -9610,6 +9608,53 @@ namespace smt { if (current_assignment == l_undef) { continue; } + + if (!regex_terms_with_length_constraints.contains(str_in_re)) { + if (current_assignment == l_true && check_regex_length_linearity(re)) { + TRACE("str", tout << "regex length constraints expected to be linear -- generating and asserting them" << std::endl;); + + if (regex_term_to_length_constraint.contains(str_in_re)) { + // use existing length constraint + expr * top_level_length_constraint; + regex_term_to_length_constraint.find(str_in_re, top_level_length_constraint); + + ptr_vector extra_length_vars; + regex_term_to_extra_length_vars.find(str_in_re, extra_length_vars); + + assert_axiom(top_level_length_constraint); + for(ptr_vector::iterator it = extra_length_vars.begin(); it != extra_length_vars.end(); ++it) { + expr * v = *it; + refresh_theory_var(v); + expr_ref len_constraint(m_autil.mk_ge(v, m_autil.mk_numeral(rational::zero(), true)), m); + assert_axiom(len_constraint); + } + } else { + // generate new length constraint + expr_ref_vector extra_length_vars(m); + expr_ref top_level_length_constraint = infer_all_regex_lengths(mk_strlen(str), re, extra_length_vars); + TRACE("str", tout << "top-level length constraint: " << mk_pp(top_level_length_constraint, m) << std::endl;); + // assert and track length constraint + assert_axiom(top_level_length_constraint); + for(expr_ref_vector::iterator it = extra_length_vars.begin(); it != extra_length_vars.end(); ++it) { + expr * v = *it; + expr_ref len_constraint(m_autil.mk_ge(v, m_autil.mk_numeral(rational::zero(), true)), m); + assert_axiom(len_constraint); + } + + regex_term_to_length_constraint.insert(str_in_re, top_level_length_constraint); + ptr_vector vtmp; + for(expr_ref_vector::iterator it = extra_length_vars.begin(); it != extra_length_vars.end(); ++it) { + vtmp.push_back(*it); + } + regex_term_to_extra_length_vars.insert(str_in_re, vtmp); + } + + regex_terms_with_length_constraints.insert(str_in_re); + m_trail_stack.push(insert_obj_trail(regex_terms_with_length_constraints, str_in_re)); + regex_axiom_add = true; + } + } // re not in regex_terms_with_length_constraints + rational exact_length_value; if (get_len_value(str, exact_length_value)) { TRACE("str", tout << "exact length of " << mk_pp(str, m) << " is " << exact_length_value << std::endl;); diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index 26a6acf8d..0d56211cd 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -409,6 +409,9 @@ protected: obj_map > regex_automaton_assumptions; // RegEx --> [ aut+assumptions ] std::map regex_nfa_cache; // Regex term --> NFA obj_hashtable regex_terms_with_path_constraints; // set of string terms which have had path constraints asserted in the current scope + obj_hashtable regex_terms_with_length_constraints; // set of regex terms which had had length constraints asserted in the current scope + obj_map regex_term_to_length_constraint; // (str.in.re S R) -> (length constraint over S wrt. R) + obj_map > regex_term_to_extra_length_vars; // extra length vars used in regex_term_to_length_constraint entries // each counter maps a (str.in.re) expression to an integer. // use helper functions regex_inc_counter() and regex_get_counter() to access From e5585ecf4c65e1c7427339a6a86998afad450220 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Tue, 16 Jan 2018 18:15:29 -0500 Subject: [PATCH 15/36] regex fail count and automaton fallback --- src/smt/theory_str.cpp | 29 +++++++++++++++++++++-------- src/smt/theory_str.h | 1 + 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 59b3b7753..474dd9c39 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -9854,8 +9854,11 @@ namespace smt { } else { // TODO check negation? // TODO construct a partial automaton for R to the given upper bound? - // TODO increment failure count if we can't - NOT_IMPLEMENTED_YET(); + if (false) { + + } else { + regex_inc_counter(regex_fail_count, str_in_re); + } } continue; } @@ -9866,8 +9869,8 @@ namespace smt { } } else { // !upper_bound_exists // no upper bound information - if (lower_bound_exists) { - // lower bound, no upper bound + if (lower_bound_exists && !lower_bound_value.is_zero()) { + // nonzero lower bound, no upper bound // check current assumptions if (regex_automaton_assumptions.contains(re) && @@ -9967,7 +9970,11 @@ namespace smt { // TODO check negation? // TODO construct a partial automaton for R to the given lower bound? // TODO increment failure count - NOT_IMPLEMENTED_YET(); + if (false) { + + } else { + regex_inc_counter(regex_fail_count, str_in_re); + } } continue; } @@ -9976,10 +9983,12 @@ namespace smt { // check for existing automata; // try to construct an automaton if we don't have one yet // and doing so without bounds is not difficult - NOT_IMPLEMENTED_YET(); - if (true) { + bool existingAutomata = (regex_automaton_assumptions.contains(re) && !regex_automaton_assumptions[re].empty()); + bool failureThresholdExceeded = (regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold); + if (!existingAutomata || failureThresholdExceeded) { unsigned expected_complexity = estimate_regex_complexity(re); - if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) { + if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold + || failureThresholdExceeded) { eautomaton * aut = m_mk_aut(re); aut->compress(); regex_automata.push_back(aut); @@ -9991,7 +10000,11 @@ namespace smt { TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); regex_axiom_add = true; // TODO immediately attempt to learn lower/upper bound info here + } else { + regex_inc_counter(regex_fail_count, str_in_re); } + } else { + regex_inc_counter(regex_fail_count, str_in_re); } } } diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index 0d56211cd..7398f599b 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -418,6 +418,7 @@ protected: obj_map regex_length_attempt_count; obj_map regex_fail_count; obj_map regex_intersection_fail_count; + obj_map > string_chars; // S --> [S_0, S_1, ...] for character terms S_i svector char_set; From 26ab91a4480f60a0c8ecd7a6d5cac8bc502eecc9 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Wed, 17 Jan 2018 13:02:32 -0500 Subject: [PATCH 16/36] check duplicate bounds info for regex terms --- src/smt/theory_str.cpp | 32 ++++++++++++++++++++++++++++++-- src/smt/theory_str.h | 5 +++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 474dd9c39..3dccc5681 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -9755,7 +9755,32 @@ namespace smt { CTRACE("str", lower_bound_exists, tout << "lower bound of " << mk_pp(str, m) << " is " << lower_bound_value << std::endl;); CTRACE("str", upper_bound_exists, tout << "upper bound of " << mk_pp(str, m) << " is " << upper_bound_value << std::endl;); - if (upper_bound_exists) { + bool new_lower_bound_info = true; + bool new_upper_bound_info = true; + // check last seen lower/upper bound to avoid performing duplicate work + if (regex_last_lower_bound.contains(str)) { + rational last_lb_value; + regex_last_lower_bound.find(str, last_lb_value); + if (last_lb_value == lower_bound_value) { + new_lower_bound_info = false; + } + } + if (regex_last_upper_bound.contains(str)) { + rational last_ub_value; + regex_last_upper_bound.find(str, last_ub_value); + if (last_ub_value == upper_bound_value) { + new_upper_bound_info = false; + } + } + + if (new_lower_bound_info) { + regex_last_lower_bound.insert(str, lower_bound_value); + } + if (new_upper_bound_info) { + regex_last_upper_bound.insert(str, upper_bound_value); + } + + if (upper_bound_exists && new_upper_bound_info) { // check current assumptions if (regex_automaton_assumptions.contains(re) && !regex_automaton_assumptions[re].empty()){ @@ -9869,7 +9894,7 @@ namespace smt { } } else { // !upper_bound_exists // no upper bound information - if (lower_bound_exists && !lower_bound_value.is_zero()) { + if (lower_bound_exists && !lower_bound_value.is_zero() && new_lower_bound_info) { // nonzero lower bound, no upper bound // check current assumptions @@ -9930,10 +9955,13 @@ namespace smt { rhs.push_back(ctx.mk_eq_atom(str_len, m_autil.mk_numeral(lower_bound_value, true))); } else { // If there are solutions at and above the lower bound, add an additional bound. + // DISABLED as this is causing non-termination in the integer solver. --mtrberzi + /* rhs.push_back(m.mk_or( ctx.mk_eq_atom(str_len, m_autil.mk_numeral(lower_bound_value, true)), m_autil.mk_ge(str_len, m_autil.mk_numeral(refined_lower_bound, true)) )); + */ } } else { if (refined_lower_bound.is_minus_one()) { diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index 7398f599b..cf3958ceb 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -413,6 +413,11 @@ protected: obj_map regex_term_to_length_constraint; // (str.in.re S R) -> (length constraint over S wrt. R) obj_map > regex_term_to_extra_length_vars; // extra length vars used in regex_term_to_length_constraint entries + // keep track of the last lower/upper bound we saw for each string term + // so we don't perform duplicate work + obj_map regex_last_lower_bound; + obj_map regex_last_upper_bound; + // each counter maps a (str.in.re) expression to an integer. // use helper functions regex_inc_counter() and regex_get_counter() to access obj_map regex_length_attempt_count; From c0ed683882c77143539050149cb1e6ef27cbd735 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Wed, 17 Jan 2018 13:32:44 -0500 Subject: [PATCH 17/36] disable regex length constraint generation as it currently makes unsound axioms --- src/smt/theory_str.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 3dccc5681..9aec6fab7 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -9609,6 +9609,8 @@ namespace smt { continue; } + // DISABLED due to bug -- star-over-union (and possibly others) results in unsound constraints + /* if (!regex_terms_with_length_constraints.contains(str_in_re)) { if (current_assignment == l_true && check_regex_length_linearity(re)) { TRACE("str", tout << "regex length constraints expected to be linear -- generating and asserting them" << std::endl;); @@ -9654,6 +9656,7 @@ namespace smt { regex_axiom_add = true; } } // re not in regex_terms_with_length_constraints + */ rational exact_length_value; if (get_len_value(str, exact_length_value)) { From c2b268c64555199e9134948474899e87eb7b9707 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Wed, 17 Jan 2018 18:26:31 -0500 Subject: [PATCH 18/36] short path for length-0 regex terms --- src/smt/theory_str.cpp | 324 ++++++++++++++++++++++++++--------------- src/smt/theory_str.h | 1 + 2 files changed, 206 insertions(+), 119 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 9aec6fab7..e25dc84f3 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -6646,6 +6646,9 @@ namespace smt { * In some cases, the returned formula requires one or more free integer variables to be created. * These variables are returned in the reference parameter `freeVariables`. * Extra assertions should be made for these free variables constraining them to be non-negative. + * + * TODO: star unrolling? + * TODO: generate stars "correctly" as a linear combination of all possible subterm lengths */ expr_ref theory_str::infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables) { ENSURE(u.is_re(re)); @@ -6725,6 +6728,59 @@ namespace smt { } } + /* + * Assert initial lower and upper bounds for the positive constraint (str in re) corresponding + * to the automaton `aut`. + * This asserts a constraint of the form: + * str_in_re --> (len(str) ?= 0 OR len(str) >= lb) AND len(str) <= ub + * where the upper bound clause is omitted if the upper bound doesn't exist + * and the equality with 0 is based on whether solutions of length 0 are allowed. + */ + void theory_str::find_automaton_initial_bounds(expr * str_in_re, eautomaton * aut) { + ENSURE(aut != NULL); + context & ctx = get_context(); + ast_manager & m = get_manager(); + + expr_ref_vector rhs(m); + expr * str; + expr * re; + u.str.is_in_re(str_in_re, str, re); + expr_ref strlen(mk_strlen(str), m); + + // lower bound first + rational nonzero_lower_bound; + bool zero_sol_exists = refine_automaton_lower_bound(aut, rational::zero(), nonzero_lower_bound); + if (zero_sol_exists) { + regex_last_lower_bound.insert(str, rational::zero()); + // solution at 0 + if (!nonzero_lower_bound.is_minus_one()) { + expr_ref rhs1(ctx.mk_eq_atom(strlen, m_autil.mk_numeral(rational::zero(), true)), m); + expr_ref rhs2(m_autil.mk_ge(strlen, m_autil.mk_numeral(nonzero_lower_bound, true)), m); + rhs.push_back(m.mk_or(rhs1, rhs2)); + } else { + // shouldn't happen + UNREACHABLE(); + } + } else { + // no solution at 0 + if (!nonzero_lower_bound.is_minus_one()) { + regex_last_lower_bound.insert(str, nonzero_lower_bound); + expr_ref rhs2(m_autil.mk_ge(strlen, m_autil.mk_numeral(nonzero_lower_bound, true)), m); + rhs.push_back(rhs2); + } else { + // shouldn't happen + UNREACHABLE(); + } + } + // TODO upper bound check + + if (!rhs.empty()) { + expr_ref lhs(str_in_re, m); + expr_ref _rhs(mk_and(rhs), m); + assert_implication(lhs, _rhs); + } + } + /* * Refine the lower bound on the length of a solution to a given automaton. * The method returns TRUE if a solution of length `current_lower_bound` exists, @@ -9596,7 +9652,6 @@ namespace smt { // regex automata if (m_params.m_RegexAutomata) { // TODO since heuristics might fail, the "no progress" flag might need to be handled specially here - // TODO learning of linear length constraints in the style of length automata, if possible? bool regex_axiom_add = false; for (obj_hashtable::iterator it = regex_terms.begin(); it != regex_terms.end(); ++it) { expr * str_in_re = *it; @@ -9667,89 +9722,96 @@ namespace smt { continue; } - // find a consistent automaton for this term - bool found = false; - regex_automaton_under_assumptions assumption; - if (regex_automaton_assumptions.contains(re) && - !regex_automaton_assumptions[re].empty()){ - for (svector::iterator it = regex_automaton_assumptions[re].begin(); - it != regex_automaton_assumptions[re].end(); ++it) { - regex_automaton_under_assumptions autA = *it; - rational assumed_upper_bound, assumed_lower_bound; - bool assumes_upper_bound = autA.get_upper_bound(assumed_upper_bound); - bool assumes_lower_bound = autA.get_lower_bound(assumed_lower_bound); - if (!assumes_upper_bound && !assumes_lower_bound) { - // automaton with no assumptions is always usable - assumption = autA; - found = true; - break; - } - // check consistency of bounds assumptions - } // foreach(a in regex_automaton_assumptions) - } - if (found) { - expr_ref pathConstraint(m); - expr_ref characterConstraints(m); - pathConstraint = generate_regex_path_constraints(str, assumption.get_automaton(), exact_length_value, characterConstraints); - TRACE("str", tout << "generated regex path constraint " << mk_pp(pathConstraint, m) << std::endl;); - TRACE("str", tout << "character constraints are " << mk_pp(characterConstraints, m) << std::endl;); - - expr_ref_vector lhs_terms(m); - if (current_assignment == l_true) { - lhs_terms.push_back(str_in_re); - } else { - lhs_terms.push_back(m.mk_not(str_in_re)); - } - lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true))); - expr_ref lhs(mk_and(lhs_terms), m); - // If the automaton was built with the same polarity as the constraint, - // assert directly. Otherwise, negate the path constraint - if ( (current_assignment == l_true && assumption.get_polarity()) - || (current_assignment == l_false && !assumption.get_polarity())) { - TRACE("str", tout << "automaton and regex term have same polarity" << std::endl;); - expr_ref rhs(m.mk_and(pathConstraint, characterConstraints), m); - assert_implication(lhs, rhs); - } else { - TRACE("str", tout << "automaton and regex term have opposite polarity" << std::endl;); - expr_ref rhs(m.mk_and(m.mk_not(pathConstraint), characterConstraints), m); - assert_implication(lhs, rhs); - } - regex_terms_with_path_constraints.insert(str_in_re); - m_trail_stack.push(insert_obj_trail(regex_terms_with_path_constraints, str_in_re)); - regex_axiom_add = true; - - // increment LengthAttemptCount - regex_inc_counter(regex_length_attempt_count, re); - - { - unsigned v = regex_get_counter(regex_length_attempt_count, re); - TRACE("str", tout << "length attempt count for " << mk_pp(re, m) << " is " << v << std::endl;); - } - - continue; + if (exact_length_value.is_zero()) { + // shortcut + expr_ref lhs(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(rational::zero(), true)), m); + expr_ref rhs(ctx.mk_eq_atom(str, mk_string("")), m); + assert_implication(lhs, rhs); } else { - // no automata available, or else all bounds assumptions are invalid - unsigned expected_complexity = estimate_regex_complexity(re); - if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold) { - CTRACE("str", regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold, - tout << "failed automaton threshold reached for " << mk_pp(str_in_re, m) << " -- automatically constructing full automaton" << std::endl;); - eautomaton * aut = m_mk_aut(re); - aut->compress(); - regex_automata.push_back(aut); - regex_automaton_under_assumptions new_aut(re, aut, true); - if (!regex_automaton_assumptions.contains(re)) { - regex_automaton_assumptions.insert(re, svector()); - } - regex_automaton_assumptions[re].push_back(new_aut); - TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); - regex_axiom_add = true; - // TODO immediately attempt to learn lower/upper bound info here - } else { - regex_inc_counter(regex_fail_count, str_in_re); + // find a consistent automaton for this term + bool found = false; + regex_automaton_under_assumptions assumption; + if (regex_automaton_assumptions.contains(re) && + !regex_automaton_assumptions[re].empty()){ + for (svector::iterator it = regex_automaton_assumptions[re].begin(); + it != regex_automaton_assumptions[re].end(); ++it) { + regex_automaton_under_assumptions autA = *it; + rational assumed_upper_bound, assumed_lower_bound; + bool assumes_upper_bound = autA.get_upper_bound(assumed_upper_bound); + bool assumes_lower_bound = autA.get_lower_bound(assumed_lower_bound); + if (!assumes_upper_bound && !assumes_lower_bound) { + // automaton with no assumptions is always usable + assumption = autA; + found = true; + break; + } + // check consistency of bounds assumptions + } // foreach(a in regex_automaton_assumptions) } - continue; - } - } + if (found) { + expr_ref pathConstraint(m); + expr_ref characterConstraints(m); + pathConstraint = generate_regex_path_constraints(str, assumption.get_automaton(), exact_length_value, characterConstraints); + TRACE("str", tout << "generated regex path constraint " << mk_pp(pathConstraint, m) << std::endl;); + TRACE("str", tout << "character constraints are " << mk_pp(characterConstraints, m) << std::endl;); + + expr_ref_vector lhs_terms(m); + if (current_assignment == l_true) { + lhs_terms.push_back(str_in_re); + } else { + lhs_terms.push_back(m.mk_not(str_in_re)); + } + lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true))); + expr_ref lhs(mk_and(lhs_terms), m); + // If the automaton was built with the same polarity as the constraint, + // assert directly. Otherwise, negate the path constraint + if ( (current_assignment == l_true && assumption.get_polarity()) + || (current_assignment == l_false && !assumption.get_polarity())) { + TRACE("str", tout << "automaton and regex term have same polarity" << std::endl;); + expr_ref rhs(m.mk_and(pathConstraint, characterConstraints), m); + assert_implication(lhs, rhs); + } else { + TRACE("str", tout << "automaton and regex term have opposite polarity" << std::endl;); + expr_ref rhs(m.mk_and(m.mk_not(pathConstraint), characterConstraints), m); + assert_implication(lhs, rhs); + } + regex_terms_with_path_constraints.insert(str_in_re); + m_trail_stack.push(insert_obj_trail(regex_terms_with_path_constraints, str_in_re)); + regex_axiom_add = true; + + // increment LengthAttemptCount + regex_inc_counter(regex_length_attempt_count, re); + + { + unsigned v = regex_get_counter(regex_length_attempt_count, re); + TRACE("str", tout << "length attempt count for " << mk_pp(re, m) << " is " << v << std::endl;); + } + + continue; + } else { + // no automata available, or else all bounds assumptions are invalid + unsigned expected_complexity = estimate_regex_complexity(re); + if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold) { + CTRACE("str", regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold, + tout << "failed automaton threshold reached for " << mk_pp(str_in_re, m) << " -- automatically constructing full automaton" << std::endl;); + eautomaton * aut = m_mk_aut(re); + aut->compress(); + regex_automata.push_back(aut); + regex_automaton_under_assumptions new_aut(re, aut, true); + if (!regex_automaton_assumptions.contains(re)) { + regex_automaton_assumptions.insert(re, svector()); + } + regex_automaton_assumptions[re].push_back(new_aut); + TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); + regex_axiom_add = true; + find_automaton_initial_bounds(str_in_re, aut); + } else { + regex_inc_counter(regex_fail_count, str_in_re); + } + continue; + } + } // !length is zero + } // get_len_value() expr_ref str_len(mk_strlen(str), m); rational lower_bound_value; rational upper_bound_value; @@ -9858,10 +9920,12 @@ namespace smt { } } - expr_ref lhs_terms(mk_and(lhs), m); - expr_ref rhs_terms(mk_and(rhs), m); - assert_implication(lhs_terms, rhs_terms); - regex_axiom_add = true; + if (!rhs.empty()) { + expr_ref lhs_terms(mk_and(lhs), m); + expr_ref rhs_terms(mk_and(rhs), m); + assert_implication(lhs_terms, rhs_terms); + regex_axiom_add = true; + } } } else { // no existing automata/assumptions. @@ -9878,7 +9942,7 @@ namespace smt { regex_automaton_assumptions[re].push_back(new_aut); TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); regex_axiom_add = true; - // TODO immediately attempt to learn lower/upper bound info here + find_automaton_initial_bounds(str_in_re, aut); } else { // TODO check negation? // TODO construct a partial automaton for R to the given upper bound? @@ -9976,10 +10040,12 @@ namespace smt { } } - expr_ref lhs_terms(mk_and(lhs), m); - expr_ref rhs_terms(mk_and(rhs), m); - assert_implication(lhs_terms, rhs_terms); - regex_axiom_add = true; + if (!rhs.empty()) { + expr_ref lhs_terms(mk_and(lhs), m); + expr_ref rhs_terms(mk_and(rhs), m); + assert_implication(lhs_terms, rhs_terms); + regex_axiom_add = true; + } } } else { // no existing automata/assumptions. @@ -9996,7 +10062,7 @@ namespace smt { regex_automaton_assumptions[re].push_back(new_aut); TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); regex_axiom_add = true; - // TODO immediately attempt to learn lower/upper bound info here + find_automaton_initial_bounds(str_in_re, aut); } else { // TODO check negation? // TODO construct a partial automaton for R to the given lower bound? @@ -10016,7 +10082,7 @@ namespace smt { // and doing so without bounds is not difficult bool existingAutomata = (regex_automaton_assumptions.contains(re) && !regex_automaton_assumptions[re].empty()); bool failureThresholdExceeded = (regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold); - if (!existingAutomata || failureThresholdExceeded) { + if (!existingAutomata) { unsigned expected_complexity = estimate_regex_complexity(re); if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || failureThresholdExceeded) { @@ -10030,7 +10096,7 @@ namespace smt { regex_automaton_assumptions[re].push_back(new_aut); TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); regex_axiom_add = true; - // TODO immediately attempt to learn lower/upper bound info here + find_automaton_initial_bounds(str_in_re, aut); } else { regex_inc_counter(regex_fail_count, str_in_re); } @@ -10165,42 +10231,62 @@ namespace smt { } } } // foreach(entry in intersect_constraints) + aut_inter->compress(); TRACE("str", tout << "intersected " << used_intersect_constraints.size() << " constraints" << std::endl;); - if (aut_inter != NULL && aut_inter->is_empty()) { - TRACE("str", tout << "product automaton is empty; asserting conflict clause" << std::endl;); - expr_ref_vector conflict_terms(m); - for (svector::iterator aut_it = used_intersect_constraints.begin(); - aut_it != used_intersect_constraints.end(); ++aut_it) { - regex_automaton_under_assumptions aut = *aut_it; - expr * str_in_re_term(u.re.mk_in_re(str, aut.get_regex_term())); - lbool current_assignment = ctx.get_assignment(str_in_re_term); - if (current_assignment == l_true) { - conflict_terms.push_back(str_in_re_term); - } else if (current_assignment == l_false) { - conflict_terms.push_back(m.mk_not(str_in_re_term)); - } - // add length assumptions, if any - rational ub; - if (aut.get_upper_bound(ub)) { - expr_ref ub_term(m_autil.mk_le(mk_strlen(str), m_autil.mk_numeral(ub, true)), m); - conflict_terms.push_back(ub_term); - } - rational lb; - if (aut.get_lower_bound(lb)) { - expr_ref lb_term(m_autil.mk_ge(mk_strlen(str), m_autil.mk_numeral(lb, true)), m); - conflict_terms.push_back(lb_term); + expr_ref_vector conflict_terms(m); + expr_ref conflict_lhs(m); + for (svector::iterator aut_it = used_intersect_constraints.begin(); + aut_it != used_intersect_constraints.end(); ++aut_it) { + regex_automaton_under_assumptions aut = *aut_it; + expr * str_in_re_term(u.re.mk_in_re(str, aut.get_regex_term())); + lbool current_assignment = ctx.get_assignment(str_in_re_term); + if (current_assignment == l_true) { + conflict_terms.push_back(str_in_re_term); + } else if (current_assignment == l_false) { + conflict_terms.push_back(m.mk_not(str_in_re_term)); + } + // add length assumptions, if any + rational ub; + if (aut.get_upper_bound(ub)) { + expr_ref ub_term(m_autil.mk_le(mk_strlen(str), m_autil.mk_numeral(ub, true)), m); + conflict_terms.push_back(ub_term); + } + rational lb; + if (aut.get_lower_bound(lb)) { + expr_ref lb_term(m_autil.mk_ge(mk_strlen(str), m_autil.mk_numeral(lb, true)), m); + conflict_terms.push_back(lb_term); + } + } + conflict_lhs = mk_and(conflict_terms); + + if (used_intersect_constraints.size() > 1 && aut_inter != NULL) { + // check whether the intersection is only the empty string + unsigned initial_state = aut_inter->init(); + if (aut_inter->final_states().size() == 1 && aut_inter->is_final_state(initial_state)) { + // initial state is final and it is the only final state + // if there are no moves from the initial state, + // the only solution is the empty string + if (aut_inter->get_moves_from(initial_state).empty()) { + TRACE("str", tout << "product automaton only accepts empty string" << std::endl;); + expr_ref rhs1(ctx.mk_eq_atom(str, mk_string("")), m); + expr_ref rhs2(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(rational::zero(), true)), m); + expr_ref rhs(m.mk_and(rhs1, rhs2), m); + assert_implication(conflict_lhs, rhs); + regex_axiom_add = true; } } + } + if (aut_inter != NULL && aut_inter->is_empty()) { + TRACE("str", tout << "product automaton is empty; asserting conflict clause" << std::endl;); expr_ref conflict_clause(m.mk_not(mk_and(conflict_terms)), m); assert_axiom(conflict_clause); regex_axiom_add = true; } } // foreach (entry in regex_terms_by_string) - if (regex_axiom_add) { - return FC_CONTINUE; + //return FC_CONTINUE; } } // RegexAutomata diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index cf3958ceb..4e790e838 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -556,6 +556,7 @@ protected: bool check_regex_length_linearity(expr * re); bool check_regex_length_linearity_helper(expr * re, bool already_star); expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables); + void find_automaton_initial_bounds(expr * str_in_re, eautomaton * aut); bool refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound); bool refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound); expr_ref generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal, expr_ref & characterConstraints); From dbb15f65b5d6deba9cefbd4f9298a3e8fe4c6fe5 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Wed, 17 Jan 2018 19:26:42 -0500 Subject: [PATCH 19/36] correct generation of linear length constraints for regex star terms --- src/smt/theory_str.cpp | 97 ++++++++++++++++++++++++++++++++++++++++-- src/smt/theory_str.h | 3 ++ 2 files changed, 96 insertions(+), 4 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index e25dc84f3..c4e6a9471 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -6638,6 +6638,68 @@ namespace smt { } } + // note: returns an empty set `lens` if something went wrong + void theory_str::check_subterm_lengths(expr * re, integer_set & lens) { + expr * sub1; + expr * sub2; + if (u.re.is_to_re(re, sub1)) { + SASSERT(u.str.is_string(sub1)); + zstring(str); + u.str.is_string(sub1, str); + lens.insert(str.length()); + } else if (u.re.is_concat(re, sub1, sub2)) { + integer_set lens_1, lens_2; + check_subterm_lengths(sub1, lens_1); + check_subterm_lengths(sub2, lens_2); + if (lens_1.empty() || lens_2.empty()) { + lens.reset(); + } else { + // take all pairwise lengths + for (integer_set::iterator it1 = lens_1.begin(); it1 != lens_1.end(); ++it1) { + for(integer_set::iterator it2 = lens_2.begin(); it2 != lens_2.end(); ++it2) { + int l1 = *it1; + int l2 = *it2; + lens.insert(l1 + l2); + } + } + } + } else if (u.re.is_union(re, sub1, sub2)) { + integer_set lens_1, lens_2; + check_subterm_lengths(sub1, lens_1); + check_subterm_lengths(sub2, lens_2); + if (lens_1.empty() || lens_2.empty()) { + lens.reset(); + } else { + // take all possibilities from either side + for (integer_set::iterator it1 = lens_1.begin(); it1 != lens_1.end(); ++it1) { + lens.insert(*it1); + } + for (integer_set::iterator it2 = lens_2.begin(); it2 != lens_2.end(); ++it2) { + lens.insert(*it2); + } + } + } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) { + // this is bad -- term generation requires this not to appear + lens.reset(); + } else if (u.re.is_range(re, sub1, sub2)) { + SASSERT(u.str.is_string(sub1)); + SASSERT(u.str.is_string(sub2)); + zstring str1, str2; + u.str.is_string(sub1, str1); + u.str.is_string(sub2, str2); + SASSERT(str1.length() == 1); + SASSERT(str2.length() == 1); + lens.insert(1); + } else if (u.re.is_full(re)) { + lens.insert(1); + } else if (u.re.is_complement(re)) { + lens.reset(); + } else { + TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;); + lens.reset(); + } + } + /* * Infer all length constraints implied by the given regular expression `re` * in order to constrain `lenVar` (which must be of sort Int). @@ -6648,7 +6710,6 @@ namespace smt { * Extra assertions should be made for these free variables constraining them to be non-negative. * * TODO: star unrolling? - * TODO: generate stars "correctly" as a linear combination of all possible subterm lengths */ expr_ref theory_str::infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables) { ENSURE(u.is_re(re)); @@ -6682,6 +6743,9 @@ namespace smt { expr_ref retval(mk_and(finalResult), m); return retval; } else if (u.re.is_star(re, sub1)) { + // stars are generated as a linear combination of all possible subterm lengths; + // this requires that there are no stars under this one + /* expr * v = mk_int_var("rlen"); expr * n = mk_int_var("rstar"); freeVariables.push_back(v); @@ -6692,7 +6756,32 @@ namespace smt { finalResult.push_back(ctx.mk_eq_atom(lenVar, m_autil.mk_mul(v, n))); expr_ref retval(mk_and(finalResult), m); return retval; + */ + integer_set subterm_lens; + check_subterm_lengths(sub1, subterm_lens); + if (subterm_lens.empty()) { + // somehow generation was impossible + expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m); + return retval; + } else { + TRACE("str", tout << "subterm lengths:"; + for(integer_set::iterator it = subterm_lens.begin(); it != subterm_lens.end(); ++it) { + tout << " " << *it; + } + tout << std::endl;); + expr_ref_vector sum_terms(m); + for (integer_set::iterator it = subterm_lens.begin(); it != subterm_lens.end(); ++it) { + rational lenOption(*it); + expr * n = mk_int_var("rstar"); + freeVariables.push_back(n); + expr_ref term(m_autil.mk_mul(m_autil.mk_numeral(lenOption, true), n), m); + sum_terms.push_back(term); + } + expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_add_simplify(sum_terms)), m); + return retval; + } } else if (u.re.is_plus(re, sub1)) { + /* expr * v = mk_int_var("rlen"); expr * n = mk_int_var("rstar"); freeVariables.push_back(v); @@ -6704,6 +6793,9 @@ namespace smt { finalResult.push_back(m_autil.mk_ge(n, m_autil.mk_numeral(rational::one(), true))); expr_ref retval(mk_and(finalResult), m); return retval; + */ + // TODO this should really be done as a sub-case of star + NOT_IMPLEMENTED_YET(); } else if (u.re.is_range(re, sub1, sub2)) { SASSERT(u.str.is_string(sub1)); SASSERT(u.str.is_string(sub2)); @@ -9664,8 +9756,6 @@ namespace smt { continue; } - // DISABLED due to bug -- star-over-union (and possibly others) results in unsound constraints - /* if (!regex_terms_with_length_constraints.contains(str_in_re)) { if (current_assignment == l_true && check_regex_length_linearity(re)) { TRACE("str", tout << "regex length constraints expected to be linear -- generating and asserting them" << std::endl;); @@ -9711,7 +9801,6 @@ namespace smt { regex_axiom_add = true; } } // re not in regex_terms_with_length_constraints - */ rational exact_length_value; if (get_len_value(str, exact_length_value)) { diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index 4e790e838..ea4449b13 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -20,6 +20,7 @@ #include "util/trail.h" #include "util/union_find.h" #include "util/scoped_ptr_vector.h" +#include "util/hashtable.h" #include "ast/ast_pp.h" #include "ast/arith_decl_plugin.h" #include "ast/rewriter/th_rewriter.h" @@ -37,6 +38,7 @@ namespace smt { typedef hashtable symbol_set; +typedef int_hashtable > integer_set; class str_value_factory : public value_factory { seq_util u; @@ -556,6 +558,7 @@ protected: bool check_regex_length_linearity(expr * re); bool check_regex_length_linearity_helper(expr * re, bool already_star); expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables); + void check_subterm_lengths(expr * re, integer_set & lens); void find_automaton_initial_bounds(expr * str_in_re, eautomaton * aut); bool refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound); bool refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound); From 5727950a3ced8c32a3a7d19680cd0dc5717fee7e Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Thu, 18 Jan 2018 17:52:55 -0500 Subject: [PATCH 20/36] zero-length automaton solution fix --- src/smt/theory_str.cpp | 148 ++++++++++++++++++++++++++++------------- 1 file changed, 100 insertions(+), 48 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index c4e6a9471..1f84b5faf 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -9811,33 +9811,85 @@ namespace smt { continue; } - if (exact_length_value.is_zero()) { - // shortcut - expr_ref lhs(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(rational::zero(), true)), m); - expr_ref rhs(ctx.mk_eq_atom(str, mk_string("")), m); - assert_implication(lhs, rhs); - } else { - // find a consistent automaton for this term - bool found = false; - regex_automaton_under_assumptions assumption; - if (regex_automaton_assumptions.contains(re) && - !regex_automaton_assumptions[re].empty()){ - for (svector::iterator it = regex_automaton_assumptions[re].begin(); - it != regex_automaton_assumptions[re].end(); ++it) { - regex_automaton_under_assumptions autA = *it; - rational assumed_upper_bound, assumed_lower_bound; - bool assumes_upper_bound = autA.get_upper_bound(assumed_upper_bound); - bool assumes_lower_bound = autA.get_lower_bound(assumed_lower_bound); - if (!assumes_upper_bound && !assumes_lower_bound) { - // automaton with no assumptions is always usable - assumption = autA; - found = true; - break; + + // find a consistent automaton for this term + bool found = false; + regex_automaton_under_assumptions assumption; + if (regex_automaton_assumptions.contains(re) && + !regex_automaton_assumptions[re].empty()){ + for (svector::iterator it = regex_automaton_assumptions[re].begin(); + it != regex_automaton_assumptions[re].end(); ++it) { + regex_automaton_under_assumptions autA = *it; + rational assumed_upper_bound, assumed_lower_bound; + bool assumes_upper_bound = autA.get_upper_bound(assumed_upper_bound); + bool assumes_lower_bound = autA.get_lower_bound(assumed_lower_bound); + if (!assumes_upper_bound && !assumes_lower_bound) { + // automaton with no assumptions is always usable + assumption = autA; + found = true; + break; + } + // check consistency of bounds assumptions + } // foreach(a in regex_automaton_assumptions) + } + if (found) { + if (exact_length_value.is_zero()) { + // check consistency of 0-length solution with automaton + eautomaton * aut = assumption.get_automaton(); + bool zero_solution = false; + unsigned initial_state = aut->init(); + if (aut->is_final_state(initial_state)) { + zero_solution = true; + } else { + unsigned_vector eps_states; + aut->get_epsilon_closure(initial_state, eps_states); + for (unsigned_vector::iterator it = eps_states.begin(); it != eps_states.end(); ++it) { + unsigned state = *it; + if (aut->is_final_state(state)) { + zero_solution = true; + break; + } } - // check consistency of bounds assumptions - } // foreach(a in regex_automaton_assumptions) - } - if (found) { + } + + // now check polarity of automaton wrt. original term + if ( (current_assignment == l_true && !assumption.get_polarity()) + || (current_assignment == l_false && assumption.get_polarity())) { + // invert sense + NOT_IMPLEMENTED_YET(); + } + + if (zero_solution) { + TRACE("str", tout << "zero-length solution OK -- asserting empty path constraint" << std::endl;); + expr_ref_vector lhs_terms(m); + if (current_assignment == l_true) { + lhs_terms.push_back(str_in_re); + } else { + lhs_terms.push_back(m.mk_not(str_in_re)); + } + lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true))); + expr_ref lhs(mk_and(lhs_terms), m); + expr_ref rhs(ctx.mk_eq_atom(str, mk_string("")), m); + assert_implication(lhs, rhs); + regex_terms_with_path_constraints.insert(str_in_re); + m_trail_stack.push(insert_obj_trail(regex_terms_with_path_constraints, str_in_re)); + } else { + TRACE("str", tout << "zero-length solution not admitted by this automaton -- asserting conflict clause" << std::endl;); + expr_ref_vector lhs_terms(m); + if (current_assignment == l_true) { + lhs_terms.push_back(str_in_re); + } else { + lhs_terms.push_back(m.mk_not(str_in_re)); + } + lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true))); + expr_ref lhs(mk_and(lhs_terms), m); + expr_ref conflict(m.mk_not(lhs), m); + assert_axiom(conflict); + } + regex_axiom_add = true; + regex_inc_counter(regex_length_attempt_count, re); + continue; + } else { expr_ref pathConstraint(m); expr_ref characterConstraints(m); pathConstraint = generate_regex_path_constraints(str, assumption.get_automaton(), exact_length_value, characterConstraints); @@ -9876,30 +9928,30 @@ namespace smt { TRACE("str", tout << "length attempt count for " << mk_pp(re, m) << " is " << v << std::endl;); } - continue; - } else { - // no automata available, or else all bounds assumptions are invalid - unsigned expected_complexity = estimate_regex_complexity(re); - if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold) { - CTRACE("str", regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold, - tout << "failed automaton threshold reached for " << mk_pp(str_in_re, m) << " -- automatically constructing full automaton" << std::endl;); - eautomaton * aut = m_mk_aut(re); - aut->compress(); - regex_automata.push_back(aut); - regex_automaton_under_assumptions new_aut(re, aut, true); - if (!regex_automaton_assumptions.contains(re)) { - regex_automaton_assumptions.insert(re, svector()); - } - regex_automaton_assumptions[re].push_back(new_aut); - TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); - regex_axiom_add = true; - find_automaton_initial_bounds(str_in_re, aut); - } else { - regex_inc_counter(regex_fail_count, str_in_re); - } continue; } - } // !length is zero + } else { + // no automata available, or else all bounds assumptions are invalid + unsigned expected_complexity = estimate_regex_complexity(re); + if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold) { + CTRACE("str", regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold, + tout << "failed automaton threshold reached for " << mk_pp(str_in_re, m) << " -- automatically constructing full automaton" << std::endl;); + eautomaton * aut = m_mk_aut(re); + aut->compress(); + regex_automata.push_back(aut); + regex_automaton_under_assumptions new_aut(re, aut, true); + if (!regex_automaton_assumptions.contains(re)) { + regex_automaton_assumptions.insert(re, svector()); + } + regex_automaton_assumptions[re].push_back(new_aut); + TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); + regex_axiom_add = true; + find_automaton_initial_bounds(str_in_re, aut); + } else { + regex_inc_counter(regex_fail_count, str_in_re); + } + continue; + } } // get_len_value() expr_ref str_len(mk_strlen(str), m); rational lower_bound_value; From a9fda81d0350fa586f9ef12c10309adab65847b0 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Thu, 18 Jan 2018 17:53:42 -0500 Subject: [PATCH 21/36] check polarity --- src/smt/theory_str.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 1f84b5faf..dceb17a06 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -9856,7 +9856,7 @@ namespace smt { if ( (current_assignment == l_true && !assumption.get_polarity()) || (current_assignment == l_false && assumption.get_polarity())) { // invert sense - NOT_IMPLEMENTED_YET(); + zero_solution = !zero_solution; } if (zero_solution) { From 2065ea88ee3000c24d3d3fc2fade98c899f261d6 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Fri, 19 Jan 2018 14:56:06 -0500 Subject: [PATCH 22/36] fix null pointer dereference --- src/smt/theory_str.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index dceb17a06..5d66a4ec3 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -10372,7 +10372,9 @@ namespace smt { } } } // foreach(entry in intersect_constraints) - aut_inter->compress(); + if (aut_inter != NULL) { + aut_inter->compress(); + } TRACE("str", tout << "intersected " << used_intersect_constraints.size() << " constraints" << std::endl;); expr_ref_vector conflict_terms(m); From d9d3ef78d2a4c8e643e33cdfaf5ae3b30c1c2a4d Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Fri, 19 Jan 2018 16:14:56 -0500 Subject: [PATCH 23/36] temporarily disable final check progress checking it is interfering with regex automata solving --- src/smt/theory_str.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 5d66a4ec3..933796d9c 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -35,7 +35,7 @@ namespace smt { m_params(params), /* Options */ opt_EagerStringConstantLengthAssertions(true), - opt_VerifyFinalCheckProgress(true), + opt_VerifyFinalCheckProgress(false), opt_LCMUnrollStep(2), opt_NoQuickReturn_IntegerTheory(false), opt_DisableIntegerTheoryIntegration(false), From d648f95f633626ac66b29c9ff1322ca995065e5d Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Wed, 24 Jan 2018 21:25:45 -0500 Subject: [PATCH 24/36] fix setup of path constraints when the path constraint is False --- src/smt/theory_str.cpp | 49 +++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 933796d9c..680500229 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -9904,21 +9904,44 @@ namespace smt { } lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true))); expr_ref lhs(mk_and(lhs_terms), m); - // If the automaton was built with the same polarity as the constraint, - // assert directly. Otherwise, negate the path constraint - if ( (current_assignment == l_true && assumption.get_polarity()) - || (current_assignment == l_false && !assumption.get_polarity())) { - TRACE("str", tout << "automaton and regex term have same polarity" << std::endl;); - expr_ref rhs(m.mk_and(pathConstraint, characterConstraints), m); - assert_implication(lhs, rhs); + + // If the path constraint comes out as "false", this means there are no paths of that length + // in the automaton. If the polarity is the same, we can assert a conflict clause. + // If the polarity is opposite, we ignore the path constraint. + + if (m.is_false(pathConstraint)) { + if ( (current_assignment == l_true && assumption.get_polarity()) + || (current_assignment == l_false && !assumption.get_polarity())) { + // automaton and constraint have same polarity -- assert conflict clause + TRACE("str", tout << "path constraint is false with matching polarity; asserting conflict clause" << std::endl;); + expr_ref conflict(m.mk_not(mk_and(lhs_terms)), m); + assert_axiom(conflict); + // don't set up "regex_terms_with_path_constraints" as a conflict clause is not a path constraint + } else { + // automaton and constraint have opposite polarity -- ignore path constraint + TRACE("str", tout << "path constraint is false with opposite polarity; ignoring path constraint" << std::endl;); + assert_implication(lhs, characterConstraints); + regex_terms_with_path_constraints.insert(str_in_re); + m_trail_stack.push(insert_obj_trail(regex_terms_with_path_constraints, str_in_re)); + } + regex_axiom_add = true; } else { - TRACE("str", tout << "automaton and regex term have opposite polarity" << std::endl;); - expr_ref rhs(m.mk_and(m.mk_not(pathConstraint), characterConstraints), m); - assert_implication(lhs, rhs); + // If the automaton was built with the same polarity as the constraint, + // assert directly. Otherwise, negate the path constraint + if ( (current_assignment == l_true && assumption.get_polarity()) + || (current_assignment == l_false && !assumption.get_polarity())) { + TRACE("str", tout << "automaton and regex term have same polarity" << std::endl;); + expr_ref rhs(m.mk_and(pathConstraint, characterConstraints), m); + assert_implication(lhs, rhs); + } else { + TRACE("str", tout << "automaton and regex term have opposite polarity" << std::endl;); + expr_ref rhs(m.mk_and(m.mk_not(pathConstraint), characterConstraints), m); + assert_implication(lhs, rhs); + } + regex_terms_with_path_constraints.insert(str_in_re); + m_trail_stack.push(insert_obj_trail(regex_terms_with_path_constraints, str_in_re)); + regex_axiom_add = true; } - regex_terms_with_path_constraints.insert(str_in_re); - m_trail_stack.push(insert_obj_trail(regex_terms_with_path_constraints, str_in_re)); - regex_axiom_add = true; // increment LengthAttemptCount regex_inc_counter(regex_length_attempt_count, re); From 8d5065d35d310b0b2e1d1950797c983df4f4f8a4 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Wed, 24 Jan 2018 22:02:00 -0500 Subject: [PATCH 25/36] fix constant eqc bug in mk_concat --- src/smt/theory_str.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 680500229..a9eac7dd4 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -709,6 +709,12 @@ namespace smt { bool n2HasEqcValue = false; expr * v1 = get_eqc_value(n1, n1HasEqcValue); expr * v2 = get_eqc_value(n2, n2HasEqcValue); + if (u.str.is_string(v1)) { + n1HasEqcValue = true; + } + if (u.str.is_string(v2)) { + n2HasEqcValue = true; + } if (n1HasEqcValue && n2HasEqcValue) { zstring n1_str; u.str.is_string(v1, n1_str); From 852e0e0892718dbbf5b9faf7544f93a6a0e210f4 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Thu, 25 Jan 2018 15:25:36 -0500 Subject: [PATCH 26/36] fix regex difficulty overflow bug; fix final check on regex terms that don't get path constraints --- src/smt/theory_str.cpp | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index a9eac7dd4..d30d406b2 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -6523,6 +6523,9 @@ namespace smt { if (a == UINT_MAX || b == UINT_MAX) { return UINT_MAX; } + if (a == 0 || b == 0) { + return 0; + } unsigned result = a * b; if (result < a || result < b) { return UINT_MAX; @@ -9817,7 +9820,6 @@ namespace smt { continue; } - // find a consistent automaton for this term bool found = false; regex_automaton_under_assumptions assumption; @@ -9835,7 +9837,7 @@ namespace smt { found = true; break; } - // check consistency of bounds assumptions + // TODO check consistency of bounds assumptions } // foreach(a in regex_automaton_assumptions) } if (found) { @@ -10101,7 +10103,8 @@ namespace smt { // no existing automata/assumptions. // if it's easy to construct a full automaton for R, do so unsigned expected_complexity = estimate_regex_complexity(re); - if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) { + bool failureThresholdExceeded = (regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold); + if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || failureThresholdExceeded) { eautomaton * aut = m_mk_aut(re); aut->compress(); regex_automata.push_back(aut); @@ -10221,7 +10224,8 @@ namespace smt { // no existing automata/assumptions. // if it's easy to construct a full automaton for R, do so unsigned expected_complexity = estimate_regex_complexity(re); - if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) { + bool failureThresholdExceeded = (regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold); + if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || failureThresholdExceeded) { eautomaton * aut = m_mk_aut(re); aut->compress(); regex_automata.push_back(aut); @@ -10518,6 +10522,27 @@ namespace smt { return FC_CONTINUE; } + // We must be be 100% certain that if there are any regex constraints, + // the string assignment for each variable is consistent with the automaton. + // The (probably) easiest way to do this is to ensure + // that we have path constraints set up for every assigned regex term. + if (m_params.m_RegexAutomata && !regex_terms.empty()) { + for (obj_hashtable::iterator it = regex_terms.begin(); it != regex_terms.end(); ++it) { + expr * str_in_re = *it; + expr * str; + expr * re; + u.str.is_in_re(str_in_re, str, re); + lbool current_assignment = ctx.get_assignment(str_in_re); + if (current_assignment == l_undef) { + continue; + } + if (!regex_terms_with_path_constraints.contains(str_in_re)) { + TRACE("str", tout << "assigned regex term " << mk_pp(str_in_re, m) << " has no path constraints -- continuing search" << std::endl;); + return FC_CONTINUE; + } + } // foreach (str.in.re in regex_terms) + } + if (unused_internal_variables.empty()) { TRACE("str", tout << "All variables are assigned. Done!" << std::endl;); return FC_DONE; From 5c3f35dc4407fb0d4d86fdf9247b7d4401de26ed Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Thu, 25 Jan 2018 15:52:57 -0500 Subject: [PATCH 27/36] always rewrite regex length constraints as they are sometimes malformed --- src/smt/theory_str.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index d30d406b2..9de0375b9 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -9787,7 +9787,10 @@ namespace smt { } else { // generate new length constraint expr_ref_vector extra_length_vars(m); - expr_ref top_level_length_constraint = infer_all_regex_lengths(mk_strlen(str), re, extra_length_vars); + expr_ref _top_level_length_constraint = infer_all_regex_lengths(mk_strlen(str), re, extra_length_vars); + expr_ref top_level_length_constraint(_top_level_length_constraint, m); + th_rewriter rw(m); + rw(top_level_length_constraint); TRACE("str", tout << "top-level length constraint: " << mk_pp(top_level_length_constraint, m) << std::endl;); // assert and track length constraint assert_axiom(top_level_length_constraint); From c01dda4e313f3fd72a5fa8453e112be6a79ad3ef Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Thu, 25 Jan 2018 16:11:31 -0500 Subject: [PATCH 28/36] experimental str.to.int fix --- src/smt/theory_str.cpp | 50 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 9de0375b9..2cbade1d5 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -9371,7 +9371,7 @@ namespace smt { bool Ival_exists = get_arith_value(a, Ival); if (Ival_exists) { TRACE("str", tout << "integer theory assigns " << mk_pp(a, m) << " = " << Ival.to_string() << std::endl;); - // if that value is not -1, we can assert (str.to-int S) = Ival --> S = "Ival" + // if that value is not -1, we can assert (str.to.int S) = Ival --> S = "Ival" if (!Ival.is_minus_one()) { zstring Ival_str(Ival.to_string().c_str()); expr_ref premise(ctx.mk_eq_atom(a, m_autil.mk_numeral(Ival, true)), m); @@ -9393,6 +9393,54 @@ namespace smt { // NOT_IMPLEMENTED_YET(); } + bool S_hasEqcValue; + expr * S_str = get_eqc_value(S, S_hasEqcValue); + if (S_hasEqcValue) { + zstring str; + u.str.is_string(S_str, str); + bool valid = true; + rational convertedRepresentation(0); + rational ten(10); + if (str.length() == 0) { + valid = false; + } else { + for (unsigned i = 0; i < str.length(); ++i) { + if (!('0' <= str[i] && str[i] <= '9')) { + valid = false; + break; + } else { + // accumulate + char digit = (int)str[i]; + std::string sDigit(1, digit); + int val = atoi(sDigit.c_str()); + convertedRepresentation = (ten * convertedRepresentation) + rational(val); + } + } + } + // TODO this duplicates code a bit, we can simplify the branch on "conclusion" only + if (valid) { + expr_ref premise(ctx.mk_eq_atom(S, mk_string(str)), m); + expr_ref conclusion(ctx.mk_eq_atom(a, m_autil.mk_numeral(convertedRepresentation, true)), m); + expr_ref axiom(rewrite_implication(premise, conclusion), m); + if (!string_int_axioms.contains(axiom)) { + string_int_axioms.insert(axiom); + assert_axiom(axiom); + m_trail_stack.push(insert_obj_trail(string_int_axioms, axiom)); + axiomAdd = true; + } + } else { + expr_ref premise(ctx.mk_eq_atom(S, mk_string(str)), m); + expr_ref conclusion(ctx.mk_eq_atom(a, m_autil.mk_numeral(rational::minus_one(), true)), m); + expr_ref axiom(rewrite_implication(premise, conclusion), m); + if (!string_int_axioms.contains(axiom)) { + string_int_axioms.insert(axiom); + assert_axiom(axiom); + m_trail_stack.push(insert_obj_trail(string_int_axioms, axiom)); + axiomAdd = true; + } + } + } + return axiomAdd; } From 1ee5ce96b8745ec9fa1ea37e2a0dbb2482a373dc Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Fri, 26 Jan 2018 14:52:18 -0500 Subject: [PATCH 29/36] use regex instead of head/tail split for string-integer conversion; check sort of refreshed vars; add intersection difficulty estimation --- src/smt/theory_str.cpp | 26 +++++++++++++++++--------- src/smt/theory_str.h | 1 + 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 2cbade1d5..5bc0590e6 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -287,10 +287,13 @@ namespace smt { } void theory_str::refresh_theory_var(expr * e) { + ast_manager & m = get_manager(); enode * en = ensure_enode(e); theory_var v = mk_var(en); (void)v; TRACE("str", tout << "refresh " << mk_pp(e, get_manager()) << ": v#" << v << std::endl;); - m_basicstr_axiom_todo.push_back(en); + if (m.get_sort(e) == u.str.mk_string_sort()) { + m_basicstr_axiom_todo.push_back(en); + } } theory_var theory_str::mk_var(enode* n) { @@ -1628,12 +1631,11 @@ namespace smt { { expr_ref premise(m_autil.mk_ge(ex, m_autil.mk_numeral(rational::one(), true)), m); - expr_ref hd(mk_str_var("hd"), m); - expr_ref tl(mk_str_var("tl"), m); - expr_ref conclusion1(ctx.mk_eq_atom(S, mk_concat(hd, tl)), m); - expr_ref conclusion2(ctx.mk_eq_atom(mk_strlen(hd), m_autil.mk_numeral(rational::one(), true)), m); - expr_ref conclusion3(mk_not(m, ctx.mk_eq_atom(hd, mk_string("0"))), m); - expr_ref conclusion(m.mk_and(conclusion1, conclusion2, conclusion3), m); + // S >= 1 --> S in [1-9][0-9]* + expr_ref re_positiveInteger(u.re.mk_concat( + u.re.mk_range(mk_string("1"), mk_string("9")), + u.re.mk_star(u.re.mk_range(mk_string("0"), mk_string("9")))), m); + expr_ref conclusion(mk_RegexIn(S, re_positiveInteger), m); SASSERT(premise); SASSERT(conclusion); assert_implication(premise, conclusion); @@ -6614,6 +6616,12 @@ namespace smt { } } + unsigned theory_str::estimate_automata_intersection_difficulty(eautomaton * aut1, eautomaton * aut2) { + ENSURE(aut1 != NULL); + ENSURE(aut2 != NULL); + return _qmul(aut1->num_states(), aut2->num_states()); + } + // Check whether a regex translates well to a linear set of length constraints. bool theory_str::check_regex_length_linearity(expr * re) { return check_regex_length_linearity_helper(re, false); @@ -10291,7 +10299,6 @@ namespace smt { } else { // TODO check negation? // TODO construct a partial automaton for R to the given lower bound? - // TODO increment failure count if (false) { } else { @@ -10424,7 +10431,8 @@ namespace smt { } if (regex_get_counter(regex_length_attempt_count, aut.get_regex_term()) >= m_params.m_RegexAutomata_LengthAttemptThreshold) { - unsigned intersectionDifficulty = 0; // TODO EstimateIntersectionDifficulty + unsigned intersectionDifficulty = estimate_automata_intersection_difficulty(aut_inter, aut.get_automaton()); + TRACE("str", tout << "intersection difficulty is " << intersectionDifficulty << std::endl;); if (intersectionDifficulty <= m_params.m_RegexAutomata_IntersectionDifficultyThreshold || regex_get_counter(regex_intersection_fail_count, aut.get_regex_term()) >= m_params.m_RegexAutomata_FailedIntersectionThreshold) { diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index ea4449b13..1330f9904 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -555,6 +555,7 @@ protected: // regex automata and length-aware regex unsigned estimate_regex_complexity(expr * re); unsigned estimate_regex_complexity_under_complement(expr * re); + unsigned estimate_automata_intersection_difficulty(eautomaton * aut1, eautomaton * aut2); bool check_regex_length_linearity(expr * re); bool check_regex_length_linearity_helper(expr * re, bool already_star); expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables); From 762129d4c7df6563cca451f3d33210cc094bae3d Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Mon, 12 Feb 2018 17:45:07 -0500 Subject: [PATCH 30/36] fixups to theory_str for regex --- src/smt/theory_str.cpp | 18 +++++++++++++----- src/smt/theory_str.h | 1 + 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 8157ceea9..e68a7fa8a 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -6571,7 +6571,7 @@ namespace smt { SASSERT(str1.length() == 1); SASSERT(str2.length() == 1); return 1 + str2[0] - str1[0]; - } else if (u.re.is_full(re)) { + } else if (u.re.is_full_char(re) || u.re.is_full_seq(re)) { return 1; } else { TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;); @@ -6613,7 +6613,7 @@ namespace smt { SASSERT(str1.length() == 1); SASSERT(str2.length() == 1); return 1 + str2[0] - str1[0]; - } else if (u.re.is_full(re)) { + } else if (u.re.is_full_char(re) || u.re.is_full_seq(re)) { return 1; } else { TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;); @@ -6649,8 +6649,10 @@ namespace smt { } } else if (u.re.is_range(re)) { return true; - } else if (u.re.is_full(re)) { + } else if (u.re.is_full_char(re)) { return true; + } else if (u.re.is_full_seq(re)) { + return false; // generally unbounded } else if (u.re.is_complement(re)) { // TODO can we do better? return false; @@ -6712,8 +6714,10 @@ namespace smt { SASSERT(str1.length() == 1); SASSERT(str2.length() == 1); lens.insert(1); - } else if (u.re.is_full(re)) { + } else if (u.re.is_full_char(re)) { lens.insert(1); + } else if (u.re.is_full_seq(re)) { + lens.reset(); } else if (u.re.is_complement(re)) { lens.reset(); } else { @@ -6828,9 +6832,13 @@ namespace smt { SASSERT(str2.length() == 1); expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_numeral(rational::one(), true)), m); return retval; - } else if (u.re.is_full(re)) { + } else if (u.re.is_full_char(re)) { expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_numeral(rational::one(), true)), m); return retval; + } else if (u.re.is_full_seq(re)) { + // match any unbounded string + expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m); + return retval; } else if (u.re.is_complement(re)) { // skip complement for now, in general this is difficult to predict expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m); diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index ef0bbf0ee..506726bc6 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -745,6 +745,7 @@ protected: void new_diseq_eh(theory_var, theory_var) override; theory* mk_fresh(context*) override { return alloc(theory_str, get_manager(), m_params); } + void init(context * ctx) override; void init_search_eh() override; void add_theory_assumptions(expr_ref_vector & assumptions) override; lbool validate_unsat_core(expr_ref_vector & unsat_core) override; From 11a339c490e06dc4a8342c11004e7909423c7e1b Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Sun, 11 Mar 2018 23:26:30 -0400 Subject: [PATCH 31/36] fix include path --- src/smt/theory_str.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 305f83df7..4ea96bd1f 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -25,7 +25,7 @@ #include "smt/theory_seq_empty.h" #include "smt/theory_arith.h" #include "ast/ast_util.h" -#include "seq_rewriter.h" +#include "ast/rewriter/seq_rewriter.h" #include "smt_kernel.h" namespace smt { From 6bb9a82425e7d2b9a17dfe77a5ae250b11c17ab4 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Thu, 15 Mar 2018 13:56:44 -0400 Subject: [PATCH 32/36] experimental axiom-persist for regex conflict clauses --- src/smt/theory_str.cpp | 23 ++++++++++++++++++++--- src/smt/theory_str.h | 6 ++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 4ea96bd1f..c9e484b02 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -57,6 +57,8 @@ namespace smt { tmpXorVarCount(0), tmpLenTestVarCount(0), tmpValTestVarCount(0), + m_persisted_axioms(m), + m_persisted_axiom_todo(m), avoidLoopCut(true), loopDetected(false), m_theoryStrOverlapAssumption_term(m), @@ -827,7 +829,8 @@ namespace smt { return !m_basicstr_axiom_todo.empty() || !m_str_eq_todo.empty() || !m_concat_axiom_todo.empty() || !m_concat_eval_todo.empty() || !m_library_aware_axiom_todo.empty() - || !m_delayed_axiom_setup_terms.empty(); + || !m_delayed_axiom_setup_terms.empty() + || !m_persisted_axiom_todo.empty() ; } @@ -895,7 +898,11 @@ namespace smt { set_up_axioms(m_delayed_axiom_setup_terms[i].get()); } m_delayed_axiom_setup_terms.reset(); - } + for (expr * a : m_persisted_axiom_todo) { + assert_axiom(a); + } + m_persisted_axiom_todo.reset(); + } // can_propagate } /* @@ -6696,7 +6703,7 @@ namespace smt { } else if (u.re.is_full_char(re)) { return true; } else if (u.re.is_full_seq(re)) { - return false; // generally unbounded + return true; } else if (u.re.is_complement(re)) { // TODO can we do better? return false; @@ -8474,6 +8481,10 @@ namespace smt { } } + void theory_str::add_persisted_axiom(expr * a) { + m_persisted_axioms.push_back(a); + } + void theory_str::pop_scope_eh(unsigned num_scopes) { sLevel -= num_scopes; TRACE("str", tout << "pop " << num_scopes << " to " << sLevel << std::endl;); @@ -8519,6 +8530,11 @@ namespace smt { m_basicstr_axiom_todo.reset(); m_basicstr_axiom_todo = new_m_basicstr; + for (expr * e : m_persisted_axioms) { + TRACE("str", tout << "persist axiom: " << mk_pp(e, get_manager()) << std::endl;); + m_persisted_axiom_todo.push_back(e); + } + m_trail_stack.pop_scope(num_scopes); theory::pop_scope_eh(num_scopes); @@ -10574,6 +10590,7 @@ namespace smt { TRACE("str", tout << "product automaton is empty; asserting conflict clause" << std::endl;); expr_ref conflict_clause(m.mk_not(mk_and(conflict_terms)), m); assert_axiom(conflict_clause); + add_persisted_axiom(conflict_clause); regex_axiom_add = true; } } // foreach (entry in regex_terms_by_string) diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index da9ed663d..7188da25c 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -352,6 +352,10 @@ protected: // enode lists for library-aware/high-level string terms (e.g. substr, contains) ptr_vector m_library_aware_axiom_todo; + // list of axioms that are re-asserted every time the scope is popped + expr_ref_vector m_persisted_axioms; + expr_ref_vector m_persisted_axiom_todo; + // hashtable of all exprs for which we've already set up term-specific axioms -- // this prevents infinite recursive descent with respect to axioms that // include an occurrence of the term for which axioms are being generated @@ -545,6 +549,8 @@ protected: void instantiate_axiom_str_to_int(enode * e); void instantiate_axiom_int_to_str(enode * e); + void add_persisted_axiom(expr * a); + expr * mk_RegexIn(expr * str, expr * regexp); void instantiate_axiom_RegexIn(enode * e); app * mk_unroll(expr * n, expr * bound); From 6a3ce301b748280f6bb8c1eef959382848b83bf8 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Tue, 3 Apr 2018 12:51:03 -0400 Subject: [PATCH 33/36] fix collection error --- src/smt/theory_str.cpp | 3 +++ src/smt/theory_str.h | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 21b6e7b26..811b66798 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -1854,6 +1854,9 @@ namespace smt { std::pair key1(ex->get_arg(0), regexStr); // skip Z3str's map check, because we already check if we set up axioms on this term regex_in_bool_map[key1] = ex; + if (!regex_in_var_reg_str_map.contains(ex->get_arg(0))) { + regex_in_var_reg_str_map.insert(ex->get_arg(0), std::set()); + } regex_in_var_reg_str_map[ex->get_arg(0)].insert(regexStr); } diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index d68d99abc..e9607585f 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -399,7 +399,7 @@ protected: obj_hashtable regex_terms; obj_map > regex_terms_by_string; // S --> [ (str.in.re S *) ] obj_map > regex_automaton_assumptions; // RegEx --> [ aut+assumptions ] - std::map regex_nfa_cache; // Regex term --> NFA + obj_map regex_nfa_cache; // Regex term --> NFA obj_hashtable regex_terms_with_path_constraints; // set of string terms which have had path constraints asserted in the current scope obj_hashtable regex_terms_with_length_constraints; // set of regex terms which had had length constraints asserted in the current scope obj_map regex_term_to_length_constraint; // (str.in.re S R) -> (length constraint over S wrt. R) From 45f48123e76b6e5717927b24d54db51d467b276d Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Fri, 6 Apr 2018 11:39:08 -0400 Subject: [PATCH 34/36] add re.plus length enumeration; fix reordering warning --- src/smt/theory_str.cpp | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 811b66798..64cba43fb 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -54,12 +54,12 @@ namespace smt { m_unused_id(0), m_delayed_axiom_setup_terms(m), m_delayed_assertions_todo(m), + m_persisted_axioms(m), + m_persisted_axiom_todo(m), tmpStringVarCount(0), tmpXorVarCount(0), tmpLenTestVarCount(0), tmpValTestVarCount(0), - m_persisted_axioms(m), - m_persisted_axiom_todo(m), avoidLoopCut(true), loopDetected(false), m_theoryStrOverlapAssumption_term(m), @@ -6865,7 +6865,7 @@ namespace smt { finalResult.push_back(r2); expr_ref retval(mk_and(finalResult), m); return retval; - } else if (u.re.is_star(re, sub1)) { + } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) { // stars are generated as a linear combination of all possible subterm lengths; // this requires that there are no stars under this one /* @@ -6898,27 +6898,16 @@ namespace smt { expr * n = mk_int_var("rstar"); freeVariables.push_back(n); expr_ref term(m_autil.mk_mul(m_autil.mk_numeral(lenOption, true), n), m); - sum_terms.push_back(term); + expr_ref term2(term, m); + if (u.re.is_plus(re)) { + // n effectively starts at 1 + term2 = m_autil.mk_add(m_autil.mk_numeral(lenOption, true), term); + } + sum_terms.push_back(term2); } expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_add_simplify(sum_terms)), m); return retval; } - } else if (u.re.is_plus(re, sub1)) { - /* - expr * v = mk_int_var("rlen"); - expr * n = mk_int_var("rstar"); - freeVariables.push_back(v); - freeVariables.push_back(n); - expr_ref rsub = infer_all_regex_lengths(v, sub1, freeVariables); - expr_ref_vector finalResult(m); - finalResult.push_back(rsub); - finalResult.push_back(ctx.mk_eq_atom(lenVar, m_autil.mk_mul(v, n))); - finalResult.push_back(m_autil.mk_ge(n, m_autil.mk_numeral(rational::one(), true))); - expr_ref retval(mk_and(finalResult), m); - return retval; - */ - // TODO this should really be done as a sub-case of star - NOT_IMPLEMENTED_YET(); } else if (u.re.is_range(re, sub1, sub2)) { SASSERT(u.str.is_string(sub1)); SASSERT(u.str.is_string(sub2)); From 27f2b542df088f060fff7540703896e32e284cb8 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Fri, 6 Apr 2018 12:13:53 -0400 Subject: [PATCH 35/36] remove comment --- src/smt/theory_str.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 64cba43fb..0a3089c62 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -6831,8 +6831,6 @@ namespace smt { * In some cases, the returned formula requires one or more free integer variables to be created. * These variables are returned in the reference parameter `freeVariables`. * Extra assertions should be made for these free variables constraining them to be non-negative. - * - * TODO: star unrolling? */ expr_ref theory_str::infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables) { ENSURE(u.is_re(re)); From 3cfb32cd2d18acaca3b5cf5449c8c2c7c090aad9 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Thu, 12 Apr 2018 14:35:29 -0400 Subject: [PATCH 36/36] fix regex automata leaked memory --- src/smt/theory_str.cpp | 5 +++++ src/smt/theory_str.h | 1 + 2 files changed, 6 insertions(+) diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 0a3089c62..60b365d20 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -77,6 +77,9 @@ namespace smt { theory_str::~theory_str() { m_trail_stack.reset(); + for (eautomaton * aut : regex_automata) { + dealloc(aut); + } regex_automata.clear(); } @@ -10559,6 +10562,7 @@ namespace smt { if ( (current_assignment == l_true && aut.get_polarity()) || (current_assignment == l_false && !aut.get_polarity())) { aut_inter = m_mk_aut.mk_product(aut_inter, aut.get_automaton()); + m_automata.push_back(aut_inter); } else { // need to complement first expr_ref rc(u.re.mk_complement(aut.get_regex_term()), m); @@ -10567,6 +10571,7 @@ namespace smt { // TODO is there any way to build a complement automaton from an existing one? // this discards length information aut_inter = m_mk_aut.mk_product(aut_inter, aut_c); + m_automata.push_back(aut_inter); } used_intersect_constraints.push_back(aut); if (aut_inter->is_empty()) { diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index e9607585f..419084091 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -395,6 +395,7 @@ protected: obj_map > regex_in_var_reg_str_map; // regex automata + scoped_ptr_vector m_automata; ptr_vector regex_automata; obj_hashtable regex_terms; obj_map > regex_terms_by_string; // S --> [ (str.in.re S *) ]