From 6b799706b5adf865cc1b3e73d00d37a86610c619 Mon Sep 17 00:00:00 2001 From: Murphy Berzish Date: Wed, 10 Jan 2018 17:24:47 -0500 Subject: [PATCH] add path constraint generation for regex terms --- src/smt/params/smt_params_helper.pyg | 1 + src/smt/params/theory_str_params.cpp | 1 + src/smt/params/theory_str_params.h | 9 +- src/smt/theory_str.cpp | 165 +++++++++++++++++++++------ src/smt/theory_str.h | 4 +- 5 files changed, 145 insertions(+), 35 deletions(-) diff --git a/src/smt/params/smt_params_helper.pyg b/src/smt/params/smt_params_helper.pyg index 122d09f32..7ab0d52f2 100644 --- a/src/smt/params/smt_params_helper.pyg +++ b/src/smt/params/smt_params_helper.pyg @@ -80,6 +80,7 @@ def_module_params(module_name='smt', ('str.finite_overlap_models', BOOL, False, 'attempt a finite model search for overlapping variables instead of completely giving up on the arrangement'), ('str.overlap_priority', DOUBLE, -0.1, 'theory-aware priority for overlapping variable cases; use smt.theory_aware_branching=true'), ('str.regex_automata', BOOL, True, 'use automata-based reasoning for regular expressions (Z3str3 only)'), + ('str.regex_automata_difficulty_threshold', UINT, 1000, 'difficulty threshold for regex automata heuristics'), ('core.minimize', BOOL, False, 'minimize unsat core produced by SMT context'), ('core.extend_patterns', BOOL, False, 'extend unsat core with literals that trigger (potential) quantifier instances'), ('core.extend_patterns.max_distance', UINT, UINT_MAX, 'limits the distance of a pattern-extended unsat core'), diff --git a/src/smt/params/theory_str_params.cpp b/src/smt/params/theory_str_params.cpp index b5451b9dc..fa6862161 100644 --- a/src/smt/params/theory_str_params.cpp +++ b/src/smt/params/theory_str_params.cpp @@ -32,4 +32,5 @@ void theory_str_params::updt_params(params_ref const & _p) { m_BinarySearchInitialUpperBound = p.str_binary_search_start(); m_OverlapTheoryAwarePriority = p.str_overlap_priority(); m_RegexAutomata = p.str_regex_automata(); + m_RegexAutomata_DifficultyThreshold = p.str_regex_automata_difficulty_threshold(); } diff --git a/src/smt/params/theory_str_params.h b/src/smt/params/theory_str_params.h index 4135cf0d6..48c0205ee 100644 --- a/src/smt/params/theory_str_params.h +++ b/src/smt/params/theory_str_params.h @@ -87,6 +87,12 @@ struct theory_str_params { */ bool m_RegexAutomata; + /* + * RegexAutomata_DifficultyThreshold is the lowest difficulty above which Z3str3 + * will not eagerly construct an automaton for a regular expression term. + */ + unsigned m_RegexAutomata_DifficultyThreshold; + theory_str_params(params_ref const & p = params_ref()): m_StrongArrangements(true), m_AggressiveLengthTesting(false), @@ -99,7 +105,8 @@ struct theory_str_params { m_UseBinarySearch(false), m_BinarySearchInitialUpperBound(64), m_OverlapTheoryAwarePriority(-0.1), - m_RegexAutomata(true) + m_RegexAutomata(true), + m_RegexAutomata_DifficultyThreshold(1000) { updt_params(p); } diff --git a/src/smt/theory_str.cpp b/src/smt/theory_str.cpp index 54636b8df..5910d9d95 100644 --- a/src/smt/theory_str.cpp +++ b/src/smt/theory_str.cpp @@ -6918,9 +6918,11 @@ namespace smt { /* * Create finite path constraints for the string variable `str` with respect to the automaton `aut`. * The returned expression is the right-hand side of a constraint of the form - * (str in re) AND (|str| = len) AND (any applicable length assumptions on aut) -> RHS + * (str in re) AND (|str| = len) AND (any applicable length assumptions on aut) -> (rhs AND character constraints). + * The character constraints, which are (str = c0 . c1 . (...) . cn) and (|c0| = 1, ...), + * are returned in `characterConstraints`. */ - expr_ref theory_str::generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal) { + expr_ref theory_str::generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal, expr_ref & characterConstraints) { ENSURE(aut != NULL); context & ctx = get_context(); ast_manager & m = get_manager(); @@ -7060,29 +7062,23 @@ namespace smt { expr_ref result(mk_or(ors)); TRACE("str", tout << "regex path constraint: " << mk_pp(result, m) << "\n";); - // if the path constraint is instantly false, then we know that the LHS must be false, - // so negate it and instantly assert a conflict clause - if (m.is_false(result)) { - expr_ref conflict(m.mk_false(), m); - return conflict; + expr_ref concat_rhs(m); + if (pathChars.size() == 1) { + concat_rhs = ctx.mk_eq_atom(stringTerm, pathChars.get(0)); } else { - expr_ref concat_rhs(m); - if (pathChars.size() == 1) { - concat_rhs = ctx.mk_eq_atom(stringTerm, pathChars.get(0)); - } else { - expr_ref acc(pathChars.get(0), m); - for (unsigned i = 1; i < pathChars.size(); ++i) { - acc = mk_concat(acc, pathChars.get(i)); - } - concat_rhs = ctx.mk_eq_atom(stringTerm, acc); + expr_ref acc(pathChars.get(0), m); + for (unsigned i = 1; i < pathChars.size(); ++i) { + acc = mk_concat(acc, pathChars.get(i)); } - - expr_ref toplevel_rhs(m.mk_and(result, mk_and(pathChars_len_constraints), concat_rhs), m); - //expr_ref final_axiom(rewrite_implication(mk_and(toplevel_lhs), toplevel_rhs), m); - //regex_automata_assertions.insert(stringTerm, final_axiom); - //m_trail_stack.push(insert_obj_map(regex_automata_assertions, stringTerm) ); - return toplevel_rhs; + concat_rhs = ctx.mk_eq_atom(stringTerm, acc); } + + //expr_ref toplevel_rhs(m.mk_and(result, mk_and(pathChars_len_constraints), concat_rhs), m); + characterConstraints = m.mk_and(mk_and(pathChars_len_constraints), concat_rhs); + //expr_ref final_axiom(rewrite_implication(mk_and(toplevel_lhs), toplevel_rhs), m); + //regex_automata_assertions.insert(stringTerm, final_axiom); + //m_trail_stack.push(insert_obj_map(regex_automata_assertions, stringTerm) ); + return result; } /* @@ -8241,6 +8237,27 @@ namespace smt { ); } + // returns true if needle appears as a subterm anywhere under haystack, + // or if needle appears in the same EQC as a subterm anywhere under haystack + bool theory_str::term_appears_as_subterm(expr * needle, expr * haystack) { + if (in_same_eqc(needle, haystack)) { + return true; + } + + if (is_app(haystack)) { + app * a_haystack = to_app(haystack); + for (unsigned i = 0; i < a_haystack->get_num_args(); ++i) { + expr * subterm = a_haystack->get_arg(i); + if (term_appears_as_subterm(needle, subterm)) { + return true; + } + } + } + + // not found + return false; + } + void theory_str::classify_ast_by_type(expr * node, std::map & varMap, std::map & concatMap, std::map & unrollMap) { @@ -9512,8 +9529,78 @@ namespace smt { rational exact_length_value; if (get_len_value(str, exact_length_value)) { TRACE("str", tout << "exact length of " << mk_pp(str, m) << " is " << exact_length_value << std::endl;); - // TODO generate_regex_path_constraints(); - NOT_IMPLEMENTED_YET(); + + if (regex_terms_with_path_constraints.contains(str_in_re)) { + TRACE("str", tout << "term " << mk_pp(str_in_re, m) << " already has path constraints set up" << std::endl;); + continue; + } + + // find a consistent automaton for this term + bool found = false; + regex_automaton_under_assumptions assumption; + if (regex_automaton_assumptions.contains(re) && + !regex_automaton_assumptions[re].empty()){ + for (svector::iterator it = regex_automaton_assumptions[re].begin(); + it != regex_automaton_assumptions[re].end(); ++it) { + regex_automaton_under_assumptions autA = *it; + rational assumed_upper_bound, assumed_lower_bound; + bool assumes_upper_bound = autA.get_upper_bound(assumed_upper_bound); + bool assumes_lower_bound = autA.get_lower_bound(assumed_lower_bound); + if (!assumes_upper_bound && !assumes_lower_bound) { + // automaton with no assumptions is always usable + assumption = autA; + found = true; + break; + } + // check consistency of bounds assumptions + } // foreach(a in regex_automaton_assumptions) + } + if (found) { + expr_ref pathConstraint(m); + expr_ref characterConstraints(m); + pathConstraint = generate_regex_path_constraints(str, assumption.get_automaton(), exact_length_value, characterConstraints); + TRACE("str", tout << "generated regex path constraint " << mk_pp(pathConstraint, m) << std::endl;); + TRACE("str", tout << "character constraints are " << mk_pp(characterConstraints, m) << std::endl;); + + expr_ref_vector lhs_terms(m); + lhs_terms.push_back(str_in_re); + lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true))); + expr_ref lhs(mk_and(lhs_terms), m); + // If the automaton was built with the same polarity as the constraint, + // assert directly. Otherwise, negate the path constraint + if ( (current_assignment == l_true && assumption.get_polarity()) + || (current_assignment == l_false && !assumption.get_polarity())) { + expr_ref rhs(m.mk_and(pathConstraint, characterConstraints), m); + assert_implication(lhs, rhs); + } else { + expr_ref rhs(m.mk_and(m.mk_not(pathConstraint), characterConstraints), m); + assert_implication(lhs, rhs); + } + regex_terms_with_path_constraints.insert(str_in_re); + m_trail_stack.push(insert_obj_trail(regex_terms_with_path_constraints, str_in_re)); + regex_axiom_add = true; + continue; + } else { + // no automata available, or else all bounds assumptions are invalid + unsigned expected_complexity = estimate_regex_complexity(re); + if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) { + eautomaton * aut = m_mk_aut(re); + aut->compress(); + regex_automata.push_back(aut); + regex_automaton_under_assumptions new_aut(re, aut, true); + if (!regex_automaton_assumptions.contains(re)) { + regex_automaton_assumptions.insert(re, svector()); + } + regex_automaton_assumptions[re].push_back(new_aut); + TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); + regex_axiom_add = true; + // TODO immediately attempt to learn lower/upper bound info here + } else { + // TODO increment failure count + NOT_IMPLEMENTED_YET(); + } + continue; + } } expr_ref str_len(mk_strlen(str), m); rational lower_bound_value; @@ -9607,8 +9694,7 @@ namespace smt { // no existing automata/assumptions. // if it's easy to construct a full automaton for R, do so unsigned expected_complexity = estimate_regex_complexity(re); - unsigned threshold = 1000; // TODO better metric - if (expected_complexity <= threshold) { + if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) { eautomaton * aut = m_mk_aut(re); aut->compress(); regex_automata.push_back(aut); @@ -9622,9 +9708,11 @@ namespace smt { // TODO immediately attempt to learn lower/upper bound info here } else { // TODO check negation? - // construct a partial automaton for R to the given upper bound + // TODO construct a partial automaton for R to the given upper bound? + // TODO increment failure count if we can't NOT_IMPLEMENTED_YET(); } + continue; } // if we have *any* automaton for R, and the upper bound is not too large, // finitize the automaton (if we have not already done so) and assert all solutions @@ -9718,8 +9806,7 @@ namespace smt { // no existing automata/assumptions. // if it's easy to construct a full automaton for R, do so unsigned expected_complexity = estimate_regex_complexity(re); - unsigned threshold = 1000; // TODO better metric - if (expected_complexity <= threshold) { + if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) { eautomaton * aut = m_mk_aut(re); aut->compress(); regex_automata.push_back(aut); @@ -9733,9 +9820,11 @@ namespace smt { // TODO immediately attempt to learn lower/upper bound info here } else { // TODO check negation? - // construct a partial automaton for R to the given upper bound + // TODO construct a partial automaton for R to the given lower bound? + // TODO increment failure count NOT_IMPLEMENTED_YET(); } + continue; } } else { // !lower_bound_exists // no bounds information @@ -9745,8 +9834,7 @@ namespace smt { NOT_IMPLEMENTED_YET(); if (true) { unsigned expected_complexity = estimate_regex_complexity(re); - unsigned threshold = 1000; // TODO better metric - if (expected_complexity <= threshold) { + if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold) { eautomaton * aut = m_mk_aut(re); aut->compress(); regex_automata.push_back(aut); @@ -11265,7 +11353,18 @@ namespace smt { // this length tester is in any way constrained by regex terms, // do not perform value testing -- this term is not a free variable. if (m_params.m_RegexAutomata) { - NOT_IMPLEMENTED_YET(); // TODO + std::map, expr*>::iterator it = regex_in_bool_map.begin(); + for (; it != regex_in_bool_map.end(); ++it) { + expr * re = it->second; + expr * re_str = to_app(re)->get_arg(0); + // recursive descent through all subterms of re_str to see if any of them are + // - the same as freeVar + // - in the same EQC as freeVar + if (term_appears_as_subterm(freeVar, re_str)) { + TRACE("str", tout << "prevent value testing on free var " << mk_pp(freeVar, m) << " as it belongs to one or more regex constraints." << std::endl;); + return NULL; + } + } } TRACE("str", tout << "length is fixed; generating models for free var" << std::endl;); diff --git a/src/smt/theory_str.h b/src/smt/theory_str.h index c6dad045f..c64eec838 100644 --- a/src/smt/theory_str.h +++ b/src/smt/theory_str.h @@ -407,6 +407,7 @@ protected: obj_map > regex_terms_by_string; // S --> [ (str.in.re S *) ] obj_map > regex_automaton_assumptions; // RegEx --> [ aut+assumptions ] std::map regex_nfa_cache; // Regex term --> NFA + obj_hashtable regex_terms_with_path_constraints; // set of string terms which have had path constraints asserted in the current scope svector char_set; std::map charSetLookupTable; @@ -538,7 +539,7 @@ protected: expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables); bool refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound); bool refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound); - expr_ref generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal); + expr_ref generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal, expr_ref & characterConstraints); void aut_path_add_next(u_map& next, expr_ref_vector& trail, unsigned idx, expr* cond); expr_ref aut_path_rewrite_constraint(expr * cond, expr * ch_var); @@ -629,6 +630,7 @@ protected: std::map > & concat_eq_concat_map, std::map > & unrollGroupMap); + bool term_appears_as_subterm(expr * needle, expr * haystack); void classify_ast_by_type(expr * node, std::map & varMap, std::map & concatMap, std::map & unrollMap); void classify_ast_by_type_in_positive_context(std::map & varMap,