/*++ Module Name: theory_str_regex.cpp Abstract: Regular expression components for Z3str3 (theory_str). Author: Murphy Berzish (2019-10-25) Revision History: --*/ #include "smt/theory_str.h" namespace smt { // saturating unsigned addition unsigned inline _qadd(unsigned a, unsigned b) { if (a == UINT_MAX || b == UINT_MAX) { return UINT_MAX; } unsigned result = a + b; if (result < a || result < b) { return UINT_MAX; } return result; } // saturating unsigned multiply unsigned inline _qmul(unsigned a, unsigned b) { if (a == UINT_MAX || b == UINT_MAX) { return UINT_MAX; } if (a == 0 || b == 0) { return 0; } unsigned result = a * b; if (result < a || result < b) { return UINT_MAX; } return result; } void theory_str::solve_regex_automata() { context & ctx = get_context(); ast_manager & m = get_manager(); // TODO since heuristics might fail, the "no progress" flag might need to be handled specially here bool regex_axiom_add = false; for (obj_hashtable::iterator it = regex_terms.begin(); it != regex_terms.end(); ++it) { expr * str_in_re = *it; expr * str = nullptr; expr * re = nullptr; u.str.is_in_re(str_in_re, str, re); if (!ctx.b_internalized(str_in_re)) { TRACE("str", tout << "regex term " << mk_pp(str_in_re, m) << " not internalized; fixing and continuing" << std::endl;); ctx.internalize(str_in_re, false); finalCheckProgressIndicator = true; continue; } lbool current_assignment = ctx.get_assignment(str_in_re); TRACE("str", tout << "regex term: " << mk_pp(str, m) << " in " << mk_pp(re, m) << " : " << current_assignment << std::endl;); if (current_assignment == l_undef) { continue; } if (!regex_terms_with_length_constraints.contains(str_in_re)) { if (current_assignment == l_true && check_regex_length_linearity(re)) { TRACE("str", tout << "regex length constraints expected to be linear -- generating and asserting them" << std::endl;); if (regex_term_to_length_constraint.contains(str_in_re)) { // use existing length constraint expr * top_level_length_constraint = nullptr; regex_term_to_length_constraint.find(str_in_re, top_level_length_constraint); ptr_vector extra_length_vars; regex_term_to_extra_length_vars.find(str_in_re, extra_length_vars); assert_axiom(top_level_length_constraint); for(ptr_vector::iterator it = extra_length_vars.begin(); it != extra_length_vars.end(); ++it) { expr * v = *it; refresh_theory_var(v); expr_ref len_constraint(m_autil.mk_ge(v, m_autil.mk_numeral(rational::zero(), true)), m); assert_axiom(len_constraint); } } else { // generate new length constraint expr_ref_vector extra_length_vars(m); expr_ref _top_level_length_constraint = infer_all_regex_lengths(mk_strlen(str), re, extra_length_vars); expr_ref top_level_length_constraint(_top_level_length_constraint, m); th_rewriter rw(m); rw(top_level_length_constraint); TRACE("str", tout << "top-level length constraint: " << mk_pp(top_level_length_constraint, m) << std::endl;); // assert and track length constraint assert_axiom(top_level_length_constraint); for(expr_ref_vector::iterator it = extra_length_vars.begin(); it != extra_length_vars.end(); ++it) { expr * v = *it; expr_ref len_constraint(m_autil.mk_ge(v, m_autil.mk_numeral(rational::zero(), true)), m); assert_axiom(len_constraint); } regex_term_to_length_constraint.insert(str_in_re, top_level_length_constraint); ptr_vector vtmp; for(expr_ref_vector::iterator it = extra_length_vars.begin(); it != extra_length_vars.end(); ++it) { vtmp.push_back(*it); } regex_term_to_extra_length_vars.insert(str_in_re, vtmp); } regex_terms_with_length_constraints.insert(str_in_re); m_trail_stack.push(insert_obj_trail(regex_terms_with_length_constraints, str_in_re)); regex_axiom_add = true; } } // re not in regex_terms_with_length_constraints rational exact_length_value; if (get_len_value(str, exact_length_value)) { TRACE("str", tout << "exact length of " << mk_pp(str, m) << " is " << exact_length_value << std::endl;); if (regex_terms_with_path_constraints.contains(str_in_re)) { TRACE("str", tout << "term " << mk_pp(str_in_re, m) << " already has path constraints set up" << std::endl;); continue; } // find a consistent automaton for this term bool found = false; regex_automaton_under_assumptions assumption; if (regex_automaton_assumptions.contains(re) && !regex_automaton_assumptions[re].empty()){ for (svector::iterator it = regex_automaton_assumptions[re].begin(); it != regex_automaton_assumptions[re].end(); ++it) { regex_automaton_under_assumptions autA = *it; rational assumed_upper_bound, assumed_lower_bound; bool assumes_upper_bound = autA.get_upper_bound(assumed_upper_bound); bool assumes_lower_bound = autA.get_lower_bound(assumed_lower_bound); if (!assumes_upper_bound && !assumes_lower_bound) { // automaton with no assumptions is always usable assumption = autA; found = true; break; } // TODO check consistency of bounds assumptions } // foreach(a in regex_automaton_assumptions) } if (found) { if (exact_length_value.is_zero()) { // check consistency of 0-length solution with automaton eautomaton * aut = assumption.get_automaton(); bool zero_solution = false; unsigned initial_state = aut->init(); if (aut->is_final_state(initial_state)) { zero_solution = true; } else { unsigned_vector eps_states; aut->get_epsilon_closure(initial_state, eps_states); for (unsigned_vector::iterator it = eps_states.begin(); it != eps_states.end(); ++it) { unsigned state = *it; if (aut->is_final_state(state)) { zero_solution = true; break; } } } // now check polarity of automaton wrt. original term if ( (current_assignment == l_true && !assumption.get_polarity()) || (current_assignment == l_false && assumption.get_polarity())) { // invert sense zero_solution = !zero_solution; } if (zero_solution) { TRACE("str", tout << "zero-length solution OK -- asserting empty path constraint" << std::endl;); expr_ref_vector lhs_terms(m); if (current_assignment == l_true) { lhs_terms.push_back(str_in_re); } else { lhs_terms.push_back(m.mk_not(str_in_re)); } lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true))); expr_ref lhs(mk_and(lhs_terms), m); expr_ref rhs(ctx.mk_eq_atom(str, mk_string("")), m); assert_implication(lhs, rhs); regex_terms_with_path_constraints.insert(str_in_re); m_trail_stack.push(insert_obj_trail(regex_terms_with_path_constraints, str_in_re)); } else { TRACE("str", tout << "zero-length solution not admitted by this automaton -- asserting conflict clause" << std::endl;); expr_ref_vector lhs_terms(m); if (current_assignment == l_true) { lhs_terms.push_back(str_in_re); } else { lhs_terms.push_back(m.mk_not(str_in_re)); } lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true))); expr_ref lhs(mk_and(lhs_terms), m); expr_ref conflict(m.mk_not(lhs), m); assert_axiom(conflict); } regex_axiom_add = true; regex_inc_counter(regex_length_attempt_count, re); continue; } else { expr_ref pathConstraint(m); expr_ref characterConstraints(m); pathConstraint = generate_regex_path_constraints(str, assumption.get_automaton(), exact_length_value, characterConstraints); TRACE("str", tout << "generated regex path constraint " << mk_pp(pathConstraint, m) << std::endl;); TRACE("str", tout << "character constraints are " << mk_pp(characterConstraints, m) << std::endl;); expr_ref_vector lhs_terms(m); if (current_assignment == l_true) { lhs_terms.push_back(str_in_re); } else { lhs_terms.push_back(m.mk_not(str_in_re)); } lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true))); expr_ref lhs(mk_and(lhs_terms), m); // If the path constraint comes out as "false", this means there are no paths of that length // in the automaton. If the polarity is the same, we can assert a conflict clause. // If the polarity is opposite, we ignore the path constraint. if (m.is_false(pathConstraint)) { if ( (current_assignment == l_true && assumption.get_polarity()) || (current_assignment == l_false && !assumption.get_polarity())) { // automaton and constraint have same polarity -- assert conflict clause TRACE("str", tout << "path constraint is false with matching polarity; asserting conflict clause" << std::endl;); expr_ref conflict(m.mk_not(mk_and(lhs_terms)), m); assert_axiom(conflict); // don't set up "regex_terms_with_path_constraints" as a conflict clause is not a path constraint } else { // automaton and constraint have opposite polarity -- ignore path constraint TRACE("str", tout << "path constraint is false with opposite polarity; ignoring path constraint" << std::endl;); assert_implication(lhs, characterConstraints); regex_terms_with_path_constraints.insert(str_in_re); m_trail_stack.push(insert_obj_trail(regex_terms_with_path_constraints, str_in_re)); } regex_axiom_add = true; } else { // If the automaton was built with the same polarity as the constraint, // assert directly. Otherwise, negate the path constraint if ( (current_assignment == l_true && assumption.get_polarity()) || (current_assignment == l_false && !assumption.get_polarity())) { TRACE("str", tout << "automaton and regex term have same polarity" << std::endl;); expr_ref rhs(m.mk_and(pathConstraint, characterConstraints), m); assert_implication(lhs, rhs); } else { TRACE("str", tout << "automaton and regex term have opposite polarity" << std::endl;); expr_ref rhs(m.mk_and(m.mk_not(pathConstraint), characterConstraints), m); assert_implication(lhs, rhs); } regex_terms_with_path_constraints.insert(str_in_re); m_trail_stack.push(insert_obj_trail(regex_terms_with_path_constraints, str_in_re)); regex_axiom_add = true; } // increment LengthAttemptCount regex_inc_counter(regex_length_attempt_count, re); TRACE("str", { unsigned v = regex_get_counter(regex_length_attempt_count, re); tout << "length attempt count for " << mk_pp(re, m) << " is " << v << std::endl; }); continue; } } else { // no automata available, or else all bounds assumptions are invalid unsigned expected_complexity = estimate_regex_complexity(re); if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold) { CTRACE("str", regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold, tout << "failed automaton threshold reached for " << mk_pp(str_in_re, m) << " -- automatically constructing full automaton" << std::endl;); eautomaton * aut = m_mk_aut(re); aut->compress(); regex_automata.push_back(aut); regex_automaton_under_assumptions new_aut(re, aut, true); if (!regex_automaton_assumptions.contains(re)) { regex_automaton_assumptions.insert(re, svector()); } regex_automaton_assumptions[re].push_back(new_aut); TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); regex_axiom_add = true; find_automaton_initial_bounds(str_in_re, aut); } else { regex_inc_counter(regex_fail_count, str_in_re); } continue; } } // get_len_value() expr_ref str_len(mk_strlen(str), m); rational lower_bound_value; rational upper_bound_value; bool lower_bound_exists = lower_bound(str_len, lower_bound_value); bool upper_bound_exists = upper_bound(str_len, upper_bound_value); CTRACE("str", lower_bound_exists, tout << "lower bound of " << mk_pp(str, m) << " is " << lower_bound_value << std::endl;); CTRACE("str", upper_bound_exists, tout << "upper bound of " << mk_pp(str, m) << " is " << upper_bound_value << std::endl;); bool new_lower_bound_info = true; bool new_upper_bound_info = true; // check last seen lower/upper bound to avoid performing duplicate work if (regex_last_lower_bound.contains(str)) { rational last_lb_value; regex_last_lower_bound.find(str, last_lb_value); if (last_lb_value == lower_bound_value) { new_lower_bound_info = false; } } if (regex_last_upper_bound.contains(str)) { rational last_ub_value; regex_last_upper_bound.find(str, last_ub_value); if (last_ub_value == upper_bound_value) { new_upper_bound_info = false; } } if (new_lower_bound_info) { regex_last_lower_bound.insert(str, lower_bound_value); } if (new_upper_bound_info) { regex_last_upper_bound.insert(str, upper_bound_value); } if (upper_bound_exists && new_upper_bound_info) { // check current assumptions if (regex_automaton_assumptions.contains(re) && !regex_automaton_assumptions[re].empty()){ // one or more existing assumptions. // see if the (current best) upper bound can be refined // (note that if we have an automaton with no assumption, // this automatically counts as best) bool need_assumption = true; regex_automaton_under_assumptions last_assumption; rational last_ub = rational::minus_one(); for (svector::iterator it = regex_automaton_assumptions[re].begin(); it != regex_automaton_assumptions[re].end(); ++it) { regex_automaton_under_assumptions autA = *it; if ((current_assignment == l_true && autA.get_polarity() == false) || (current_assignment == l_false && autA.get_polarity() == true)) { // automaton uses incorrect polarity continue; } rational this_ub; if (autA.get_upper_bound(this_ub)) { if (last_ub == rational::minus_one() || this_ub < last_ub) { last_ub = this_ub; last_assumption = autA; } } else { need_assumption = false; last_assumption = autA; break; } } if (!last_ub.is_minus_one() || !need_assumption) { CTRACE("str", !need_assumption, tout << "using automaton with full length information" << std::endl;); CTRACE("str", need_assumption, tout << "using automaton with assumed upper bound of " << last_ub << std::endl;); rational refined_upper_bound; bool solution_at_upper_bound = refine_automaton_upper_bound(last_assumption.get_automaton(), upper_bound_value, refined_upper_bound); TRACE("str", tout << "refined upper bound is " << refined_upper_bound << (solution_at_upper_bound?", solution at upper bound":", no solution at upper bound") << std::endl;); expr_ref_vector lhs(m); if (current_assignment == l_false) { lhs.push_back(m.mk_not(str_in_re)); } else { lhs.push_back(str_in_re); } if (need_assumption) { lhs.push_back(m_autil.mk_le(str_len, m_autil.mk_numeral(last_ub, true))); } lhs.push_back(m_autil.mk_le(str_len, m_autil.mk_numeral(upper_bound_value, true))); expr_ref_vector rhs(m); if (solution_at_upper_bound) { if (refined_upper_bound.is_minus_one()) { // If there are solutions at the upper bound but not below it, make the bound exact. rhs.push_back(ctx.mk_eq_atom(str_len, m_autil.mk_numeral(upper_bound_value, true))); } else { // If there are solutions at and below the upper bound, add an additional bound. rhs.push_back(m.mk_or( ctx.mk_eq_atom(str_len, m_autil.mk_numeral(upper_bound_value, true)), m_autil.mk_le(str_len, m_autil.mk_numeral(refined_upper_bound, true)) )); } } else { if (refined_upper_bound.is_minus_one()) { // If there are no solutions at or below the upper bound, assert a conflict clause. rhs.push_back(m.mk_not(m_autil.mk_le(str_len, m_autil.mk_numeral(upper_bound_value, true)))); } else { // If there are solutions below the upper bound but not at it, refine the bound. rhs.push_back(m_autil.mk_le(str_len, m_autil.mk_numeral(refined_upper_bound, true))); } } if (!rhs.empty()) { expr_ref lhs_terms(mk_and(lhs), m); expr_ref rhs_terms(mk_and(rhs), m); assert_implication(lhs_terms, rhs_terms); regex_axiom_add = true; } } } else { // no existing automata/assumptions. // if it's easy to construct a full automaton for R, do so unsigned expected_complexity = estimate_regex_complexity(re); bool failureThresholdExceeded = (regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold); if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || failureThresholdExceeded) { eautomaton * aut = m_mk_aut(re); aut->compress(); regex_automata.push_back(aut); regex_automaton_under_assumptions new_aut(re, aut, true); if (!regex_automaton_assumptions.contains(re)) { regex_automaton_assumptions.insert(re, svector()); } regex_automaton_assumptions[re].push_back(new_aut); TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); regex_axiom_add = true; find_automaton_initial_bounds(str_in_re, aut); } else { // TODO check negation? // TODO construct a partial automaton for R to the given upper bound? if (false) { } else { regex_inc_counter(regex_fail_count, str_in_re); } } continue; } // if we have *any* automaton for R, and the upper bound is not too large, // finitize the automaton (if we have not already done so) and assert all solutions if (upper_bound_value < 50) { // TODO better metric for threshold // NOT_IMPLEMENTED_YET(); // TODO(mtrberzi) } } else { // !upper_bound_exists // no upper bound information if (lower_bound_exists && !lower_bound_value.is_zero() && new_lower_bound_info) { // nonzero lower bound, no upper bound // check current assumptions if (regex_automaton_assumptions.contains(re) && !regex_automaton_assumptions[re].empty()){ // one or more existing assumptions. // see if the (current best) lower bound can be refined // (note that if we have an automaton with no assumption, // this automatically counts as best) bool need_assumption = true; regex_automaton_under_assumptions last_assumption; rational last_lb = rational::zero(); // the default for (svector::iterator it = regex_automaton_assumptions[re].begin(); it != regex_automaton_assumptions[re].end(); ++it) { regex_automaton_under_assumptions autA = *it; if ((current_assignment == l_true && autA.get_polarity() == false) || (current_assignment == l_false && autA.get_polarity() == true)) { // automaton uses incorrect polarity continue; } rational this_lb; if (autA.get_lower_bound(this_lb)) { if (this_lb > last_lb) { last_lb = this_lb; last_assumption = autA; } } else { need_assumption = false; last_assumption = autA; break; } } if (!last_lb.is_zero() || !need_assumption) { CTRACE("str", !need_assumption, tout << "using automaton with full length information" << std::endl;); CTRACE("str", need_assumption, tout << "using automaton with assumed lower bound of " << last_lb << std::endl;); rational refined_lower_bound; bool solution_at_lower_bound = refine_automaton_lower_bound(last_assumption.get_automaton(), lower_bound_value, refined_lower_bound); TRACE("str", tout << "refined lower bound is " << refined_lower_bound << (solution_at_lower_bound?", solution at lower bound":", no solution at lower bound") << std::endl;); expr_ref_vector lhs(m); if (current_assignment == l_false) { lhs.push_back(m.mk_not(str_in_re)); } else { lhs.push_back(str_in_re); } if (need_assumption) { lhs.push_back(m_autil.mk_ge(str_len, m_autil.mk_numeral(last_lb, true))); } lhs.push_back(m_autil.mk_ge(str_len, m_autil.mk_numeral(lower_bound_value, true))); expr_ref_vector rhs(m); if (solution_at_lower_bound) { if (refined_lower_bound.is_minus_one()) { // If there are solutions at the lower bound but not above it, make the bound exact. rhs.push_back(ctx.mk_eq_atom(str_len, m_autil.mk_numeral(lower_bound_value, true))); } else { // If there are solutions at and above the lower bound, add an additional bound. // DISABLED as this is causing non-termination in the integer solver. --mtrberzi /* rhs.push_back(m.mk_or( ctx.mk_eq_atom(str_len, m_autil.mk_numeral(lower_bound_value, true)), m_autil.mk_ge(str_len, m_autil.mk_numeral(refined_lower_bound, true)) )); */ } } else { if (refined_lower_bound.is_minus_one()) { // If there are no solutions at or above the lower bound, assert a conflict clause. rhs.push_back(m.mk_not(m_autil.mk_ge(str_len, m_autil.mk_numeral(lower_bound_value, true)))); } else { // If there are solutions above the lower bound but not at it, refine the bound. rhs.push_back(m_autil.mk_ge(str_len, m_autil.mk_numeral(refined_lower_bound, true))); } } if (!rhs.empty()) { expr_ref lhs_terms(mk_and(lhs), m); expr_ref rhs_terms(mk_and(rhs), m); assert_implication(lhs_terms, rhs_terms); regex_axiom_add = true; } } } else { // no existing automata/assumptions. // if it's easy to construct a full automaton for R, do so unsigned expected_complexity = estimate_regex_complexity(re); bool failureThresholdExceeded = (regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold); if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || failureThresholdExceeded) { eautomaton * aut = m_mk_aut(re); aut->compress(); regex_automata.push_back(aut); regex_automaton_under_assumptions new_aut(re, aut, true); if (!regex_automaton_assumptions.contains(re)) { regex_automaton_assumptions.insert(re, svector()); } regex_automaton_assumptions[re].push_back(new_aut); TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); regex_axiom_add = true; find_automaton_initial_bounds(str_in_re, aut); } else { // TODO check negation? // TODO construct a partial automaton for R to the given lower bound? if (false) { } else { regex_inc_counter(regex_fail_count, str_in_re); } } continue; } } else { // !lower_bound_exists // no bounds information // check for existing automata; // try to construct an automaton if we don't have one yet // and doing so without bounds is not difficult bool existingAutomata = (regex_automaton_assumptions.contains(re) && !regex_automaton_assumptions[re].empty()); bool failureThresholdExceeded = (regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold); if (!existingAutomata) { unsigned expected_complexity = estimate_regex_complexity(re); if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || failureThresholdExceeded) { eautomaton * aut = m_mk_aut(re); aut->compress(); regex_automata.push_back(aut); regex_automaton_under_assumptions new_aut(re, aut, true); if (!regex_automaton_assumptions.contains(re)) { regex_automaton_assumptions.insert(re, svector()); } regex_automaton_assumptions[re].push_back(new_aut); TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;); regex_axiom_add = true; find_automaton_initial_bounds(str_in_re, aut); } else { regex_inc_counter(regex_fail_count, str_in_re); } } else { regex_inc_counter(regex_fail_count, str_in_re); } } } } // foreach (entry in regex_terms) for (obj_map >::iterator it = regex_terms_by_string.begin(); it != regex_terms_by_string.end(); ++it) { // TODO do we need to check equivalence classes of strings here? expr * str = it->m_key; ptr_vector str_in_re_terms = it->m_value; svector intersect_constraints; // we may find empty intersection before checking every constraint; // this vector keeps track of which ones actually take part in intersection svector used_intersect_constraints; // choose an automaton/assumption for each assigned (str.in.re) // that's consistent with the current length information for (ptr_vector::iterator term_it = str_in_re_terms.begin(); term_it != str_in_re_terms.end(); ++term_it) { expr * _unused = nullptr; expr * re = nullptr; SASSERT(u.str.is_in_re(*term_it)); u.str.is_in_re(*term_it, _unused, re); rational exact_len; bool has_exact_len = get_len_value(str, exact_len); rational lb, ub; bool has_lower_bound = lower_bound(mk_strlen(str), lb); bool has_upper_bound = upper_bound(mk_strlen(str), ub); if (regex_automaton_assumptions.contains(re) && !regex_automaton_assumptions[re].empty()){ for (svector::iterator aut_it = regex_automaton_assumptions[re].begin(); aut_it != regex_automaton_assumptions[re].end(); ++aut_it) { regex_automaton_under_assumptions aut = *aut_it; rational aut_ub; bool assume_ub = aut.get_upper_bound(aut_ub); rational aut_lb; bool assume_lb = aut.get_lower_bound(aut_lb); bool consistent = true; if (assume_ub) { // check consistency of assumed upper bound if (has_exact_len) { if (exact_len > aut_ub) { consistent = false; } } else { if (has_upper_bound && ub > aut_ub) { consistent = false; } } } if (assume_lb) { // check consistency of assumed lower bound if (has_exact_len) { if (exact_len < aut_lb) { consistent = false; } } else { if (has_lower_bound && lb < aut_lb) { consistent = false; } } } if (consistent) { intersect_constraints.push_back(aut); break; } } } } // foreach(term in str_in_re_terms) eautomaton * aut_inter = nullptr; CTRACE("str", !intersect_constraints.empty(), tout << "check intersection of automata constraints for " << mk_pp(str, m) << std::endl;); for (svector::iterator aut_it = intersect_constraints.begin(); aut_it != intersect_constraints.end(); ++aut_it) { regex_automaton_under_assumptions aut = *aut_it; if (aut_inter == nullptr) { // start somewhere aut_inter = aut.get_automaton(); used_intersect_constraints.push_back(aut); continue; } TRACE("str", { unsigned v = regex_get_counter(regex_length_attempt_count, aut.get_regex_term()); tout << "length attempt count of " << mk_pp(aut.get_regex_term(), m) << " is " << v << ", threshold is " << m_params.m_RegexAutomata_LengthAttemptThreshold << std::endl; }); if (regex_get_counter(regex_length_attempt_count, aut.get_regex_term()) >= m_params.m_RegexAutomata_LengthAttemptThreshold) { unsigned intersectionDifficulty = estimate_automata_intersection_difficulty(aut_inter, aut.get_automaton()); TRACE("str", tout << "intersection difficulty is " << intersectionDifficulty << std::endl;); if (intersectionDifficulty <= m_params.m_RegexAutomata_IntersectionDifficultyThreshold || regex_get_counter(regex_intersection_fail_count, aut.get_regex_term()) >= m_params.m_RegexAutomata_FailedIntersectionThreshold) { expr * str_in_re_term(u.re.mk_in_re(str, aut.get_regex_term())); lbool current_assignment = ctx.get_assignment(str_in_re_term); // if the assignment is consistent with our assumption, use the automaton directly; // otherwise, complement it (and save that automaton for next time) // TODO we should cache these intermediate results // TODO do we need to push the intermediates into a vector for deletion anyway? if ( (current_assignment == l_true && aut.get_polarity()) || (current_assignment == l_false && !aut.get_polarity())) { aut_inter = m_mk_aut.mk_product(aut_inter, aut.get_automaton()); m_automata.push_back(aut_inter); } else { // need to complement first expr_ref rc(u.re.mk_complement(aut.get_regex_term()), m); eautomaton * aut_c = m_mk_aut(rc); regex_automata.push_back(aut_c); // TODO is there any way to build a complement automaton from an existing one? // this discards length information aut_inter = m_mk_aut.mk_product(aut_inter, aut_c); m_automata.push_back(aut_inter); } used_intersect_constraints.push_back(aut); if (aut_inter->is_empty()) { break; } } else { // failed intersection regex_inc_counter(regex_intersection_fail_count, aut.get_regex_term()); } } } // foreach(entry in intersect_constraints) if (aut_inter != nullptr) { aut_inter->compress(); } TRACE("str", tout << "intersected " << used_intersect_constraints.size() << " constraints" << std::endl;); expr_ref_vector conflict_terms(m); expr_ref conflict_lhs(m); for (svector::iterator aut_it = used_intersect_constraints.begin(); aut_it != used_intersect_constraints.end(); ++aut_it) { regex_automaton_under_assumptions aut = *aut_it; expr * str_in_re_term(u.re.mk_in_re(str, aut.get_regex_term())); lbool current_assignment = ctx.get_assignment(str_in_re_term); if (current_assignment == l_true) { conflict_terms.push_back(str_in_re_term); } else if (current_assignment == l_false) { conflict_terms.push_back(m.mk_not(str_in_re_term)); } // add length assumptions, if any rational ub; if (aut.get_upper_bound(ub)) { expr_ref ub_term(m_autil.mk_le(mk_strlen(str), m_autil.mk_numeral(ub, true)), m); conflict_terms.push_back(ub_term); } rational lb; if (aut.get_lower_bound(lb)) { expr_ref lb_term(m_autil.mk_ge(mk_strlen(str), m_autil.mk_numeral(lb, true)), m); conflict_terms.push_back(lb_term); } } conflict_lhs = mk_and(conflict_terms); if (used_intersect_constraints.size() > 1 && aut_inter != nullptr) { // check whether the intersection is only the empty string unsigned initial_state = aut_inter->init(); if (aut_inter->final_states().size() == 1 && aut_inter->is_final_state(initial_state)) { // initial state is final and it is the only final state // if there are no moves from the initial state, // the only solution is the empty string if (aut_inter->get_moves_from(initial_state).empty()) { TRACE("str", tout << "product automaton only accepts empty string" << std::endl;); expr_ref rhs1(ctx.mk_eq_atom(str, mk_string("")), m); expr_ref rhs2(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(rational::zero(), true)), m); expr_ref rhs(m.mk_and(rhs1, rhs2), m); assert_implication(conflict_lhs, rhs); regex_axiom_add = true; } } } if (aut_inter != nullptr && aut_inter->is_empty()) { TRACE("str", tout << "product automaton is empty; asserting conflict clause" << std::endl;); expr_ref conflict_clause(m.mk_not(mk_and(conflict_terms)), m); assert_axiom(conflict_clause); add_persisted_axiom(conflict_clause); regex_axiom_add = true; } } // foreach (entry in regex_terms_by_string) if (regex_axiom_add) { //return FC_CONTINUE; } } unsigned theory_str::estimate_regex_complexity(expr * re) { ENSURE(u.is_re(re)); expr * sub1; expr * sub2; unsigned lo, hi; if (u.re.is_to_re(re, sub1)) { if (!u.str.is_string(sub1)) throw default_exception("regular expressions must be built from string literals"); zstring str; u.str.is_string(sub1, str); return str.length(); } else if (u.re.is_complement(re, sub1)) { return estimate_regex_complexity_under_complement(sub1); } else if (u.re.is_concat(re, sub1, sub2)) { unsigned cx1 = estimate_regex_complexity(sub1); unsigned cx2 = estimate_regex_complexity(sub2); return _qadd(cx1, cx2); } else if (u.re.is_union(re, sub1, sub2)) { unsigned cx1 = estimate_regex_complexity(sub1); unsigned cx2 = estimate_regex_complexity(sub2); return _qadd(cx1, cx2); } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) { unsigned cx = estimate_regex_complexity(sub1); return _qmul(2, cx); } else if (u.re.is_loop(re, sub1, lo, hi) || u.re.is_loop(re, sub1, lo)) { unsigned cx = estimate_regex_complexity(sub1); return _qadd(lo, cx); } else if (u.re.is_range(re, sub1, sub2)) { SASSERT(u.str.is_string(sub1)); SASSERT(u.str.is_string(sub2)); zstring str1, str2; u.str.is_string(sub1, str1); u.str.is_string(sub2, str2); SASSERT(str1.length() == 1); SASSERT(str2.length() == 1); return 1 + str2[0] - str1[0]; } else if (u.re.is_full_char(re) || u.re.is_full_seq(re)) { return 1; } else { TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;); return 1; } } unsigned theory_str::estimate_regex_complexity_under_complement(expr * re) { ENSURE(u.is_re(re)); expr * sub1; expr * sub2; unsigned lo, hi; if (u.re.is_to_re(re, sub1)) { SASSERT(u.str.is_string(sub1)); zstring str; u.str.is_string(sub1, str); return str.length(); } else if (u.re.is_complement(re, sub1)) { // Why don't we return the regular complexity here? // We could, but this might be called from under another complemented subexpression. // It's better to give a worst-case complexity. return estimate_regex_complexity_under_complement(sub1); } else if (u.re.is_concat(re, sub1, sub2)) { unsigned cx1 = estimate_regex_complexity_under_complement(sub1); unsigned cx2 = estimate_regex_complexity_under_complement(sub2); return _qadd(_qmul(2, cx1), cx2); } else if (u.re.is_union(re, sub1, sub2)) { unsigned cx1 = estimate_regex_complexity_under_complement(sub1); unsigned cx2 = estimate_regex_complexity_under_complement(sub2); return _qmul(cx1, cx2); } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1) || u.re.is_loop(re, sub1, lo, hi) || u.re.is_loop(re, sub1, lo)) { unsigned cx = estimate_regex_complexity_under_complement(sub1); return _qmul(2, cx); } else if (u.re.is_range(re, sub1, sub2)) { SASSERT(u.str.is_string(sub1)); SASSERT(u.str.is_string(sub2)); zstring str1, str2; u.str.is_string(sub1, str1); u.str.is_string(sub2, str2); SASSERT(str1.length() == 1); SASSERT(str2.length() == 1); return 1 + str2[0] - str1[0]; } else if (u.re.is_full_char(re) || u.re.is_full_seq(re)) { return 1; } else { TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;); return 1; } } unsigned theory_str::estimate_automata_intersection_difficulty(eautomaton * aut1, eautomaton * aut2) { ENSURE(aut1 != nullptr); ENSURE(aut2 != nullptr); return _qmul(aut1->num_states(), aut2->num_states()); } // Check whether a regex translates well to a linear set of length constraints. bool theory_str::check_regex_length_linearity(expr * re) { return check_regex_length_linearity_helper(re, false); } bool theory_str::check_regex_length_linearity_helper(expr * re, bool already_star) { expr * sub1; expr * sub2; unsigned lo, hi; if (u.re.is_to_re(re)) { return true; } else if (u.re.is_concat(re, sub1, sub2)) { return check_regex_length_linearity_helper(sub1, already_star) && check_regex_length_linearity_helper(sub2, already_star); } else if (u.re.is_union(re, sub1, sub2)) { return check_regex_length_linearity_helper(sub1, already_star) && check_regex_length_linearity_helper(sub2, already_star); } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) { if (already_star) { return false; } else { return check_regex_length_linearity_helper(sub1, true); } } else if (u.re.is_range(re)) { return true; } else if (u.re.is_full_char(re)) { return true; } else if (u.re.is_full_seq(re)) { return true; } else if (u.re.is_complement(re)) { // TODO can we do better? return false; } else if (u.re.is_loop(re, sub1, lo, hi) || u.re.is_loop(re, sub1, lo)) { return check_regex_length_linearity_helper(sub1, already_star); } else { TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;); UNREACHABLE(); return false; } } // note: returns an empty set `lens` if something went wrong void theory_str::check_subterm_lengths(expr * re, integer_set & lens) { expr * sub1; expr * sub2; unsigned lo, hi; if (u.re.is_to_re(re, sub1)) { SASSERT(u.str.is_string(sub1)); zstring str; u.str.is_string(sub1, str); lens.insert(str.length()); } else if (u.re.is_concat(re, sub1, sub2)) { integer_set lens_1, lens_2; check_subterm_lengths(sub1, lens_1); check_subterm_lengths(sub2, lens_2); if (lens_1.empty() || lens_2.empty()) { lens.reset(); } else { // take all pairwise lengths for (integer_set::iterator it1 = lens_1.begin(); it1 != lens_1.end(); ++it1) { for(integer_set::iterator it2 = lens_2.begin(); it2 != lens_2.end(); ++it2) { int l1 = *it1; int l2 = *it2; lens.insert(l1 + l2); } } } } else if (u.re.is_union(re, sub1, sub2)) { integer_set lens_1, lens_2; check_subterm_lengths(sub1, lens_1); check_subterm_lengths(sub2, lens_2); if (lens_1.empty() || lens_2.empty()) { lens.reset(); } else { // take all possibilities from either side for (integer_set::iterator it1 = lens_1.begin(); it1 != lens_1.end(); ++it1) { lens.insert(*it1); } for (integer_set::iterator it2 = lens_2.begin(); it2 != lens_2.end(); ++it2) { lens.insert(*it2); } } } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) { // this is bad -- term generation requires this not to appear lens.reset(); } else if (u.re.is_range(re, sub1, sub2)) { SASSERT(u.str.is_string(sub1)); SASSERT(u.str.is_string(sub2)); zstring str1, str2; u.str.is_string(sub1, str1); u.str.is_string(sub2, str2); SASSERT(str1.length() == 1); SASSERT(str2.length() == 1); lens.insert(1); } else if (u.re.is_full_char(re)) { lens.insert(1); } else if (u.re.is_full_seq(re)) { lens.reset(); } else if (u.re.is_complement(re)) { lens.reset(); } else if (u.re.is_loop(re, sub1, lo, hi)) { integer_set lens_1; check_subterm_lengths(sub1, lens_1); for (unsigned i = lo; i <= hi; ++i) { for (auto j : lens_1) { lens.insert(i * j); } } } else { TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, get_manager()) << std::endl;); lens.reset(); } } /* * Infer all length constraints implied by the given regular expression `re` * in order to constrain `lenVar` (which must be of sort Int). * This assumes that `re` appears in a positive context. * Returns a Boolean formula expressing the appropriate constraints over `lenVar`. * In some cases, the returned formula requires one or more free integer variables to be created. * These variables are returned in the reference parameter `freeVariables`. * Extra assertions should be made for these free variables constraining them to be non-negative. */ expr_ref theory_str::infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables) { ENSURE(u.is_re(re)); context & ctx = get_context(); ast_manager & m = get_manager(); expr * sub1; expr * sub2; unsigned lo, hi; if (u.re.is_to_re(re, sub1)) { if (!u.str.is_string(sub1)) throw default_exception("regular expressions must be built from string literals"); zstring str; u.str.is_string(sub1, str); rational strlen(str.length()); expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_numeral(strlen, true)), m); return retval; } else if (u.re.is_union(re, sub1, sub2)) { expr_ref r1 = infer_all_regex_lengths(lenVar, sub1, freeVariables); expr_ref r2 = infer_all_regex_lengths(lenVar, sub2, freeVariables); expr_ref retval(m.mk_or(r1, r2), m); return retval; } else if (u.re.is_concat(re, sub1, sub2)) { expr * v1 = mk_int_var("rlen1"); expr * v2 = mk_int_var("rlen2"); freeVariables.push_back(v1); freeVariables.push_back(v2); expr_ref r1 = infer_all_regex_lengths(v1, sub1, freeVariables); expr_ref r2 = infer_all_regex_lengths(v2, sub2, freeVariables); expr_ref_vector finalResult(m); finalResult.push_back(ctx.mk_eq_atom(lenVar, m_autil.mk_add(v1, v2))); finalResult.push_back(r1); finalResult.push_back(r2); expr_ref retval(mk_and(finalResult), m); return retval; } else if (u.re.is_star(re, sub1) || u.re.is_plus(re, sub1)) { // stars are generated as a linear combination of all possible subterm lengths; // this requires that there are no stars under this one /* expr * v = mk_int_var("rlen"); expr * n = mk_int_var("rstar"); freeVariables.push_back(v); freeVariables.push_back(n); expr_ref rsub = infer_all_regex_lengths(v, sub1, freeVariables); expr_ref_vector finalResult(m); finalResult.push_back(rsub); finalResult.push_back(ctx.mk_eq_atom(lenVar, m_autil.mk_mul(v, n))); expr_ref retval(mk_and(finalResult), m); return retval; */ integer_set subterm_lens; check_subterm_lengths(sub1, subterm_lens); if (subterm_lens.empty()) { // somehow generation was impossible expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m); return retval; } else { TRACE("str", tout << "subterm lengths:"; for(integer_set::iterator it = subterm_lens.begin(); it != subterm_lens.end(); ++it) { tout << " " << *it; } tout << std::endl;); expr_ref_vector sum_terms(m); for (integer_set::iterator it = subterm_lens.begin(); it != subterm_lens.end(); ++it) { rational lenOption(*it); expr * n = mk_int_var("rstar"); freeVariables.push_back(n); expr_ref term(m_autil.mk_mul(m_autil.mk_numeral(lenOption, true), n), m); expr_ref term2(term, m); if (u.re.is_plus(re)) { // n effectively starts at 1 term2 = m_autil.mk_add(m_autil.mk_numeral(lenOption, true), term); } sum_terms.push_back(term2); } expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_add_simplify(sum_terms)), m); return retval; } } else if (u.re.is_loop(re, sub1, lo, hi)) { expr * v1 = mk_int_var("rlen"); freeVariables.push_back(v1); expr_ref r1 = infer_all_regex_lengths(v1, sub1, freeVariables); expr_ref_vector v1_choices(m); for (unsigned i = lo; i <= hi; ++i) { rational rI(i); expr_ref v1_i(ctx.mk_eq_atom(lenVar, m_autil.mk_mul(m_autil.mk_numeral(rI, true), v1)), m); v1_choices.push_back(v1_i); } expr_ref_vector finalResult(m); finalResult.push_back(r1); finalResult.push_back(mk_or(v1_choices)); expr_ref retval(mk_and(finalResult), m); SASSERT(retval); return retval; } else if (u.re.is_range(re, sub1, sub2)) { SASSERT(u.str.is_string(sub1)); SASSERT(u.str.is_string(sub2)); zstring str1, str2; u.str.is_string(sub1, str1); u.str.is_string(sub2, str2); SASSERT(str1.length() == 1); SASSERT(str2.length() == 1); expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_numeral(rational::one(), true)), m); return retval; } else if (u.re.is_full_char(re)) { expr_ref retval(ctx.mk_eq_atom(lenVar, m_autil.mk_numeral(rational::one(), true)), m); return retval; } else if (u.re.is_full_seq(re)) { // match any unbounded string expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m); return retval; } else if (u.re.is_complement(re)) { // skip complement for now, in general this is difficult to predict expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m); return retval; } else { TRACE("str", tout << "WARNING: unknown regex term " << mk_pp(re, m) << std::endl;); expr_ref retval(m_autil.mk_ge(lenVar, m_autil.mk_numeral(rational::zero(), true)), m); return retval; } } /* * Assert initial lower and upper bounds for the positive constraint (str in re) corresponding * to the automaton `aut`. * This asserts a constraint of the form: * str_in_re --> (len(str) ?= 0 OR len(str) >= lb) AND len(str) <= ub * where the upper bound clause is omitted if the upper bound doesn't exist * and the equality with 0 is based on whether solutions of length 0 are allowed. */ void theory_str::find_automaton_initial_bounds(expr * str_in_re, eautomaton * aut) { ENSURE(aut != nullptr); context & ctx = get_context(); ast_manager & m = get_manager(); expr_ref_vector rhs(m); expr * str = nullptr; expr * re = nullptr; u.str.is_in_re(str_in_re, str, re); expr_ref strlen(mk_strlen(str), m); // lower bound first rational nonzero_lower_bound; bool zero_sol_exists = refine_automaton_lower_bound(aut, rational::zero(), nonzero_lower_bound); if (zero_sol_exists) { regex_last_lower_bound.insert(str, rational::zero()); // solution at 0 if (!nonzero_lower_bound.is_minus_one()) { expr_ref rhs1(ctx.mk_eq_atom(strlen, m_autil.mk_numeral(rational::zero(), true)), m); expr_ref rhs2(m_autil.mk_ge(strlen, m_autil.mk_numeral(nonzero_lower_bound, true)), m); rhs.push_back(m.mk_or(rhs1, rhs2)); } else { // shouldn't happen UNREACHABLE(); } } else { // no solution at 0 if (!nonzero_lower_bound.is_minus_one()) { regex_last_lower_bound.insert(str, nonzero_lower_bound); expr_ref rhs2(m_autil.mk_ge(strlen, m_autil.mk_numeral(nonzero_lower_bound, true)), m); rhs.push_back(rhs2); } else { // shouldn't happen UNREACHABLE(); } } // TODO upper bound check if (!rhs.empty()) { expr_ref lhs(str_in_re, m); expr_ref _rhs(mk_and(rhs), m); assert_implication(lhs, _rhs); } } /* * Refine the lower bound on the length of a solution to a given automaton. * The method returns TRUE if a solution of length `current_lower_bound` exists, * and FALSE otherwise. In addition, the reference parameter `refined_lower_bound` * is assigned the length of the shortest solution longer than `current_lower_bound` * if it exists, or -1 otherwise. */ bool theory_str::refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound) { ENSURE(aut != nullptr); if (aut->final_states().empty()) { // no solutions at all refined_lower_bound = rational::minus_one(); return false; } // from here we assume that there is a final state reachable from the initial state unsigned_vector search_queue; // populate search_queue with all states reachable from the epsilon-closure of start state aut->get_epsilon_closure(aut->init(), search_queue); unsigned search_depth = 0; hashtable> next_states; unsigned_vector next_search_queue; bool found_solution_at_lower_bound = false; while (!search_queue.empty()) { // if we are at the lower bound, check for final states if (search_depth == current_lower_bound.get_unsigned()) { for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) { unsigned state = *it; if (aut->is_final_state(state)) { found_solution_at_lower_bound = true; break; } } // end phase 1 break; } next_states.reset(); next_search_queue.clear(); // move one step along all states for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) { unsigned src = *it; eautomaton::moves next_moves; aut->get_moves_from(src, next_moves, true); for (eautomaton::moves::iterator move_it = next_moves.begin(); move_it != next_moves.end(); ++move_it) { unsigned dst = move_it->dst(); if (!next_states.contains(dst)) { next_states.insert(dst); next_search_queue.push_back(dst); } } } search_queue.clear(); search_queue.append(next_search_queue); search_depth += 1; } // !search_queue.empty() // if we got here before reaching the lower bound, // there aren't any solutions at or above it, so stop if (search_depth < current_lower_bound.get_unsigned()) { refined_lower_bound = rational::minus_one(); return false; } // phase 2: continue exploring the automaton above the lower bound SASSERT(search_depth == current_lower_bound.get_unsigned()); while (!search_queue.empty()) { if (search_depth > current_lower_bound.get_unsigned()) { // check if we have found a solution above the lower bound for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) { unsigned state = *it; if (aut->is_final_state(state)) { // this is a solution at a depth higher than the lower bound refined_lower_bound = rational(search_depth); return found_solution_at_lower_bound; } } } next_states.reset(); next_search_queue.clear(); // move one step along all states for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) { unsigned src = *it; eautomaton::moves next_moves; aut->get_moves_from(src, next_moves, true); for (eautomaton::moves::iterator move_it = next_moves.begin(); move_it != next_moves.end(); ++move_it) { unsigned dst = move_it->dst(); if (!next_states.contains(dst)) { next_states.insert(dst); next_search_queue.push_back(dst); } } } search_queue.clear(); search_queue.append(next_search_queue); search_depth += 1; } // if we reached this point, we explored the whole automaton and didn't find any // solutions above the lower bound refined_lower_bound = rational::minus_one(); return found_solution_at_lower_bound; } /* * Refine the upper bound on the length of a solution to a given automaton. * The method returns TRUE if a solution of length `current_upper_bound` exists, * and FALSE otherwise. In addition, the reference parameter `refined_upper_bound` * is assigned the length of the longest solution shorter than `current_upper_bound`, * if a shorter solution exists, or -1 otherwise. */ bool theory_str::refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound) { ENSURE(aut != nullptr); if (aut->final_states().empty()) { // no solutions at all! refined_upper_bound = rational::minus_one(); return false; } // from here we assume there is a final state reachable from the initial state unsigned_vector search_queue; // populate search queue with all states reachable from the epsilon-closure of the start state aut->get_epsilon_closure(aut->init(), search_queue); rational last_solution_depth = rational::minus_one(); bool found_solution_at_upper_bound = false; unsigned search_depth = 0; hashtable > next_states; unsigned_vector next_search_queue; while(!search_queue.empty()) { // see if any of the current states are final for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) { unsigned src = *it; if (aut->is_final_state(src)) { if (search_depth == current_upper_bound.get_unsigned()) { found_solution_at_upper_bound = true; } else { last_solution_depth = rational(search_depth); } break; } } if (search_depth == current_upper_bound.get_unsigned()) { break; } next_states.reset(); next_search_queue.clear(); // move one step along all states for (unsigned_vector::iterator it = search_queue.begin(); it != search_queue.end(); ++it) { unsigned src = *it; eautomaton::moves next_moves; aut->get_moves_from(src, next_moves, true); for (eautomaton::moves::iterator moves_it = next_moves.begin(); moves_it != next_moves.end(); ++moves_it) { unsigned dst = moves_it->dst(); if (!next_states.contains(dst)) { next_states.insert(dst); next_search_queue.push_back(dst); } } } search_queue.clear(); search_queue.append(next_search_queue); search_depth += 1; } //!search_queue.empty() refined_upper_bound = last_solution_depth; return found_solution_at_upper_bound; } void theory_str::aut_path_add_next(u_map& next, expr_ref_vector& trail, unsigned idx, expr* cond) { expr* acc; if (!get_manager().is_true(cond) && next.find(idx, acc)) { expr* args[2] = { cond, acc }; cond = mk_or(get_manager(), 2, args); } trail.push_back(cond); next.insert(idx, cond); } expr_ref theory_str::aut_path_rewrite_constraint(expr * cond, expr * ch_var) { context & ctx = get_context(); ast_manager & m = get_manager(); expr_ref retval(m); unsigned char_val = 0; expr * lhs; expr * rhs; if (u.is_const_char(cond, char_val)) { SASSERT(char_val < 256); TRACE("str", tout << "rewrite character constant " << char_val << std::endl;); zstring str_const(char_val); retval = u.str.mk_string(str_const); return retval; } else if (is_var(cond)) { TRACE("str", tout << "substitute var" << std::endl;); retval = ch_var; return retval; } else if (m.is_eq(cond, lhs, rhs)) { // handle this specially because the sort of the equality will change expr_ref new_lhs(aut_path_rewrite_constraint(lhs, ch_var), m); SASSERT(new_lhs); expr_ref new_rhs(aut_path_rewrite_constraint(rhs, ch_var), m); SASSERT(new_rhs); retval = ctx.mk_eq_atom(new_lhs, new_rhs); return retval; } else if (m.is_bool(cond)) { TRACE("str", tout << "rewrite boolean term " << mk_pp(cond, m) << std::endl;); app * a_cond = to_app(cond); expr_ref_vector rewritten_args(m); for (unsigned i = 0; i < a_cond->get_num_args(); ++i) { expr * argI = a_cond->get_arg(i); expr_ref new_arg(aut_path_rewrite_constraint(argI, ch_var), m); SASSERT(new_arg); rewritten_args.push_back(new_arg); } retval = m.mk_app(a_cond->get_decl(), rewritten_args.c_ptr()); TRACE("str", tout << "final rewritten term is " << mk_pp(retval, m) << std::endl;); return retval; } else { TRACE("str", tout << "ERROR: unrecognized automaton path constraint " << mk_pp(cond, m) << ", cannot translate" << std::endl;); retval = nullptr; return retval; } } /* * Create finite path constraints for the string variable `str` with respect to the automaton `aut`. * The returned expression is the right-hand side of a constraint of the form * (str in re) AND (|str| = len) AND (any applicable length assumptions on aut) -> (rhs AND character constraints). * The character constraints, which are (str = c0 . c1 . (...) . cn) and (|c0| = 1, ...), * are returned in `characterConstraints`. */ expr_ref theory_str::generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal, expr_ref & characterConstraints) { ENSURE(aut != nullptr); context & ctx = get_context(); ast_manager & m = get_manager(); if (lenVal.is_zero()) { // if any state in the epsilon-closure of the start state is accepting, // then the empty string is in this language unsigned_vector states; bool has_final = false; aut->get_epsilon_closure(aut->init(), states); for (unsigned i = 0; i < states.size() && !has_final; ++i) { has_final = aut->is_final_state(states[i]); } if (has_final) { // empty string is OK, assert axiom expr_ref rhs(ctx.mk_eq_atom(stringTerm, mk_string("")), m); SASSERT(rhs); //regex_automata_assertions.insert(stringTerm, final_axiom); //m_trail_stack.push(insert_obj_map(regex_automata_assertions, stringTerm) ); return rhs; } else { // negate -- the empty string isn't in the language //expr_ref conflict(m.mk_not(mk_and(toplevel_lhs)), m); //assert_axiom(conflict); expr_ref conflict(m.mk_false(), m); return conflict; } } // lenVal.is_zero() expr_ref_vector pathChars(m); expr_ref_vector pathChars_len_constraints(m); // reuse character terms over the same string if (string_chars.contains(stringTerm)) { // find out whether we have enough characters already ptr_vector old_chars; string_chars.find(stringTerm, old_chars); if (old_chars.size() < lenVal.get_unsigned()) { for (unsigned i = old_chars.size(); i < lenVal.get_unsigned(); ++i) { std::stringstream ss; ss << "ch" << i; expr_ref ch(mk_str_var(ss.str()), m); m_trail.push_back(ch); old_chars.push_back(ch); } } string_chars.insert(stringTerm, old_chars); // now we're guaranteed to have at least the right number of characters in old_chars for (unsigned i = 0; i < lenVal.get_unsigned(); ++i) { expr_ref ch(old_chars.get(i), m); refresh_theory_var(ch); pathChars.push_back(ch); pathChars_len_constraints.push_back(ctx.mk_eq_atom(mk_strlen(ch), m_autil.mk_numeral(rational::one(), true))); } } else { ptr_vector new_chars; for (unsigned i = 0; i < lenVal.get_unsigned(); ++i) { std::stringstream ss; ss << "ch" << i; expr_ref ch(mk_str_var(ss.str()), m); pathChars.push_back(ch); pathChars_len_constraints.push_back(ctx.mk_eq_atom(mk_strlen(ch), m_autil.mk_numeral(rational::one(), true))); new_chars.push_back(ch); } string_chars.insert(stringTerm, new_chars); } // modification of code in seq_rewriter::mk_str_in_regexp() expr_ref_vector trail(m); u_map maps[2]; bool select_map = false; expr_ref ch(m), cond(m); eautomaton::moves mvs; maps[0].insert(aut->init(), m.mk_true()); // is_accepted(a, aut) & some state in frontier is final. for (unsigned i = 0; i < lenVal.get_unsigned(); ++i) { u_map& frontier = maps[select_map]; u_map& next = maps[!select_map]; select_map = !select_map; ch = pathChars.get(i); next.reset(); u_map::iterator it = frontier.begin(), end = frontier.end(); for (; it != end; ++it) { mvs.reset(); unsigned state = it->m_key; expr* acc = it->m_value; aut->get_moves_from(state, mvs, false); for (unsigned j = 0; j < mvs.size(); ++j) { eautomaton::move const& mv = mvs[j]; SASSERT(mv.t()); if (mv.t()->is_char() && m.is_value(mv.t()->get_char())) { // change this to a string constraint expr_ref cond_rhs = aut_path_rewrite_constraint(mv.t()->get_char(), ch); SASSERT(cond_rhs); cond = ctx.mk_eq_atom(ch, cond_rhs); SASSERT(cond); expr * args[2] = {cond, acc}; cond = mk_and(m, 2, args); aut_path_add_next(next, trail, mv.dst(), cond); } else if (mv.t()->is_range()) { expr_ref range_lo(mv.t()->get_lo(), m); expr_ref range_hi(mv.t()->get_hi(), m); unsigned lo_val, hi_val; if (u.is_const_char(range_lo, lo_val) && u.is_const_char(range_hi, hi_val)) { TRACE("str", tout << "make range predicate from " << lo_val << " to " << hi_val << std::endl;); expr_ref cond_rhs(m); if (hi_val < lo_val) { // NSB: why? The range would be empty. std::swap(lo_val, hi_val); } expr_ref_vector cond_rhs_terms(m); for (unsigned i = lo_val; i <= hi_val; ++i) { zstring str_const(i); expr_ref str_expr(u.str.mk_string(str_const), m); cond_rhs_terms.push_back(ctx.mk_eq_atom(ch, str_expr)); } cond_rhs = mk_or(cond_rhs_terms); SASSERT(cond_rhs); expr * args[2] = {cond_rhs, acc}; cond = mk_and(m, 2, args); aut_path_add_next(next, trail, mv.dst(), cond); } else { TRACE("str", tout << "warning: non-bitvectors in automaton range predicate" << std::endl;); UNREACHABLE(); } } else if (mv.t()->is_pred()) { // rewrite this constraint over string terms expr_ref cond_rhs = aut_path_rewrite_constraint(mv.t()->get_pred(), ch); SASSERT(cond_rhs); if (m.is_false(cond_rhs)) { continue; } else if (m.is_true(cond_rhs)) { aut_path_add_next(next, trail, mv.dst(), acc); continue; } expr * args[2] = {cond_rhs, acc}; cond = mk_and(m, 2, args); aut_path_add_next(next, trail, mv.dst(), cond); } } } } u_map const& frontier = maps[select_map]; u_map::iterator it = frontier.begin(), end = frontier.end(); expr_ref_vector ors(m); for (; it != end; ++it) { unsigned_vector states; bool has_final = false; aut->get_epsilon_closure(it->m_key, states); for (unsigned i = 0; i < states.size() && !has_final; ++i) { has_final = aut->is_final_state(states[i]); } if (has_final) { ors.push_back(it->m_value); } } expr_ref result(mk_or(ors)); TRACE("str", tout << "regex path constraint: " << mk_pp(result, m) << "\n";); expr_ref concat_rhs(m); if (pathChars.size() == 1) { concat_rhs = ctx.mk_eq_atom(stringTerm, pathChars.get(0)); } else { expr_ref acc(pathChars.get(0), m); for (unsigned i = 1; i < pathChars.size(); ++i) { acc = mk_concat(acc, pathChars.get(i)); } concat_rhs = ctx.mk_eq_atom(stringTerm, acc); } //expr_ref toplevel_rhs(m.mk_and(result, mk_and(pathChars_len_constraints), concat_rhs), m); characterConstraints = m.mk_and(mk_and(pathChars_len_constraints), concat_rhs); //expr_ref final_axiom(rewrite_implication(mk_and(toplevel_lhs), toplevel_rhs), m); //regex_automata_assertions.insert(stringTerm, final_axiom); //m_trail_stack.push(insert_obj_map(regex_automata_assertions, stringTerm) ); return result; } void theory_str::regex_inc_counter(obj_map & counter_map, expr * key) { unsigned old_v; if (counter_map.find(key, old_v)) { unsigned new_v = old_v += 1; counter_map.insert(key, new_v); } else { counter_map.insert(key, 1); } } unsigned theory_str::regex_get_counter(obj_map & counter_map, expr * key) { unsigned v; if (counter_map.find(key, v)) { return v; } else { counter_map.insert(key, 0); return 0; } } }; /* namespace smt */