short path for length-0 regex terms

2025-04-22 16:45:31 +00:00 · 2018-01-17 18:26:31 -05:00 · 2018-01-17 18:26:31 -05:00 · c2b268c645
commit c2b268c645
parent c0ed683882
2 changed files with 206 additions and 119 deletions
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@ -6646,6 +6646,9 @@ namespace smt {
     * In some cases, the returned formula requires one or more free integer variables to be created.
     * These variables are returned in the reference parameter `freeVariables`.
     * Extra assertions should be made for these free variables constraining them to be non-negative.
+     *
+     * TODO: star unrolling?
+     * TODO: generate stars "correctly" as a linear combination of all possible subterm lengths
     */
    expr_ref theory_str::infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables) {
        ENSURE(u.is_re(re));
@ -6725,6 +6728,59 @@ namespace smt {
        }
    }

+    /*
+     * Assert initial lower and upper bounds for the positive constraint (str in re) corresponding
+     * to the automaton `aut`.
+     * This asserts a constraint of the form:
+     *   str_in_re --> (len(str) ?= 0 OR len(str) >= lb) AND len(str) <= ub
+     * where the upper bound clause is omitted if the upper bound doesn't exist
+     * and the equality with 0 is based on whether solutions of length 0 are allowed.
+     */
+    void theory_str::find_automaton_initial_bounds(expr * str_in_re, eautomaton * aut) {
+        ENSURE(aut != NULL);
+        context & ctx = get_context();
+        ast_manager & m = get_manager();
+
+        expr_ref_vector rhs(m);
+        expr * str;
+        expr * re;
+        u.str.is_in_re(str_in_re, str, re);
+        expr_ref strlen(mk_strlen(str), m);
+
+        // lower bound first
+        rational nonzero_lower_bound;
+        bool zero_sol_exists = refine_automaton_lower_bound(aut, rational::zero(), nonzero_lower_bound);
+        if (zero_sol_exists) {
+            regex_last_lower_bound.insert(str, rational::zero());
+            // solution at 0
+            if (!nonzero_lower_bound.is_minus_one()) {
+                expr_ref rhs1(ctx.mk_eq_atom(strlen, m_autil.mk_numeral(rational::zero(), true)), m);
+                expr_ref rhs2(m_autil.mk_ge(strlen, m_autil.mk_numeral(nonzero_lower_bound, true)), m);
+                rhs.push_back(m.mk_or(rhs1, rhs2));
+            } else {
+                // shouldn't happen
+                UNREACHABLE();
+            }
+        } else {
+            // no solution at 0
+            if (!nonzero_lower_bound.is_minus_one()) {
+                regex_last_lower_bound.insert(str, nonzero_lower_bound);
+                expr_ref rhs2(m_autil.mk_ge(strlen, m_autil.mk_numeral(nonzero_lower_bound, true)), m);
+                rhs.push_back(rhs2);
+            } else {
+                // shouldn't happen
+                UNREACHABLE();
+            }
+        }
+        // TODO upper bound check
+
+        if (!rhs.empty()) {
+            expr_ref lhs(str_in_re, m);
+            expr_ref _rhs(mk_and(rhs), m);
+            assert_implication(lhs, _rhs);
+        }
+    }
+
    /*
     * Refine the lower bound on the length of a solution to a given automaton.
     * The method returns TRUE if a solution of length `current_lower_bound` exists,
@ -9596,7 +9652,6 @@ namespace smt {
        // regex automata
        if (m_params.m_RegexAutomata) {
            // TODO since heuristics might fail, the "no progress" flag might need to be handled specially here
-            // TODO learning of linear length constraints in the style of length automata, if possible?
            bool regex_axiom_add = false;
            for (obj_hashtable<expr>::iterator it = regex_terms.begin(); it != regex_terms.end(); ++it) {
                expr * str_in_re = *it;
@ -9667,89 +9722,96 @@ namespace smt {
                        continue;
                    }

-                    // find a consistent automaton for this term
-                    bool found = false;
-                    regex_automaton_under_assumptions assumption;
-                    if (regex_automaton_assumptions.contains(re) &&
-                                                !regex_automaton_assumptions[re].empty()){
-                        for (svector<regex_automaton_under_assumptions>::iterator it = regex_automaton_assumptions[re].begin();
-                                it != regex_automaton_assumptions[re].end(); ++it) {
-                            regex_automaton_under_assumptions autA = *it;
-                            rational assumed_upper_bound, assumed_lower_bound;
-                            bool assumes_upper_bound = autA.get_upper_bound(assumed_upper_bound);
-                            bool assumes_lower_bound = autA.get_lower_bound(assumed_lower_bound);
-                            if (!assumes_upper_bound && !assumes_lower_bound) {
-                                // automaton with no assumptions is always usable
-                                assumption = autA;
-                                found = true;
-                                break;
-                            }
-                            // check consistency of bounds assumptions
-                        } // foreach(a in regex_automaton_assumptions)
-                    }
-                    if (found) {
-                        expr_ref pathConstraint(m);
-                        expr_ref characterConstraints(m);
-                        pathConstraint = generate_regex_path_constraints(str, assumption.get_automaton(), exact_length_value, characterConstraints);
-                        TRACE("str", tout << "generated regex path constraint " << mk_pp(pathConstraint, m) << std::endl;);
-                        TRACE("str", tout << "character constraints are " << mk_pp(characterConstraints, m) << std::endl;);
-
-                        expr_ref_vector lhs_terms(m);
-                        if (current_assignment == l_true) {
-                            lhs_terms.push_back(str_in_re);
-                        } else {
-                            lhs_terms.push_back(m.mk_not(str_in_re));
-                        }
-                        lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true)));
-                        expr_ref lhs(mk_and(lhs_terms), m);
-                        // If the automaton was built with the same polarity as the constraint,
-                        // assert directly. Otherwise, negate the path constraint
-                        if ( (current_assignment == l_true && assumption.get_polarity())
-                                || (current_assignment == l_false && !assumption.get_polarity())) {
-                            TRACE("str", tout << "automaton and regex term have same polarity" << std::endl;);
-                            expr_ref rhs(m.mk_and(pathConstraint, characterConstraints), m);
-                            assert_implication(lhs, rhs);
-                        } else {
-                            TRACE("str", tout << "automaton and regex term have opposite polarity" << std::endl;);
-                            expr_ref rhs(m.mk_and(m.mk_not(pathConstraint), characterConstraints), m);
-                            assert_implication(lhs, rhs);
-                        }
-                        regex_terms_with_path_constraints.insert(str_in_re);
-                        m_trail_stack.push(insert_obj_trail<theory_str, expr>(regex_terms_with_path_constraints, str_in_re));
-                        regex_axiom_add = true;
-
-                        // increment LengthAttemptCount
-                        regex_inc_counter(regex_length_attempt_count, re);
-
-                        {
-                            unsigned v = regex_get_counter(regex_length_attempt_count, re);
-                            TRACE("str", tout << "length attempt count for " << mk_pp(re, m) << " is " << v << std::endl;);
-                        }
-
-                        continue;
+                    if (exact_length_value.is_zero()) {
+                        // shortcut
+                        expr_ref lhs(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(rational::zero(), true)), m);
+                        expr_ref rhs(ctx.mk_eq_atom(str, mk_string("")), m);
+                        assert_implication(lhs, rhs);
                    } else {
-                        // no automata available, or else all bounds assumptions are invalid
-                        unsigned expected_complexity = estimate_regex_complexity(re);
-                        if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold) {
-                            CTRACE("str", regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold,
-                                    tout << "failed automaton threshold reached for " << mk_pp(str_in_re, m) << " -- automatically constructing full automaton" << std::endl;);
-                            eautomaton * aut = m_mk_aut(re);
-                            aut->compress();
-                            regex_automata.push_back(aut);
-                            regex_automaton_under_assumptions new_aut(re, aut, true);
-                            if (!regex_automaton_assumptions.contains(re)) {
-                                regex_automaton_assumptions.insert(re, svector<regex_automaton_under_assumptions>());
-                            }
-                            regex_automaton_assumptions[re].push_back(new_aut);
-                            TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
-                            regex_axiom_add = true;
-                            // TODO immediately attempt to learn lower/upper bound info here
-                        } else {
-                            regex_inc_counter(regex_fail_count, str_in_re);
+                        // find a consistent automaton for this term
+                        bool found = false;
+                        regex_automaton_under_assumptions assumption;
+                        if (regex_automaton_assumptions.contains(re) &&
+                                !regex_automaton_assumptions[re].empty()){
+                            for (svector<regex_automaton_under_assumptions>::iterator it = regex_automaton_assumptions[re].begin();
+                                    it != regex_automaton_assumptions[re].end(); ++it) {
+                                regex_automaton_under_assumptions autA = *it;
+                                rational assumed_upper_bound, assumed_lower_bound;
+                                bool assumes_upper_bound = autA.get_upper_bound(assumed_upper_bound);
+                                bool assumes_lower_bound = autA.get_lower_bound(assumed_lower_bound);
+                                if (!assumes_upper_bound && !assumes_lower_bound) {
+                                    // automaton with no assumptions is always usable
+                                    assumption = autA;
+                                    found = true;
+                                    break;
+                                }
+                                // check consistency of bounds assumptions
+                            } // foreach(a in regex_automaton_assumptions)
                        }
-                        continue;
-                    }
-                }
+                        if (found) {
+                            expr_ref pathConstraint(m);
+                            expr_ref characterConstraints(m);
+                            pathConstraint = generate_regex_path_constraints(str, assumption.get_automaton(), exact_length_value, characterConstraints);
+                            TRACE("str", tout << "generated regex path constraint " << mk_pp(pathConstraint, m) << std::endl;);
+                            TRACE("str", tout << "character constraints are " << mk_pp(characterConstraints, m) << std::endl;);
+
+                            expr_ref_vector lhs_terms(m);
+                            if (current_assignment == l_true) {
+                                lhs_terms.push_back(str_in_re);
+                            } else {
+                                lhs_terms.push_back(m.mk_not(str_in_re));
+                            }
+                            lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true)));
+                            expr_ref lhs(mk_and(lhs_terms), m);
+                            // If the automaton was built with the same polarity as the constraint,
+                            // assert directly. Otherwise, negate the path constraint
+                            if ( (current_assignment == l_true && assumption.get_polarity())
+                                    || (current_assignment == l_false && !assumption.get_polarity())) {
+                                TRACE("str", tout << "automaton and regex term have same polarity" << std::endl;);
+                                expr_ref rhs(m.mk_and(pathConstraint, characterConstraints), m);
+                                assert_implication(lhs, rhs);
+                            } else {
+                                TRACE("str", tout << "automaton and regex term have opposite polarity" << std::endl;);
+                                expr_ref rhs(m.mk_and(m.mk_not(pathConstraint), characterConstraints), m);
+                                assert_implication(lhs, rhs);
+                            }
+                            regex_terms_with_path_constraints.insert(str_in_re);
+                            m_trail_stack.push(insert_obj_trail<theory_str, expr>(regex_terms_with_path_constraints, str_in_re));
+                            regex_axiom_add = true;
+
+                            // increment LengthAttemptCount
+                            regex_inc_counter(regex_length_attempt_count, re);
+
+                            {
+                                unsigned v = regex_get_counter(regex_length_attempt_count, re);
+                                TRACE("str", tout << "length attempt count for " << mk_pp(re, m) << " is " << v << std::endl;);
+                            }
+
+                            continue;
+                        } else {
+                            // no automata available, or else all bounds assumptions are invalid
+                            unsigned expected_complexity = estimate_regex_complexity(re);
+                            if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold || regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold) {
+                                CTRACE("str", regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold,
+                                        tout << "failed automaton threshold reached for " << mk_pp(str_in_re, m) << " -- automatically constructing full automaton" << std::endl;);
+                                eautomaton * aut = m_mk_aut(re);
+                                aut->compress();
+                                regex_automata.push_back(aut);
+                                regex_automaton_under_assumptions new_aut(re, aut, true);
+                                if (!regex_automaton_assumptions.contains(re)) {
+                                    regex_automaton_assumptions.insert(re, svector<regex_automaton_under_assumptions>());
+                                }
+                                regex_automaton_assumptions[re].push_back(new_aut);
+                                TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
+                                regex_axiom_add = true;
+                                find_automaton_initial_bounds(str_in_re, aut);
+                            } else {
+                                regex_inc_counter(regex_fail_count, str_in_re);
+                            }
+                            continue;
+                        }
+                    } // !length is zero
+                } // get_len_value()
                expr_ref str_len(mk_strlen(str), m);
                rational lower_bound_value;
                rational upper_bound_value;
@ -9858,10 +9920,12 @@ namespace smt {
                                }
                            }

-                            expr_ref lhs_terms(mk_and(lhs), m);
-                            expr_ref rhs_terms(mk_and(rhs), m);
-                            assert_implication(lhs_terms, rhs_terms);
-                            regex_axiom_add = true;
+                            if (!rhs.empty()) {
+                                expr_ref lhs_terms(mk_and(lhs), m);
+                                expr_ref rhs_terms(mk_and(rhs), m);
+                                assert_implication(lhs_terms, rhs_terms);
+                                regex_axiom_add = true;
+                            }
                        }
                    } else {
                        // no existing automata/assumptions.
@ -9878,7 +9942,7 @@ namespace smt {
                            regex_automaton_assumptions[re].push_back(new_aut);
                            TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
                            regex_axiom_add = true;
-                            // TODO immediately attempt to learn lower/upper bound info here
+                            find_automaton_initial_bounds(str_in_re, aut);
                        } else {
                            // TODO check negation?
                            // TODO construct a partial automaton for R to the given upper bound?
@ -9976,10 +10040,12 @@ namespace smt {
                                    }
                                }

-                                expr_ref lhs_terms(mk_and(lhs), m);
-                                expr_ref rhs_terms(mk_and(rhs), m);
-                                assert_implication(lhs_terms, rhs_terms);
-                                regex_axiom_add = true;
+                                if (!rhs.empty()) {
+                                    expr_ref lhs_terms(mk_and(lhs), m);
+                                    expr_ref rhs_terms(mk_and(rhs), m);
+                                    assert_implication(lhs_terms, rhs_terms);
+                                    regex_axiom_add = true;
+                                }
                            }
                        } else {
                            // no existing automata/assumptions.
@ -9996,7 +10062,7 @@ namespace smt {
                                regex_automaton_assumptions[re].push_back(new_aut);
                                TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
                                regex_axiom_add = true;
-                                // TODO immediately attempt to learn lower/upper bound info here
+                                find_automaton_initial_bounds(str_in_re, aut);
                            } else {
                                // TODO check negation?
                                // TODO construct a partial automaton for R to the given lower bound?
@ -10016,7 +10082,7 @@ namespace smt {
                        // and doing so without bounds is not difficult
                        bool existingAutomata = (regex_automaton_assumptions.contains(re) && !regex_automaton_assumptions[re].empty());
                        bool failureThresholdExceeded = (regex_get_counter(regex_fail_count, str_in_re) >= m_params.m_RegexAutomata_FailedAutomatonThreshold);
-                        if (!existingAutomata || failureThresholdExceeded) {
+                        if (!existingAutomata) {
                            unsigned expected_complexity = estimate_regex_complexity(re);
                            if (expected_complexity <= m_params.m_RegexAutomata_DifficultyThreshold
                                    || failureThresholdExceeded) {
@ -10030,7 +10096,7 @@ namespace smt {
                                regex_automaton_assumptions[re].push_back(new_aut);
                                TRACE("str", tout << "add new automaton for " << mk_pp(re, m) << ": no assumptions" << std::endl;);
                                regex_axiom_add = true;
-                                // TODO immediately attempt to learn lower/upper bound info here
+                                find_automaton_initial_bounds(str_in_re, aut);
                            } else {
                                regex_inc_counter(regex_fail_count, str_in_re);
                            }
@ -10165,42 +10231,62 @@ namespace smt {
                        }
                    }
                } // foreach(entry in intersect_constraints)
+                aut_inter->compress();
                TRACE("str", tout << "intersected " << used_intersect_constraints.size() << " constraints" << std::endl;);
-                if (aut_inter != NULL && aut_inter->is_empty()) {
-                    TRACE("str", tout << "product automaton is empty; asserting conflict clause" << std::endl;);
-                    expr_ref_vector conflict_terms(m);

-                    for (svector<regex_automaton_under_assumptions>::iterator aut_it = used_intersect_constraints.begin();
-                            aut_it != used_intersect_constraints.end(); ++aut_it) {
-                        regex_automaton_under_assumptions aut = *aut_it;
-                        expr * str_in_re_term(u.re.mk_in_re(str, aut.get_regex_term()));
-                        lbool current_assignment = ctx.get_assignment(str_in_re_term);
-                        if (current_assignment == l_true) {
-                            conflict_terms.push_back(str_in_re_term);
-                        } else if (current_assignment == l_false) {
-                            conflict_terms.push_back(m.mk_not(str_in_re_term));
-                        }
-                        // add length assumptions, if any
-                        rational ub;
-                        if (aut.get_upper_bound(ub)) {
-                            expr_ref ub_term(m_autil.mk_le(mk_strlen(str), m_autil.mk_numeral(ub, true)), m);
-                            conflict_terms.push_back(ub_term);
-                        }
-                        rational lb;
-                        if (aut.get_lower_bound(lb)) {
-                            expr_ref lb_term(m_autil.mk_ge(mk_strlen(str), m_autil.mk_numeral(lb, true)), m);
-                            conflict_terms.push_back(lb_term);
+                expr_ref_vector conflict_terms(m);
+                expr_ref conflict_lhs(m);
+                for (svector<regex_automaton_under_assumptions>::iterator aut_it = used_intersect_constraints.begin();
+                        aut_it != used_intersect_constraints.end(); ++aut_it) {
+                    regex_automaton_under_assumptions aut = *aut_it;
+                    expr * str_in_re_term(u.re.mk_in_re(str, aut.get_regex_term()));
+                    lbool current_assignment = ctx.get_assignment(str_in_re_term);
+                    if (current_assignment == l_true) {
+                        conflict_terms.push_back(str_in_re_term);
+                    } else if (current_assignment == l_false) {
+                        conflict_terms.push_back(m.mk_not(str_in_re_term));
+                    }
+                    // add length assumptions, if any
+                    rational ub;
+                    if (aut.get_upper_bound(ub)) {
+                        expr_ref ub_term(m_autil.mk_le(mk_strlen(str), m_autil.mk_numeral(ub, true)), m);
+                        conflict_terms.push_back(ub_term);
+                    }
+                    rational lb;
+                    if (aut.get_lower_bound(lb)) {
+                        expr_ref lb_term(m_autil.mk_ge(mk_strlen(str), m_autil.mk_numeral(lb, true)), m);
+                        conflict_terms.push_back(lb_term);
+                    }
+                }
+                conflict_lhs = mk_and(conflict_terms);
+
+                if (used_intersect_constraints.size() > 1 && aut_inter != NULL) {
+                    // check whether the intersection is only the empty string
+                    unsigned initial_state = aut_inter->init();
+                    if (aut_inter->final_states().size() == 1 && aut_inter->is_final_state(initial_state)) {
+                        // initial state is final and it is the only final state
+                        // if there are no moves from the initial state,
+                        // the only solution is the empty string
+                        if (aut_inter->get_moves_from(initial_state).empty()) {
+                            TRACE("str", tout << "product automaton only accepts empty string" << std::endl;);
+                            expr_ref rhs1(ctx.mk_eq_atom(str, mk_string("")), m);
+                            expr_ref rhs2(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(rational::zero(), true)), m);
+                            expr_ref rhs(m.mk_and(rhs1, rhs2), m);
+                            assert_implication(conflict_lhs, rhs);
+                            regex_axiom_add = true;
                        }
                    }
+                }

+                if (aut_inter != NULL && aut_inter->is_empty()) {
+                    TRACE("str", tout << "product automaton is empty; asserting conflict clause" << std::endl;);
                    expr_ref conflict_clause(m.mk_not(mk_and(conflict_terms)), m);
                    assert_axiom(conflict_clause);
                    regex_axiom_add = true;
                }
            } // foreach (entry in regex_terms_by_string)
-
            if (regex_axiom_add) {
-                return FC_CONTINUE;
+                //return FC_CONTINUE;
            }
        } // RegexAutomata

--- a/src/smt/theory_str.h
+++ b/src/smt/theory_str.h
@ -556,6 +556,7 @@ protected:
    bool check_regex_length_linearity(expr * re);
    bool check_regex_length_linearity_helper(expr * re, bool already_star);
    expr_ref infer_all_regex_lengths(expr * lenVar, expr * re, expr_ref_vector & freeVariables);
+    void find_automaton_initial_bounds(expr * str_in_re, eautomaton * aut);
    bool refine_automaton_lower_bound(eautomaton * aut, rational current_lower_bound, rational & refined_lower_bound);
    bool refine_automaton_upper_bound(eautomaton * aut, rational current_upper_bound, rational & refined_upper_bound);
    expr_ref generate_regex_path_constraints(expr * stringTerm, eautomaton * aut, rational lenVal, expr_ref & characterConstraints);