z3str3: make counterexamples less naive, and check regex membership more efficiently (#4358)

* z3str3: make counterexamples less naive, and check regex membership more efficiently * z3str3: construct even better counterexamples for regex membership
2025-08-05 19:00:25 +00:00 · 2020-05-28 11:57:08 -05:00 · 2020-05-28 11:57:08 -05:00 · f3b2a082ae
commit f3b2a082ae
parent 56bf4c144b
3 changed files with 94 additions and 83 deletions
--- a/src/smt/theory_str.cpp
+++ b/src/smt/theory_str.cpp
@ -8633,11 +8633,9 @@ namespace smt {

            // We must be be 100% certain that if there are any regex constraints,
            // the string assignment for each variable is consistent with the automaton.
-            // The (probably) easiest way to do this is to ensure
-            // that we have path constraints set up for every assigned regex term.
+            bool regexOK = true;
            if (!regex_terms.empty()) {
-                for (obj_hashtable<expr>::iterator it = regex_terms.begin(); it != regex_terms.end(); ++it) {
-                    expr * str_in_re = *it;
+                for (auto& str_in_re : regex_terms) {
                    expr * str;
                    expr * re;
                    u.str.is_in_re(str_in_re, str, re);
@ -8645,25 +8643,53 @@ namespace smt {
                    if (current_assignment == l_undef) {
                        continue;
                    }
-                    if (!regex_terms_with_path_constraints.contains(str_in_re)) {
-                        TRACE("str", tout << "assigned regex term " << mk_pp(str_in_re, m) << " has no path constraints -- continuing search" << std::endl;);
-                        return FC_CONTINUE;
+                    zstring strValue;
+                    if (get_string_constant_eqc(str, strValue)) {
+                        // try substituting the current assignment and solving the regex
+                        expr_ref valueInRe(u.re.mk_in_re(mk_string(strValue), re), m);
+                        ctx.get_rewriter()(valueInRe);
+                        if (m.is_true(valueInRe)) {
+                            if (current_assignment == l_false) {
+                                TRACE("str", tout << "regex conflict: " << mk_pp(str, m) << " = \"" << strValue << "\" but must not be in the language " << mk_pp(re, m) << std::endl;);
+                                expr_ref conflictClause(m.mk_or(m.mk_not(ctx.mk_eq_atom(str, mk_string(strValue))), str_in_re), m);
+                                assert_axiom(conflictClause);
+                                add_persisted_axiom(conflictClause);
+                                return FC_CONTINUE;
+                            }
+                        } else if (m.is_false(valueInRe)) {
+                            if (current_assignment == l_true) {
+                                TRACE("str", tout << "regex conflict: " << mk_pp(str, m) << " = \"" << strValue << "\" but must be in the language " << mk_pp(re, m) << std::endl;);
+                                expr_ref conflictClause(m.mk_or(m.mk_not(ctx.mk_eq_atom(str, mk_string(strValue))), m.mk_not(str_in_re)), m);
+                                assert_axiom(conflictClause);
+                                add_persisted_axiom(conflictClause);
+                                return FC_CONTINUE;
+                            }
+                        } else {
+                            // try to keep going, but don't assume the current assignment is right or wrong
+                            regexOK = false;
+                            break;
+                        }
+                    } else {
+                        regexOK = false;
+                        break;
                    }
                } // foreach (str.in.re in regex_terms)
            }
-
-            if (unused_internal_variables.empty()) {
-                TRACE("str", tout << "All variables are assigned. Done!" << std::endl;);
-                m_stats.m_solved_by = 2;
-                return FC_DONE;
-            } else {
-                TRACE("str", tout << "Assigning decoy values to free internal variables." << std::endl;);
-                for (std::set<expr*>::iterator it = unused_internal_variables.begin(); it != unused_internal_variables.end(); ++it) {
-                    expr * var = *it;
-                    expr_ref assignment(m.mk_eq(var, mk_string("**unused**")), m);
-                    assert_axiom(assignment);
+            // we're not done if some variable in a regex membership predicate was unassigned
+            if (regexOK) {
+                if (unused_internal_variables.empty()) {
+                    TRACE("str", tout << "All variables are assigned. Done!" << std::endl;);
+                    m_stats.m_solved_by = 2;
+                    return FC_DONE;
+                } else {
+                    TRACE("str", tout << "Assigning decoy values to free internal variables." << std::endl;);
+                    for (std::set<expr*>::iterator it = unused_internal_variables.begin(); it != unused_internal_variables.end(); ++it) {
+                        expr * var = *it;
+                        expr_ref assignment(m.mk_eq(var, mk_string("**unused**")), m);
+                        assert_axiom(assignment);
+                    }
+                    return FC_CONTINUE;
                }
-                return FC_CONTINUE;
            }
        }

--- a/src/smt/theory_str_mc.cpp
+++ b/src/smt/theory_str_mc.cpp
@ -571,7 +571,33 @@ namespace smt {
                // If the membership constraint is true, we assert a conflict clause.
                // If the membership constraint is false, we ignore the constraint.
                if (polarity) {
-                    cex = m.mk_or(m.mk_not(f), m.mk_not(ctx.mk_eq_atom(mk_strlen(str), mk_int(str_chars.size()))));
+                    // Decompose `str` into its components if it is a concatenation of terms.
+                    // This fixes cases where the length of S in (S in RE) might be correct
+                    // if the lengths of components of S are assigned in a different way.
+                    expr_ref_vector str_terms(m);
+                    expr_ref_vector str_terms_eq_len(m);
+                    str_terms.push_back(str);
+                    while (!str_terms.empty()) {
+                        expr* str_term = str_terms.back();
+                        str_terms.pop_back();
+                        expr* arg0;
+                        expr* arg1;
+                        if (u.str.is_concat(str_term, arg0, arg1)) {
+                            str_terms.push_back(arg0);
+                            str_terms.push_back(arg1);
+                        } else {
+                            rational termLen;
+                            if (fixed_length_get_len_value(str_term, termLen)) {
+                                str_terms_eq_len.push_back(ctx.mk_eq_atom(mk_strlen(str_term), mk_int(termLen)));
+                            } else {
+                                // this is strange, since we knew the length of `str` in order to get here
+                                cex = expr_ref(m_autil.mk_ge(mk_strlen(str_term), mk_int(0)), m);
+                                return false;
+                            }
+                        }
+                    }
+
+                    cex = m.mk_or(m.mk_not(f), m.mk_not(mk_and(str_terms_eq_len)));
                    ctx.get_rewriter()(cex);
                    return false;
                } else {
@ -887,6 +913,9 @@ namespace smt {
        uninterpreted_to_char_subterm_map.reset();
        fixed_length_lesson.reset();

+        // All reduced Boolean formulas in the current assignment
+        expr_ref_vector fixed_length_reduced_boolean_formulas(m);
+
        // Boolean formulas on which to apply abstraction refinement.
        expr_ref_vector abstracted_boolean_formulas(m);

@ -951,6 +980,7 @@ namespace smt {
                            add_persisted_axiom(cex);
                            return l_undef;
                        }
+                        fixed_length_reduced_boolean_formulas.push_back(f);
                    } else {
                        TRACE("str_fl", tout << "skip reducing formula " << mk_pp(f, m) << ", not an equality over strings" << std::endl;);
                    }
@ -963,6 +993,7 @@ namespace smt {
                        add_persisted_axiom(cex_clause);
                        return l_undef;
                    }
+                    fixed_length_reduced_boolean_formulas.push_back(f);
                } else if (u.str.is_contains(f)) {
                    // TODO in some cases (e.g. len(haystack) is only slightly greater than len(needle))
                    // we might be okay to assert the full disjunction because there are very few disjuncts
@ -978,6 +1009,7 @@ namespace smt {
                            add_persisted_axiom(cex);
                            return l_undef;
                        }
+                        fixed_length_reduced_boolean_formulas.push_back(f);
                    }
                } else if (u.str.is_prefix(f)) {
                    TRACE("str_fl", tout << "reduce positive prefix: " << mk_pp(f, m) << std::endl;);
@ -988,6 +1020,7 @@ namespace smt {
                        add_persisted_axiom(cex);
                        return l_undef;
                    }
+                    fixed_length_reduced_boolean_formulas.push_back(f);
                } else if (u.str.is_suffix(f)) {
                    TRACE("str_fl", tout << "reduce positive suffix: " << mk_pp(f, m) << std::endl;);
                    expr_ref cex(m);
@ -997,6 +1030,7 @@ namespace smt {
                        add_persisted_axiom(cex);
                        return l_undef;
                    }
+                    fixed_length_reduced_boolean_formulas.push_back(f);
                }else if (m.is_not(f, subterm)) {
                    // if subterm is a string formula such as an equality, reduce it as a disequality
                    if (m.is_eq(subterm, lhs, rhs)) {
@ -1012,6 +1046,7 @@ namespace smt {
                                add_persisted_axiom(cex);
                                return l_undef;
                            }
+                            fixed_length_reduced_boolean_formulas.push_back(f);
                        }
                    } else if (u.str.is_in_re(subterm)) {
                        TRACE("str_fl", tout << "reduce negative regex membership: " << mk_pp(f, m) << std::endl;);
@ -1022,6 +1057,7 @@ namespace smt {
                            add_persisted_axiom(cex_clause);
                            return l_undef;
                        }
+                        fixed_length_reduced_boolean_formulas.push_back(f);
                    } else if (u.str.is_contains(subterm)) {
                        TRACE("str_fl", tout << "reduce negative contains: " << mk_pp(subterm, m) << std::endl;);
                        expr_ref cex(m);
@ -1031,6 +1067,7 @@ namespace smt {
                            add_persisted_axiom(cex);
                            return l_undef;
                        }
+                        fixed_length_reduced_boolean_formulas.push_back(f);
                    } else if (u.str.is_prefix(subterm)) {
                        TRACE("str_fl", tout << "reduce negative prefix: " << mk_pp(subterm, m) << std::endl;);
                        expr_ref cex(m);
@ -1040,6 +1077,7 @@ namespace smt {
                            add_persisted_axiom(cex);
                            return l_undef;
                        }
+                        fixed_length_reduced_boolean_formulas.push_back(f);
                    } else if (u.str.is_suffix(subterm)) {
                        TRACE("str_fl", tout << "reduce negative suffix: " << mk_pp(subterm, m) << std::endl;);
                        expr_ref cex(m);
@ -1049,6 +1087,7 @@ namespace smt {
                            add_persisted_axiom(cex);
                            return l_undef;
                        }
+                        fixed_length_reduced_boolean_formulas.push_back(f);
                    } else {
                        TRACE("str_fl", tout << "skip reducing formula " << mk_pp(f, m) << ", not a boolean formula we handle" << std::endl;);
                    }
@ -1081,6 +1120,10 @@ namespace smt {
                }
                tout << std::endl;
            }
+            tout << "reduced boolean formulas:" << std::endl;
+              for (auto e : fixed_length_reduced_boolean_formulas) {
+                  tout << mk_pp(e, m) << std::endl;
+              }
        );

        TRACE("str_fl", tout << "calling subsolver" << std::endl;);
@ -1184,6 +1227,9 @@ namespace smt {
                    rational val = e.get_value();
                    cex.push_back(m.mk_eq(u.str.mk_length(var), mk_int(val)));
                }
+                for (auto e : fixed_length_reduced_boolean_formulas) {
+                    cex.push_back(e);
+                }
                return l_false;
            } else {
                TRACE("str_fl", tout << "subsolver found UNSAT; reconstructing unsat core" << std::endl;);
--- a/src/smt/theory_str_regex.cpp
+++ b/src/smt/theory_str_regex.cpp
@ -206,69 +206,8 @@ namespace smt {
                        regex_inc_counter(regex_length_attempt_count, re);
                        continue;
                    } else {
-                        expr_ref pathConstraint(m);
-                        expr_ref characterConstraints(m);
-                        pathConstraint = generate_regex_path_constraints(str, assumption.get_automaton(), exact_length_value, characterConstraints);
-                        TRACE("str", tout << "generated regex path constraint " << mk_pp(pathConstraint, m) << std::endl;);
-                        TRACE("str", tout << "character constraints are " << mk_pp(characterConstraints, m) << std::endl;);
-
-                        expr_ref_vector lhs_terms(m);
-                        if (current_assignment == l_true) {
-                            lhs_terms.push_back(str_in_re);
-                        } else {
-                            lhs_terms.push_back(m.mk_not(str_in_re));
-                        }
-                        lhs_terms.push_back(ctx.mk_eq_atom(mk_strlen(str), m_autil.mk_numeral(exact_length_value, true)));
-                        expr_ref lhs(mk_and(lhs_terms), m);
-
-                        // If the path constraint comes out as "false", this means there are no paths of that length
-                        // in the automaton. If the polarity is the same, we can assert a conflict clause.
-                        // If the polarity is opposite, we ignore the path constraint.
-
-                        if (m.is_false(pathConstraint)) {
-                            if ( (current_assignment == l_true && assumption.get_polarity())
-                                    || (current_assignment == l_false && !assumption.get_polarity())) {
-                                // automaton and constraint have same polarity -- assert conflict clause
-                                TRACE("str", tout << "path constraint is false with matching polarity; asserting conflict clause" << std::endl;);
-                                expr_ref conflict(m.mk_not(mk_and(lhs_terms)), m);
-                                assert_axiom(conflict);
-                                // don't set up "regex_terms_with_path_constraints" as a conflict clause is not a path constraint
-                            } else {
-                                // automaton and constraint have opposite polarity -- ignore path constraint
-                                TRACE("str", tout << "path constraint is false with opposite polarity; ignoring path constraint" << std::endl;);
-                                assert_implication(lhs, characterConstraints);
-                                regex_terms_with_path_constraints.insert(str_in_re);
-                                m_trail_stack.push(insert_obj_trail<theory_str, expr>(regex_terms_with_path_constraints, str_in_re));
-                            }
-                            regex_axiom_add = true;
-                        } else {
-                            // If the automaton was built with the same polarity as the constraint,
-                            // assert directly. Otherwise, negate the path constraint
-                            if ( (current_assignment == l_true && assumption.get_polarity())
-                                    || (current_assignment == l_false && !assumption.get_polarity())) {
-                                TRACE("str", tout << "automaton and regex term have same polarity" << std::endl;);
-                                expr_ref rhs(m.mk_and(pathConstraint, characterConstraints), m);
-                                assert_implication(lhs, rhs);
-                            } else {
-                                TRACE("str", tout << "automaton and regex term have opposite polarity" << std::endl;);
-                                expr_ref rhs(m.mk_and(m.mk_not(pathConstraint), characterConstraints), m);
-                                assert_implication(lhs, rhs);
-                            }
-                            regex_terms_with_path_constraints.insert(str_in_re);
-                            m_trail_stack.push(insert_obj_trail<theory_str, expr>(regex_terms_with_path_constraints, str_in_re));
-                            regex_axiom_add = true;
-                        }
-
-                        // increment LengthAttemptCount
-                        regex_inc_counter(regex_length_attempt_count, re);
-
-                        TRACE("str",
-                              {
-                                  unsigned v = regex_get_counter(regex_length_attempt_count, re);
-                                  tout << "length attempt count for " << mk_pp(re, m) << " is " << v << std::endl;
-                              });
-
-                        continue;
+                        // fixed-length model construction handles path constraints on our behalf, and with a better reduction
+                        return;
                    }
                } else {
                    // no automata available, or else all bounds assumptions are invalid