3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2026-03-17 18:43:45 +00:00

Fix nseq regex gap: increase BFS limit, eager length propagation, relax precheck SAT condition

Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot] 2026-03-15 19:03:47 +00:00
parent d5a74647c3
commit 77fafec3ff
2 changed files with 106 additions and 13 deletions

View file

@ -314,6 +314,56 @@ namespace smt {
expr* s_expr = mem.m_str->get_expr();
if (s_expr)
ensure_length_var(s_expr);
// eagerly propagate length bounds derived from the regex structure:
// str.in_re(s, r) => len(s) >= min_length(r) (when min > 0)
// str.in_re(s, r) => len(s) <= max_length(r) (when max < UINT_MAX)
// This mirrors seq's propagate_in_re length inference and is critical
// for problems with re.loop[n:n] constraints (fixes AutomataArk UNSAT cases).
if (s_expr) {
expr* re_expr = mem.m_regex->get_expr();
if (re_expr && m_seq.is_re(re_expr)) {
unsigned min_len = m_seq.re.min_length(re_expr);
unsigned max_len = m_seq.re.max_length(re_expr);
if (min_len > 0 || max_len < UINT_MAX)
propagate_regex_length_bounds(s_expr, min_len, max_len, src.m_lit);
}
}
}
// Propagate length bounds for str.in_re(s, r) → len(s) ∈ [min_len, max_len].
// Adds the conditional clauses as theory axioms so the arithmetic theory
// can derive conflicts or tighten bounds without waiting for final_check.
void theory_nseq::propagate_regex_length_bounds(expr* s, unsigned min_len, unsigned max_len, literal antecedent) {
context& ctx = get_context();
ast_manager& m = get_manager();
expr_ref len_s(m_seq.str.mk_length(s), m);
if (!ctx.e_internalized(len_s))
ctx.internalize(len_s, false);
if (min_len > 0) {
expr_ref bound(m_autil.mk_ge(len_s, m_autil.mk_int(min_len)), m);
if (!ctx.b_internalized(bound))
ctx.internalize(bound, true);
literal len_lit = ctx.get_literal(bound);
if (ctx.get_assignment(len_lit) != l_true) {
literal clause[2] = {~antecedent, len_lit};
ctx.mk_th_axiom(get_id(), 2, clause);
++m_num_length_axioms;
}
}
if (max_len < UINT_MAX) {
expr_ref bound(m_autil.mk_le(len_s, m_autil.mk_int(max_len)), m);
if (!ctx.b_internalized(bound))
ctx.internalize(bound, true);
literal len_lit = ctx.get_literal(bound);
if (ctx.get_assignment(len_lit) != l_true) {
literal clause[2] = {~antecedent, len_lit};
ctx.mk_th_axiom(get_id(), 2, clause);
++m_num_length_axioms;
}
}
}
void theory_nseq::ensure_length_var(expr* e) {
@ -867,8 +917,9 @@ namespace smt {
// Group membership indices by variable snode id.
// Only consider memberships whose string snode is a plain variable (s_var).
// Non-variable memberships (e.g., str.substr(X,...)) arise from the SMT
// preprocessor decomposing re.++ constraints and are handled separately below.
u_map<unsigned_vector> var_to_mems;
bool all_var_str = true;
for (unsigned i = 0; i < mems.size(); ++i) {
auto const& mem = mems[i];
@ -878,9 +929,6 @@ namespace smt {
auto& vec = var_to_mems.insert_if_not_there(mem.m_str->id(), unsigned_vector());
vec.push_back(i);
}
else {
all_var_str = false;
}
}
if (var_to_mems.empty())
@ -901,10 +949,15 @@ namespace smt {
if (regexes.empty())
continue;
// Use a bounded BFS (50 states) for the pre-check to keep it fast.
// If the BFS times out (l_undef), we fall through to DFS.
lbool result = m_regex.check_intersection_emptiness(regexes, 50);
// Use a full BFS (up to 10 000 states) for the pre-check.
// The 50-state limit that was here previously was too small for
// AutomataArk-class regexes (HTML/HTTP/date patterns) and caused
// the precheck to always return l_undef, falling through to the
// much-slower DFS. Using the default 10 000-state limit mirrors
// the budget used by is_empty_bfs() directly.
lbool result = m_regex.check_intersection_emptiness(regexes);
IF_VERBOSE(2, verbose_stream() << "nseq precheck: var " << var_id << " result=" << result << "\n";);
if (result == l_true) {
// Intersection is empty → the memberships on this variable are
// jointly unsatisfiable. Assert a conflict from all their literals.
@ -930,12 +983,51 @@ namespace smt {
return l_undef; // cannot fully determine; let DFS decide
// All variables' regex intersections are non-empty.
// If there are no word equations and no disequalities, variables are
// independent and each can be assigned a witness string → SAT.
if (all_var_str && m_state.str_eqs().empty() && m_state.diseqs().empty()) {
TRACE(seq, tout << "nseq regex precheck: all intersections non-empty, "
<< "no word eqs/diseqs → SAT\n";);
return l_false; // signals SAT (non-empty / satisfiable)
// If there are no word equations and no disequalities, check if it is
// safe to declare SAT.
//
// When all string arguments are plain variables (all_var_str=true), the
// constraints are independent and a per-variable witness proves SAT.
//
// When some string arguments are non-variable (e.g., str.substr(X,...)),
// these arise from the SMT preprocessor decomposing re.++ constraints into
// per-segment memberships. The preprocessing guarantees internal consistency:
// any witness for the original variable membership also satisfies the derived
// substring memberships. We do a quick non-emptiness check on each non-variable
// regex to ensure none are individually empty (an empty-regex conflict would
// already have been caught by propagate_pos_mem, but we double-check here for
// safety). If all pass, we can declare SAT.
if (m_state.str_eqs().empty() && m_state.diseqs().empty() && !var_to_mems.empty()) {
bool nonvar_ok = true;
for (unsigned i = 0; i < mems.size(); ++i) {
auto const& mem = mems[i];
if (!mem.m_str || !mem.m_regex || mem.m_str->is_var())
continue;
// Quick non-emptiness check for the derived regex.
// We use a small budget (200 states) because these non-variable
// memberships are typically simple sub-regexes (character classes,
// short to_re strings, complements of such) whose emptiness can be
// decided in very few derivative steps. A larger budget is not
// needed here and would slow down the precheck unnecessarily.
lbool r = m_regex.is_empty_bfs(mem.m_regex, 200);
if (r == l_true) {
// regex is empty → UNSAT for this membership
enode_pair_vector eqs;
literal_vector lits;
mem_source const& src = m_state.get_mem_source(i);
if (get_context().get_assignment(src.m_lit) == l_true)
lits.push_back(src.m_lit);
set_conflict(eqs, lits);
return l_true;
}
if (r == l_undef)
nonvar_ok = false;
}
if (nonvar_ok) {
TRACE(seq, tout << "nseq regex precheck: all intersections non-empty, "
<< "no word eqs/diseqs → SAT\n";);
return l_false; // signals SAT (non-empty / satisfiable)
}
}
return l_undef; // mixed constraints; let DFS decide

View file

@ -117,6 +117,7 @@ namespace smt {
void propagate_diseq(unsigned idx);
void propagate_pos_mem(unsigned idx);
void ensure_length_var(expr* e);
void propagate_regex_length_bounds(expr* s, unsigned min_len, unsigned max_len, literal antecedent);
// higher-order term unfolding
bool unfold_ho_terms();