From faeb936a8358c68177f796c5656133c71bb032f8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Mar 2026 02:25:29 +0000 Subject: [PATCH] Continue nseq: regex synthesis, model generation, length propagation Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com> --- src/smt/theory_nseq.cpp | 330 ++++++++++++++++++++++++++++++++++++---- src/smt/theory_nseq.h | 6 + 2 files changed, 305 insertions(+), 31 deletions(-) diff --git a/src/smt/theory_nseq.cpp b/src/smt/theory_nseq.cpp index b1128317e..368f5f310 100644 --- a/src/smt/theory_nseq.cpp +++ b/src/smt/theory_nseq.cpp @@ -94,8 +94,11 @@ namespace smt { if (check_disequalities()) return FC_CONTINUE; - if (all_eqs_solved() && m_state.mems().empty()) - return FC_DONE; + if (all_eqs_solved()) { + // All word equations solved; check if remaining memberships are handled + if (all_mems_checked()) + return FC_DONE; + } return FC_GIVEUP; } @@ -195,6 +198,12 @@ namespace smt { if (m_find.find(v1) != m_find.find(v2)) m_find.merge(v1, v2); m_state.add_eq(o1, o2, dep); + // Propagate length equality: len(o1) = len(o2) + ensure_length_axiom(o1); + ensure_length_axiom(o2); + expr_ref len1 = mk_len(o1); + expr_ref len2 = mk_len(o2); + add_axiom(mk_eq(len1, len2, false)); IF_VERBOSE(3, verbose_stream() << "new_eq_eh: " << mk_bounded_pp(o1, m) << " = " << mk_bounded_pp(o2, m) << " (total eqs: " << m_state.eqs().size() << ")\n"); TRACE(seq, tout << "new eq: " << mk_bounded_pp(o1, m) << " = " << mk_bounded_pp(o2, m) << "\n";); @@ -1397,6 +1406,84 @@ namespace smt { return r; } + // DFS: find a string of exactly target_len in regex r. + // On success, appends characters to 'result' and returns true. + // cur_len is how many characters have already been appended. + bool theory_nseq::dfs_regex(expr* r, unsigned target_len, unsigned cur_len, zstring& result) { + if (cur_len == target_len) { + expr_ref nullable = m_seq_rewrite.is_nullable(expr_ref(r, m)); + m_rewrite(nullable); + return m.is_true(nullable); + } + // Try printable characters: lowercase, uppercase, digits, then common punctuation + static const unsigned char priority[] = { + 'a','b','c','d','e','f','g','h','i','j','k','l','m', + 'n','o','p','q','r','s','t','u','v','w','x','y','z', + '0','1','2','3','4','5','6','7','8','9', + 'A','B','C','D','E','F','G','H','I','J','K','L','M', + 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z', + ' ','.', ',', '!', '?', '-', '_', '@', '#', '$', '%', '&', '*' + }; + for (unsigned c : priority) { + expr_ref ch(m_util.mk_char(c), m); + expr_ref dr = m_seq_rewrite.mk_derivative(ch, expr_ref(r, m)); + m_rewrite(dr); + if (m_util.re.is_empty(dr)) + continue; + result = result + zstring(c); + if (dfs_regex(dr, target_len, cur_len + 1, result)) + return true; + // Backtrack: remove last character + result = result.extract(0, result.length() - 1); + } + return false; + } + + // Synthesize a concrete string of exactly `len` accepted by `regex`. + // Returns true and sets `result` on success. + bool theory_nseq::synthesize_regex_string(expr* regex, unsigned len, zstring& result) { + result = zstring(); + return dfs_regex(regex, len, 0, result); + } + + // Add length-bound axioms from regex structure: len(s) >= min_length(r). + // Also adds: len(s) = 0 → nullable(r) (otherwise conflict). + bool theory_nseq::add_regex_length_axioms(nseq_mem const& mem) { + expr* s = mem.str(); + expr* r = mem.regex(); + bool sign = mem.sign(); + nseq_dependency* dep = mem.dep(); + bool progress = false; + + if (sign) { + // s ∈ r → len(s) >= min_length(r) + unsigned minlen = m_util.re.min_length(r); + if (minlen > 0) { + expr_ref len_s = mk_len(s); + ensure_length_axiom(s); + literal lb = mk_literal(m_autil.mk_ge(len_s, m_autil.mk_int(minlen))); + if (ctx.get_assignment(lb) != l_true) { + add_axiom(lb); + progress = true; + } + } + // If r is not nullable and len(s) = 0, conflict + expr_ref nullable = m_seq_rewrite.is_nullable(expr_ref(r, m)); + m_rewrite(nullable); + if (m.is_false(nullable)) { + // s ∈ r → s ≠ "" + expr_ref emp(m_util.str.mk_empty(s->get_sort()), m); + ensure_enode(emp); + literal s_empty = mk_eq(s, emp, false); + if (ctx.get_assignment(~s_empty) != l_true) { + add_axiom(~s_empty); + progress = true; + } + } + } + return progress; + } + bool theory_nseq::check_regex_memberships() { bool progress = false; for (auto const& mem : m_state.mems()) { @@ -1414,43 +1501,99 @@ namespace smt { bool sign = mem.sign(); nseq_dependency* dep = mem.dep(); + // Add structural length axioms from the regex + if (add_regex_length_axioms(mem)) + return true; + // Try to determine the string value from the e-graph zstring sval; - if (!is_ground_string(s, sval)) + if (is_ground_string(s, sval)) { + // Compute derivatives for the known string + expr_ref derived = derive_regex(r, sval); + + // Check nullable: does the derived regex accept the empty word? + expr_ref nullable = m_seq_rewrite.is_nullable(derived); + m_rewrite(nullable); + + if (sign) { + // Positive membership: s must be in r + if (m.is_false(nullable) || m_util.re.is_empty(derived)) { + set_conflict(dep); + return true; + } + } + else { + // Negative membership: s must NOT be in r + if (m.is_true(nullable)) { + set_conflict(dep); + return true; + } + } return false; + } - // Compute derivatives for the known string - expr_ref derived = derive_regex(r, sval); - - // Check nullable: does the derived regex accept the empty word? - expr_ref nullable = m_seq_rewrite.is_nullable(derived); - m_rewrite(nullable); - + // String is not yet ground. For positive membership: + // if we know the exact length, synthesize a concrete string. if (sign) { - // Positive membership: s must be in r - if (m.is_false(nullable)) { - // String is fully determined but regex rejects → conflict - set_conflict(dep); - return true; - } - } - else { - // Negative membership: s must NOT be in r - if (m.is_true(nullable)) { - // String is fully determined and regex accepts → conflict - set_conflict(dep); - return true; + rational len_val; + if (get_length(s, len_val) && len_val.is_nonneg() && len_val.is_unsigned()) { + unsigned len = len_val.get_unsigned(); + // Feasibility check only: detect conflicts but don't force a specific value. + // Model generation (mk_value) will synthesize the actual string. + zstring synth; + if (!synthesize_regex_string(r, len, synth)) { + // No string of that length accepted by regex - conflict + IF_VERBOSE(3, verbose_stream() << " regex conflict: no string of len=" << len + << " in " << mk_bounded_pp(r, m, 2) << "\n";); + set_conflict(dep); + return true; + } + IF_VERBOSE(3, verbose_stream() << " regex feasible: len=" << len + << " has witness \"" << synth << "\"\n";); } } - // Check if the derived regex is the empty set - if (sign && m_util.re.is_empty(derived)) { - set_conflict(dep); - return true; - } return false; } + // Returns true if every membership constraint is already verified + // (ground strings checked) or provably feasible (non-ground with known length). + bool theory_nseq::all_mems_checked() { + for (auto const& mem : m_state.mems()) { + expr* s = mem.str(); + expr* r = mem.regex(); + bool sign = mem.sign(); + nseq_dependency* dep = mem.dep(); + zstring sval; + if (is_ground_string(s, sval)) { + // Verify the ground string + expr_ref derived = derive_regex(r, sval); + expr_ref nullable = m_seq_rewrite.is_nullable(derived); + m_rewrite(nullable); + if (sign && !m.is_true(nullable)) + return false; + if (!sign && !m.is_false(nullable)) + return false; + } + else if (sign) { + // Non-ground positive membership: check feasibility by length + rational len_val; + if (get_length(s, len_val) && len_val.is_nonneg() && len_val.is_unsigned()) { + zstring synth; + if (!synthesize_regex_string(r, len_val.get_unsigned(), synth)) + return false; // infeasible; conflict should have been set by check_regex_mem + // Feasible - model generator will assign the right value + } + else { + // Length unknown and regex is empty: unconditionally infeasible + if (m_util.re.is_empty(r)) + return false; + } + } + } + return true; + } + // ------------------------------------------------------- // Length reasoning // ------------------------------------------------------- @@ -1495,7 +1638,34 @@ namespace smt { bool theory_nseq::get_length(expr* e, rational& val) { expr_ref len = mk_len(e); - return m_arith_value.get_value(len, val); + if (m_arith_value.get_value(len, val)) + return true; + // Structural fallback: for str.substr(s, off, n) with concrete off and n, + // use the explicit length argument if it's a numeral and s has sufficient length. + expr* s = nullptr, *offset = nullptr, *len_arg = nullptr; + if (m_util.str.is_extract(e, s, offset, len_arg)) { + rational len_n; + if (m_autil.is_numeral(len_arg, len_n) && len_n.is_nonneg()) { + rational off_n; + if (m_autil.is_numeral(offset, off_n) && off_n.is_nonneg()) { + rational parent_len; + if (get_length(s, parent_len) && + off_n + len_n <= parent_len) { + val = len_n; + return true; + } + } + } + } + // Fallback via lower/upper bound: if lo == hi, that's the value + rational lo, hi; + bool s_lo, s_hi; + if (m_arith_value.get_lo(len, lo, s_lo) && !s_lo && + m_arith_value.get_up(len, hi, s_hi) && !s_hi && lo == hi) { + val = lo; + return true; + } + return false; } bool theory_nseq::lower_bound(expr* e, rational& lo) { @@ -1603,8 +1773,60 @@ namespace smt { st.update("nseq axioms", s.m_num_axioms); } + // Find the most constrained regex for a given string expression + // by scanning memberships that apply to e or its equivalence class root. + expr* theory_nseq::find_regex_for(expr* e) { + if (!ctx.e_internalized(e)) + return nullptr; + enode* root = ctx.get_enode(e)->get_root(); + for (auto const& mem : m_state.mems()) { + if (!mem.sign()) continue; + expr* ms = mem.str(); + if (!ctx.e_internalized(ms)) continue; + if (ctx.get_enode(ms)->get_root() == root) + return mem.regex(); + } + return nullptr; + } + + // Find any positive membership for a string expression whose base + // is `base` and which starts at position `pos`. Returns the regex + // and sets `cover_len` to how many characters it covers. + // Used to synthesize sub-range values during model generation. + expr* theory_nseq::find_substr_regex(expr* base, unsigned pos, + unsigned total_len, unsigned& cover_len) { + if (!ctx.e_internalized(base)) + return nullptr; + enode* base_root = ctx.get_enode(base)->get_root(); + for (auto const& mem : m_state.mems()) { + if (!mem.sign()) continue; + expr* ms = mem.str(); + if (!ctx.e_internalized(ms)) continue; + // Check if ms is str.substr(base, pos, k) for some k + expr* s = nullptr, *off = nullptr, *len_arg = nullptr; + if (!m_util.str.is_extract(ms, s, off, len_arg)) continue; + if (!ctx.e_internalized(s)) continue; + if (ctx.get_enode(s)->get_root() != base_root) continue; + // Check offset matches + rational off_v; + if (!m_autil.is_numeral(off, off_v) || !off_v.is_nonneg() || + !off_v.is_unsigned() || off_v.get_unsigned() != pos) continue; + // Get the length (may be symbolic, use get_length) + rational len_v; + if (m_autil.is_numeral(len_arg, len_v) && len_v.is_nonneg() && len_v.is_unsigned()) { + cover_len = len_v.get_unsigned(); + return mem.regex(); + } + if (get_length(ms, len_v) && len_v.is_nonneg() && len_v.is_unsigned()) { + cover_len = len_v.get_unsigned(); + return mem.regex(); + } + } + return nullptr; + } + // ------------------------------------------------------- - // Model generation (stub) + // Model generation // ------------------------------------------------------- model_value_proc* theory_nseq::mk_value(enode* n, model_generator& mg) { @@ -1636,11 +1858,57 @@ namespace smt { curr = curr->get_next(); } while (curr != root); - // Try to construct a string of the correct length + // Try to construct a string satisfying any regex membership constraint rational len_val; if (get_length(e, len_val) && len_val.is_nonneg() && len_val.is_unsigned()) { unsigned len = len_val.get_unsigned(); + // Check if there's a direct regex constraint for this string + expr* regex = find_regex_for(e); + if (regex) { + zstring synth; + if (synthesize_regex_string(regex, len, synth)) { + expr_ref val(m_util.str.mk_string(synth), m); + m_fresh_values.push_back(val); + return alloc(expr_wrapper_proc, to_app(val)); + } + } + // Try to synthesize character by character from str.at(e, i) memberships. + // Preprocessors often split x ∈ r into (str.at x 0) ∈ r0, (str.substr x 1 k) ∈ r1, etc. zstring s; + bool from_chars = true; + unsigned pos = 0; + while (pos < len && from_chars) { + // Option A: there's a membership for str.at(e, pos) (length-1 char) + expr_ref at_i(m_util.str.mk_at(e, m_autil.mk_int(pos)), m); + expr* r = find_regex_for(at_i); + if (r) { + zstring ch; + if (synthesize_regex_string(r, 1, ch)) { + s = s + ch; + pos++; + continue; + } + } + // Option B: find a membership for str.substr(e, pos, k) and synthesize k chars + unsigned cover_len = 0; + expr* rs = find_substr_regex(e, pos, len, cover_len); + if (rs && cover_len > 0) { + zstring piece; + if (synthesize_regex_string(rs, cover_len, piece)) { + s = s + piece; + pos += cover_len; + continue; + } + } + from_chars = false; + } + if (from_chars && s.length() == len) { + expr_ref val(m_util.str.mk_string(s), m); + m_fresh_values.push_back(val); + return alloc(expr_wrapper_proc, to_app(val)); + } + // No regex constraint (or synthesis failed): use default fill + s = zstring(); for (unsigned i = 0; i < len; i++) s = s + zstring("A"); expr_ref val(m_util.str.mk_string(s), m); diff --git a/src/smt/theory_nseq.h b/src/smt/theory_nseq.h index 7830c08b9..72a2b29cb 100644 --- a/src/smt/theory_nseq.h +++ b/src/smt/theory_nseq.h @@ -115,8 +115,14 @@ namespace smt { // Regex membership bool check_regex_memberships(); bool check_regex_mem(nseq_mem const& mem); + bool add_regex_length_axioms(nseq_mem const& mem); + bool synthesize_regex_string(expr* regex, unsigned len, zstring& result); + bool dfs_regex(expr* r, unsigned target_len, unsigned cur_len, zstring& result); bool is_ground_string(expr* e, zstring& s); expr_ref derive_regex(expr* regex, zstring const& prefix); + bool all_mems_checked(); + expr* find_regex_for(expr* e); + expr* find_substr_regex(expr* base, unsigned pos, unsigned total_len, unsigned& cover_len); // String operation reductions void reduce_op(expr* e);