3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2026-03-07 22:04:53 +00:00

Continue nseq: regex synthesis, model generation, length propagation

Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot] 2026-03-01 02:25:29 +00:00
parent a65b64ae63
commit faeb936a83
2 changed files with 305 additions and 31 deletions

View file

@ -94,8 +94,11 @@ namespace smt {
if (check_disequalities())
return FC_CONTINUE;
if (all_eqs_solved() && m_state.mems().empty())
return FC_DONE;
if (all_eqs_solved()) {
// All word equations solved; check if remaining memberships are handled
if (all_mems_checked())
return FC_DONE;
}
return FC_GIVEUP;
}
@ -195,6 +198,12 @@ namespace smt {
if (m_find.find(v1) != m_find.find(v2))
m_find.merge(v1, v2);
m_state.add_eq(o1, o2, dep);
// Propagate length equality: len(o1) = len(o2)
ensure_length_axiom(o1);
ensure_length_axiom(o2);
expr_ref len1 = mk_len(o1);
expr_ref len2 = mk_len(o2);
add_axiom(mk_eq(len1, len2, false));
IF_VERBOSE(3, verbose_stream() << "new_eq_eh: " << mk_bounded_pp(o1, m) << " = " << mk_bounded_pp(o2, m)
<< " (total eqs: " << m_state.eqs().size() << ")\n");
TRACE(seq, tout << "new eq: " << mk_bounded_pp(o1, m) << " = " << mk_bounded_pp(o2, m) << "\n";);
@ -1397,6 +1406,84 @@ namespace smt {
return r;
}
// DFS: find a string of exactly target_len in regex r.
// On success, appends characters to 'result' and returns true.
// cur_len is how many characters have already been appended.
bool theory_nseq::dfs_regex(expr* r, unsigned target_len, unsigned cur_len, zstring& result) {
if (cur_len == target_len) {
expr_ref nullable = m_seq_rewrite.is_nullable(expr_ref(r, m));
m_rewrite(nullable);
return m.is_true(nullable);
}
// Try printable characters: lowercase, uppercase, digits, then common punctuation
static const unsigned char priority[] = {
'a','b','c','d','e','f','g','h','i','j','k','l','m',
'n','o','p','q','r','s','t','u','v','w','x','y','z',
'0','1','2','3','4','5','6','7','8','9',
'A','B','C','D','E','F','G','H','I','J','K','L','M',
'N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
' ','.', ',', '!', '?', '-', '_', '@', '#', '$', '%', '&', '*'
};
for (unsigned c : priority) {
expr_ref ch(m_util.mk_char(c), m);
expr_ref dr = m_seq_rewrite.mk_derivative(ch, expr_ref(r, m));
m_rewrite(dr);
if (m_util.re.is_empty(dr))
continue;
result = result + zstring(c);
if (dfs_regex(dr, target_len, cur_len + 1, result))
return true;
// Backtrack: remove last character
result = result.extract(0, result.length() - 1);
}
return false;
}
// Synthesize a concrete string of exactly `len` accepted by `regex`.
// Returns true and sets `result` on success.
bool theory_nseq::synthesize_regex_string(expr* regex, unsigned len, zstring& result) {
result = zstring();
return dfs_regex(regex, len, 0, result);
}
// Add length-bound axioms from regex structure: len(s) >= min_length(r).
// Also adds: len(s) = 0 → nullable(r) (otherwise conflict).
bool theory_nseq::add_regex_length_axioms(nseq_mem const& mem) {
expr* s = mem.str();
expr* r = mem.regex();
bool sign = mem.sign();
nseq_dependency* dep = mem.dep();
bool progress = false;
if (sign) {
// s ∈ r → len(s) >= min_length(r)
unsigned minlen = m_util.re.min_length(r);
if (minlen > 0) {
expr_ref len_s = mk_len(s);
ensure_length_axiom(s);
literal lb = mk_literal(m_autil.mk_ge(len_s, m_autil.mk_int(minlen)));
if (ctx.get_assignment(lb) != l_true) {
add_axiom(lb);
progress = true;
}
}
// If r is not nullable and len(s) = 0, conflict
expr_ref nullable = m_seq_rewrite.is_nullable(expr_ref(r, m));
m_rewrite(nullable);
if (m.is_false(nullable)) {
// s ∈ r → s ≠ ""
expr_ref emp(m_util.str.mk_empty(s->get_sort()), m);
ensure_enode(emp);
literal s_empty = mk_eq(s, emp, false);
if (ctx.get_assignment(~s_empty) != l_true) {
add_axiom(~s_empty);
progress = true;
}
}
}
return progress;
}
bool theory_nseq::check_regex_memberships() {
bool progress = false;
for (auto const& mem : m_state.mems()) {
@ -1414,43 +1501,99 @@ namespace smt {
bool sign = mem.sign();
nseq_dependency* dep = mem.dep();
// Add structural length axioms from the regex
if (add_regex_length_axioms(mem))
return true;
// Try to determine the string value from the e-graph
zstring sval;
if (!is_ground_string(s, sval))
if (is_ground_string(s, sval)) {
// Compute derivatives for the known string
expr_ref derived = derive_regex(r, sval);
// Check nullable: does the derived regex accept the empty word?
expr_ref nullable = m_seq_rewrite.is_nullable(derived);
m_rewrite(nullable);
if (sign) {
// Positive membership: s must be in r
if (m.is_false(nullable) || m_util.re.is_empty(derived)) {
set_conflict(dep);
return true;
}
}
else {
// Negative membership: s must NOT be in r
if (m.is_true(nullable)) {
set_conflict(dep);
return true;
}
}
return false;
}
// Compute derivatives for the known string
expr_ref derived = derive_regex(r, sval);
// Check nullable: does the derived regex accept the empty word?
expr_ref nullable = m_seq_rewrite.is_nullable(derived);
m_rewrite(nullable);
// String is not yet ground. For positive membership:
// if we know the exact length, synthesize a concrete string.
if (sign) {
// Positive membership: s must be in r
if (m.is_false(nullable)) {
// String is fully determined but regex rejects → conflict
set_conflict(dep);
return true;
}
}
else {
// Negative membership: s must NOT be in r
if (m.is_true(nullable)) {
// String is fully determined and regex accepts → conflict
set_conflict(dep);
return true;
rational len_val;
if (get_length(s, len_val) && len_val.is_nonneg() && len_val.is_unsigned()) {
unsigned len = len_val.get_unsigned();
// Feasibility check only: detect conflicts but don't force a specific value.
// Model generation (mk_value) will synthesize the actual string.
zstring synth;
if (!synthesize_regex_string(r, len, synth)) {
// No string of that length accepted by regex - conflict
IF_VERBOSE(3, verbose_stream() << " regex conflict: no string of len=" << len
<< " in " << mk_bounded_pp(r, m, 2) << "\n";);
set_conflict(dep);
return true;
}
IF_VERBOSE(3, verbose_stream() << " regex feasible: len=" << len
<< " has witness \"" << synth << "\"\n";);
}
}
// Check if the derived regex is the empty set
if (sign && m_util.re.is_empty(derived)) {
set_conflict(dep);
return true;
}
return false;
}
// Returns true if every membership constraint is already verified
// (ground strings checked) or provably feasible (non-ground with known length).
bool theory_nseq::all_mems_checked() {
for (auto const& mem : m_state.mems()) {
expr* s = mem.str();
expr* r = mem.regex();
bool sign = mem.sign();
nseq_dependency* dep = mem.dep();
zstring sval;
if (is_ground_string(s, sval)) {
// Verify the ground string
expr_ref derived = derive_regex(r, sval);
expr_ref nullable = m_seq_rewrite.is_nullable(derived);
m_rewrite(nullable);
if (sign && !m.is_true(nullable))
return false;
if (!sign && !m.is_false(nullable))
return false;
}
else if (sign) {
// Non-ground positive membership: check feasibility by length
rational len_val;
if (get_length(s, len_val) && len_val.is_nonneg() && len_val.is_unsigned()) {
zstring synth;
if (!synthesize_regex_string(r, len_val.get_unsigned(), synth))
return false; // infeasible; conflict should have been set by check_regex_mem
// Feasible - model generator will assign the right value
}
else {
// Length unknown and regex is empty: unconditionally infeasible
if (m_util.re.is_empty(r))
return false;
}
}
}
return true;
}
// -------------------------------------------------------
// Length reasoning
// -------------------------------------------------------
@ -1495,7 +1638,34 @@ namespace smt {
bool theory_nseq::get_length(expr* e, rational& val) {
expr_ref len = mk_len(e);
return m_arith_value.get_value(len, val);
if (m_arith_value.get_value(len, val))
return true;
// Structural fallback: for str.substr(s, off, n) with concrete off and n,
// use the explicit length argument if it's a numeral and s has sufficient length.
expr* s = nullptr, *offset = nullptr, *len_arg = nullptr;
if (m_util.str.is_extract(e, s, offset, len_arg)) {
rational len_n;
if (m_autil.is_numeral(len_arg, len_n) && len_n.is_nonneg()) {
rational off_n;
if (m_autil.is_numeral(offset, off_n) && off_n.is_nonneg()) {
rational parent_len;
if (get_length(s, parent_len) &&
off_n + len_n <= parent_len) {
val = len_n;
return true;
}
}
}
}
// Fallback via lower/upper bound: if lo == hi, that's the value
rational lo, hi;
bool s_lo, s_hi;
if (m_arith_value.get_lo(len, lo, s_lo) && !s_lo &&
m_arith_value.get_up(len, hi, s_hi) && !s_hi && lo == hi) {
val = lo;
return true;
}
return false;
}
bool theory_nseq::lower_bound(expr* e, rational& lo) {
@ -1603,8 +1773,60 @@ namespace smt {
st.update("nseq axioms", s.m_num_axioms);
}
// Find the most constrained regex for a given string expression
// by scanning memberships that apply to e or its equivalence class root.
expr* theory_nseq::find_regex_for(expr* e) {
if (!ctx.e_internalized(e))
return nullptr;
enode* root = ctx.get_enode(e)->get_root();
for (auto const& mem : m_state.mems()) {
if (!mem.sign()) continue;
expr* ms = mem.str();
if (!ctx.e_internalized(ms)) continue;
if (ctx.get_enode(ms)->get_root() == root)
return mem.regex();
}
return nullptr;
}
// Find any positive membership for a string expression whose base
// is `base` and which starts at position `pos`. Returns the regex
// and sets `cover_len` to how many characters it covers.
// Used to synthesize sub-range values during model generation.
expr* theory_nseq::find_substr_regex(expr* base, unsigned pos,
unsigned total_len, unsigned& cover_len) {
if (!ctx.e_internalized(base))
return nullptr;
enode* base_root = ctx.get_enode(base)->get_root();
for (auto const& mem : m_state.mems()) {
if (!mem.sign()) continue;
expr* ms = mem.str();
if (!ctx.e_internalized(ms)) continue;
// Check if ms is str.substr(base, pos, k) for some k
expr* s = nullptr, *off = nullptr, *len_arg = nullptr;
if (!m_util.str.is_extract(ms, s, off, len_arg)) continue;
if (!ctx.e_internalized(s)) continue;
if (ctx.get_enode(s)->get_root() != base_root) continue;
// Check offset matches
rational off_v;
if (!m_autil.is_numeral(off, off_v) || !off_v.is_nonneg() ||
!off_v.is_unsigned() || off_v.get_unsigned() != pos) continue;
// Get the length (may be symbolic, use get_length)
rational len_v;
if (m_autil.is_numeral(len_arg, len_v) && len_v.is_nonneg() && len_v.is_unsigned()) {
cover_len = len_v.get_unsigned();
return mem.regex();
}
if (get_length(ms, len_v) && len_v.is_nonneg() && len_v.is_unsigned()) {
cover_len = len_v.get_unsigned();
return mem.regex();
}
}
return nullptr;
}
// -------------------------------------------------------
// Model generation (stub)
// Model generation
// -------------------------------------------------------
model_value_proc* theory_nseq::mk_value(enode* n, model_generator& mg) {
@ -1636,11 +1858,57 @@ namespace smt {
curr = curr->get_next();
} while (curr != root);
// Try to construct a string of the correct length
// Try to construct a string satisfying any regex membership constraint
rational len_val;
if (get_length(e, len_val) && len_val.is_nonneg() && len_val.is_unsigned()) {
unsigned len = len_val.get_unsigned();
// Check if there's a direct regex constraint for this string
expr* regex = find_regex_for(e);
if (regex) {
zstring synth;
if (synthesize_regex_string(regex, len, synth)) {
expr_ref val(m_util.str.mk_string(synth), m);
m_fresh_values.push_back(val);
return alloc(expr_wrapper_proc, to_app(val));
}
}
// Try to synthesize character by character from str.at(e, i) memberships.
// Preprocessors often split x ∈ r into (str.at x 0) ∈ r0, (str.substr x 1 k) ∈ r1, etc.
zstring s;
bool from_chars = true;
unsigned pos = 0;
while (pos < len && from_chars) {
// Option A: there's a membership for str.at(e, pos) (length-1 char)
expr_ref at_i(m_util.str.mk_at(e, m_autil.mk_int(pos)), m);
expr* r = find_regex_for(at_i);
if (r) {
zstring ch;
if (synthesize_regex_string(r, 1, ch)) {
s = s + ch;
pos++;
continue;
}
}
// Option B: find a membership for str.substr(e, pos, k) and synthesize k chars
unsigned cover_len = 0;
expr* rs = find_substr_regex(e, pos, len, cover_len);
if (rs && cover_len > 0) {
zstring piece;
if (synthesize_regex_string(rs, cover_len, piece)) {
s = s + piece;
pos += cover_len;
continue;
}
}
from_chars = false;
}
if (from_chars && s.length() == len) {
expr_ref val(m_util.str.mk_string(s), m);
m_fresh_values.push_back(val);
return alloc(expr_wrapper_proc, to_app(val));
}
// No regex constraint (or synthesis failed): use default fill
s = zstring();
for (unsigned i = 0; i < len; i++)
s = s + zstring("A");
expr_ref val(m_util.str.mk_string(s), m);

View file

@ -115,8 +115,14 @@ namespace smt {
// Regex membership
bool check_regex_memberships();
bool check_regex_mem(nseq_mem const& mem);
bool add_regex_length_axioms(nseq_mem const& mem);
bool synthesize_regex_string(expr* regex, unsigned len, zstring& result);
bool dfs_regex(expr* r, unsigned target_len, unsigned cur_len, zstring& result);
bool is_ground_string(expr* e, zstring& s);
expr_ref derive_regex(expr* regex, zstring const& prefix);
bool all_mems_checked();
expr* find_regex_for(expr* e);
expr* find_substr_regex(expr* base, unsigned pos, unsigned total_len, unsigned& cover_len);
// String operation reductions
void reduce_op(expr* e);