/*++ Copyright (c) 2026 Microsoft Corporation Module Name: seq_model.cpp Abstract: Implementation of seq_model: model construction for the Nielsen-based string solver. Author: Clemens Eisenhofer 2026-03-01 Nikolaj Bjorner (nbjorner) 2026-03-01 Notes: We end up with a set of substitutions and membership constraints x -> rx y -> ry z in R_z u in R_u Plan: Compute solutions for variables in rx, ry store them in m_var_values We can compute these solutions on demand. When evaluating x, use dependencies from rx. This can include character variables that are assigned values by other theories. Reconstruct value for x using value for rx. Model construction in z3 is designed to be hierarchical. During model initialization solvers register depenendencies between enodes for model construction. The dependencies should be acyclic to enable bottom-up model construction. Values for dependencies are accessed in the model_value_proc class. For strings/sequences we have a natural way to record dependencies. unit/character nodes depend on the elements they contain. --*/ #include "smt/seq_model.h" #include "smt/smt_context.h" #include "smt/smt_model_generator.h" #include "smt/smt_arith_value.h" #include "smt/proto_model/proto_model.h" #include "ast/ast_pp.h" namespace smt { class seq_snode_value_proc : public model_value_proc { seq_model& m_owner; enode* m_node; euf::snode* m_snode; ptr_vector m_dependencies; public: seq_snode_value_proc(seq_model& owner, enode* node, euf::snode* snode) : m_owner(owner), m_node(node), m_snode(snode) { m_owner.collect_dependencies(m_snode, m_dependencies); } void get_dependencies(buffer& result) override { for (enode* d : m_dependencies) result.push_back(model_value_dependency(d)); } app* mk_value(model_generator& mg, expr_ref_vector const& values) override { SASSERT(values.size() == m_dependencies.size()); expr_ref val = m_owner.snode_to_value(m_snode, values); if (!val) val = m_owner.m_seq.str.mk_empty(m_node->get_expr()->get_sort()); m_owner.m_trail.push_back(val); m_owner.m_factory->add_trail(val); TRACE(seq, tout << "nseq seq_snode_value_proc: " << mk_pp(m_node->get_expr(), m_owner.m) << " -> " << mk_pp(val, m_owner.m) << "\n";); return to_app(val); } }; seq_model::seq_model(ast_manager& m, context& ctx, seq_util& seq, seq_rewriter& rw, euf::sgraph& sg) : m(m), m_ctx(ctx), m_seq(seq), m_rewriter(rw), m_sg(sg), m_trail(m) {} void seq_model::init(model_generator& mg, seq::nielsen_graph& nielsen) { TRACE(seq, nielsen.display(tout << nielsen.to_dot() << "\n")); m_var_values.reset(); m_var_replacement.reset(); m_var_regex.reset(); m_trail.reset(); m_factory = alloc(seq_factory, m, m_seq.get_family_id(), mg.get_model()); mg.register_factory(m_factory); register_existing_values(nielsen); seq::nielsen_node* sat_node = nielsen.sat_node(); SASSERT(sat_node); // in case we report sat, this has to point to a satisfied Nielsen node! collect_var_regex_constraints(sat_node); // solve integer constraints from the sat_path FIRST so that // m_int_model is available when snode_to_value evaluates power exponents // VERIFY(nielsen.solve_sat_path_raw(m_int_model)); // extract variable assignments from the satisfying leaf's substitution path extract_assignments(nielsen.sat_path()); } model_value_proc* seq_model::mk_value(enode* n, model_generator& mg) { app* e = n->get_expr(); if (!m_seq.is_seq(e) && !m_seq.is_re(e) && !m_seq.str.is_nth_u(e)) return nullptr; // For regex-sorted enodes, return the expression itself as a model value. // Regexes are interpreted as themselves in the model. if (m_seq.is_re(e)) { m_trail.push_back(e); return alloc(expr_wrapper_proc, e); } // For nth_u (underspecified nth), return a fresh value of the element sort. // NSB review: this looks plain wrong. if (m_seq.str.is_nth_u(e)) { sort *srt = e->get_sort(); expr *val = m_factory->get_fresh_value(srt); if (val) { m_trail.push_back(val); return alloc(expr_wrapper_proc, to_app(val)); } return nullptr; } // look up snode for this expression euf::snode* sn = m_sg.find(e); IF_VERBOSE(2, { verbose_stream() << "nseq mk_value: expr=" << mk_bounded_pp(e, m, 2); if (sn) verbose_stream() << " snode[" << sn->id() << "] kind=" << (int)sn->kind(); else verbose_stream() << " snode=null"; verbose_stream() << "\n"; }); if (sn) return alloc(seq_snode_value_proc, *this, n, sn); expr_ref val(m); if (m_seq.is_seq(e)) // no assignment found — default to empty string val = m_seq.str.mk_empty(e->get_sort()); else val = e; m_trail.push_back(val); m_factory->add_trail(val); return alloc(expr_wrapper_proc, to_app(val)); } void seq_model::finalize(model_generator& mg) { m_var_values.reset(); m_var_regex.reset(); m_var_replacement.reset(); m_trail.reset(); m_factory = nullptr; } void seq_model::extract_assignments(ptr_vector const& sat_path) { IF_VERBOSE(1, verbose_stream() << "nseq extract_assignments: path length=" << sat_path.size() << "\n";); // compose substitutions root-to-leaf. // bindings[i] = (var_snode, current_value_snode). // When a new substitution (s.m_var -> s.m_replacement) is applied, // substitute s.m_var in all existing values, then record the new binding. vector> bindings; for (seq::nielsen_edge* e : sat_path) { for (seq::nielsen_subst const& s : e->subst()) { if (!s.m_var) continue; IF_VERBOSE(1, { verbose_stream() << " subst: snode[" << s.m_var->id() << "]"; if (s.m_var->get_expr()) verbose_stream() << "=" << mk_bounded_pp(s.m_var->get_expr(), m, 2); verbose_stream() << " -> snode[" << (s.m_replacement ? (int)s.m_replacement->id() : -1) << "]"; if (s.m_replacement && s.m_replacement->get_expr()) verbose_stream() << "=" << mk_bounded_pp(s.m_replacement->get_expr(), m, 2); verbose_stream() << "\n"; }); for (auto& b : bindings) b.second = m_sg.subst(b.second, s.m_var, s.m_replacement); bindings.push_back({s.m_var, s.m_replacement}); } } IF_VERBOSE(1, verbose_stream() << "nseq extract_assignments: " << bindings.size() << " bindings\n";); for (auto const &[var, replacement] : bindings) { SASSERT(var); unsigned id = var->first()->id(); // TODO - first or just var->id()? if (!m_var_replacement.contains(id)) m_var_replacement.insert(id, replacement); } } void seq_model::collect_dependencies(euf::snode *n, ptr_vector &deps) const { uint_set seen; buffer> todo; todo.push_back({n, false}); while (!todo.empty()) { auto [curr, is_recursive] = todo.back(); todo.pop_back(); if (seen.contains(curr->id())) continue; seen.insert(curr->id()); if (m.is_value(curr->get_expr())) ; else if (curr->is_empty()) ; else if (curr->is_char_or_unit()) { expr *e = curr->arg(0)->get_expr(); if (m_ctx.e_internalized(e)) deps.push_back(m_ctx.get_enode(e)); } else if (curr->is_concat()) { for (unsigned i = 0; i < curr->num_args(); ++i) todo.push_back({curr->arg(i), is_recursive}); } else if (curr->is_power()) { // pretend there are no dependencies // TODO - may not be sufficient if the exponent is a variable with a binding that contains dependencies } else if (curr->is_var()) { // we could have a binding n |-> replacement // we want to collect all elements in replacement as dependencies // when using the dependencies to build a value for n we should // map the values that are passed in to the sub-terms that are listed as dependencies. // sub-terms are under concat, power and unit euf::snode *replacement = nullptr; if (!is_recursive && m_var_replacement.find(curr->id(), replacement)) todo.push_back({replacement, true}); } else { IF_VERBOSE(0, { verbose_stream() << "nseq collect_dependencies_rec: unhandled snode kind " << (int)curr->kind() << "\n"; verbose_stream() << " curr: snode[" << curr->id() << "]"; if (curr->get_expr()) verbose_stream() << " expr=" << mk_bounded_pp(curr->get_expr(), m, 2); verbose_stream() << "\n"; }); UNREACHABLE(); } } // verbose_stream() << "collect " << mk_pp(n->get_expr(), m) << " " << deps.size() << "\n"; } expr_ref seq_model::snode_to_value(euf::snode* n, expr_ref_vector const& values) { // insert var2value in the same order that dependencies were traversed uint_set seen; u_map var2value; buffer> todo; todo.push_back({n, false}); unsigned idx = 0; arith_util a(m); expr_ref_vector args(m), pinned(m); todo.push_back({n, false}); // verbose_stream() << "extract " << mk_pp(n->get_expr(), m) << " " << values.size() << "\n"; while (!todo.empty()) { SASSERT(idx <= values.size()); auto [curr, is_recursive] = todo.back(); todo.pop_back(); if (seen.contains(curr->id())) continue; seen.insert(curr->id()); if (m.is_value(curr->get_expr())) var2value.insert(curr->id(), curr->get_expr()); else if (curr->is_empty()) var2value.insert(curr->id(), curr->get_expr()); else if (curr->is_char_or_unit()) { auto arg = curr->arg(0); expr *e = arg->get_expr(); expr *val = nullptr; if (m_ctx.e_internalized(e)) { val = values[idx++]; } else if (m_seq.str.is_nth_u(e)) { expr* var_value = get_var_value(arg->arg(0)); auto index = int_value(arg->arg(1)->get_expr()); val = m_seq.str.mk_nth(var_value, a.mk_int(index)); } else { NOT_IMPLEMENTED_YET(); } val = m_seq.str.mk_unit(val); var2value.insert(curr->id(), val); pinned.push_back(val); } else if (curr->is_concat()) { for (unsigned i = 0; i < curr->num_args(); ++i) todo.push_back({curr->arg(i), is_recursive}); } else if (curr->is_power()) { SASSERT(curr->num_args() == 2); } else if (curr->is_var()) { euf::snode *replacement = nullptr; if (!is_recursive && m_var_replacement.find(n->id(), replacement)) todo.push_back({replacement, true}); } else UNREACHABLE(); } // then reconstruct the value for replacement based on the collected sub-term values. SASSERT(values.size() == idx); todo.push_back({n, false}); expr *val = nullptr; while (!todo.empty()) { auto [curr, is_recursive] = todo.back(); if (var2value.contains(curr->id())) { todo.pop_back(); continue; } if (curr->is_empty()) val = curr->get_expr(); else if (m.is_value(curr->get_expr())) val = curr->get_expr(); else if (curr->is_power()) { auto ival = int_value(curr->arg(1)->get_expr()); val = m_seq.str.mk_power(curr->arg(0)->get_expr(), a.mk_int(ival)); } else if (curr->is_concat()) { args.reset(); for (unsigned i = 0; i < curr->num_args(); ++i) { auto arg = curr->arg(i); if (var2value.find(arg->id(), val)) args.push_back(val); else todo.push_back({arg, is_recursive}); } if (args.size() == curr->num_args()) val = m_seq.str.mk_concat(args, curr->get_sort()); else continue; // not all arguments have been processed yet, will reconstruct in a later iteration } else if (curr->is_var()) { euf::snode *replacement = nullptr; if (!is_recursive && m_var_replacement.find(curr->id(), replacement)) { if (!var2value.find(replacement->id(), val)) { todo.push_back({replacement, true}); continue; } } else val = get_var_value(curr); } else { IF_VERBOSE(0, verbose_stream() << "not handled " << mk_pp(curr->get_expr(), m) << "\n"); UNREACHABLE(); } var2value.insert(curr->id(), val); pinned.push_back(val); todo.pop_back(); } return expr_ref(var2value.find(n->id()), m); } void seq_model::register_existing_values(seq::nielsen_graph& nielsen) { seq::nielsen_node const* root = nielsen.root(); if (!root) return; // TODO - need to traverse sub-expressions for values. } expr* seq_model::get_var_value(euf::snode* var) { SASSERT(var); unsigned key = var->first()->id(); expr* val = nullptr; if (m_var_values.find(key, val)) return val; // unconstrained or regex-constrained: delegate to mk_fresh_value val = mk_fresh_value(var); if (val) { m_trail.push_back(val); m_var_values.insert(key, val); } return val; } // Gets the arithmetic value of e. In QF_SLIA mode theory_lra does not register // numeral constants as LP variables, so get_value_equiv misses cases like // (str.len w) being equivalent to numeral 5 via EUF. Walking the EUF class // and checking for numerals directly handles this. bool seq_model::get_arith_value(expr* e, rational& val) const { if (!m_ctx.e_internalized(e)) return false; arith_util a(m); enode* root = m_ctx.get_enode(e); enode* it = root; do { if (a.is_numeral(it->get_expr(), val)) return true; it = it->get_next(); } while (it != root); arith_value avalue(m); avalue.init(const_cast(&m_ctx)); return avalue.get_value_equiv(e, val); } rational seq_model::int_value(expr *_e) { expr_ref e(_e, m); m_ctx.get_rewriter()(e); rational val; arith_util a(m); if (a.is_numeral(e, val)) return val; bool has_val = get_arith_value(e, val); CTRACE(seq, !has_val, tout << "no value associated with " << mk_pp(e, m) << "\n";); return val; } expr* seq_model::mk_fresh_value(euf::snode* var) { SASSERT(var->get_expr()); if (!m_seq.is_seq(var->get_expr())) return nullptr; auto srt = var->get_expr()->get_sort(); // check if this variable has regex constraints euf::snode* re = nullptr; unsigned key = var->first()->id(); if (m_var_regex.find(key, re) && re) { expr* re_expr = re->get_expr(); SASSERT(re_expr); arith_util arith(m); expr_ref len_expr(m_seq.str.mk_length(var->get_expr()), m); rational len_val; bool has_len = get_arith_value(len_expr, len_val); CTRACE(seq, !has_len, tout << "no value associated with " << mk_pp(len_expr, m) << "\n";); if (has_len && len_val.is_unsigned()) { unsigned n = len_val.get_unsigned(); expr_ref loop(m_seq.re.mk_loop(m_seq.re.mk_full_char(re_expr->get_sort()), n, n), m); re_expr = m_seq.re.mk_inter(re_expr, loop); } expr_ref witness(m); // We checked non-emptiness during Nielsen already lbool wr = m_rewriter.some_seq_in_re(re_expr, witness); if (wr == l_true && witness) { m_trail.push_back(witness); m_factory->register_value(witness); return witness; } IF_VERBOSE(1, verbose_stream() << "witness extraction failed for " << mk_pp(var->get_expr(), m) << " : " << wr << " with len " << (has_len ? len_val.to_string() : "unknown") << "\n" << mk_pp(re_expr, m) << "\n"); UNREACHABLE(); } // No regex constraint: try to respect the assigned length for the variable. // This prevents invalid models such as len(x)=1 with x="". arith_util arith(m); expr_ref len_expr(m_seq.str.mk_length(var->get_expr()), m); rational len_val; bool has_len = get_arith_value(len_expr, len_val); CTRACE(seq, !has_len, tout << "no value associated with " << mk_pp(len_expr, m) << "\n";); if (has_len) { if (!len_val.is_int() || len_val.is_neg()) len_val = rational(0); if (len_val.is_zero()) return m_seq.str.mk_empty(srt); constexpr unsigned MAX_CONCRETE_WITNESS = 1024; if (len_val.is_unsigned() && len_val.get_unsigned() <= MAX_CONCRETE_WITNESS) { unsigned n = len_val.get_unsigned(); zstring w; for (unsigned i = 0; i < n; ++i) w += zstring('0'); expr* witness = m_seq.str.mk_string(w); m_factory->register_value(witness); return witness; } expr* base = m_seq.str.mk_string("0"); expr* witness = m_seq.str.mk_power(base, arith.mk_int(len_val)); m_factory->register_value(witness); return witness; } // no regex constraint or witness generation failed: use empty string return m_seq.str.mk_empty(srt); } void seq_model::collect_var_regex_constraints(seq::nielsen_node const* sat_node) { SASSERT(sat_node); for (auto const& mem : sat_node->str_mems()) { SASSERT(mem.well_formed()); if (mem.is_trivial(sat_node)) continue; // empty string in nullable regex: already satisfied, no variable to constrain VERIFY(mem.is_primitive()); // everything else should have been eliminated already euf::snode* first = mem.m_str->first(); unsigned id = first->id(); euf::snode* existing = nullptr; if (m_var_regex.find(id, existing) && existing) { // intersect with existing constraint: // build re.inter(existing, new_regex) expr* e1 = existing->get_expr(); expr* e2 = mem.m_regex->get_expr(); if (e1 && e2) { expr_ref inter(m_seq.re.mk_inter(e1, e2), m); euf::snode* inter_sn = m_sg.mk(inter); SASSERT(inter_sn); m_var_regex.insert(id, inter_sn); } } else m_var_regex.insert(id, mem.m_regex); } } bool seq_model::validate_regex(tracked_str_mem const& mem, ::proto_model& mdl) { SASSERT(mem.well_formed()); expr* s_expr = mem.m_str->get_expr(); expr* r_expr = mem.m_regex->get_expr(); expr_ref in_re(m_seq.re.mk_in_re(s_expr, r_expr), m); if (mdl.is_false(in_re)) { IF_VERBOSE(0, verbose_stream() << "nseq model: positive membership violated: " << mk_bounded_pp(s_expr, m, 3) << " in " << mk_bounded_pp(r_expr, m, 3) << "\n";); return false; } return true; } } #if 0 expr_ref seq_model::snode_to_value(euf::snode* n, expr_ref_vector const& values) { SASSERT(n); if (n->is_empty()) { sort* srt = n->get_sort(); if (!srt) srt = m_seq.str.mk_string_sort(); return expr_ref(m_seq.str.mk_empty(srt), m); } arith_util arith(m); if (m.is_value(n->get_expr())) return expr_ref(n->get_expr(), m); if (n->is_char_or_unit()) { expr *e = nullptr; VERIFY(m_seq.str.is_unit(n->get_expr(), e)); if (values.size() == 1) { unsigned c; expr *dval = values.get(0); if (m_seq.is_const_char(dval, c)) return expr_ref(m_seq.str.mk_string(zstring(c)), m); return expr_ref(m_seq.str.mk_unit(dval), m); } else if (m_seq.str.is_nth_u(e)) { auto arg = n->arg(0); auto var_val = get_var_value(arg->arg(0)); auto index_val = int_value(arg->arg(1)->get_expr()); expr_ref val(m_seq.str.mk_nth(var_val, arith.mk_int(index_val)), m); return expr_ref(m_seq.str.mk_unit(val), m); } else NOT_IMPLEMENTED_YET(); } if (n->is_var()) { euf::snode *replacement = nullptr; if (!m_var_replacement.find(n->id(), replacement)) return expr_ref(get_var_value(n), m); return mk_value_with_dependencies(replacement, values); } if (n->is_concat()) { expr_ref_vector es(m), vals(m); unsigned idx = 0; m_seq.str.get_concat(n->get_expr(), es); for (auto e : es) { if (m.is_value(e)) vals.push_back(e); else vals.push_back(values[idx++]); } return expr_ref(m_seq.str.mk_concat(vals, n->get_sort()), m); } if (n->is_power()) { SASSERT(n->num_args() == 2); SASSERT(values.size() == 0); // Evaluate the base and exponent to produce a concrete string. // The base is a string snode; the exponent is an integer expression // whose value comes from the sat_path integer model. expr *base_val = n->arg(0)->get_expr(); expr *exp_expr = n->arg(1)->get_expr(); rational exp_val = int_value(exp_expr); if (exp_val.is_neg()) exp_val = rational(0); // Build the repeated string: base^exp_val if (exp_val == 0) return expr_ref(m_seq.str.mk_empty(n->get_sort()), m); TRACE(seq, tout << mk_pp(n->get_expr(), m) << '\n'); // For small exponents, concatenate directly; for large ones, // return mk_power to avoid enormous AST chains. constexpr unsigned POWER_EXPAND_LIMIT = 10; if (exp_val > POWER_EXPAND_LIMIT) return expr_ref(m_seq.str.mk_power(base_val, arith.mk_int(exp_val)), m); unsigned n_val = exp_val.get_unsigned(); expr_ref acc(base_val, m); for (unsigned i = 1; i < n_val; ++i) acc = m_seq.str.mk_concat(acc, base_val); return acc; } // fallback: use the underlying expression return expr_ref(n->get_expr(), m); } #endif