From 466bfea60436fbe1639b3169d33e72da388de345 Mon Sep 17 00:00:00 2001 From: Nikolaj Bjorner Date: Fri, 1 May 2026 11:07:27 -0700 Subject: [PATCH] add draft for model construction Signed-off-by: Nikolaj Bjorner --- src/smt/seq_model.cpp.draft | 468 ++++++++++++++++++++++++++++++++++++ src/smt/seq_model.h.draft | 134 +++++++++++ 2 files changed, 602 insertions(+) create mode 100644 src/smt/seq_model.cpp.draft create mode 100644 src/smt/seq_model.h.draft diff --git a/src/smt/seq_model.cpp.draft b/src/smt/seq_model.cpp.draft new file mode 100644 index 000000000..c70bb6903 --- /dev/null +++ b/src/smt/seq_model.cpp.draft @@ -0,0 +1,468 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_model.cpp + +Abstract: + + Implementation of seq_model: model construction for the + Nielsen-based string solver. + +Author: + + Clemens Eisenhofer 2026-03-01 + Nikolaj Bjorner (nbjorner) 2026-03-01 + +--*/ +#include "smt/seq_model.h" +#include "smt/smt_context.h" +#include "smt/smt_model_generator.h" +#include "smt/smt_arith_value.h" +#include "smt/proto_model/proto_model.h" +#include "ast/ast_pp.h" + +namespace smt { + + static enode* find_root_enode(context& ctx, expr* e) { + if (!e) + return nullptr; + enode* n = ctx.find_enode(e); + return n ? n->get_root() : nullptr; + } + + static bool is_model_dependency(context& ctx, enode* n) { + if (!n) + return false; + seq_util seq(ctx.get_manager()); + if (seq.is_seq(n->get_sort()) || seq.is_re(n->get_sort())) + return false; + return ctx.is_relevant(n) || ctx.get_manager().is_value(n->get_expr()); + } + + class seq_snode_value_proc : public model_value_proc { + seq_model& m_owner; + enode* m_node; + euf::snode* m_snode; + ptr_vector m_dependencies; + + public: + seq_snode_value_proc(seq_model& owner, enode* node, euf::snode* snode) + : m_owner(owner), m_node(node), m_snode(snode) { + m_owner.collect_dependencies(m_snode, m_dependencies); + } + + void get_dependencies(buffer& result) override { + for (enode* d : m_dependencies) + result.push_back(model_value_dependency(d)); + } + + app* mk_value(model_generator& mg, expr_ref_vector const& values) override { + SASSERT(values.size() == m_dependencies.size()); + + expr_ref val = m_owner.snode_to_value(m_snode, values); + if (!val) + val = m_owner.m_seq.str.mk_empty(m_node->get_expr()->get_sort()); + + m_owner.m_trail.push_back(val); + m_owner.m_factory->add_trail(val); + TRACE(seq, tout << "nseq seq_snode_value_proc: " << mk_pp(m_node->get_expr(), m_owner.m) << " -> " << mk_pp(val, m_owner.m) << "\n";); + return to_app(val); + } + }; + + seq_model::seq_model(ast_manager& m, context& ctx, seq_util& seq, + seq_rewriter& rw, euf::sgraph& sg) + : m(m), m_ctx(ctx), m_seq(seq), m_rewriter(rw), m_sg(sg), m_trail(m) + {} + + void seq_model::init(model_generator& mg, seq::nielsen_graph& nielsen) { + + TRACE(seq, nielsen.display(tout << nielsen.to_dot() << "\n")); + m_var_values.reset(); + m_var_regex.reset(); + m_trail.reset(); + m_mg = &mg; + + m_factory = alloc(seq_factory, m, m_seq.get_family_id(), mg.get_model()); + mg.register_factory(m_factory); + + register_existing_values(nielsen); + seq::nielsen_node* sat_node = nielsen.sat_node(); + SASSERT(sat_node); // in case we report sat, this has to point to a satisfied Nielsen node! + collect_var_regex_constraints(sat_node); + + // solve integer constraints from the sat_path FIRST so that + // m_int_model is available when snode_to_value evaluates power exponents + // VERIFY(nielsen.solve_sat_path_raw(m_int_model)); + + // extract variable assignments from the satisfying leaf's substitution path + extract_assignments(nielsen.sat_path()); + + } + + model_value_proc* seq_model::mk_value(enode* n, model_generator& mg) { + app* e = n->get_expr(); + if (!m_seq.is_seq(e) && !m_seq.is_re(e) && !m_seq.str.is_nth_u(e)) + return nullptr; + + // For regex-sorted enodes, return the expression itself as a model value. + // Regexes are interpreted as themselves in the model. + if (m_seq.is_re(e)) { + m_trail.push_back(e); + return alloc(expr_wrapper_proc, e); + } + + // For nth_u (underspecified nth), return a fresh value of the element sort. + if (m_seq.str.is_nth_u(e)) { + sort* srt = e->get_sort(); + expr* val = m_factory->get_fresh_value(srt); + if (val) { + m_trail.push_back(val); + return alloc(expr_wrapper_proc, to_app(val)); + } + return nullptr; + } + + // look up snode for this expression + euf::snode* sn = m_sg.find(e); + IF_VERBOSE(2, { + verbose_stream() << "nseq mk_value: expr=" << mk_bounded_pp(e, m, 2); + if (sn) verbose_stream() << " snode[" << sn->id() << "] kind=" << (int)sn->kind(); + else verbose_stream() << " snode=null"; + verbose_stream() << "\n"; + }); + expr_ref val(m); + if (sn) + return alloc(seq_snode_value_proc, *this, n, sn); + + if (!val) { + // no assignment found — default to empty string + val = m_seq.str.mk_empty(e->get_sort()); + } + + if (val) { + m_trail.push_back(val); + m_factory->add_trail(val); + return alloc(expr_wrapper_proc, to_app(val)); + } + + return alloc(expr_wrapper_proc, to_app(m_seq.str.mk_empty(e->get_sort()))); + } + + void seq_model::finalize(model_generator& mg) { + m_var_values.reset(); + m_var_regex.reset(); + m_trail.reset(); + m_mg = nullptr; + m_factory = nullptr; + } + + void seq_model::extract_assignments(ptr_vector const& sat_path) { + IF_VERBOSE(1, verbose_stream() << "nseq extract_assignments: path length=" << sat_path.size() << "\n";); + + // compose substitutions root-to-leaf. + // bindings[i] = (var_snode, current_value_snode). + // When a new substitution (s.m_var -> s.m_replacement) is applied, + // substitute s.m_var in all existing values, then record the new binding. + vector> bindings; + for (seq::nielsen_edge* e : sat_path) { + for (seq::nielsen_subst const& s : e->subst()) { + if (!s.m_var) + continue; + IF_VERBOSE(1, { + verbose_stream() << " subst: snode[" << s.m_var->id() << "]"; + if (s.m_var->get_expr()) verbose_stream() << "=" << mk_bounded_pp(s.m_var->get_expr(), m, 2); + verbose_stream() << " -> snode[" << (s.m_replacement ? (int)s.m_replacement->id() : -1) << "]"; + if (s.m_replacement && s.m_replacement->get_expr()) verbose_stream() << "=" << mk_bounded_pp(s.m_replacement->get_expr(), m, 2); + verbose_stream() << "\n"; + }); + for (auto& b : bindings) + b.second = m_sg.subst(b.second, s.m_var, s.m_replacement); + bindings.push_back({s.m_var, s.m_replacement}); + } + } + + IF_VERBOSE(1, verbose_stream() << "nseq extract_assignments: " << bindings.size() << " bindings\n";); + for (auto const &[var, replacement] : bindings) { + SASSERT(var); + unsigned id = var->first()->id(); + if (m_var_values.contains(id)) + continue; + expr_ref_vector values(m); + // TODO - this doesn't work. + // we need a way to track that var depends on replacement + // so the value computed for var is based on replacement's value. + expr_ref val = snode_to_value(replacement, values); // TODO + IF_VERBOSE(1, { + verbose_stream() << " var snode[" << id << "]"; + if (var->get_expr()) verbose_stream() << "=" << mk_bounded_pp(var->get_expr(), m, 2); + verbose_stream() << " -> "; + if (val) verbose_stream() << mk_bounded_pp(val, m, 3); else verbose_stream() << "(null)"; + verbose_stream() << "\n"; + }); + if (val) { + m_trail.push_back(val); + m_var_values.insert(id, val); + } + } + } + + expr_ref seq_model::snode_to_value(euf::snode* n, expr_ref_vector const& values) { + SASSERT(n); + + if (n->is_empty()) { + sort* srt = n->get_sort(); + if (!srt) + srt = m_seq.str.mk_string_sort(); + return expr_ref(m_seq.str.mk_empty(srt), m); + } + + // NSB review + // we have to carefully figure out what to do/redo here. + // model construction in z3 is designed to be hierarchical. + // during model initialization solvers register depenendencies between enodes for model construction. + // The dependencies should be acyclic to enable bottom-up model construction. + // Values for dependencies are accessed in the model_value_proc class. + // For strings/sequences we have a natural way to record dependencies. + // unit/character nodes depend on the elements they contain. + if (n->is_char() || n->is_unit()) { + expr *e = n->get_expr(); + SASSERT(m_seq.str.is_unit(e)); + e = to_app(e)->get_arg(0); + unsigned c; + + expr *dval = values.get(0); + if (m_seq.is_const_char(dval, c)) + return expr_ref(m_seq.str.mk_string(zstring(c)), m); + return expr_ref(m_seq.str.mk_unit(dval), m); + } + + if (n->is_var()) + return expr_ref(get_var_value(n), m); + + if (n->is_concat()) { + return expr_ref(m_seq.str.mk_concat(values, n->get_sort()), m); + } + + if (n->is_power()) { + SASSERT(n->num_args() == 2); + SASSERT(values.size() == 2); + // Evaluate the base and exponent to produce a concrete string. + // The base is a string snode; the exponent is an integer expression + // whose value comes from the sat_path integer model. + expr* base_val = values.get(0); + expr *exp_expr = values.get(1); + + rational exp_val(0); + arith_util arith(m); + + + // Try to evaluate exponent: first check if it's a numeral, + // then try the int model from sat_path constraints, + // finally fall back to the proto_model from model_generator. + bool has_val = false; + if (exp_expr && arith.is_numeral(exp_expr, exp_val)) { + has_val = true; + // already concrete + } + + if (!has_val) { + arith_value avalue(m); + avalue.init(&m_ctx); + avalue.get_value(exp_expr, exp_val); + } + + if (exp_val.is_neg()) + exp_val = rational(0); + + // Build the repeated string: base^exp_val + if (exp_val == 0) { + sort* srt = n->get_sort(); + SASSERT(srt); + return expr_ref(m_seq.str.mk_empty(srt), m); + } + if (exp_val.is_one()) + return expr_ref(base_val, m); + + // For small exponents, concatenate directly; for large ones, + // return mk_power to avoid enormous AST chains. + constexpr unsigned POWER_EXPAND_LIMIT = 100; + if (exp_val > POWER_EXPAND_LIMIT) + return expr_ref(m_seq.str.mk_power(base_val, arith.mk_int(exp_val)), m); + unsigned n_val = exp_val.get_unsigned(); + expr_ref acc(base_val, m); + for (unsigned i = 1; i < n_val; ++i) + acc = m_seq.str.mk_concat(acc, base_val); + return acc; + } + + // fallback: use the underlying expression + return expr_ref(n->get_expr(), m); + } + + void seq_model::collect_dependencies(euf::snode* n, ptr_vector& deps) const { + if (n->is_char() || n->is_unit()) + deps.push_back(m_ctx.get_enode(n->arg(0)->get_expr())); + else if (n->is_concat()) { + for (unsigned i = 0; i < n->num_args(); ++i) + deps.push_back(m_ctx.get_enode(n->arg(i)->get_expr())); + } + else if (n->is_power()) { + deps.push_back(m_ctx.get_enode(n->arg(0)->get_expr())); + deps.push_back(m_ctx.get_enode(n->arg(1)->get_expr())); + } + } + + void seq_model::register_existing_values(seq::nielsen_graph& nielsen) { + seq::nielsen_node const* root = nielsen.root(); + if (!root) + return; + for (auto const& eq : root->str_eqs()) { + if (eq.m_lhs && eq.m_lhs->get_expr()) + m_factory->register_value(eq.m_lhs->get_expr()); + if (eq.m_rhs && eq.m_rhs->get_expr()) + m_factory->register_value(eq.m_rhs->get_expr()); + } + } + + expr* seq_model::get_var_value(euf::snode* var) { + SASSERT(var); + unsigned key = var->first()->id(); + expr* val = nullptr; + if (m_var_values.find(key, val)) + return val; + + // unconstrained or regex-constrained: delegate to mk_fresh_value + val = mk_fresh_value(var); + if (val) { + m_trail.push_back(val); + m_var_values.insert(key, val); + } + return val; + } + + expr* seq_model::mk_fresh_value(euf::snode* var) { + SASSERT(var->get_expr()); + if (!m_seq.is_seq(var->get_expr())) + return nullptr; + auto srt = var->get_expr()->get_sort(); + + // check if this variable has regex constraints + euf::snode* re = nullptr; + unsigned key = var->first()->id(); + if (m_var_regex.find(key, re) && re) { + expr* re_expr = re->get_expr(); + SASSERT(re_expr); + + arith_util arith(m); + arith_value avalue(m); + avalue.init(&m_ctx); + expr_ref len_expr(m_seq.str.mk_length(var->get_expr()), m); + rational len_val; + bool has_len = avalue.get_value(len_expr, len_val); + + if (has_len && len_val.is_unsigned()) { + unsigned n = len_val.get_unsigned(); + expr_ref loop(m_seq.re.mk_loop(m_seq.re.mk_full_char(re_expr->get_sort()), n, n), m); + re_expr = m_seq.re.mk_inter(re_expr, loop); + } + + expr_ref witness(m); + // We checked non-emptiness during Nielsen already + lbool wr = m_rewriter.some_seq_in_re(re_expr, witness); + if (wr == l_true && witness) { + m_trail.push_back(witness); + m_factory->register_value(witness); + return witness; + } + IF_VERBOSE(1, verbose_stream() << "witness extraction failed for " << mk_pp(var->get_expr(), m) + << " : " << wr << " with len " << (has_len ? len_val.to_string() : "unknown") << "\n" << mk_pp(re_expr, m) << "\n"); + UNREACHABLE(); + } + + // No regex constraint: try to respect the assigned length for the variable. + // This prevents invalid models such as len(x)=1 with x="". + arith_util arith(m); + expr_ref len_expr(m_seq.str.mk_length(var->get_expr()), m); + rational len_val; + + arith_value avalue(m); + avalue.init(&m_ctx); + bool has_len = avalue.get_value(len_expr, len_val); + + + if (has_len) { + if (!len_val.is_int() || len_val.is_neg()) + len_val = rational(0); + if (len_val.is_zero()) + return m_seq.str.mk_empty(srt); + + constexpr unsigned MAX_CONCRETE_WITNESS = 1024; + if (len_val.is_unsigned() && len_val.get_unsigned() <= MAX_CONCRETE_WITNESS) { + unsigned n = len_val.get_unsigned(); + zstring w; + for (unsigned i = 0; i < n; ++i) + w += zstring('0'); + expr* witness = m_seq.str.mk_string(w); + m_factory->register_value(witness); + return witness; + } + + expr* base = m_seq.str.mk_string("0"); + expr* witness = m_seq.str.mk_power(base, arith.mk_int(len_val)); + m_factory->register_value(witness); + return witness; + } + + // no regex constraint or witness generation failed: use empty string + return m_seq.str.mk_empty(srt); + } + + void seq_model::collect_var_regex_constraints(seq::nielsen_node const* sat_node) { + SASSERT(sat_node); + for (auto const& mem : sat_node->str_mems()) { + SASSERT(mem.m_str && mem.m_regex); + if (mem.is_trivial(sat_node)) + continue; // empty string in nullable regex: already satisfied, no variable to constrain + VERIFY(mem.is_primitive()); // everything else should have been eliminated already + euf::snode* first = mem.m_str->first(); + unsigned id = first->id(); + euf::snode* existing = nullptr; + if (m_var_regex.find(id, existing) && existing) { + // intersect with existing constraint: + // build re.inter(existing, new_regex) + expr* e1 = existing->get_expr(); + expr* e2 = mem.m_regex->get_expr(); + if (e1 && e2) { + expr_ref inter(m_seq.re.mk_inter(e1, e2), m); + euf::snode* inter_sn = m_sg.mk(inter); + SASSERT(inter_sn); + m_var_regex.insert(id, inter_sn); + } + } + else + m_var_regex.insert(id, mem.m_regex); + } + } + + bool seq_model::validate_regex(tracked_str_mem const& mem, ::proto_model& mdl) { + if (!mem.m_str || !mem.m_regex) + return true; + expr* s_expr = mem.m_str->get_expr(); + expr* r_expr = mem.m_regex->get_expr(); + if (!s_expr || !r_expr) + return true; + + expr_ref in_re(m_seq.re.mk_in_re(s_expr, r_expr), m); + if (mdl.is_false(in_re)) { + IF_VERBOSE(0, verbose_stream() << "nseq model: positive membership violated: " + << mk_bounded_pp(s_expr, m, 3) + << " in " << mk_bounded_pp(r_expr, m, 3) << "\n";); + return false; + } + return true; + } + +} diff --git a/src/smt/seq_model.h.draft b/src/smt/seq_model.h.draft new file mode 100644 index 000000000..c88ff9ae5 --- /dev/null +++ b/src/smt/seq_model.h.draft @@ -0,0 +1,134 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_model.h + +Abstract: + + Model construction for the Nielsen-based string solver (theory_nseq). + + After the Nielsen graph search returns sat, this module extracts + variable-to-value assignments from the satisfying leaf node and + builds model_value_proc callbacks for the SMT model generator. + + The workflow is: + 1. init() — allocate seq_factory, register existing string literals, + and extract variable assignments from the satisfying Nielsen node. + 2. mk_value(enode*) — return a model_value_proc that lazily builds + the concrete value for a given enode. + 3. finalize() — clean up temporary state. + +Author: + + Clemens Eisenhofer 2026-03-01 + Nikolaj Bjorner (nbjorner) 2026-03-01 + +--*/ +#pragma once + +#include "smt_context.h" +#include "ast/seq_decl_plugin.h" +#include "ast/rewriter/seq_rewriter.h" +#include "ast/euf/euf_sgraph.h" +#include "smt/seq/seq_nielsen.h" +#include "smt/seq/seq_state.h" // tracked_str_mem +#include "model/seq_factory.h" + +class proto_model; + +namespace smt { + + class enode; + class model_generator; + struct tracked_str_mem; + class model_value_proc; + class seq_snode_value_proc; + + class seq_model { + friend class seq_snode_value_proc; + + ast_manager& m; + context& m_ctx; + seq_util& m_seq; + seq_rewriter& m_rewriter; + euf::sgraph& m_sg; + + // factory for generating fresh string/regex values + seq_factory* m_factory = nullptr; + + // variable assignments extracted from the satisfying Nielsen node. + // maps snode id -> expr* (concrete value) + u_map m_var_values; + + // trail for GC protection of generated expressions + expr_ref_vector m_trail; + + // integer variable model from sat_path constraints + model_ref m_int_model; + model_generator* m_mg = nullptr; + + // per-variable regex constraints: maps snode id -> intersected regex snode. + // collected during init() from the state's str_mem list. + u_map m_var_regex; + + public: + seq_model(ast_manager& m, context& ctx, seq_util& seq, + seq_rewriter& rw, euf::sgraph& sg); + + // Phase 1: initialize model construction. + // Allocates seq_factory, registers it with mg, collects + // existing string literals, and extracts variable assignments + // from the satisfying Nielsen leaf node. + void init(model_generator& mg, seq::nielsen_graph& nielsen); + + // Phase 2: build a model_value_proc for the given enode. + // Returns nullptr if the enode is not a sequence/string sort. + model_value_proc* mk_value(enode* n, model_generator& mg); + + // Phase 3: clean up temporary model construction state. + void finalize(model_generator& mg); + + // Validate that model assignments satisfy all regex membership + // constraints from the state. Checks positive and negative + // memberships. Returns true if all constraints pass. + bool validate_regex(tracked_str_mem const& mem, ::proto_model& mdl); + + private: + // extract variable assignments from the sat path (root-to-leaf edges). + // Composes substitutions along the path to compute final var values. + void extract_assignments(ptr_vector const& sat_path); + + // recursively substitute known variable assignments into an snode tree. + // Returns a concrete Z3 expression. + // Optionally uses pre-evaluated model values for + // enode dependencies (provided by model_generator). + expr_ref snode_to_value(euf::snode* n, expr_ref_vector const& values); + + // Collect enode dependencies required to evaluate an snode value. + void collect_dependencies(euf::snode* n, ptr_vector& deps) const; + + // register all string literals appearing in the constraint store + // with the factory to avoid collisions with fresh values. + void register_existing_values(seq::nielsen_graph& nielsen); + + // look up or compute the value for an snode variable. + // If no assignment exists, delegates to mk_fresh_value. + expr* get_var_value(euf::snode* var); + + // generate a fresh value for a variable, respecting regex + // membership constraints. If the variable has associated + // regex constraints (collected during init), generates a + // witness satisfying the intersection; otherwise falls back + // to a plain fresh value from the factory. + expr* mk_fresh_value(euf::snode* var); + + // collect per-variable regex constraints from the state. + // For each positive str_mem, records the regex (or intersects + // with existing) into m_var_regex keyed by the string snode id. + void collect_var_regex_constraints(seq::nielsen_node const* sat_node); + + }; + +}