From b74f0bbb0032681437e3c4e3518edc4dd89ad7e4 Mon Sep 17 00:00:00 2001 From: CEisenhofer Date: Tue, 24 Mar 2026 13:20:30 +0100 Subject: [PATCH] Signature splits Fixed dot printing errorfor Skolems --- src/params/smt_params.cpp | 1 + src/params/smt_params.h | 1 + src/params/smt_params_helper.pyg | 1 + src/smt/seq/seq_nielsen.cpp | 162 +++++++++++++++++++++++++++++++ src/smt/seq/seq_nielsen.h | 7 ++ src/smt/seq/seq_nielsen_pp.cpp | 4 +- src/smt/theory_nseq.cpp | 3 + 7 files changed, 178 insertions(+), 1 deletion(-) diff --git a/src/params/smt_params.cpp b/src/params/smt_params.cpp index 55ae9b2f8..7e148a061 100644 --- a/src/params/smt_params.cpp +++ b/src/params/smt_params.cpp @@ -57,6 +57,7 @@ void smt_params::updt_local_params(params_ref const & _p) { m_nseq_max_nodes = p.nseq_max_nodes(); m_nseq_parikh = p.nseq_parikh(); m_nseq_regex_precheck = p.nseq_regex_precheck(); + m_nseq_signature = p.nseq_signature(); m_up_persist_clauses = p.up_persist_clauses(); validate_string_solver(m_string_solver); if (_p.get_bool("arith.greatest_error_pivot", false)) diff --git a/src/params/smt_params.h b/src/params/smt_params.h index aa593fd90..14e03a114 100644 --- a/src/params/smt_params.h +++ b/src/params/smt_params.h @@ -252,6 +252,7 @@ struct smt_params : public preprocessor_params, unsigned m_nseq_max_nodes = 0; bool m_nseq_parikh = false; bool m_nseq_regex_precheck = true; + bool m_nseq_signature = false; smt_params(params_ref const & p = params_ref()): m_string_solver(symbol("auto")){ diff --git a/src/params/smt_params_helper.pyg b/src/params/smt_params_helper.pyg index 226c5ccbb..c9257b19f 100644 --- a/src/params/smt_params_helper.pyg +++ b/src/params/smt_params_helper.pyg @@ -127,6 +127,7 @@ def_module_params(module_name='smt', ('nseq.max_nodes', UINT, 0, 'maximum number of DFS nodes explored by theory_nseq per solve() call (0 = unlimited)'), ('nseq.parikh', BOOL, False, 'enable Parikh image checks in nseq solver'), ('nseq.regex_precheck', BOOL, True, 'enable regex membership pre-check before DFS in theory_nseq: checks intersection emptiness per-variable and short-circuits SAT/UNSAT for regex-only problems'), + ('nseq.signature', BOOL, False, 'enable heuristic signature-based string equation splitting in Nielsen solver'), ('core.validate', BOOL, False, '[internal] validate unsat core produced by SMT context. This option is intended for debugging'), ('seq.split_w_len', BOOL, True, 'enable splitting guided by length constraints'), ('seq.validate', BOOL, False, 'enable self-validation of theory axioms created by seq theory'), diff --git a/src/smt/seq/seq_nielsen.cpp b/src/smt/seq/seq_nielsen.cpp index 810198963..308dc8772 100644 --- a/src/smt/seq/seq_nielsen.cpp +++ b/src/smt/seq/seq_nielsen.cpp @@ -37,6 +37,7 @@ NSB review: #include #include #include +#include namespace seq { @@ -1880,6 +1881,12 @@ namespace seq { if (!node->is_extended()) { bool ext = generate_extensions(node); IF_VERBOSE(0, display(verbose_stream(), node)); + if (!ext) { +#ifdef Z3DEBUG + std::string dot = to_dot(); + std::cout << dot << std::endl; +#endif + } VERIFY(ext); node->set_extended(true); ++m_stats.m_num_extensions; @@ -2414,6 +2421,10 @@ namespace seq { if (apply_const_nielsen(node)) return ++m_stats.m_mod_const_nielsen, true; + // Priority 9: SignatureSplit - heuristic string equation splitting + if (m_signature_split && apply_signature_split(node)) + return ++m_stats.m_mod_signature_split, true; + // Priority 10: RegexVarSplit - split str_mem by minterms if (apply_regex_var_split(node)) return ++m_stats.m_mod_regex_var_split, true; @@ -3116,6 +3127,156 @@ namespace seq { return true; } + // ----------------------------------------------------------------------- + // Modifier: apply_signature_split + // Heuristic equation split based on a shortest prefix signature (i, j): + // prefixes u[0..i-1], v[0..j-1] must contain at least one variable and + // every variable in one prefix must occur in the other prefix. + // ----------------------------------------------------------------------- + + bool nielsen_graph::apply_signature_split(nielsen_node* node) { + auto first_var_pos = [](euf::snode_vector const& toks) { + for (unsigned k = 0; k < toks.size(); ++k) + if (toks[k]->is_var()) + return k; + return toks.size(); + }; + + auto const& eqs = node->str_eqs(); + for (unsigned eq_idx = 0; eq_idx < eqs.size(); ++eq_idx) { + str_eq const& eq = eqs[eq_idx]; + SASSERT(eq.m_lhs && eq.m_rhs); + if (eq.is_trivial()) + continue; + + euf::snode_vector lhs_toks, rhs_toks; + eq.m_lhs->collect_tokens(lhs_toks); + eq.m_rhs->collect_tokens(rhs_toks); + + // Start from the first variable on each side; if one side has no + // variable, this equation has no usable signature. + unsigned i0 = first_var_pos(lhs_toks); + unsigned j0 = first_var_pos(rhs_toks); + if (i0 == lhs_toks.size() || j0 == rhs_toks.size()) + continue; + + std::unordered_map lhs_first, rhs_first; + lhs_first.reserve(lhs_toks.size()); + rhs_first.reserve(rhs_toks.size()); + + for (unsigned k = 0; k < lhs_toks.size(); ++k) { + if (!lhs_toks[k]->is_var()) + continue; + expr* x = lhs_toks[k]->get_expr(); + if (!lhs_first.contains(x)) + lhs_first.emplace(x, k); + } + for (unsigned k = 0; k < rhs_toks.size(); ++k) { + if (!rhs_toks[k]->is_var()) + continue; + expr* x = rhs_toks[k]->get_expr(); + if (!rhs_first.contains(x)) + rhs_first.emplace(x, k); + } + + svector lhs_need_j(lhs_toks.size(), UINT_MAX); + svector rhs_need_i(rhs_toks.size(), UINT_MAX); + + // Prefix summary arrays: + // lhs_need_j[k] = maximum first-occurrence index in rhs for any + // variable seen in lhs[0..k]. Symmetric for rhs_need_i. + // A value of UINT_MAX means "no variable requirement yet". + // A value of UINT_MAX-1 means "fail: some prefix variable does not + // occur on the opposite side at all". + unsigned const FAIL_MARK = UINT_MAX - 1; + unsigned need = UINT_MAX; + for (unsigned k = 0; k < lhs_toks.size(); ++k) { + if (lhs_toks[k]->is_var()) { + auto it = rhs_first.find(lhs_toks[k]->get_expr()); + if (it == rhs_first.end()) + need = FAIL_MARK; + else if (need != FAIL_MARK) + need = (need == UINT_MAX) ? it->second : std::max(need, it->second); + } + lhs_need_j[k] = need; + } + + need = UINT_MAX; + for (unsigned k = 0; k < rhs_toks.size(); ++k) { + if (rhs_toks[k]->is_var()) { + auto it = lhs_first.find(rhs_toks[k]->get_expr()); + if (it == lhs_first.end()) + need = FAIL_MARK; + else if (need != FAIL_MARK) + need = (need == UINT_MAX) ? it->second : std::max(need, it->second); + } + rhs_need_i[k] = need; + } + + unsigned i = i0 + 1; + unsigned j = j0 + 1; + + // Compute least fixpoint for (i, j): grow one side only when the + // current prefix on the other side requires it. + bool changed = true; + while (changed) { + changed = false; + + unsigned req_j = lhs_need_j[i - 1]; + if (req_j == FAIL_MARK) { + i = lhs_toks.size(); + break; + } + if (req_j != UINT_MAX && req_j + 1 > j) { + j = req_j + 1; + changed = true; + } + + unsigned req_i = rhs_need_i[j - 1]; + if (req_i == FAIL_MARK) { + j = rhs_toks.size(); + break; + } + if (req_i != UINT_MAX && req_i + 1 > i) { + i = req_i + 1; + changed = true; + } + } + + if (i >= lhs_toks.size() || j >= rhs_toks.size()) + continue; + + // Decompose u = u1·u2 and v = v1·v2 at signature indices. + euf::snode* u1 = m_sg.drop_right(eq.m_lhs, lhs_toks.size() - i); + euf::snode* u2 = m_sg.drop_left(eq.m_lhs, i); + euf::snode* v1 = m_sg.drop_right(eq.m_rhs, rhs_toks.size() - j); + euf::snode* v2 = m_sg.drop_left(eq.m_rhs, j); + euf::snode* x = mk_fresh_var(eq.m_lhs->get_sort()); + + for (unsigned branch = 0; branch < 2; ++branch) { + nielsen_node* child = mk_child(node); + mk_edge(node, child, true); + + auto& child_eqs = child->str_eqs(); + child_eqs[eq_idx] = child_eqs.back(); + child_eqs.pop_back(); + + // Two-way split: + // (1) u1·x = v1 and u2 = x·v2 + // (2) u1 = v1·x and x·u2 = v2 + if (branch == 0) { + child_eqs.push_back(str_eq(m_sg.mk_concat(u1, x), v1, eq.m_dep)); + child_eqs.push_back(str_eq(u2, m_sg.mk_concat(x, v2), eq.m_dep)); + } + else { + child_eqs.push_back(str_eq(u1, m_sg.mk_concat(v1, x), eq.m_dep)); + child_eqs.push_back(str_eq(m_sg.mk_concat(x, u2), v2, eq.m_dep)); + } + } + return true; + } + return false; + } // ----------------------------------------------------------------------- // Modifier: apply_regex_var_split // For str_mem x·s ∈ R where x is a variable, split using minterms: @@ -4033,6 +4194,7 @@ namespace seq { st.update("nseq mod star intr", m_stats.m_mod_star_intr); st.update("nseq mod gpower intr", m_stats.m_mod_gpower_intr); st.update("nseq mod const nielsen", m_stats.m_mod_const_nielsen); + st.update("nseq mod signature split", m_stats.m_mod_signature_split); st.update("nseq mod regex var", m_stats.m_mod_regex_var_split); st.update("nseq mod power split", m_stats.m_mod_power_split); st.update("nseq mod var nielsen", m_stats.m_mod_var_nielsen); diff --git a/src/smt/seq/seq_nielsen.h b/src/smt/seq/seq_nielsen.h index 927e39e1e..1b3d90acd 100644 --- a/src/smt/seq/seq_nielsen.h +++ b/src/smt/seq/seq_nielsen.h @@ -739,6 +739,7 @@ namespace seq { unsigned m_mod_gpower_intr = 0; unsigned m_mod_const_nielsen = 0; unsigned m_mod_regex_var_split = 0; + unsigned m_mod_signature_split = 0; unsigned m_mod_power_split = 0; unsigned m_mod_var_nielsen = 0; unsigned m_mod_var_num_unwinding = 0; @@ -763,6 +764,7 @@ namespace seq { unsigned m_max_search_depth = 0; unsigned m_max_nodes = 0; // 0 = unlimited bool m_parikh_enabled = true; + bool m_signature_split = false; unsigned m_next_mem_id = 0; unsigned m_fresh_cnt = 0; nielsen_stats m_stats; @@ -886,6 +888,8 @@ namespace seq { // enable/disable Parikh image verification constraints void set_parikh_enabled(bool e) { m_parikh_enabled = e; } + void set_signature_split(bool e) { m_signature_split = e; } + // generate next unique regex membership id unsigned next_mem_id() { return m_next_mem_id++; } @@ -1064,6 +1068,9 @@ namespace seq { bool fire_gpower_intro(nielsen_node* node, str_eq const& eq, euf::snode* var, euf::snode_vector const& ground_prefix_orig, bool fwd); + // heuristic string equation splitting. Left to right scanning for shortest prefix with matching variables. + bool apply_signature_split(nielsen_node* node); + // regex variable split: for str_mem x·s ∈ R where x is a variable, // split using minterms: x → ε, or x → c·x' for each minterm c. // More general than regex_char_split, uses minterm partitioning. diff --git a/src/smt/seq/seq_nielsen_pp.cpp b/src/smt/seq/seq_nielsen_pp.cpp index 4ad2c21b6..8226c490f 100644 --- a/src/smt/seq/seq_nielsen_pp.cpp +++ b/src/smt/seq/seq_nielsen_pp.cpp @@ -393,7 +393,9 @@ namespace seq { if (!e) { result += "#" + std::to_string(tok->id()); } else if (tok->is_var()) { - result += dot_html_escape(to_app(e)->get_decl()->get_name().str()); + std::ostringstream os; + os << mk_pp(e, m); + result += dot_html_escape(os.str()); } else if (tok->is_unit()) { // seq.unit with non-literal character: show the character expression expr* ch = to_app(e)->get_arg(0); diff --git a/src/smt/theory_nseq.cpp b/src/smt/theory_nseq.cpp index d09842ab7..2c7eab148 100644 --- a/src/smt/theory_nseq.cpp +++ b/src/smt/theory_nseq.cpp @@ -173,6 +173,7 @@ namespace smt { void theory_nseq::new_eq_eh(theory_var v1, theory_var v2) { expr* e1 = get_enode(v1)->get_expr(); expr* e2 = get_enode(v2)->get_expr(); + // std::cout << mk_pp(e1, m) << " = " << mk_pp(e2, m) << std::endl; if (m_seq.is_re(e1)) { push_unhandled_pred(); return; @@ -185,6 +186,7 @@ namespace smt { seq::dep_tracker dep = nullptr; ctx.push_trail(restore_vector(m_prop_queue)); m_prop_queue.push_back(eq_item(s1, s2, get_enode(v1), get_enode(v2), dep)); + std::cout << "Enqueuing equation " << seq::snode_label_html(s1, m) << " = " << seq::snode_label_html(s2, m) << std::endl; } } @@ -520,6 +522,7 @@ namespace smt { m_nielsen.set_max_search_depth(get_fparams().m_nseq_max_depth); m_nielsen.set_max_nodes(get_fparams().m_nseq_max_nodes); m_nielsen.set_parikh_enabled(get_fparams().m_nseq_parikh); + m_nielsen.set_signature_split(get_fparams().m_nseq_signature); // Regex membership pre-check: before running DFS, check intersection // emptiness for each variable's regex constraints. This handles