/*++ Copyright (c) 2026 Microsoft Corporation Module Name: seq_nielsen.cpp Abstract: Nielsen graph implementation for string constraint solving. Ports the constraint types and Nielsen graph structures from ZIPT (https://github.com/CEisenhofer/ZIPT/tree/parikh/ZIPT/Constraints) Author: Clemens Eisenhofer 2026-03-02 Nikolaj Bjorner (nbjorner) 2026-03-02 NSB review: ostrich\substring2b.smt2, ostrich\substring.smt2 - We are missing rewrites for unit(x) = unit('a') that would eliminate x by a. --*/ #include "smt/seq/seq_nielsen.h" #include "smt/seq/seq_parikh.h" #include "smt/seq/seq_regex.h" #include "ast/arith_decl_plugin.h" #include "ast/ast_pp.h" #include "ast/ast_util.h" #include "ast/rewriter/seq_rewriter.h" #include "ast/rewriter/th_rewriter.h" #include "ast/rewriter/seq_skolem.h" #include "ast/rewriter/var_subst.h" #include "util/statistics.h" #include #include #include #include #include #include #include namespace seq { void deps_to_lits(dep_manager &dep_mgr, const dep_tracker deps, svector &eqs, svector &lits) { vector vs; dep_mgr.linearize(deps, vs); for (dep_source const &d : vs) { if (std::holds_alternative(d)) { eqs.push_back(std::get(d)); } else if (std::holds_alternative(d)) lits.push_back(std::get(d)); else UNREACHABLE(); } } // Normalize an arithmetic expression using th_rewriter. // Simplifies e.g. (n - 1 + 1) to n, preventing unbounded growth // of power exponents during unwind/merge cycles. static expr_ref normalize_arith(ast_manager &m, expr *e) { expr_ref result(e, m); th_rewriter rw(m); rw(result); return result; } std::pair split_membership(euf::snode const *str, euf::snode const *regex, euf::sgraph& sg, unsigned threshold, split_set& result) { seq_util& seq = sg.get_seq_util(); ast_manager& m = sg.get_manager(); euf::snode const* first = str->first(); SASSERT(first); SASSERT(!first->is_char()); // constants are consumed earlier // Choose the factorization boundary so the tail starts with the // LONGEST run of concrete characters c — this gives the split-engine // lookahead oracle the most pruning information. head = u' (tokens // before the run), tail = c · u''' (tokens from the run onward). euf::snode_vector toks; str->collect_tokens(toks); const unsigned total = toks.size(); unsigned run_start = 0, run_len = 0; for (unsigned i = 0; i < total; ) { if (!toks[i]->is_char()) { ++i; continue; } unsigned j = i; while (j < total && toks[j]->is_char()) ++j; if (j - i > run_len) { run_len = j - i; run_start = i; } i = j; } // No constant run → fall back to splitting off the first token. const unsigned p = run_len == 0 ? 1 : run_start; SASSERT(p >= 1); euf::snode const* head = p == 1 ? first : sg.drop_right(str, total - p); euf::snode const* tail = sg.drop_left(str, p); SASSERT(head && tail); // Build the constant lookahead c and (if non-empty) an oracle that // prunes splits whose ∇ cannot match c. zstring c; for (unsigned i = 0; i < run_len; ++i) { expr* ch; unsigned cv; VERIFY(seq.str.is_unit(toks[run_start + i]->get_expr(), ch)); VERIFY(seq.is_const_char(ch, cv)); c = c + zstring(cv); } split_oracle oracle; if (!c.empty()) oracle = [&sg, c](expr*, expr* n) { return split_lookahead_viable(n, sg, c); }; // Decompose the regex into a split-set via the shared seq_split engine // (sigma from the paper): head ∈ Δ ∧ tail ∈ ∇ for each ⟨Δ,∇⟩, with the // lookahead oracle pruning non-viable ∇ during generation. seq_rewriter rw(m); if (!rw.split(regex->get_expr(), result, threshold, split_mode::strong, oracle)) { result.clear(); return { nullptr, nullptr }; } rw.simplify_split(result); // Eagerly consume the constant run c from the tail by taking the c-derivative // of each ∇: tail = c·u''' ∈ ∇ ⟺ u''' ∈ δ_c(∇) (Brzozowski). // This makes progress — the returned tail becomes u''' (c consumed) — and // drops any split whose ∇ cannot start with c, since there δ_c(∇) = ∅ // (e.g. the star rule's ⟨ε,ε⟩: δ_c(ε) = ∅ for non-empty c). This is sound // because ∇ is now a complete top-level component (no factor appended). if (!c.empty()) { unsigned w = 0; for (unsigned i = 0; i < result.size(); ++i) { euf::snode const* d = sg.mk(result[i].m_n); for (unsigned k = 0; d && !d->is_fail() && k < c.length(); ++k) { d = sg.brzozowski_deriv(d, sg.mk_char(c[k])); } SASSERT(d); if (d->is_fail()) continue; // ∇ can't start with c → infeasible split, drop result[w++] = split_pair(result[i].m_d, d->get_expr(), m); } result.shrink(w); tail = sg.drop_left(str, run_start + run_len); // u''' (c consumed) } return { head, tail }; } bool split_lookahead_viable(expr* n_regex, euf::sgraph& sg, zstring const& c) { euf::snode const* cur = sg.mk(n_regex); SASSERT(cur); for (unsigned i = 0; i < c.length(); i++) { if (sg.re_nullable(cur) == l_true) return true; // N accepts the prefix c[0..i) → a suffix completes it cur = sg.brzozowski_deriv(cur, sg.mk_char(c[i])); SASSERT(cur); if (cur->is_fail()) return false; // N went (syntactically) dead before reaching c } return !cur->is_fail(); } // Directional helpers mirroring ZIPT's fwd flag: // fwd=true -> left-to-right (prefix/head) // fwd=false -> right-to-left (suffix/tail) static euf::snode const* dir_token(euf::snode const* s, const bool fwd) { if (!s) return nullptr; return fwd ? s->first() : s->last(); } static euf::snode const* dir_drop(euf::sgraph &sg, euf::snode const* s, const unsigned count, const bool fwd) { if (!s || count == 0) return s; return fwd ? sg.drop_left(s, count) : sg.drop_right(s, count); } static euf::snode const* dir_concat(euf::sgraph &sg, euf::snode const* a, euf::snode const* b, const bool fwd) { if (!a) return b; if (!b) return a; return fwd ? sg.mk_concat(a, b) : sg.mk_concat(b, a); } static void collect_tokens_dir(euf::snode const* s, const bool fwd, euf::snode_vector &toks) { toks.reset(); if (!s) return; s->collect_tokens(toks); if (!fwd) toks.reverse(); } // Right-derivative helper used by backward str_mem simplification: // dR(re, c) = reverse( derivative(c, reverse(re)) ). static euf::snode const* reverse_brzozowski_deriv(euf::sgraph &sg, euf::snode const* re, euf::snode const* elem) { if (!re || !elem || !re->get_expr() || !elem->get_expr()) return nullptr; ast_manager &m = sg.get_manager(); seq_util &seq = sg.get_seq_util(); seq_rewriter rw(m); expr *elem_expr = elem->get_expr(); expr *ch = nullptr; if (seq.str.is_unit(elem_expr, ch)) elem_expr = ch; const expr_ref re_rev(seq.re.mk_reverse(re->get_expr()), m); const expr_ref d = rw.mk_derivative(elem_expr, re_rev); if (!d.get()) return nullptr; expr_ref result(seq.re.mk_reverse(d), m); th_rewriter tr(m); tr(result); return sg.mk(result); } // ----------------------------------------------- // str_eq // ----------------------------------------------- void str_eq::sort() { if (m_lhs && m_rhs && m_lhs->id() > m_rhs->id()) { std::swap(m_lhs, m_rhs); } SASSERT(!m_lhs || !m_rhs || m_lhs->id() <= m_rhs->id()); } bool str_eq::is_trivial() const { return m_lhs == m_rhs || (m_lhs && m_rhs && m_lhs->is_empty() && m_rhs->is_empty()); } bool str_eq::contains_var(euf::snode const* var) const { if (!var) return false; // check if var appears in the token list of lhs or rhs if (m_lhs) { euf::snode_vector tokens; m_lhs->collect_tokens(tokens); for (euf::snode const* t : tokens) { if (t == var) return true; } } if (m_rhs) { euf::snode_vector tokens; m_rhs->collect_tokens(tokens); for (euf::snode const* t : tokens) { if (t == var) return true; } } return false; } // ----------------------------------------------- // str_mem // ----------------------------------------------- bool str_mem::is_primitive() const { // A still-unresolved symbolic-derivative residual (ite) is not a settled // primitive — apply_regex_if_split must resolve it first. return m_str && m_str->length() == 1 && m_str->is_var() && m_regex->is_ground() && m_regex->kind() != euf::snode_kind::s_ite; } bool str_mem::is_trivial(nielsen_node const* n) const { if (!(m_str && m_regex)) return false; if (m_kind == mem_kind::no_loop) // guard: discharged ⇒ Σ* (accepts all); ε has no non-empty lap-prefix. return m_discharged || m_str->is_empty(); if (!m_str->is_empty()) return false; if (m_kind == mem_kind::stab_view) // ε ∈ stab(root,Q) iff current state ≡ root (i.e. root ∈ F={root}). return m_regex == m_root; return n->graph().sg().re_nullable(m_regex) == l_true; } bool str_mem::is_contradiction(nielsen_node const* n) const { if (!(m_str && m_regex && m_str->is_empty())) return false; if (m_kind == mem_kind::no_loop) return false; // guard acceptance is always true on the empty word if (m_kind == mem_kind::stab_view) return m_regex != m_root; // ε ∉ stab(root,Q) when state ≢ root return n->graph().sg().re_nullable(m_regex) == l_false; } bool str_mem::contains_var(euf::snode const* var) const { SASSERT(var); if (m_str) { euf::snode_vector tokens; m_str->collect_tokens(tokens); return any_of(tokens, [var](auto t) { return t == var; }); } return false; } // ----------------------------------------------- // nielsen_subst // ----------------------------------------------- bool nielsen_subst::is_eliminating() const { SASSERT(m_var && m_replacement); // check if var appears in replacement euf::snode_vector tokens; m_replacement->collect_tokens(tokens); return all_of(tokens, [this](auto t) { return t != m_var; }); } bool nielsen_subst::is_char_subst() const { SASSERT(m_var && m_replacement); SASSERT(!m_var->is_unit() || m_replacement->is_char_or_unit()); return m_var->is_unit(); } // ----------------------------------------------- // nielsen_edge // ----------------------------------------------- nielsen_edge::nielsen_edge(nielsen_node* src, nielsen_node* tgt, const char* rule, const bool is_progress): m_src(src), m_tgt(tgt), m_rule_name(rule), m_is_progress(is_progress) { } void nielsen_edge::add_subst(nielsen_subst const& s) { m_subst.push_back(s); } // ----------------------------------------------- // nielsen_node // ----------------------------------------------- nielsen_node::nielsen_node(nielsen_graph& graph, const unsigned id): m_id(id), m_graph(graph), m_is_progress(true) { } void nielsen_node::clone_from(nielsen_node const& parent) { m_str_eq.reset(); m_str_deq.reset(); m_str_mem.reset(); m_constraints.reset(); m_char_ranges.reset(); m_str_eq.append(parent.m_str_eq); m_str_deq.append(parent.m_str_deq); m_str_mem.append(parent.m_str_mem); m_constraints.append(parent.m_constraints); // clone character ranges for (auto const &[k, v] : parent.m_char_ranges) m_char_ranges.insert(k, std::make_pair(v.first.clone(), v.second)); // clone regex occurrence tracking m_regex_occurrence = parent.m_regex_occurrence; SASSERT(m_str_eq.size() == parent.m_str_eq.size()); SASSERT(m_str_deq.size() == parent.m_str_deq.size()); SASSERT(m_str_mem.size() == parent.m_str_mem.size()); SASSERT(m_constraints.size() == parent.m_constraints.size()); } bool nielsen_node::track_regex_occurrence(unsigned regex_id, unsigned mem_id) { const auto key = std::make_pair(regex_id, mem_id); const auto it = m_regex_occurrence.find(key); if (it != m_regex_occurrence.end()) { // Already seen — cycle detected return true; } m_regex_occurrence[key] = m_id; return false; } void nielsen_node::add_str_eq(str_eq const& eq) { SASSERT(eq.m_lhs != nullptr); SASSERT(eq.m_rhs != nullptr); if (eq.is_trivial()) return; m_str_eq.push_back(eq); } void nielsen_node::add_str_deq(str_deq const& deq) { SASSERT(deq.m_lhs != nullptr); SASSERT(deq.m_rhs != nullptr); m_str_deq.push_back(deq); } void nielsen_node::add_str_mem(str_mem const& mem) { SASSERT(mem.m_str != nullptr); SASSERT(mem.m_regex != nullptr); if (mem.is_trivial(this)) return; m_str_mem.push_back(mem); } bool nielsen_node::lower_bound(expr *e, rational &lo, dep_tracker &dep) { literal_vector lits; enode_pair_vector eqs; if (m_graph.a.is_numeral(e, lo)) return true; if (!m_graph.m_context_solver.lower_bound(e, lo, lits, eqs)) return false; for (auto lit : lits) dep = m_graph.dep_mgr().mk_join(dep, m_graph.dep_mgr().mk_leaf(lit)); for (auto eq : eqs) dep = m_graph.dep_mgr().mk_join(dep, m_graph.dep_mgr().mk_leaf(eq)); const expr_ref lo_expr(m_graph.a.mk_int(lo), m_graph.m); m_graph.add_le_dependency(dep, this, lo_expr, e); return true; } bool nielsen_node::upper_bound(expr *e, rational &up, dep_tracker &dep) { literal_vector lits; enode_pair_vector eqs; if (m_graph.a.is_numeral(e, up)) return true; if (!m_graph.m_context_solver.upper_bound(e, up, lits, eqs)) return false; for (auto lit : lits) dep = m_graph.dep_mgr().mk_join(dep, m_graph.dep_mgr().mk_leaf(lit)); for (auto eq : eqs) dep = m_graph.dep_mgr().mk_join(dep, m_graph.dep_mgr().mk_leaf(eq)); const expr_ref up_expr(m_graph.a.mk_int(up), m_graph.m); m_graph.add_le_dependency(dep, this, e, up_expr); return true; } void nielsen_node::add_constraint(constraint const &c) { auto& m = graph().get_manager(); if (m.is_true(c.fml)) return; // TODO: Is it possible that we miss a conflict if we decompose? if (m.is_and(c.fml)) { // We have to add all - even if some of it conflict // [otw. we would leave the node partially initialized] for (const auto f : *to_app(c.fml)) { add_constraint(constraint(f, c.dep, m)); } return; } m_constraints.push_back(c); } void nielsen_node::apply_subst(euf::sgraph& sg, nielsen_subst const& s) { SASSERT(!s.m_var->is_char_or_unit() || s.m_replacement->is_char_or_unit()); SASSERT(s.m_var); SASSERT(s.m_replacement != nullptr); for (auto &eq : m_str_eq) { const auto new_lhs = sg.subst(eq.m_lhs, s.m_var, s.m_replacement); const auto new_rhs = sg.subst(eq.m_rhs, s.m_var, s.m_replacement); if (new_lhs != eq.m_lhs || new_rhs != eq.m_rhs) { eq.m_lhs = new_lhs; eq.m_rhs = new_rhs; eq.m_dep = m_graph.dep_mgr().mk_join(eq.m_dep, s.m_dep); eq.sort(); } } for (auto &deq : m_str_deq) { const auto new_lhs = sg.subst(deq.m_lhs, s.m_var, s.m_replacement); const auto new_rhs = sg.subst(deq.m_rhs, s.m_var, s.m_replacement); if (new_lhs != deq.m_lhs || new_rhs != deq.m_rhs) { deq.m_lhs = new_lhs; deq.m_rhs = new_rhs; deq.m_dep = m_graph.dep_mgr().mk_join(deq.m_dep, s.m_dep); deq.sort(); } } for (auto &mem : m_str_mem) { const auto new_str = sg.subst(mem.m_str, s.m_var, s.m_replacement); const auto new_regex = sg.subst(mem.m_regex, s.m_var, s.m_replacement); if (new_str != mem.m_str || new_regex != mem.m_regex) { mem.m_str = new_str; mem.m_regex = new_regex; mem.m_dep = m_graph.dep_mgr().mk_join(mem.m_dep, s.m_dep); } } const unsigned var_id = s.m_var->id(); ast_manager& m = graph().get_manager(); if (s.is_char_subst()) { expr* var_c_expr = s.m_var->arg0()->get_expr(); expr* repl_c_expr = s.m_replacement->arg0()->get_expr(); add_constraint( constraint(m.mk_eq(var_c_expr, repl_c_expr), s.m_dep, m)); if (m_char_ranges.contains(var_id)) { const auto range = m_char_ranges.find(var_id); // copy exactly m_char_ranges.remove(var_id); add_char_range(s.m_replacement, range.first, m_graph.dep_mgr().mk_join(range.second, s.m_dep)); } } } void nielsen_node::add_char_range(euf::snode const* sym_char, char_set const& range, dep_tracker dep) { if (sym_char->is_char()) { // for a concrete character just check if it matches const expr * val = sym_char->get_expr(); unsigned ch; expr* ch_expr; VERIFY(graph().seq().str.is_unit(val, ch_expr)); VERIFY(graph().seq().is_const_char(ch_expr, ch)); for (unsigned i = 0; i < range.ranges().size(); i++) { if (range.ranges()[i].contains(ch)) return; // matches, no conflict } set_conflict(backtrack_reason::character_range, dep); set_general_conflict(); return; } SASSERT(sym_char && sym_char->is_unit()); const unsigned id = sym_char->id(); if (m_char_ranges.contains(id)) { auto& existing = m_char_ranges.find(id); char_set inter = existing.first.intersect_with(range); existing = std::make_pair(inter, graph().dep_mgr().mk_join(existing.second, dep)); if (inter.is_empty()) { set_conflict(backtrack_reason::character_range, existing.second); set_general_conflict(); } } else m_char_ranges.insert(id, std::make_pair(range.clone(), dep)); auto& ranges = range.ranges(); auto& m = graph().get_manager(); const auto & seq = graph().seq(); expr* var = sym_char->get_expr(); SASSERT(seq.str.is_unit(var)); var = to_app(var)->get_arg(0); ptr_vector cases; cases.reserve(ranges.size()); for (unsigned i = 0; i < ranges.size(); ++i) { cases[i] = m.mk_and( seq.mk_le(seq.mk_char(ranges[i].m_lo), var), seq.mk_le(var, seq.mk_char(ranges[i].m_hi - 1))); } add_constraint(constraint(m.mk_or(cases), dep, m)); } // ----------------------------------------------- // nielsen_graph // ----------------------------------------------- nielsen_graph::nielsen_graph(euf::sgraph &sg, sub_solver_i &solver, context_solver_i &ctx_solver) : m(sg.get_manager()), a(sg.get_manager()), m_seq(sg.get_seq_util()), m_sg(sg), m_rw(m), m_sk(m, m_rw), m_length_solver(solver), m_context_solver(ctx_solver), m_partial_dfa_pin(sg.get_manager()), m_parikh(alloc(seq_parikh, sg)), m_seq_regex(alloc(seq::seq_regex, sg)) { } nielsen_graph::~nielsen_graph() { dealloc(m_parikh); dealloc(m_seq_regex); reset(); } bool nielsen_graph::projection_state_in_Q(expr *state, unsigned nu) { if (!state || nu == 0) return false; const unsigned sid = state->get_id(); // r ∈ Q_nu iff r is incident to a partial-DFA edge whose extraction // index lies in [1, nu] (the "edges marked ≤ ν" subautomaton of the // implementation-aspects section of the paper). auto incident = [&](std::unordered_map const &adj) { auto it = adj.find(sid); if (it == adj.end()) return false; for (const unsigned edge_idx : it->second) { if (edge_idx >= m_partial_dfa_edges.size()) continue; const unsigned pidx = m_partial_dfa_edges[edge_idx].m_projection_idx; if (pidx != 0 && pidx <= nu) return true; } return false; }; return incident(m_partial_dfa_out) || incident(m_partial_dfa_in); } nielsen_node *nielsen_graph::mk_node() { const unsigned id = m_nodes.size(); nielsen_node *n = alloc(nielsen_node, *this, id); m_nodes.push_back(n); SASSERT(n->id() == m_nodes.size() - 1); return n; } nielsen_node *nielsen_graph::mk_child(nielsen_node *parent) { nielsen_node *child = mk_node(); child->clone_from(*parent); child->m_parent_ic_count = parent->constraints().size(); return child; } nielsen_edge *nielsen_graph::mk_edge(nielsen_node *src, nielsen_node *tgt, const char *rule, const bool is_progress) { SASSERT(src != nullptr); SASSERT(tgt != nullptr); nielsen_edge *e = alloc(nielsen_edge, src, tgt, rule, is_progress); m_edges.push_back(e); src->add_outgoing(e); return e; } void nielsen_graph::add_str_eq(euf::snode const* lhs, euf::snode const* rhs, smt::enode *l, smt::enode *r) const { const dep_tracker dep = m_dep_mgr.mk_leaf(enode_pair(l, r)); str_eq eq(lhs, rhs, dep); eq.sort(); // check if root node contains this equation already if (std::ranges::any_of(m_root->str_eqs(), [&](const str_eq &e) { return e.m_lhs == eq.m_lhs && e.m_rhs == eq.m_rhs; })) // already present, no need to add again return; m_root->add_str_eq(eq); } void nielsen_graph::add_str_deq(euf::snode const* lhs, euf::snode const* rhs, sat::literal l) const { const dep_tracker dep = m_dep_mgr.mk_leaf(l); str_deq deq(lhs, rhs, dep); // check if root node contains this equation already if (std::ranges::any_of(m_root->str_deqs(), [&](const str_deq &e) { return e.m_lhs == deq.m_lhs && e.m_rhs == deq.m_rhs; })) // already present, no need to add again return; m_root->add_str_deq(deq); } void nielsen_graph::add_str_mem(euf::snode const* str, euf::snode const* regex, sat::literal l) const { // check if root node contains this membership constraint already if (std::ranges::any_of(m_root->str_mems(), [&](const str_mem &e) { return e.m_regex == regex && e.m_str == str; })) // already present, no need to add again return; const dep_tracker dep = m_dep_mgr.mk_leaf(l); m_root->add_str_mem(str_mem(str, regex, dep)); } // test-friendly overloads (no external dependency tracking) void nielsen_graph::add_str_eq(euf::snode const* lhs, euf::snode const* rhs) { const dep_tracker dep = m_dep_mgr.mk_leaf(enode_pair(nullptr, nullptr)); str_eq eq(lhs, rhs, dep); eq.sort(); m_root->add_str_eq(eq); } void nielsen_graph::add_str_deq(euf::snode const* lhs, euf::snode const* rhs) { const dep_tracker dep = m_dep_mgr.mk_leaf(enode_pair(nullptr, nullptr)); str_deq deq(lhs, rhs, dep); m_root->add_str_deq(deq); } void nielsen_graph::add_str_mem(euf::snode const* str, euf::snode const* regex) { const dep_tracker dep = nullptr; m_root->add_str_mem(str_mem(str, regex, dep)); } void nielsen_graph::reset() { for (nielsen_node *n : m_nodes) { dealloc(n); } for (nielsen_edge *e : m_edges) { dealloc(e); } m_nodes.reset(); m_edges.reset(); m_root = nullptr; m_sat_node = nullptr; m_sat_path.reset(); m_depth_bound = 0; m_fresh_cnt = 0; m_root_constraints_asserted = false; // m_mod_cnt.reset(); m_partial_dfa_edges.reset(); m_partial_dfa_out.clear(); m_partial_dfa_in.clear(); m_partial_dfa_edge_index.clear(); m_partial_dfa_pin.reset(); m_projection_extract_idx = 0; m_explored_automaton.reset(); m_unsat_node_cache.clear(); m_num_cache_hits = 0; // m_length_trail.reset(); // m_length_info.reset(); m_dep_mgr.reset(); m_length_solver.reset(); SASSERT(m_nodes.empty()); SASSERT(m_edges.empty()); SASSERT(m_root == nullptr); SASSERT(m_sat_node == nullptr); } void nielsen_graph::add_le_dependency(const dep_tracker dep, nielsen_node *n, expr *lhs, expr *rhs) const { SASSERT(lhs); SASSERT(rhs); const expr_ref le(a.mk_le(lhs, rhs), m); // just assume it to be correct // Just add the constraint - we do not have to recompute it // [also it is on the set of side-conditions if we assert a satisfied node] n->add_constraint(constraint(le, dep, m)); } // ----------------------------------------------------------------------- // nielsen_node: simplify_and_init // ----------------------------------------------------------------------- bool nielsen_node::check_empty_side_conflict(euf::sgraph& sg, euf::snode const* non_empty_side, dep_tracker const& dep) { euf::snode_vector tokens; non_empty_side->collect_tokens(tokens); const bool has_char = std::ranges::any_of(tokens, [](euf::snode const* t){ return t->is_char(); }); const bool all_eliminable = std::ranges::all_of(tokens, [](euf::snode const* t){ return t->is_var() || t->is_power(); }); if (has_char || !all_eliminable) { set_general_conflict(); set_conflict(backtrack_reason::symbol_clash, dep); return true; } return false; } // ----------------------------------------------------------------------- // Power simplification helpers (mirrors ZIPT's MergePowers, // SimplifyPowerElim/CommPower, SimplifyPowerSingle) // ----------------------------------------------------------------------- // Check if exponent b equals exponent a + diff for some rational constant diff. // Uses syntactic matching on Z3 expression structure: pointer equality // detects shared sub-expressions created during ConstNumUnwinding. // static bool get_const_power_diff(expr* b, expr* a, arith_util& arith, rational& diff) { if (a == b) { diff = rational(0); return true; } expr* x = nullptr, *y = nullptr; rational val; // b = (+ a k) ? if (arith.is_add(b, x, y)) { if (x == a && arith.is_numeral(y, val)) { diff = val; return true; } if (y == a && arith.is_numeral(x, val)) { diff = val; return true; } } // a = (+ b k) → diff = -k if (arith.is_add(a, x, y)) { if (x == b && arith.is_numeral(y, val)) { diff = -val; return true; } if (y == b && arith.is_numeral(x, val)) { diff = -val; return true; } } // b = (- a k) → diff = -k if (arith.is_sub(b, x, y) && x == a && arith.is_numeral(y, val)) { diff = -val; return true; } // a = (- b k) → diff = k if (arith.is_sub(a, x, y) && x == b && arith.is_numeral(y, val)) { diff = val; return true; } return false; } // Get the base expression of a power snode. static expr* get_power_base_expr(euf::snode const* power, seq_util& seq) { if (!power || !power->is_power()) return nullptr; const expr * e = power->get_expr(); expr* base = nullptr, *exp = nullptr; return (e && seq.str.is_power(e, base, exp)) ? base : nullptr; } // Get the exponent expression of a power snode. static expr* get_power_exp_expr(euf::snode const* power, seq_util& seq) { if (!power->is_power()) return nullptr; const expr * e = power->get_expr(); expr* base = nullptr, *exp = nullptr; return (e && seq.str.is_power(e, base, exp)) ? exp : nullptr; } // Merge adjacent tokens with the same power base on one side of an equation. // Handles: char(c) · power(c^e) → power(c^(e+1)), // power(c^e) · char(c) → power(c^(e+1)), // power(c^e1) · power(c^e2) → power(c^(e1+e2)). // Returns new snode if merging happened, nullptr otherwise. static euf::snode const* merge_adjacent_powers(euf::sgraph& sg, euf::snode const* side) { if (!side || side->is_empty() || side->is_token()) return nullptr; euf::snode_vector tokens; side->collect_tokens(tokens); if (tokens.size() < 2) return nullptr; ast_manager& m = sg.get_manager(); arith_util arith(m); seq_util& seq = sg.get_seq_util(); bool merged = false; euf::snode_vector result; unsigned i = 0; while (i < tokens.size()) { euf::snode const* tok = tokens[i]; // Case 1: current is a power token — absorb following same-base tokens. // Skip at leading position (i == 0) to keep exponents small: CommPower // cross-side cancellation works better with unmerged leading powers // (e.g., w^k trivially ≤ 1+k, but w^(2k) vs 1+k requires k ≥ 1). if (tok->is_power() && i > 0) { expr* base_e = get_power_base_expr(tok, seq); expr* exp_acc = get_power_exp_expr(tok, seq); if (!base_e || !exp_acc) { result.push_back(tok); ++i; continue; } bool local_merged = false; unsigned j = i + 1; while (j < tokens.size()) { euf::snode const* next = tokens[j]; if (next->is_power()) { const expr * nb = get_power_base_expr(next, seq); if (nb == base_e) { exp_acc = arith.mk_add(exp_acc, get_power_exp_expr(next, seq)); local_merged = true; ++j; continue; } } if (next->is_char() && next->get_expr() == base_e) { exp_acc = arith.mk_add(exp_acc, arith.mk_int(1)); local_merged = true; ++j; continue; } break; } if (local_merged) { merged = true; expr_ref norm_exp = normalize_arith(m, exp_acc); expr_ref new_pow(seq.str.mk_power(base_e, norm_exp), m); result.push_back(sg.mk(new_pow)); } else result.push_back(tok); i = j; continue; } // Case 2: current is a char — check if next is a same-base power. // Skip at leading position (i == 0) to avoid undoing power unwinding: // unwind produces u · u^(n-1); merging it back to u^n creates an infinite cycle. if (i > 0 && tok->is_char() && tok->get_expr() && i + 1 < tokens.size()) { euf::snode const* next = tokens[i + 1]; if (next->is_power() && get_power_base_expr(next, seq) == tok->get_expr()) { expr* base_e = tok->get_expr(); // Use same arg order as Case 1: add(exp, 1), not add(1, exp), // so that merging "c · c^e" and "c^e · c" both produce add(e, 1) // and the resulting power expression is hash-consed identically. expr* exp_acc = arith.mk_add(get_power_exp_expr(next, seq), arith.mk_int(1)); unsigned j = i + 2; while (j < tokens.size()) { euf::snode const* further = tokens[j]; if (further->is_power() && get_power_base_expr(further, seq) == base_e) { exp_acc = arith.mk_add(exp_acc, get_power_exp_expr(further, seq)); ++j; continue; } if (further->is_char() && further->get_expr() == base_e) { exp_acc = arith.mk_add(exp_acc, arith.mk_int(1)); ++j; continue; } break; } merged = true; expr_ref norm_exp = normalize_arith(m, exp_acc); expr_ref new_pow(seq.str.mk_power(base_e, norm_exp), m); result.push_back(sg.mk(new_pow)); i = j; continue; } } result.push_back(tok); ++i; } if (!merged) return nullptr; euf::snode const* rebuilt = nullptr; for (const auto tok : result) rebuilt = rebuilt ? sg.mk_concat(rebuilt, tok) : tok; if (!rebuilt) rebuilt = sg.mk_empty_seq(side->get_sort()); return rebuilt; } // Simplify constant-exponent powers: base^0 → ε, base^1 → base. // Returns new snode if any simplification happened, nullptr otherwise. static euf::snode const* simplify_const_powers(nielsen_node* node, euf::sgraph& sg, euf::snode const* side, dep_tracker& dep) { dep = nullptr; SASSERT(side); if (side->is_empty()) return nullptr; euf::snode_vector tokens; side->collect_tokens(tokens); seq_util& seq = sg.get_seq_util(); bool simplified = false; euf::snode_vector result; for (euf::snode const* tok : tokens) { if (tok->is_power()) { expr* exp_e = get_power_exp_expr(tok, seq); rational ub; dep_tracker ub_dep = nullptr; if (exp_e && node->upper_bound(exp_e, ub, ub_dep)) { if (ub.is_zero()) { // base^0 → ε (skip this token entirely) dep = node->graph().dep_mgr().mk_join(dep, ub_dep); simplified = true; continue; } if (ub.is_one()) { // base^1 → base euf::snode const* base_sn = tok->arg0(); if (base_sn) { dep = node->graph().dep_mgr().mk_join(dep, ub_dep); result.push_back(base_sn); simplified = true; continue; } } } } result.push_back(tok); } if (!simplified) return nullptr; euf::snode const* rebuilt = nullptr; for (euf::snode const* tok : result) { rebuilt = rebuilt ? sg.mk_concat(rebuilt, tok) : tok; } if (!rebuilt) rebuilt = sg.mk_empty_seq(side->get_sort()); return rebuilt; } // CommPower: count how many times a power's base pattern appears in // the directional prefix of the other side (fwd=true: left prefix, // fwd=false: right suffix). Mirrors ZIPT's CommPower helper. // Returns (count_expr, num_tokens_consumed). count_expr is nullptr // when no complete base-pattern match is found. static std::pair comm_power( euf::snode const* base_sn, euf::snode const* side, ast_manager& m, seq_util& seq, const bool fwd) { arith_util arith(m); euf::snode_vector base_tokens, side_tokens; collect_tokens_dir(base_sn, fwd, base_tokens); collect_tokens_dir(side, fwd, side_tokens); if (base_tokens.empty() || side_tokens.empty()) return {expr_ref(nullptr, m), 0}; expr* sum = nullptr; unsigned pos = 0; expr* last_stable_sum = nullptr; unsigned last_stable_idx = 0; unsigned i = 0; for (; i < side_tokens.size(); i++) { euf::snode const* t = side_tokens[i]; if (pos == 0) { last_stable_idx = i; last_stable_sum = sum; } // Case 1: direct token match with base pattern if (pos < base_tokens.size() && t == base_tokens[pos]) { pos++; if (pos >= base_tokens.size()) { pos = 0; sum = sum ? arith.mk_add(sum, arith.mk_int(1)) : arith.mk_int(1); } continue; } // Case 2: power token whose base matches our base pattern (at pos == 0) // Skip at leading position (i == 0) to avoid undoing power unwinding: // unwind produces u · u^(n-1); merging it back to u^n creates an infinite cycle. if (i > 0 && t->is_power()) { euf::snode const* pow_base = t->arg0(); if (pow_base) { euf::snode_vector pb_tokens; collect_tokens_dir(pow_base, fwd, pb_tokens); if (pb_tokens.size() == base_tokens.size()) { bool match = true; for (unsigned j = 0; j < pb_tokens.size() && match; j++) match = (pb_tokens[j] == base_tokens[j]); if (match) { expr* pow_exp = get_power_exp_expr(t, seq); if (pow_exp) { sum = sum ? arith.mk_add(sum, pow_exp) : pow_exp; continue; } } } } } break; } // After loop: i = break index or side_tokens.size() if (pos == 0) { last_stable_idx = i; last_stable_sum = sum; } return {expr_ref(last_stable_sum, m), last_stable_idx}; } euf::snode const* nielsen_graph::to_partial_label_regex(euf::snode const* label) const { SASSERT(label && label->get_expr()); expr* e = label->get_expr(); if (m_seq.is_re(e) || m_seq.re.is_full_char(e)) return label; if (m_seq.is_char(e)) { const expr_ref u(m_seq.str.mk_unit(e), m); return m_sg.mk(m_seq.re.mk_to_re(u)); } if (m_seq.str.is_string(e) || m_seq.str.is_unit(e)) return m_sg.mk(m_seq.re.mk_to_re(e)); std::cout << "Unexpected: " << mk_pp(label->get_expr(), m) << std::endl; UNREACHABLE(); return nullptr; } void nielsen_graph::record_partial_derivative_edge(euf::snode const* src_re, euf::snode const* label, euf::snode const* dst_re) { SASSERT(src_re && dst_re); if (!src_re->is_ground() || !dst_re->is_ground()) return; if (src_re->is_fail() || dst_re->is_fail()) return; //euf::snode const* label_re = to_partial_label_regex(label); //SASSERT(label_re); //if (!m_seq.is_re(label_re->get_expr()) || !label_re->is_ground()) // return; expr* src_e = src_re->get_expr(); //expr* label_e = label_re->get_expr(); expr* dst_e = dst_re->get_expr(); // Deduplicate transitions by (src, dst) only — NOT by label. The // projection operator is parameterized by the explored state set Q // (the Brzozowski automaton is deterministic, so the only a-transition // out of a state is to δ_a(state)); edge labels are never consulted by // projection_state_in_Q / collect_scc_for_projection / the projection // derivative. Keying by label would record the SAME transition twice // when it is discovered once as a concrete char (to_re "a") and once as // a minterm range ([a-a]), spuriously inflating the SCC edge count and // re-triggering cycle decomposition as the derivation walks the cycle. partial_dfa_edge_key key{ src_e->get_id(), 0, dst_e->get_id() }; if (m_partial_dfa_edge_index.contains(key)) return; // Pin each expression so the egraph cannot release it on pop while // we still reference it from the cache. Bumping the ref count of an // already-pinned expression is harmless; the wasted slot is bounded // by the unique-edge count. m_partial_dfa_pin.push_back(src_e); //m_partial_dfa_pin.push_back(label_e); m_partial_dfa_pin.push_back(dst_e); unsigned edge_idx = m_partial_dfa_edges.size(); m_partial_dfa_edge_index.emplace(key, edge_idx); partial_dfa_edge e; e.m_src = src_e; //e.m_label = label_e; e.m_dst = dst_e; m_partial_dfa_edges.push_back(e); m_partial_dfa_out[src_e->get_id()].push_back(edge_idx); m_partial_dfa_in[dst_e->get_id()].push_back(edge_idx); } bool nielsen_graph::collect_scc_for_projection(euf::snode const* root_re, uint_set& scc) const { scc.reset(); if (!root_re || !root_re->get_expr()) return false; // scc, fwd, bwd contain expression ids (matching the keys in // m_partial_dfa_out / m_partial_dfa_in). Expression ids are stable // across sgraph pops because we pin them in m_partial_dfa_pin. const unsigned root_id = root_re->get_expr()->get_id(); uint_set fwd, bwd; unsigned_vector stack; stack.push_back(root_id); while (!stack.empty()) { unsigned s = stack.back(); stack.pop_back(); if (fwd.contains(s)) continue; fwd.insert(s); auto it = m_partial_dfa_out.find(s); if (it == m_partial_dfa_out.end()) continue; for (const unsigned edge_idx : it->second) { if (edge_idx >= m_partial_dfa_edges.size()) continue; partial_dfa_edge const& e = m_partial_dfa_edges[edge_idx]; if (e.m_dst) stack.push_back(e.m_dst->get_id()); } } stack.push_back(root_id); while (!stack.empty()) { unsigned s = stack.back(); stack.pop_back(); if (bwd.contains(s)) continue; bwd.insert(s); auto it = m_partial_dfa_in.find(s); if (it == m_partial_dfa_in.end()) continue; for (const unsigned edge_idx : it->second) { if (edge_idx >= m_partial_dfa_edges.size()) continue; partial_dfa_edge const& e = m_partial_dfa_edges[edge_idx]; if (e.m_src) stack.push_back(e.m_src->get_id()); } } for (const unsigned s : fwd) { if (bwd.contains(s)) scc.insert(s); } if (!scc.contains(root_id)) return false; if (scc.num_elems() > 1) return true; const auto it = m_partial_dfa_out.find(root_id); if (it == m_partial_dfa_out.end()) return false; for (const unsigned edge_idx : it->second) { if (edge_idx >= m_partial_dfa_edges.size()) continue; partial_dfa_edge const& e = m_partial_dfa_edges[edge_idx]; if (e.m_dst && e.m_dst->get_id() == root_id) return true; } return false; } unsigned nielsen_graph::mark_scc_projection_edges(uint_set const& scc) { // scc contains expr ids (see collect_scc_for_projection). // // Returns the number of edges *newly* added to the projection coverage // by this call (edges that were previously unmarked). Crucially, the // monotone extraction index is bumped ONLY when there is something new // to mark. This makes "the explored SCC has grown" a *global* property // of the SCC's edge set rather than a per-entry-state one: re-visiting // an already-fully-marked SCC from a different state (e.g. a derivative // state br of r, which shares the SCC {r, br}) marks nothing new and is // therefore not treated as a fresh cycle to decompose. Without this, // each state of the cycle would trigger its own redundant decomposition // as the derivation walks around the SCC. unsigned newly_marked = 0; for (unsigned src_id : scc) { auto it = m_partial_dfa_out.find(src_id); if (it == m_partial_dfa_out.end()) continue; for (const unsigned edge_idx : it->second) { if (edge_idx >= m_partial_dfa_edges.size()) continue; partial_dfa_edge const& e = m_partial_dfa_edges[edge_idx]; if (!e.m_dst || !scc.contains(e.m_dst->get_id())) continue; if (e.m_projection_idx == 0) ++newly_marked; } } if (newly_marked == 0) return 0; ++m_projection_extract_idx; const unsigned extract_idx = m_projection_extract_idx; for (unsigned src_id : scc) { auto it = m_partial_dfa_out.find(src_id); if (it == m_partial_dfa_out.end()) continue; for (const unsigned edge_idx : it->second) { if (edge_idx >= m_partial_dfa_edges.size()) continue; partial_dfa_edge& e = m_partial_dfa_edges[edge_idx]; if (!e.m_dst || !scc.contains(e.m_dst->get_id())) continue; if (e.m_projection_idx == 0) e.m_projection_idx = extract_idx; } } return newly_marked; } euf::snode const* nielsen_graph::get_slice(euf::snode const* v, expr* left, expr* right) { SASSERT(v && v->get_expr() && left && right); SASSERT(v->is_var()); expr_ref new_arg(v->get_expr(), m); expr_ref new_l(left, m), new_r(right, m); expr* arg, *l, *r; if (m_sk.is_slice(new_arg, arg, l, r)) { new_l = a.mk_add(left, l); m_rw(new_l); new_r = a.mk_add(right, r); m_rw(new_r); new_arg = arg; } expr_ref slice = m_sk.mk_slice(new_arg, new_l, new_r); return m_sg.mk(slice); } euf::snode const* nielsen_graph::get_tail(euf::snode const* v, expr* cnt, const bool fwd) { if (fwd) return get_slice(v, cnt, a.mk_int(0)); return get_slice(v, a.mk_int(0), cnt); } euf::snode const* nielsen_graph::get_tail(euf::snode const* v, const unsigned cnt, const bool fwd) { return get_tail(v, a.mk_int(cnt), fwd); } simplify_result nielsen_node::simplify_and_init(ptr_vector const& cur_path) { if (m_is_extended) return simplify_result::proceed; euf::sgraph& sg = m_graph.sg(); ast_manager& m = sg.get_manager(); seq_util& seq = this->graph().seq(); bool changed = true; //if (id() == 9) { // std::string dot = graph().to_dot(); // std::cout << dot << std::endl; //} // DON'T add rules here that add new constraints or apply substitutions // add them to apply_det_modifier instead while (changed) { changed = false; // pass 1: remove trivially satisfied equalities and memberships unsigned wi = 0; for (unsigned i = 0; i < m_str_eq.size(); ++i) { str_eq& eq = m_str_eq[i]; if (eq.is_trivial()) continue; m_str_eq[wi++] = eq; } if (wi < m_str_eq.size()) { m_str_eq.shrink(wi); changed = true; } unsigned wj = 0; for (unsigned j = 0; j < m_str_mem.size(); ++j) { str_mem& mem = m_str_mem[j]; if (mem.is_trivial(this)) continue; m_str_mem[wj++] = mem; } if (wj < m_str_mem.size()) { m_str_mem.shrink(wj); changed = true; } auto cannot_be_empty = [&](euf::snode const* side) { euf::snode_vector tokens; side->collect_tokens(tokens); const bool has_char = std::ranges::any_of(tokens, [](euf::snode const* t){ return t->is_char(); }); const bool all_eliminable = std::ranges::all_of(tokens, [](euf::snode const* t){ return t->is_var() || t->is_power(); }); return has_char || !all_eliminable; }; unsigned wk = 0; for (unsigned k = 0; k < m_str_deq.size(); ++k) { str_deq& deq = m_str_deq[k]; if (deq.m_lhs == deq.m_rhs || (deq.m_lhs && deq.m_rhs && deq.m_lhs->is_empty() && deq.m_rhs->is_empty())) { set_general_conflict(); set_conflict(backtrack_reason::symbol_clash, deq.m_dep); return simplify_result::conflict; } if (deq.m_lhs->length() == 1 && deq.m_rhs->length() == 1) { expr* l, *r; if (graph().seq().str.is_unit(deq.m_lhs->get_expr(), l) && graph().seq().str.is_unit(deq.m_rhs->get_expr(), r)) { add_constraint(constraint(m.mk_not(m.mk_eq(l, r)), deq.m_dep, m)); continue; } } if (deq.m_lhs->is_empty() && !deq.m_rhs->is_empty()) { if (cannot_be_empty(deq.m_rhs)) continue; } else if (deq.m_rhs->is_empty() && !deq.m_lhs->is_empty()) { if (cannot_be_empty(deq.m_lhs)) continue; } // simplify constant-exponent powers dep_tracker lhs_pow_dep = nullptr; if (euf::snode const* s = simplify_const_powers(this, sg, deq.m_lhs, lhs_pow_dep)) { deq.m_lhs = s; deq.m_dep = m_graph.dep_mgr().mk_join(deq.m_dep, lhs_pow_dep); changed = true; } dep_tracker rhs_pow_dep = nullptr; if (euf::snode const* s = simplify_const_powers(this, sg, deq.m_rhs, rhs_pow_dep)) { deq.m_rhs = s; deq.m_dep = m_graph.dep_mgr().mk_join(deq.m_dep, rhs_pow_dep); changed = true; } // prefix/suffix cancellation { euf::snode_vector lhs_toks, rhs_toks; deq.m_lhs->collect_tokens(lhs_toks); deq.m_rhs->collect_tokens(rhs_toks); unsigned prefix = 0; while (prefix < lhs_toks.size() && prefix < rhs_toks.size()) { euf::snode const* lt = lhs_toks[prefix]; euf::snode const* rt = rhs_toks[prefix]; if (m.are_equal(lt->get_expr(), rt->get_expr())) { ++prefix; } else if (sg.are_unit_distinct(lt, rt)) { prefix = static_cast(-1); break; } else break; } if (prefix == static_cast(-1)) { continue; } unsigned lsz = lhs_toks.size(), rsz = rhs_toks.size(); unsigned suffix = 0; while (suffix < lsz - prefix && suffix < rsz - prefix) { euf::snode const* lt = lhs_toks[lsz - 1 - suffix]; euf::snode const* rt = rhs_toks[rsz - 1 - suffix]; if (m.are_equal(lt->get_expr(), rt->get_expr())) { ++suffix; } else if (sg.are_unit_distinct(lt, rt)) { suffix = static_cast(-1); break; } else break; } if (suffix == static_cast(-1)) { continue; } if (prefix > 0 || suffix > 0) { deq.m_lhs = sg.drop_left(sg.drop_right(deq.m_lhs, suffix), prefix); deq.m_rhs = sg.drop_left(sg.drop_right(deq.m_rhs, suffix), prefix); changed = true; } } if (deq.m_lhs == deq.m_rhs || (deq.m_lhs->is_empty() && deq.m_rhs->is_empty())) { set_general_conflict(); set_conflict(backtrack_reason::symbol_clash, deq.m_dep); return simplify_result::conflict; } m_str_deq[wk++] = deq; } if (wk < m_str_deq.size()) { m_str_deq.shrink(wk); changed = true; } // pass 2: detect symbol clashes, empty-propagation, and prefix cancellation for (str_eq& eq : m_str_eq) { SASSERT(eq.well_formed()); // one side empty, the other not empty => conflict check // (the actual substitution is done in apply_det_modifier) if (eq.m_lhs->is_empty() && !eq.m_rhs->is_empty()) { if (check_empty_side_conflict(sg, eq.m_rhs, eq.m_dep)) return simplify_result::conflict; continue; } if (eq.m_rhs->is_empty() && !eq.m_lhs->is_empty()) { if (check_empty_side_conflict(sg, eq.m_lhs, eq.m_dep)) return simplify_result::conflict; continue; } // prefix/suffix cancellation: strip common leading and trailing tokens. // Same char or same variable on both sides can always be cancelled. // Two different concrete characters is a symbol clash. { euf::snode_vector lhs_toks, rhs_toks; eq.m_lhs->collect_tokens(lhs_toks); eq.m_rhs->collect_tokens(rhs_toks); // --- prefix --- unsigned prefix = 0; while (prefix < lhs_toks.size() && prefix < rhs_toks.size()) { euf::snode const* lt = lhs_toks[prefix]; euf::snode const* rt = rhs_toks[prefix]; if (m.are_equal(lt->get_expr(), rt->get_expr())) { ++prefix; } else if (sg.are_unit_distinct(lt, rt)) { set_general_conflict(); set_conflict(backtrack_reason::symbol_clash, eq.m_dep); return simplify_result::conflict; } else break; } // --- suffix (only among the tokens not already consumed by prefix) --- unsigned lsz = lhs_toks.size(), rsz = rhs_toks.size(); unsigned suffix = 0; while (suffix < lsz - prefix && suffix < rsz - prefix) { euf::snode const* lt = lhs_toks[lsz - 1 - suffix]; euf::snode const* rt = rhs_toks[rsz - 1 - suffix]; if (m.are_equal(lt->get_expr(), rt->get_expr())) { ++suffix; } else if (sg.are_unit_distinct(lt, rt)) { set_general_conflict(); set_conflict(backtrack_reason::symbol_clash, eq.m_dep); return simplify_result::conflict; } else break; } if (prefix > 0 || suffix > 0) { eq.m_lhs = sg.drop_left(sg.drop_right(eq.m_lhs, suffix), prefix); eq.m_rhs = sg.drop_left(sg.drop_right(eq.m_rhs, suffix), prefix); changed = true; } } } // pass 3: power simplification (mirrors ZIPT's LcpCompression + // SimplifyPowerElim + SimplifyPowerSingle) for (str_eq& eq : m_str_eq) { SASSERT(eq.well_formed()); if (eq.is_trivial()) continue; // 3a: simplify constant-exponent powers (base^0 → ε, base^1 → base) dep_tracker lhs_pow_dep = nullptr; if (euf::snode const* s = simplify_const_powers(this, sg, eq.m_lhs, lhs_pow_dep)) { eq.m_lhs = s; eq.m_dep = m_graph.dep_mgr().mk_join(eq.m_dep, lhs_pow_dep); changed = true; } dep_tracker rhs_pow_dep = nullptr; if (euf::snode const* s = simplify_const_powers(this, sg, eq.m_rhs, rhs_pow_dep)) { eq.m_rhs = s; eq.m_dep = m_graph.dep_mgr().mk_join(eq.m_dep, rhs_pow_dep); changed = true; } // 3b: merge adjacent same-base tokens into combined powers if (euf::snode const* s = merge_adjacent_powers(sg, eq.m_lhs)) { eq.m_lhs = s; changed = true; } if (euf::snode const* s = merge_adjacent_powers(sg, eq.m_rhs)) { eq.m_rhs = s; changed = true; } // 3c: CommPower-based power elimination — when one side starts // with a power w^p, count base-pattern occurrences c on the // other side's prefix. If we can determine the ordering between // p and c, cancel the matched portion. Mirrors ZIPT's // SimplifyPowerElim calling CommPower. // Spec: CommPower cancellation. // Given: pow_side = w^p · rest_pow and other_side = w^c · rest_other // where c is the number of times the base pattern w occurs in the // directional prefix of other_side. // - If p ≤ c: pow_side := rest_pow, other_side := w^(c-p) · rest_other // - If c ≤ p: pow_side := w^(p-c) · rest_pow, other_side := rest_other // - If p = c: both reduce completely (handled by both conditions above). SASSERT(eq.well_formed()); bool comm_changed = false; for (int side = 0; side < 2 && !comm_changed; ++side) { euf::snode const*& pow_side = side == 0 ? eq.m_lhs : eq.m_rhs; euf::snode const*& other_side = side == 0 ? eq.m_rhs : eq.m_lhs; if (!pow_side || !other_side) continue; for (unsigned od = 0; od < 2 && !comm_changed; ++od) { bool fwd = od == 0; euf::snode const* end_tok = dir_token(pow_side, fwd); if (!end_tok || !end_tok->is_power()) continue; euf::snode const* base_sn = end_tok->arg0(); expr* pow_exp = get_power_exp_expr(end_tok, seq); if (!base_sn || !pow_exp) continue; auto [count, consumed] = comm_power(base_sn, other_side, m, seq, fwd); if (!count.get() || consumed == 0) continue; expr_ref norm_count = normalize_arith(m, count); bool pow_le_count = false, count_le_pow = false; dep_tracker pow_le_dep = nullptr, count_le_dep = nullptr; rational diff; if (get_const_power_diff(norm_count, pow_exp, m_graph.a, diff)) { count_le_pow = diff.is_nonpos(); pow_le_count = diff.is_nonneg(); } else if (!cur_path.empty()) { pow_le_count = m_graph.check_lp_le(pow_exp, norm_count, this, pow_le_dep); count_le_pow = m_graph.check_lp_le(norm_count, pow_exp, this, count_le_dep); } if (!pow_le_count && !count_le_pow) continue; eq.m_dep = m_graph.dep_mgr().mk_join(eq.m_dep, pow_le_dep); eq.m_dep = m_graph.dep_mgr().mk_join(eq.m_dep, count_le_dep); pow_side = dir_drop(sg, pow_side, 1, fwd); other_side = dir_drop(sg, other_side, consumed, fwd); expr* base_e = get_power_base_expr(end_tok, seq); if (pow_le_count && count_le_pow) { // equal: both cancel completely } else if (pow_le_count) { // pow <= count: remainder goes to other_side expr_ref rem = normalize_arith(m, m_graph.a.mk_sub(norm_count, pow_exp)); expr_ref pw(seq.str.mk_power(base_e, rem), m); other_side = dir_concat(sg, sg.mk(pw), other_side, fwd); } else { // count <= pow: remainder goes to pow_side expr_ref rem = normalize_arith(m, m_graph.a.mk_sub(pow_exp, norm_count)); expr_ref pw(seq.str.mk_power(base_e, rem), m); pow_side = dir_concat(sg, sg.mk(pw), pow_side, fwd); } comm_changed = true; } } if (comm_changed) changed = true; // After any change in passes 3a–3c, restart the while loop // before running 3d/3e. This lets 3a simplify new constant- // exponent powers (e.g. base^1 → base created by 3c) before // 3e attempts LP-based elimination which would introduce a // needless fresh variable. if (changed) continue; // 3d: power prefix elimination — when both sides start with a // power of the same base, cancel the common power prefix. // (Subsumed by 3c for many cases, but handles same-base-power // pairs that CommPower may miss when both leading tokens are powers.) SASSERT(eq.well_formed()); for (unsigned od = 0; od < 2 && !changed; ++od) { bool fwd = (od == 0); euf::snode const* lh = dir_token(eq.m_lhs, fwd); euf::snode const* rh = dir_token(eq.m_rhs, fwd); if (!(lh && rh && lh->is_power() && rh->is_power())) continue; expr* lb = get_power_base_expr(lh, seq); expr* rb = get_power_base_expr(rh, seq); if (!(lb && rb && lb == rb)) continue; expr* lp = get_power_exp_expr(lh, seq); expr* rp = get_power_exp_expr(rh, seq); rational diff; if (lp && rp && get_const_power_diff(rp, lp, m_graph.a, diff)) { // rp = lp + diff (constant difference) eq.m_lhs = dir_drop(sg, eq.m_lhs, 1, fwd); eq.m_rhs = dir_drop(sg, eq.m_rhs, 1, fwd); if (diff.is_pos()) { // rp > lp: put base^diff on RHS (direction-aware prepend/append) expr_ref de(m_graph.a.mk_int(diff), m); expr_ref pw(seq.str.mk_power(lb, de), m); eq.m_rhs = dir_concat(sg, sg.mk(pw), eq.m_rhs, fwd); } else if (diff.is_neg()) { // lp > rp: put base^(-diff) on LHS expr_ref de(m_graph.a.mk_int(-diff), m); expr_ref pw(seq.str.mk_power(lb, de), m); eq.m_lhs = dir_concat(sg, sg.mk(pw), eq.m_lhs, fwd); } // diff == 0: both powers cancel completely changed = true; } // 3e: LP-aware power directional elimination else if (lp && rp && !cur_path.empty()) { dep_tracker lp_le_dep = nullptr, rp_le_dep = nullptr; bool lp_le_rp = m_graph.check_lp_le(lp, rp, this, lp_le_dep); bool rp_le_lp = m_graph.check_lp_le(rp, lp, this, rp_le_dep); if (lp_le_rp || rp_le_lp) { if (lp_le_rp) eq.m_dep = m_graph.dep_mgr().mk_join(eq.m_dep, lp_le_dep); if (rp_le_lp) eq.m_dep = m_graph.dep_mgr().mk_join(eq.m_dep, rp_le_dep); expr* smaller_exp = lp_le_rp ? lp : rp; expr* larger_exp = lp_le_rp ? rp : lp; eq.m_lhs = dir_drop(sg, eq.m_lhs, 1, fwd); eq.m_rhs = dir_drop(sg, eq.m_rhs, 1, fwd); if (lp_le_rp && rp_le_lp) { // both ≤ -> equal -> both cancel completely add_constraint(m_graph.mk_constraint(m.mk_eq(lp, rp), eq.m_dep)); } else { // we only know for sure that one is smaller than the other expr_ref d(m_graph.a.mk_sub(larger_exp, smaller_exp), m); expr_ref zero(m_graph.a.mk_int(0), m); add_constraint(m_graph.mk_constraint(m_graph.a.mk_ge(d, zero), eq.m_dep)); expr_ref pw(seq.str.mk_power(lb, d), m); euf::snode const*& larger_side = lp_le_rp ? eq.m_rhs : eq.m_lhs; larger_side = dir_concat(sg, sg.mk(pw), larger_side, fwd); } changed = true; } } } } } // consume concrete characters from str_mem via Brzozowski derivatives // in both directions (left-to-right, then right-to-left), mirroring ZIPT. for (str_mem& mem : m_str_mem) { SASSERT(mem.well_formed()); if (mem.is_primitive() || !mem.is_plain()) continue; for (unsigned od = 0; od < 2; ++od) { bool fwd = od == 0; while (!mem.m_str->is_empty()) { euf::snode const* tok = dir_token(mem.m_str, fwd); if (!tok || !tok->is_char_or_unit()) break; euf::snode const* src_re = mem.m_regex; euf::snode const* deriv = fwd ? sg.brzozowski_deriv(mem.m_regex, tok) : reverse_brzozowski_deriv(sg, mem.m_regex, tok); TRACE(seq, tout << mem_pp(mem, m) << " d: " << spp(deriv, m) << "\n"); if (!deriv) break; if (deriv->is_fail()) { set_general_conflict(); set_conflict(backtrack_reason::regex, mem.m_dep); return simplify_result::conflict; } if (fwd) { if (tok->is_char()) { // concrete char: record single edge directly m_graph.record_partial_derivative_edge(src_re, tok, deriv); } else if (src_re->is_ground()) { // symbolic unit: record all concrete minterm edges for src_re // so cycle_decomp can detect SCCs lazily (mirrors old concrete // per-minterm var_split behaviour). euf::snode_vector mts; sg.compute_minterms(src_re, mts); for (euf::snode const* mt : mts) { euf::snode const* mt_deriv = sg.brzozowski_deriv(src_re, mt); if (mt_deriv && !mt_deriv->is_fail() && mt_deriv->is_ground()) m_graph.record_partial_derivative_edge(src_re, mt, mt_deriv); } } } mem.m_str = dir_drop(sg, mem.m_str, 1, fwd); mem.m_regex = deriv; } } } // consume symbolic characters via uniform derivatives for (str_mem& mem : m_str_mem) { SASSERT(mem.well_formed()); if (mem.is_primitive() || !mem.is_plain()) continue; while (mem.m_str && !mem.m_str->is_empty()) { // TODO: generalize this to work for reverse derivative as well. euf::snode const* tok = mem.m_str->first(); if (!tok || !tok->is_char_or_unit()) break; euf::snode const* src_re = mem.m_regex; euf::snode const* next = nullptr; { seq_rewriter rw(m); expr_ref d(rw.mk_derivative(mem.m_regex->get_expr()), m); // Extract the inner char expression from seq.unit(?inner) expr *inner_char = tok->arg0()->get_expr(); // substitute the inner char for the derivative variable in d var_subst vs(m); d = vs(d, inner_char); th_rewriter thrw(m); thrw(d); // Record concrete minterm edges for src_re so cycle_decomp can // detect SCCs lazily (mirrors the incremental DFA building from // concrete char consumption in the loop above). if (src_re->is_ground()) { euf::snode_vector mts; sg.compute_minterms(src_re, mts); for (euf::snode const* mt : mts) { euf::snode const* mt_deriv = sg.brzozowski_deriv(src_re, mt); if (mt_deriv && !mt_deriv->is_fail()) m_graph.record_partial_derivative_edge(src_re, mt, mt_deriv); } } next = sg.mk(d); } if (next->is_fail()) { TRACE(seq, tout << "empty regex" << spp(mem.m_regex, m) << "\n"); set_general_conflict(); set_conflict(backtrack_reason::regex, mem.m_dep); return simplify_result::conflict; } mem.m_str = sg.drop_left(mem.m_str, 1); mem.m_regex = next; } } // consume leading characters of view / guard memberships (Section 3.3). // m_regex is the current (plain) derivative state; we gate on whether it // lies in Q_ν (projection_state_in_Q) and step with the ordinary // derivative, keeping the view/guard annotation. for (str_mem& mem : m_str_mem) { SASSERT(mem.well_formed()); if (mem.is_plain()) continue; if (consume_view_guard(mem)) return simplify_result::conflict; } // check for regex memberships that are immediately infeasible for (str_mem& mem : m_str_mem) { if (mem.is_contradiction(this)) { TRACE(seq, tout << "contradiction " << mem_pp(mem, m) << "\n"); set_general_conflict(); set_conflict(backtrack_reason::regex, mem.m_dep); return simplify_result::conflict; } } // remove trivial membership constraints once again unsigned wj = 0; for (unsigned j = 0; j < m_str_mem.size(); ++j) { str_mem& mem = m_str_mem[j]; if (mem.is_trivial(this)) continue; m_str_mem[wj++] = mem; } if (wj < m_str_mem.size()) m_str_mem.shrink(wj); // Regex widening: for each remaining str_mem, overapproximate // the string by replacing variables with their regex intersection // and check if the result intersected with the target regex is empty. // Detects infeasible constraints that would otherwise require // expensive exploration. Mirrors ZIPT NielsenNode.CheckRegexWidening. SASSERT(m_graph.m_seq_regex); for (str_mem const& mem : m_str_mem) { SASSERT(mem.well_formed()); if (mem.is_primitive()) continue; dep_tracker dep = mem.m_dep; if (m_graph.check_regex_widening(*this, mem, dep)) { set_general_conflict(); set_conflict(backtrack_reason::regex_widening, dep); return simplify_result::conflict; } } if (is_satisfied()) { // pass 1 removed all trivial str_eq entries; is_satisfied() requires // the remainder to be trivial, so the vector must be empty here. SASSERT(m_str_eq.empty()); return simplify_result::satisfied; } return simplify_result::proceed; } bool nielsen_node::consume_view_guard(str_mem& mem) { SASSERT(!mem.is_plain()); euf::sgraph& sg = m_graph.sg(); ast_manager& m = sg.get_manager(); seq_util& seq = m_graph.seq(); auto set_regex_conflict = [&]() { set_general_conflict(); set_conflict(backtrack_reason::regex, mem.m_dep); }; while (mem.m_str && !mem.m_str->is_empty()) { euf::snode const* tok = mem.m_str->first(); if (!tok || !tok->is_char_or_unit()) break; // leading token is a variable/power — nothing to consume yet euf::snode const* c = mem.m_regex; // The gate tests the CURRENT (plain) state c against Q_ν. An ite // state means a previous symbolic step has not been resolved yet; // leave it for apply_regex_if_split. if (!c->is_ground() || c->kind() == euf::snode_kind::s_ite) break; const bool in_Q = m_graph.projection_state_in_Q(c->get_expr(), mem.m_nu); if (!in_Q) { if (mem.is_guard()) { // The run left Q: no lap from the start can complete within Q // anymore, so the guard is discharged (accepts every suffix). mem.m_discharged = true; return false; } // view: a^{-1} L_{Q,F}(c) = ∅ when c ∉ Q. set_regex_conflict(); return true; } // Step with brzozowski_deriv for BOTH concrete and symbolic tokens. // This is essential: the partial-DFA states (and m_root) are produced // by brzozowski_deriv, so its canonicalization must be used here too — // otherwise the guard's resolved state never equals m_root by snode // identity and laps never close. For a symbolic unit it yields a // canonical ite residual that apply_regex_if_split later resolves. euf::snode const* next = sg.brzozowski_deriv(c, tok); if (!next) break; mem.m_str = sg.drop_left(mem.m_str, 1); mem.m_regex = next; if (next->is_fail()) { // view: derivative collapsed to ∅ — unsatisfiable. // guard: the lap can never close through ∅; treat as discharged. if (mem.is_guard()) { mem.m_discharged = true; return false; } set_regex_conflict(); return true; } if (next->is_ground() && next->kind() != euf::snode_kind::s_ite) { // concrete next state resolved immediately if (mem.is_guard() && next == mem.m_root) { // a non-empty prefix completed a lap r→…→r within Q. set_regex_conflict(); return true; } } else { // symbolic ite residual: defer to apply_regex_if_split, which // resolves the character and (for guards) detects a lap landing // back on the root. break; } } return false; } bool nielsen_node::is_satisfied() const { if (!m_str_deq.empty() || !m_str_eq.empty()) return false; if (any_of(m_str_mem, [](auto const &m) { return !m.is_primitive();})) return false; return true; } euf::snode const* nielsen_graph::mk_rewrite(expr* e) const { expr_ref er(e, m); th_rewriter rw(m); rw(er); return m_sg.mk(er); } // ----------------------------------------------------------------------- // nielsen_graph: search // ----------------------------------------------------------------------- void nielsen_graph::apply_parikh_to_node(nielsen_node& node) const { if (!m_parikh_enabled || node.m_parikh_applied) return; node.m_parikh_applied = true; // Generate modular length constraints (len(str) = min_len + stride·k, etc.) // and append them to the node's integer constraint list. m_parikh->apply_to_node(node); // Lightweight feasibility pre-check: does the Parikh modular constraint // contradict the variable's current integer bounds? If so, mark this // node as a Parikh-image conflict immediately (avoids a solver call). dep_tracker parikh_dep = nullptr; if (!node.is_currently_conflict() && m_parikh->check_parikh_conflict(node, parikh_dep) != nullptr) { node.set_general_conflict(); node.set_conflict(backtrack_reason::parikh_image, parikh_dep); } } void nielsen_graph::assert_root_constraints_to_solver() { if (m_root_constraints_asserted) return; m_root_constraints_asserted = true; // Constraint.Shared: assert all root-level length/Parikh constraints // to m_solver at the base level (no push/pop). These include: // - len(lhs) = len(rhs) for each non-trivial string equality // - len(str) >= min_len and len(str) <= max_len for each regex membership // - len(x) >= 0 for each variable appearing in the root constraints // Making these visible to the solver before the DFS allows arithmetic // pruning at every node, not just the root. vector constraints; generate_length_constraints(constraints); for (auto const& lc : constraints) { m_root->add_constraint(lc.to_constraint()); } } template class scoped_push { T &t; public: scoped_push(T& t) : t(t) { t.push_scope(); } ~scoped_push() { t.pop_scope(1); } }; nielsen_graph::search_result nielsen_graph::solve() { SASSERT(m_root); try { ++m_stats.m_num_solve_calls; clear_sat_node(); TRACE(seq, tout << "Solve call " << m_stats.m_num_solve_calls << "\n"); // Constraint.Shared: assert root-level length/Parikh constraints to the // solver at the base level, so they are visible during all feasibility checks. assert_root_constraints_to_solver(); // Iterative deepening: increment by 1 on each failure. // m_max_search_depth == 0 means unlimited; otherwise stop when bound exceeds it. m_depth_bound = 3; while (true) { if (!m.inc()) { #ifdef Z3DEBUG // Examining the Nielsen graph is probably the best way of debugging const std::string dot = to_dot(); IF_VERBOSE(1, verbose_stream() << dot << "\n";); IF_VERBOSE(1, display(verbose_stream())); #endif break; } if (m_max_search_depth > 0 && m_depth_bound > m_max_search_depth) break; ptr_vector cur_path; // scoped_push _scoped_push(m_dep_mgr); // gc dependencies after search SASSERT(!m_root->is_currently_conflict()); const search_result r = search_dfs(m_root, cur_path); // the main search loop IF_VERBOSE(1, verbose_stream() << " depth_bound=" << m_depth_bound << " dfs_nodes=" << m_stats.m_num_dfs_nodes << " max_depth=" << m_stats.m_max_depth << " extensions=" << m_stats.m_num_extensions << " arith_prune=" << m_stats.m_num_arith_infeasible << " result=" << (r == search_result::sat ? "SAT" : r == search_result::unsat ? "UNSAT" : "UNKNOWN") << "\n";); if (r == search_result::sat) { IF_VERBOSE(1, verbose_stream() << "side constraints: \n"; for (auto const &c : cur_path.back()->side_constraints()) { verbose_stream() << " side constraint: " << c.fml << "\n"; }); ++m_stats.m_num_sat; SASSERT(m_sat_node != nullptr); return r; } if (r == search_result::unsat) { ++m_stats.m_num_unsat; const auto deps = collect_conflict_deps(); m_dep_mgr.linearize(deps, m_conflict_sources); TRACE(seq, display(tout, m_root)); return r; } // depth limit hit – double the bound and retry m_depth_bound *= 2; SASSERT(m_depth_bound < INT_MAX); } ++m_stats.m_num_unknown; return search_result::unknown; } catch(const std::exception&) { #ifdef Z3DEBUG std::string dot = to_dot(); #endif throw; } } // ---- Transposition-table helpers (node memoization) ---------------------- static bool reason_is_string_only(backtrack_reason r) { switch (r) { case backtrack_reason::regex: case backtrack_reason::regex_widening: case backtrack_reason::symbol_clash: case backtrack_reason::character_range: return true; default: // arithmetic, parikh_image, external, children_failed, unevaluated return false; } } bool nielsen_graph::node_unsat_string_only(nielsen_node const* n) const { if (n->m_reason == backtrack_reason::children_failed) return n->m_unsat_cacheable; // set when the children_failed result was formed return reason_is_string_only(n->m_reason); } std::vector nielsen_graph::compute_node_signature(nielsen_node const* n) const { std::vector sig; // string equalities (order-independent) { std::vector> v; for (auto const& e : n->str_eqs()) v.emplace_back(e.m_lhs->id(), e.m_rhs->id()); std::sort(v.begin(), v.end()); sig.push_back(static_cast(v.size())); for (auto const& [a,b] : v) { sig.push_back(a); sig.push_back(b); } } sig.push_back(UINT_MAX); // section separator // string disequalities { std::vector> v; for (auto const& d : n->str_deqs()) v.emplace_back(d.m_lhs->id(), d.m_rhs->id()); std::sort(v.begin(), v.end()); sig.push_back(static_cast(v.size())); for (auto const& [a,b] : v) { sig.push_back(a); sig.push_back(b); } } sig.push_back(UINT_MAX); // regex memberships incl. view/guard metadata { std::vector> v; for (auto const& mm : n->str_mems()) v.push_back({ mm.m_str->id(), mm.m_regex->id(), static_cast(mm.m_kind), mm.m_root ? mm.m_root->id() : UINT_MAX, mm.m_nu, mm.m_discharged ? 1u : 0u }); std::sort(v.begin(), v.end()); sig.push_back(static_cast(v.size())); for (auto const& a : v) for (unsigned x : a) sig.push_back(x); } sig.push_back(UINT_MAX); // character-range constraints (per symbolic unit) { std::vector> v; for (auto const& [uid, cr] : n->char_ranges()) { std::vector entry; entry.push_back(uid); for (auto const& rg : cr.first.ranges()) { entry.push_back(rg.m_lo); entry.push_back(rg.m_hi); } v.push_back(std::move(entry)); } std::sort(v.begin(), v.end()); sig.push_back(static_cast(v.size())); for (auto const& e : v) { sig.push_back(static_cast(e.size())); for (unsigned x : e) sig.push_back(x); } } return sig; } dep_tracker nielsen_graph::node_all_deps(nielsen_node const* n) { dep_tracker d = nullptr; for (auto const& e : n->str_eqs()) d = m_dep_mgr.mk_join(d, e.m_dep); for (auto const& q : n->str_deqs()) d = m_dep_mgr.mk_join(d, q.m_dep); for (auto const& mm : n->str_mems()) d = m_dep_mgr.mk_join(d, mm.m_dep); for (auto const& [uid, cr] : n->char_ranges()) d = m_dep_mgr.mk_join(d, cr.second); return d; } nielsen_graph::search_result nielsen_graph::search_dfs(nielsen_node* node, ptr_vector& cur_path, const unsigned depth) { ++m_stats.m_num_dfs_nodes; // std::cout << m_stats.m_num_dfs_nodes << std::endl; // depth is NOT necessarily the length of the path // Reason: Progress nodes are not counted towards the depth limit // Otw. problems with a lot of variables would barely terminate SASSERT(depth <= cur_path.size()); m_stats.m_max_depth = std::max(m_stats.m_max_depth, depth); if (node->is_general_conflict()) { ++m_stats.m_num_simplify_conflict; return search_result::unsat; } // check for external cancellation (timeout, user interrupt) if (!m.inc()) return search_result::unknown; #ifdef Z3DEBUG if (m_stats.m_num_dfs_nodes % 50 == 0) { std::string dot = to_dot(); std::cout << ""; } #endif // check DFS node budget (0 = unlimited) if (m_max_nodes > 0 && m_stats.m_num_dfs_nodes > m_max_nodes) return search_result::unknown; // we might need to tell the SAT solver about the new integer inequalities // that might have been added by an extension step assert_node_side_constraints(node); if (node->is_currently_conflict()) { ++m_stats.m_num_simplify_conflict; return search_result::unsat; } // simplify constraints (idempotent after first call) const simplify_result sr = node->simplify_and_init(cur_path); if (sr == simplify_result::conflict || node->is_general_conflict()) { SASSERT(node->is_general_conflict()); ++m_stats.m_num_simplify_conflict; node->set_general_conflict(); return search_result::unsat; } // Transposition-table lookup. The node is now canonical (post-simplify). // If an equivalent node was already proven UNSAT for string/regex-only // reasons, this node is unsat too — independently of its (integer) side // constraints — so we prune without re-exploring its subtree. We derive // the conflict from this node's own constraint deps (a sound over-approx). { const std::vector sig = compute_node_signature(node); if (m_unsat_node_cache.contains(sig)) { node->set_conflict(backtrack_reason::regex, node_all_deps(node)); node->set_general_conflict(); node->m_unsat_cacheable = true; ++m_stats.m_num_simplify_conflict; ++m_num_cache_hits; return search_result::unsat; } } // Apply Parikh image filter: generate modular length constraints and // perform a lightweight feasibility pre-check. The filter is guarded // internally (m_parikh_applied) so it only runs once per node. // Note: Parikh filtering is skipped for satisfied nodes (returned above); // a fully satisfied node has no remaining memberships to filter. apply_parikh_to_node(*node); if (node->is_general_conflict()) { ++m_stats.m_num_simplify_conflict; return search_result::unsat; } // Assert any new int_constraints added during simplify_and_init for this // node into the current solver scope. Constraints inherited from the parent // (indices 0..m_parent_ic_count-1) are already present at the enclosing // scope level; only the newly-added tail needs to be asserted here. // Also generate per-node |LHS| = |RHS| length constraints for descendant // equations (root constraints are already at the base level). generate_node_length_constraints(node); assert_node_side_constraints(node); if (node->is_currently_conflict()) { ++m_stats.m_num_simplify_conflict; return search_result::unsat; } // integer feasibility check: the solver now holds all path constraints // incrementally; just query the solver directly if (!cur_path.empty() && !check_int_feasibility()) { const dep_tracker deps = get_subsolver_dependency(node); node->set_conflict(backtrack_reason::arithmetic, deps); node->set_general_conflict(); ++m_stats.m_num_arith_infeasible; return search_result::unsat; } SASSERT(sr != simplify_result::satisfied || node->is_satisfied()); SASSERT(!node->is_currently_conflict()); if (node->is_satisfied()) { // Before declaring SAT, check leaf-node regex feasibility: // for each variable with multiple regex constraints, verify // that the intersection of all its regexes is non-empty. // Mirrors ZIPT NielsenNode.CheckRegex. dep_tracker dep = nullptr; if (!check_leaf_regex(*node, dep)) { node->set_general_conflict(); node->set_conflict(backtrack_reason::regex, dep); // string-only conflict (empty intersection) → memoize. node->m_unsat_cacheable = true; m_unsat_node_cache.insert(compute_node_signature(node)); return search_result::unsat; } assert_node_side_constraints(node); // We need to have everything asserted before reporting SAT // (otw. the outer solver might assume false-assigned literals to be true) if (node->is_currently_conflict()) { ++m_stats.m_num_simplify_conflict; return search_result::unsat; } m_sat_node = node; m_sat_path = cur_path; return search_result::sat; } if (node->is_currently_conflict()) return search_result::unsat; // depth bound check if (depth >= m_depth_bound) return search_result::unknown; SASSERT(!node->is_currently_conflict()); // generate extensions only once per node; children persist across runs if (!node->is_extended()) { const bool ext = generate_extensions(node); IF_VERBOSE(1, display(verbose_stream(), node)); CTRACE(seq, !ext, display(tout, node) << to_dot() << "\n"); if (!ext) { std::cout << "No extensions generated for node " << node->id() << ", but not satisfied or conflict?!" << std::endl; node->to_html(std::cout, m); std::cout << std::endl; display(std::cout, node); } VERIFY(ext); if (node->is_currently_conflict()) // in rare cases, trying to extend can make a complicated conflict visible return search_result::unsat; node->set_extended(true); ++m_stats.m_num_extensions; } // explore children bool any_unknown = false; bool all_general_conflict = true; for (nielsen_edge *e : node->outgoing()) { cur_path.push_back(e); // Push a solver scope for this edge and assert its side integer // constraints. The child's own new constraints will be asserted // inside the recursive call (above). On return, pop the scope so // that backtracking removes those assertions. m_length_solver.push(); // Lazily compute substitution length constraints (|x| = |u|) on first // traversal. This must happen before asserting side_constraints if (!e->len_constraints_computed()) { add_subst_length_constraints(e); e->set_len_constraints_computed(true); for (const auto& sc : e->side_constraints()) { e->tgt()->add_constraint(sc); } } const auto new_depth = depth + (e->is_progress() ? 0 : 1); const search_result r = search_dfs(e->tgt(), cur_path, new_depth); m_length_solver.pop(1); if (r == search_result::sat) return search_result::sat; cur_path.pop_back(); if (r == search_result::unknown) any_unknown = true; if (!e->tgt()->is_general_conflict()) all_general_conflict = false; } if (all_general_conflict) { SASSERT(!any_unknown); // mark it such that we do not have to reconsider it even after a hot-restart node->set_general_conflict(); } if (!any_unknown) { node->set_child_conflict(); // Memoize this internal UNSAT iff its whole subtree closed for // string/regex-only reasons (no child relied on arithmetic / Parikh / // external context). Then a same-signature node found via any other // path is pruned by the lookup above. bool all_string_only = true; for (nielsen_edge* e : node->outgoing()) { if (!node_unsat_string_only(e->tgt())) { all_string_only = false; break; } } node->m_unsat_cacheable = all_string_only; if (all_string_only) m_unsat_node_cache.insert(compute_node_signature(node)); return search_result::unsat; } return search_result::unknown; } // Returns true if variable snode `var` appears anywhere in the token list of `n`. static bool snode_contains_var(euf::snode const* n, euf::snode const* var) { SASSERT(n && var); euf::snode_vector tokens; n->collect_tokens(tokens); return any_of(tokens, [var](auto const &t) { return t == var; }); } bool nielsen_graph::apply_det_modifier(nielsen_node* node) { // resist the temptation to add rules that "simplify" primitive membership constraints! // pretty much all of them could cause divergence! // e.g., x \in aa* => don't apply substitution x / ax even though it looks "safe" to do // there might be another constraint x \in a* and they would just push the "a" back and forth! for (unsigned eq_idx = 0; eq_idx < node->str_eqs().size(); ++eq_idx) { str_eq const& eq = node->str_eqs()[eq_idx]; if (eq.is_trivial()) continue; // We should have simplified it away before auto l = eq.m_lhs, r = eq.m_rhs; if (!l || !r) continue; // 0. empty side propagation if (l->is_empty() || r->is_empty()) { euf::snode const* non_empty_side = l->is_empty() ? r : l; nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "det", true); euf::snode_vector tokens; non_empty_side->collect_tokens(tokens); auto& eqs = child->str_eqs(); eqs[eq_idx] = eqs.back(); eqs.pop_back(); for (euf::snode const* t : tokens) { if (t->is_var()) { nielsen_subst s(t, m_sg.mk_empty_seq(t->get_sort()), eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); } else if (t->is_power()) { expr* expr_node = t->get_expr(); expr* pow_base = nullptr, *pow_exp = nullptr; if (seq().str.is_power(expr_node, pow_base, pow_exp) && pow_exp) e->add_side_constraint(mk_constraint(a.mk_eq(pow_exp, a.mk_int(0)), eq.m_dep)); nielsen_subst s(t, m_sg.mk_empty_seq(t->get_sort()), eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); } } return true; } // 1. unit equalities produced by unit-unit prefix/suffix splits { euf::snode_vector lhs_toks, rhs_toks; l->collect_tokens(lhs_toks); r->collect_tokens(rhs_toks); // --- prefix --- unsigned prefix = 0; while (prefix < lhs_toks.size() && prefix < rhs_toks.size()) { euf::snode const* lt = lhs_toks[prefix]; euf::snode const* rt = rhs_toks[prefix]; if (m.are_equal(lt->get_expr(), rt->get_expr())) ++prefix; else if (m_sg.are_unit_distinct(lt, rt)) break; else if (lt->is_char_or_unit() && rt->is_char_or_unit()) { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "det", true); if (lt->is_char()) { std::swap(lt, rt); std::swap(l, r); } SASSERT(lt->is_unit()); euf::snode const* lhs_rest = m_sg.drop_left(l, prefix + 1); euf::snode const* rhs_rest = m_sg.drop_left(r, prefix + 1); auto& eqs = child->str_eqs(); eqs[eq_idx] = eqs.back(); eqs.pop_back(); if (lt->is_char()) std::swap(lt, rt); nielsen_subst subst(lt, rt, eq.m_dep); e->add_subst(subst); child->apply_subst(m_sg, subst); if (!lhs_rest->is_empty() || !rhs_rest->is_empty()) eqs.push_back(str_eq(lhs_rest, rhs_rest, eq.m_dep)); return true; } else break; } // --- suffix --- unsigned lsz = lhs_toks.size(), rsz = rhs_toks.size(); unsigned suffix = 0; while (suffix < lsz - prefix && suffix < rsz - prefix) { euf::snode const* lt = lhs_toks[lsz - 1 - suffix]; euf::snode const* rt = rhs_toks[rsz - 1 - suffix]; if (m.are_equal(lt->get_expr(), rt->get_expr())) ++suffix; else if (m_sg.are_unit_distinct(lt, rt)) break; else if (lt->is_char_or_unit() && rt->is_char_or_unit()) { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "det", true); euf::snode const* lhs_rest = m_sg.drop_right(l, suffix + 1); euf::snode const* rhs_rest = m_sg.drop_right(r, suffix + 1); auto& eqs = child->str_eqs(); eqs[eq_idx] = eqs.back(); eqs.pop_back(); if (lt->is_char()) std::swap(lt, rt); nielsen_subst subst(lt, rt, eq.m_dep); e->add_subst(subst); child->apply_subst(m_sg, subst); if (!lhs_rest->is_empty() || !rhs_rest->is_empty()) eqs.push_back(str_eq(lhs_rest, rhs_rest, eq.m_dep)); return true; } else break; } } // 2. power-character directional inconsistency for (unsigned od = 0; od < 2; ++od) { bool fwd = (od == 0); euf::snode const* lh = dir_token(l, fwd); euf::snode const* rh = dir_token(r, fwd); for (int side = 0; side < 2; ++side) { euf::snode const* pow_head = (side == 0) ? lh : rh; euf::snode const* other_head = (side == 0) ? rh : lh; if (!pow_head || !pow_head->is_power() || !other_head || !other_head->is_char()) continue; euf::snode const* base_sn = pow_head->arg0(); if (!base_sn) continue; euf::snode const* base_head = dir_token(base_sn, fwd); if (!base_head || !base_head->is_char()) continue; if (m.are_equal(base_head->get_expr(), other_head->get_expr())) continue; // Directional base/head mismatch -> force exponent 0 and power -> ε. nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "det", true); nielsen_subst s(pow_head, m_sg.mk_empty_seq(pow_head->get_sort()), eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); expr* pow_exp = get_power_exp_expr(pow_head, m_seq); if (pow_exp) { expr* zero = a.mk_int(0); e->add_side_constraint(mk_constraint(a.mk_eq(pow_exp, zero), eq.m_dep)); } return true; } } // 3. variable-character look-ahead substitution for (unsigned od = 0; od < 2; ++od) { bool fwd = od == 0; euf::snode const* var_side = nullptr; euf::snode const* char_side = nullptr; euf::snode const* lhead = dir_token(l, fwd); euf::snode const* rhead = dir_token(r, fwd); if (!lhead || !rhead) continue; if (lhead->is_var() && rhead->is_char()) { var_side = l; char_side = r; } else if (rhead->is_var() && lhead->is_char()) { var_side = r; char_side = l; } else continue; euf::snode_vector var_toks, char_toks; collect_tokens_dir(var_side, fwd, var_toks); collect_tokens_dir(char_side, fwd, char_toks); if (var_toks.size() <= 1 || char_toks.empty()) continue; euf::snode const* var_node = var_toks[0]; SASSERT(var_node->is_var()); unsigned i = 0; for (; i < char_toks.size() && char_toks[i]->is_char(); ++i) { unsigned j1 = 1; unsigned j2 = i; bool failed = false; while (j1 < var_toks.size() && j2 < char_toks.size()) { euf::snode const* st1 = var_toks[j1]; euf::snode const* st2 = char_toks[j2]; if (!st2->is_char()) break; if (st1->is_char()) { if (st1->id() == st2->id()) { j1++; j2++; continue; } failed = true; break; } if (st1->id() != var_node->id()) break; bool inner_indet = false; for (unsigned l_idx = 0; j2 < char_toks.size() && l_idx < i; ++l_idx) { st2 = char_toks[j2]; if (!st2->is_char()) { inner_indet = true; break; } if (st2->id() == char_toks[l_idx]->id()) { j2++; continue; } failed = true; break; } if (inner_indet || failed) break; j1++; } if (failed) continue; break; } if (i == 0) continue; bool skip_dir = false; euf::snode const* next_var = nullptr; for (unsigned k = i; k < char_toks.size(); ++k) { euf::snode const* t = char_toks[k]; if (t->is_power()) { skip_dir = true; break; } if (t->is_var()) { next_var = t; break; } } if (skip_dir) continue; if (next_var) { u_map dep_edges; for (str_eq const& other_eq : node->str_eqs()) { if (other_eq.is_trivial() || !other_eq.m_lhs || !other_eq.m_rhs) continue; euf::snode const* lh2 = dir_token(other_eq.m_lhs, fwd); euf::snode const* rh2 = dir_token(other_eq.m_rhs, fwd); if (!lh2 || !rh2) continue; auto record_dep = [&](euf::snode const* head_var, euf::snode const* other_side) { euf::snode_vector other_toks; collect_tokens_dir(other_side, fwd, other_toks); for (unsigned idx = 0; idx < other_toks.size(); ++idx) { if (other_toks[idx]->is_var()) { if (!dep_edges.contains(head_var->id())) dep_edges.insert(head_var->id(), other_toks[idx]->id()); return; } } }; if (lh2->is_var() && rh2->is_var()) { if (!dep_edges.contains(lh2->id())) dep_edges.insert(lh2->id(), rh2->id()); if (!dep_edges.contains(rh2->id())) dep_edges.insert(rh2->id(), lh2->id()); } else if (lh2->is_var() && !rh2->is_var()) record_dep(lh2, other_eq.m_rhs); else if (rh2->is_var() && !lh2->is_var()) record_dep(rh2, other_eq.m_lhs); } uint_set visited; svector worklist; worklist.push_back(next_var->id()); bool cycle_found = false; while (!worklist.empty() && !cycle_found) { unsigned cur = worklist.back(); worklist.pop_back(); if (cur == var_node->id()) { cycle_found = true; break; } if (visited.contains(cur)) continue; visited.insert(cur); unsigned dep_id; if (dep_edges.find(cur, dep_id)) worklist.push_back(dep_id); } if (cycle_found) continue; } euf::snode const* prefix_sn = char_toks[0]; for (unsigned j = 1; j < i; ++j) { prefix_sn = dir_concat(m_sg, prefix_sn, char_toks[j], fwd); } euf::snode const* tail = get_tail(var_node, compute_length_expr(prefix_sn).get(), fwd); euf::snode const* replacement = dir_concat(m_sg, prefix_sn, tail, fwd); nielsen_subst s(var_node, replacement, eq.m_dep); nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "det", true); e->add_side_constraint(mk_constraint(a.mk_ge(compute_length_expr(tail), a.mk_int(0)), eq.m_dep)); e->add_subst(s); child->apply_subst(m_sg, s); return true; } // variable definition: x = t where x is a single var and x ∉ vars(t) // → deterministically substitute x → t throughout the node euf::snode const* var = nullptr; euf::snode const* def; if (l->is_var() && !snode_contains_var(r, l)) { var = l; def = r; } else if (r->is_var() && !snode_contains_var(l, r)) { var = r; def = l; } else if (l->is_unit() && r->is_char_or_unit()) { var = l; def = r; } else if (r->is_unit() && l->is_char_or_unit()) { var = r; def = l; } if (var) { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "det", true); nielsen_subst s(var, def, eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); return true; } } return false; } bool nielsen_graph::apply_const_nielsen(nielsen_node* node) { for (str_eq const& eq : node->str_eqs()) { if (eq.is_trivial()) continue; SASSERT(eq.well_formed()); for (unsigned od = 0; od < 2; ++od) { const bool fwd = (od == 0); euf::snode const* lhead = dir_token(eq.m_lhs, fwd); euf::snode const* rhead = dir_token(eq.m_rhs, fwd); if (!lhead || !rhead) continue; // char vs var: branch 1: var -> ε, branch 2: var -> char·var (depending on direction) euf::snode const* char_head = lhead->is_char_or_unit() ? lhead : (rhead->is_char_or_unit() ? rhead : nullptr); euf::snode const* var_head = lhead->is_var() ? lhead : (rhead->is_var() ? rhead : nullptr); if (char_head && var_head) { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "nielsen const 0", true); const nielsen_subst s1(var_head, m_sg.mk_empty_seq(var_head->get_sort()), eq.m_dep); e->add_subst(s1); child->apply_subst(m_sg, s1); euf::snode const* tail = get_tail(var_head, a.mk_int(1), fwd); euf::snode const* replacement = dir_concat(m_sg, char_head, tail, fwd); child = mk_child(node); e = mk_edge(node, child, "nielsen const >", false); e->add_side_constraint(mk_constraint(a.mk_ge(compute_length_expr(tail), a.mk_int(0)), eq.m_dep)); const nielsen_subst s2(var_head, replacement, eq.m_dep); e->add_subst(s2); child->apply_subst(m_sg, s2); return true; } } } return false; } bool nielsen_graph::apply_var_nielsen(nielsen_node* node) { for (str_eq const& eq : node->str_eqs()) { if (eq.is_trivial()) continue; SASSERT(eq.well_formed()); for (unsigned od = 0; od < 2; ++od) { const bool fwd = od == 0; euf::snode const* lhead = dir_token(eq.m_lhs, fwd); euf::snode const* rhead = dir_token(eq.m_rhs, fwd); SASSERT(lhead && rhead); SASSERT(lhead->id() != rhead->id()); if (!lhead->is_var() || !rhead->is_var()) continue; // x·A = y·B where x,y are distinct variables (classic Nielsen) // child 1: x → ε (progress) { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "nielsen var =l", true); const nielsen_subst s(lhead, m_sg.mk_empty_seq(lhead->get_sort()), eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); } // child 2: y → ε (progress) { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "nielsen var =r", true); const nielsen_subst s(rhead, m_sg.mk_empty_seq(rhead->get_sort()), eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); } // child 3: x → y·x (no progress) { euf::snode const* replacement = dir_concat(m_sg, rhead, get_tail(lhead, compute_length_expr(rhead).get(), fwd), fwd); nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "nielsen var >", false); const nielsen_subst s(lhead, replacement, eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); } // child 4: y → x·y (no progress) { euf::snode const* replacement = dir_concat(m_sg, lhead, get_tail(rhead, compute_length_expr(lhead).get(), fwd), fwd); nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "nielsen var <", false); const nielsen_subst s(rhead, replacement, eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); } return true; } } return false; } // ----------------------------------------------------------------------- // EqSplit helpers: token length classification // ----------------------------------------------------------------------- bool nielsen_graph::token_has_variable_length(euf::snode const* tok) const { // Chars and units have known constant length 1. if (tok->is_char_or_unit()) return false; // Variables and powers have symbolic/unknown length. if (tok->is_var() || tok->is_power()) return true; // For s_var string literals: check if it's a string literal (known constant length). if (tok->get_expr()) { zstring s; if (m_seq.str.is_string(tok->get_expr(), s)) return false; } // Everything else is treated as variable length. return true; } unsigned nielsen_graph::token_const_length(euf::snode const* tok) const { if (tok->is_char_or_unit()) return 1; if (tok->get_expr()) { zstring s; if (m_seq.str.is_string(tok->get_expr(), s)) return s.length(); } return 0; } // ----------------------------------------------------------------------- // EqSplit: find_eq_split_point // Port of ZIPT's StrEq.SplitEq algorithm. // // Walks tokens from each side tracking two accumulators: // - lhs_has_symbolic / rhs_has_symbolic : whether a variable-length token // has been consumed on that side since the last reset // - const_diff : net constant-length difference (LHS constants − RHS constants) // // A potential split point arises when both accumulators are "zero" // (no outstanding symbolic contributions on either side), meaning // the prefix lengths are determined up to a constant offset (const_diff). // // Among all such split points, we pick the one minimising |const_diff| // (the padding amount). We also require having seen at least one variable- // length token before accepting a split, so that the split is non-trivial. // ----------------------------------------------------------------------- bool nielsen_graph::find_eq_split_point( euf::snode_vector const& lhs_toks, euf::snode_vector const& rhs_toks, unsigned& out_lhs_idx, unsigned& out_rhs_idx, int& out_padding) const { const unsigned lhs_len = lhs_toks.size(); const unsigned rhs_len = rhs_toks.size(); if (lhs_len <= 1 || rhs_len <= 1) return false; // Initialize accumulators with the first token on each side (index 0). bool lhs_has_symbolic = token_has_variable_length(lhs_toks[0]); bool rhs_has_symbolic = token_has_variable_length(rhs_toks[0]); int const_diff = 0; if (!lhs_has_symbolic) const_diff += (int)token_const_length(lhs_toks[0]); if (!rhs_has_symbolic) const_diff -= (int)token_const_length(rhs_toks[0]); bool seen_variable = lhs_has_symbolic || rhs_has_symbolic; // Best confirmed split point. bool has_best = false; unsigned best_lhs = 0, best_rhs = 0; int best_padding = 0; // Pending split (needs confirmation by a subsequent variable token). bool has_pending = false; unsigned pending_lhs = 0, pending_rhs = 0; int pending_padding = 0; unsigned li = 1, ri = 1; while (li < lhs_len || ri < rhs_len) { const bool lhs_zero = !lhs_has_symbolic; const bool rhs_zero = !rhs_has_symbolic; // Potential split point: both symbolic accumulators are zero. if (seen_variable && lhs_zero && rhs_zero) { if (!has_pending || std::abs(const_diff) < std::abs(pending_padding)) { has_pending = true; pending_padding = const_diff; pending_lhs = li; pending_rhs = ri; } } // Decide which side to consume from. // Prefer consuming from the side whose symbolic accumulator is zero // (i.e., that side has no outstanding variable contributions). bool consume_lhs; if (lhs_zero && !rhs_zero) consume_lhs = true; else if (!lhs_zero && rhs_zero) consume_lhs = false; else if (lhs_zero && rhs_zero) consume_lhs = (const_diff <= 0); // consume from side with less accumulated constant else consume_lhs = (li <= ri); // both non-zero: round-robin if (consume_lhs) { if (li >= lhs_len) break; euf::snode const* tok = lhs_toks[li++]; const bool is_var = token_has_variable_length(tok); if (is_var) { // Confirm any pending split point. if (has_pending && (!has_best || std::abs(pending_padding) < std::abs(best_padding))) { has_best = true; best_padding = pending_padding; best_lhs = pending_lhs; best_rhs = pending_rhs; has_pending = false; } seen_variable = true; lhs_has_symbolic = true; } else const_diff += (int)token_const_length(tok); } else { if (ri >= rhs_len) break; euf::snode const* tok = rhs_toks[ri++]; const bool is_var = token_has_variable_length(tok); if (is_var) { if (has_pending && (!has_best || std::abs(pending_padding) < std::abs(best_padding))) { has_best = true; best_padding = pending_padding; best_lhs = pending_lhs; best_rhs = pending_rhs; has_pending = false; } seen_variable = true; rhs_has_symbolic = true; } else const_diff -= (int)token_const_length(tok); } } if (!has_best) return false; // A split at the very start or very end is degenerate — skip. if ((best_lhs == 0 && best_rhs == 0) || (best_lhs == lhs_len && best_rhs == rhs_len)) return false; out_lhs_idx = best_lhs; out_rhs_idx = best_rhs; out_padding = best_padding; return true; } // ----------------------------------------------------------------------- // apply_eq_split — Port of ZIPT's EqSplitModifier.Apply // // For a regex-free equation LHS = RHS, finds a split point and decomposes // into two shorter equations with optional padding variable: // // eq1: LHS[0..lhsIdx] · [pad if padding<0] = [pad if padding>0] · RHS[0..rhsIdx] // eq2: LHS[lhsIdx..] · [pad if padding>0] = [pad if padding<0] · RHS[rhsIdx..] // // Creates a single progress child. // ----------------------------------------------------------------------- bool nielsen_graph::apply_eq_split(nielsen_node* node) { for (unsigned eq_idx = 0; eq_idx < node->str_eqs().size(); ++eq_idx) { str_eq const& eq = node->str_eqs()[eq_idx]; SASSERT(eq.well_formed()); if (eq.is_trivial()) continue; // EqSplit only applies to regex-free equations. if (!eq.m_lhs->is_regex_free() || !eq.m_rhs->is_regex_free()) continue; euf::snode_vector lhs_toks, rhs_toks; eq.m_lhs->collect_tokens(lhs_toks); eq.m_rhs->collect_tokens(rhs_toks); if (lhs_toks.empty() || rhs_toks.empty()) continue; unsigned split_lhs = 0, split_rhs = 0; int padding = 0; if (!find_eq_split_point(lhs_toks, rhs_toks, split_lhs, split_rhs, padding)) continue; // Split the equation sides into prefix / suffix. euf::snode const* lhs_prefix = m_sg.drop_right(eq.m_lhs, lhs_toks.size() - split_lhs); euf::snode const* lhs_suffix = m_sg.drop_left(eq.m_lhs, split_lhs); euf::snode const* rhs_prefix = m_sg.drop_right(eq.m_rhs, rhs_toks.size() - split_rhs); euf::snode const* rhs_suffix = m_sg.drop_left(eq.m_rhs, split_rhs); // Build the two new equations, incorporating a padding variable if needed. euf::snode const* eq1_lhs = lhs_prefix; euf::snode const* eq1_rhs = rhs_prefix; euf::snode const* eq2_lhs = lhs_suffix; euf::snode const* eq2_rhs = rhs_suffix; th_rewriter rw(m); euf::snode const* pad = nullptr; if (padding != 0) { // NSB review: can we represent pad_var using a string function? expr *pad_var = m_sk.mk("eq-split", a.mk_int(padding), eq.m_lhs->get_expr(), eq.m_rhs->get_expr(), eq.m_lhs->get_sort()); pad = m_sg.mk(pad_var); if (padding > 0) { // LHS prefix is longer by |padding| constants. // Prepend pad to RHS prefix, append pad to LHS suffix. eq1_rhs = m_sg.mk_concat(pad, rhs_prefix); eq2_lhs = m_sg.mk_concat(lhs_suffix, pad); } else { // RHS prefix is longer by |padding| constants. // Append pad to LHS prefix, prepend pad to RHS suffix. eq1_lhs = m_sg.mk_concat(lhs_prefix, pad); eq2_rhs = m_sg.mk_concat(pad, rhs_suffix); } } // Create single progress child. nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "eq split", true); // Remove the original equation and add the two new ones. auto& eqs = child->str_eqs(); eqs[eq_idx] = eqs.back(); eqs.pop_back(); eqs.push_back(str_eq(eq1_lhs, eq1_rhs, eq.m_dep)); eqs.push_back(str_eq(eq2_lhs, eq2_rhs, eq.m_dep)); // Int constraints on the edge. // 1) len(pad) = |padding| (if padding variable was created) if (pad && pad->get_expr()) { const expr_ref len_pad(m_seq.str.mk_length(pad->get_expr()), m); const expr_ref abs_pad(a.mk_int(std::abs(padding)), m); e->add_side_constraint(mk_constraint(m.mk_eq(len_pad, abs_pad), eq.m_dep)); } // 2) len(eq1_lhs) = len(eq1_rhs) const expr_ref l1 = compute_length_expr(eq1_lhs); const expr_ref r1 = compute_length_expr(eq1_rhs); e->add_side_constraint(mk_constraint(m.mk_eq(l1, r1), eq.m_dep)); // 3) len(eq2_lhs) = len(eq2_rhs) const expr_ref l2 = compute_length_expr(eq2_lhs); const expr_ref r2 = compute_length_expr(eq2_rhs); e->add_side_constraint(mk_constraint(m.mk_eq(l2, r2), eq.m_dep)); return true; } return false; } // ----------------------------------------------------------------------- // nielsen_graph: mk_fresh_var // ----------------------------------------------------------------------- euf::snode const* nielsen_graph::mk_fresh_var(sort* s) { ++m_stats.m_num_fresh_vars; const std::string name = "v!" + std::to_string(m_fresh_cnt++); return m_sg.mk_var(symbol(name.c_str()), s); } bool nielsen_graph::generate_extensions(nielsen_node *node) { SASSERT(node != nullptr); SASSERT(!node->is_currently_conflict()); // The first modifier that produces edges is used and returned immediately. // Priority 1: deterministic modifiers (single child, always progress) if (apply_det_modifier(node)) return ++m_stats.m_mod_det, true; // Priority 2: PowerEpsilon - power → ε via base=ε or n=0 if (apply_power_epsilon(node)) return ++m_stats.m_mod_power_epsilon, true; // Priority 3: NumCmp - length comparison branching for power tokens if (apply_num_cmp(node)) return ++m_stats.m_mod_num_cmp, true; // Priority 3b: SplitPowerElim - CommPower-based branching when // one side has a power and the other side has same-base occurrences // but ordering is unknown. if (apply_split_power_elim(node)) return ++m_stats.m_mod_split_power_elim, true; // Priority 4: ConstNumUnwinding - power vs constant: n=0 or peel if (apply_const_num_unwinding(node)) return ++m_stats.m_mod_const_num_unwinding, true; // Priority 5: EqSplit - split equations into two (single progress child) if (apply_eq_split(node)) return ++m_stats.m_mod_eq_split, true; // Priority 5b: CycleSubsumption - eliminate leading variable subsumed by stabilizer if (apply_cycle_subsumption(node)) return ++m_stats.m_mod_cycle_subsumption, true; // Priority 6: CycleDecomp - stabilizer introduction for regex cycles using partial DFA projection if (apply_cycle_decomposition(node)) return ++m_stats.m_mod_star_intr, true; // Priority 7: GPowerIntr - ground power introduction if (apply_gpower_intr(node)) return ++m_stats.m_mod_gpower_intr, true; // Priority 8: Regex Factorization if (apply_regex_factorization(node)) return ++m_stats.m_mod_regex_factorization, true; // Priority 8b: ConstNielsen - char vs var (2 children) if (apply_const_nielsen(node)) return ++m_stats.m_mod_const_nielsen, true; // Priority 9: RegexIfSplit - split str_mem s ∈ ite(c,th,el) by branching on c if (apply_regex_if_split(node)) return ++m_stats.m_mod_regex_if_split, true; // Priority 9b: SignatureSplit - heuristic string equation splitting if (m_signature_split && apply_signature_split(node)) return ++m_stats.m_mod_signature_split, true; // Priority 10: RegexVarSplit - split str_mem by minterms if (apply_regex_var_split(node)) return ++m_stats.m_mod_regex_var_split, true; // Priority 11: PowerSplit - power unwinding with bounded prefix if (apply_power_split(node)) return ++m_stats.m_mod_power_split, true; // Priority 12: VarNielsen - var vs var, all progress (classic Nielsen) if (apply_var_nielsen(node)) return ++m_stats.m_mod_var_nielsen, true; // Priority 13: VarNumUnwinding - variable power unwinding for equality constraints if (apply_var_num_unwinding_eq(node)) return ++m_stats.m_mod_var_num_unwinding_eq, true; // Priority 14: variable power unwinding for membership constraints if (apply_var_num_unwinding_mem(node)) return ++m_stats.m_mod_var_num_unwinding_mem, true; // let's unwindind a disequality if (axiomatize_diseq(node)) return ++m_stats.m_ax_diseq, true; return false; } // ----------------------------------------------------------------------- // Helper: find a power token in any str_eq // ----------------------------------------------------------------------- euf::snode const* nielsen_graph::find_power_token(nielsen_node* node) { for (str_eq const& eq : node->str_eqs()) { if (eq.is_trivial()) continue; SASSERT(eq.well_formed()); euf::snode_vector toks; eq.m_lhs->collect_tokens(toks); for (euf::snode const* t : toks) { if (t->is_power()) return t; } toks.reset(); eq.m_rhs->collect_tokens(toks); for (euf::snode const* t : toks) { if (t->is_power()) return t; } } return nullptr; } // ----------------------------------------------------------------------- // Helper: find a power token facing a constant (char) head // Returns true if found, sets power, other_head, eq_out. // ----------------------------------------------------------------------- bool nielsen_graph::find_power_vs_non_var(nielsen_node* node, euf::snode const*& power, euf::snode const*& other_head, str_eq const*& eq_out, bool& fwd) { for (str_eq const& eq : node->str_eqs()) { if (eq.is_trivial()) continue; for (unsigned od = 0; od < 2; ++od) { const bool local_fwd = (od == 0); euf::snode const* lhead = dir_token(eq.m_lhs, local_fwd); euf::snode const* rhead = dir_token(eq.m_rhs, local_fwd); // Match power vs any non-variable, non-empty token (char, unit, // power with different base, etc.). // Same-base power vs power is handled by NumCmp (priority 3). // Power vs variable is handled by PowerSplit (priority 11). // Power vs empty is handled by PowerEpsilon (priority 2). if (lhead && lhead->is_power() && rhead && !rhead->is_var() && !rhead->is_empty()) { power = lhead; other_head = rhead; eq_out = &eq; fwd = local_fwd; return true; } if (rhead && rhead->is_power() && lhead && !lhead->is_var() && !lhead->is_empty()) { power = rhead; other_head = lhead; eq_out = &eq; fwd = local_fwd; return true; } } } return false; } // ----------------------------------------------------------------------- // Helper: find a power token facing a variable head // ----------------------------------------------------------------------- bool nielsen_graph::find_power_vs_var(nielsen_node* node, euf::snode const*& power, euf::snode const*& var_head, str_eq const*& eq_out, bool& fwd) { for (str_eq const& eq : node->str_eqs()) { SASSERT(eq.well_formed() && !eq.is_trivial()); for (unsigned od = 0; od < 2; ++od) { const bool local_fwd = (od == 0); euf::snode const* lhead = dir_token(eq.m_lhs, local_fwd); euf::snode const* rhead = dir_token(eq.m_rhs, local_fwd); if (lhead && lhead->is_power() && rhead && rhead->is_var()) { power = lhead; var_head = rhead; eq_out = &eq; fwd = local_fwd; return true; } if (rhead && rhead->is_power() && lhead && lhead->is_var()) { power = rhead; var_head = lhead; eq_out = &eq; fwd = local_fwd; return true; } } } return false; } bool nielsen_graph::find_power_vs_var(nielsen_node* node, euf::snode const*& power, str_mem const*& mem_out, bool& fwd) { for (str_mem const& mem : node->str_mems()) { if (mem.is_trivial(node)) { std::cout << "Trivial mem: " << mem_pp(mem, node->graph().get_manager()) << std::endl; } SASSERT(mem.well_formed() && !mem.is_trivial(node)); for (unsigned od = 0; od < 2; ++od) { const bool local_fwd = (od == 0); euf::snode const* lhead = dir_token(mem.m_str, local_fwd); if (lhead && lhead->is_power()) { power = lhead; mem_out = &mem; fwd = local_fwd; return true; } } } return false; } // ----------------------------------------------------------------------- // Modifier: apply_power_epsilon // Fires only when one side of an equation is empty and the other side // starts with a power token u^n. In that case, branch: // (1) base u → ε (base is empty, so u^n = ε) // (2) u^n → ε (the power is zero, replace power with empty) // mirrors ZIPT's PowerEpsilonModifier (which requires LHS.IsEmpty()) // ----------------------------------------------------------------------- bool nielsen_graph::apply_power_epsilon(nielsen_node* node) { // Match only when one equation side is empty and the other starts // with a power. This mirrors ZIPT where PowerEpsilonModifier is // constructed only inside the "if (LHS.IsEmpty())" branch of // ExtendDir. When both sides are non-empty and a power faces a // constant, ConstNumUnwinding (priority 4) handles it with both // n=0 and n≥1 branches. euf::snode const* power = nullptr; dep_tracker dep = m_dep_mgr.mk_empty(); for (str_eq const& eq : node->str_eqs()) { if (eq.is_trivial()) continue; SASSERT(eq.well_formed()); if (eq.m_lhs->is_empty() && eq.m_rhs->first() && eq.m_rhs->first()->is_power()) { power = eq.m_rhs->first(); dep = eq.m_dep; break; } if (eq.m_rhs->is_empty() && eq.m_lhs->first() && eq.m_lhs->first()->is_power()) { power = eq.m_lhs->first(); dep = eq.m_dep; break; } } if (!power) return false; SASSERT(power->is_power() && power->num_args() >= 1); euf::snode const* base = power->arg0(); nielsen_node* child; nielsen_edge* e; // Branch 1: base → ε (if base is a variable, substitute it to empty) // This makes u^n = ε^n = ε for any n. if (base->is_var()) { child = mk_child(node); e = mk_edge(node, child, "power power 0", true); const nielsen_subst s1(base, m_sg.mk_empty_seq(base->get_sort()), dep); e->add_subst(s1); child->apply_subst(m_sg, s1); } // Branch 2: replace the power token itself with ε (n = 0 semantics) child = mk_child(node); e = mk_edge(node, child, "power base 0", true); const nielsen_subst s2(power, m_sg.mk_empty_seq(power->get_sort()), dep); e->add_subst(s2); child->apply_subst(m_sg, s2); return true; } // ----------------------------------------------------------------------- // Modifier: apply_num_cmp // For equations involving two power tokens u^m and u^n with the same base, // branch on the numeric relationship: m <= n vs n < m. // Generates proper integer side constraints for each branch. // mirrors ZIPT's NumCmpModifier // ----------------------------------------------------------------------- bool nielsen_graph::apply_num_cmp(nielsen_node* node) { // Look for two directional endpoint power tokens with the same base. for (str_eq const& eq : node->str_eqs()) { SASSERT(eq.well_formed()); if (eq.is_trivial()) continue; for (unsigned od = 0; od < 2; ++od) { const bool fwd = (od == 0); euf::snode const* lhead = dir_token(eq.m_lhs, fwd); euf::snode const* rhead = dir_token(eq.m_rhs, fwd); if (!lhead || !rhead) continue; if (!lhead->is_power() || !rhead->is_power()) continue; if (lhead->num_args() < 1 || rhead->num_args() < 1) continue; // same base: compare the two powers if (lhead->arg0() != rhead->arg0()) continue; // Skip if the exponents differ by a constant — simplify_and_init's // directional power elimination already handles that case. expr* exp_m = get_power_exponent(lhead); expr* exp_n = get_power_exponent(rhead); if (!exp_m || !exp_n) continue; rational diff; SASSERT(!get_const_power_diff(exp_n, exp_m, a, diff)); // handled by simplification // Branch 1 (explored first): n < m (add constraint c ≥ p + 1) { nielsen_edge *e = mk_edge(node, mk_child(node), "power cmp <", true); const expr_ref n_plus_1(a.mk_add(exp_n, a.mk_int(1)), m); e->add_side_constraint(mk_constraint(a.mk_ge(exp_m, n_plus_1), eq.m_dep)); } // Branch 2 (explored second): m <= n (add constraint p ≥ c) { nielsen_edge *e = mk_edge(node, mk_child(node), "power cmp >=", true); e->add_side_constraint(mk_constraint(a.mk_ge(exp_n, exp_m), eq.m_dep)); } return true; } } return false; } // ----------------------------------------------------------------------- // Modifier: apply_split_power_elim // When one side starts with a power w^p, call CommPower on the other // side to count base-pattern occurrences c. If c > 0 and the ordering // between p and c cannot be determined, create two branches: // Branch 1: p < c (add constraint c ≥ p + 1) // Branch 2: c ≤ p (add constraint p ≥ c) // After branching, simplify_and_init's CommPower pass (3c) can resolve // the ordering deterministically and cancel the matched portion. // mirrors ZIPT's SplitPowerElim + NumCmpModifier // ----------------------------------------------------------------------- bool nielsen_graph::apply_split_power_elim(nielsen_node* node) { for (str_eq const& eq : node->str_eqs()) { SASSERT(eq.well_formed()); if (eq.is_trivial()) continue; for (int side = 0; side < 2; ++side) { euf::snode const* pow_side = (side == 0) ? eq.m_lhs : eq.m_rhs; euf::snode const* other_side = (side == 0) ? eq.m_rhs : eq.m_lhs; // NB: Shuvendu - this test is always false if (!pow_side || !other_side) continue; for (unsigned od = 0; od < 2; ++od) { const bool fwd = od == 0; euf::snode const* end_tok = dir_token(pow_side, fwd); if (!end_tok || !end_tok->is_power()) continue; euf::snode const* base_sn = end_tok->arg0(); expr* pow_exp = get_power_exp_expr(end_tok, m_seq); // NB: Shuvendu - this test is also redundant if (!base_sn || !pow_exp) continue; auto [count, consumed] = comm_power(base_sn, other_side, m, m_seq, fwd); if (!count.get() || consumed == 0) continue; expr_ref norm_count = normalize_arith(m, count); // Skip if ordering is already deterministic — simplify_and_init // pass 3c should have handled it. rational diff; if (get_const_power_diff(norm_count, pow_exp, a, diff)) continue; // Branch 1: pow_exp < count (i.e., count >= pow_exp + 1) { nielsen_edge *e = mk_edge(node, mk_child(node), "power elim >", true); const expr_ref pow_plus1(a.mk_add(pow_exp, a.mk_int(1)), m); e->add_side_constraint(mk_constraint(a.mk_ge(norm_count, pow_plus1), eq.m_dep)); } // Branch 2: count <= pow_exp (i.e., pow_exp >= count) { nielsen_edge *e = mk_edge(node, mk_child(node), "power elim ≤", true); e->add_side_constraint(mk_constraint(a.mk_ge(pow_exp, norm_count), eq.m_dep)); } return true; } } } return false; } // ----------------------------------------------------------------------- // Modifier: apply_const_num_unwinding // For a power token u^n facing a constant (char) head, // branch: (1) n = 0 → u^n = ε, (2) n >= 1 → peel one u from power. // Generates integer side constraints for each branch. // mirrors ZIPT's ConstNumUnwindingModifier // ----------------------------------------------------------------------- bool nielsen_graph::apply_const_num_unwinding(nielsen_node* node) { euf::snode const* power = nullptr; euf::snode const* other_head = nullptr; str_eq const *eq = nullptr; bool fwd = true; if (!find_power_vs_non_var(node, power, other_head, eq, fwd)) return false; SASSERT(power->is_power() && power->num_args() >= 1); euf::snode const* base = power->arg0(); expr *exp_n = get_power_exponent(power); expr *zero = a.mk_int(0); expr *one = a.mk_int(1); // Branch 1 (explored first): n = 0 → replace power with ε (progress) // Side constraint: n = 0 nielsen_node *child = mk_child(node); nielsen_edge *e = mk_edge(node, child, "unwinding 0", true); const nielsen_subst s1(power, m_sg.mk_empty_seq(power->get_sort()), eq->m_dep); e->add_subst(s1); child->apply_subst(m_sg, s1); if (exp_n) e->add_side_constraint(mk_constraint(a.mk_eq(exp_n, zero), eq->m_dep)); // Branch 2 (explored second): n >= 1 → peel one u: replace u^n with u · u^(n-1) // Side constraint: n >= 1 // Create a proper nested power base^(n-1) instead of a fresh string variable. // This preserves power structure so that simplify_and_init can merge and // cancel adjacent same-base powers (mirroring ZIPT's SimplifyPowerUnwind). // Explored first because the n≥1 branch is typically more productive // for SAT instances (preserves power structure). const seq_util &seq = m_sg.get_seq_util(); expr *power_e = power->get_expr(); SASSERT(power_e); expr *base_expr = to_app(power_e)->get_arg(0); const expr_ref n_minus_1 = normalize_arith(m, a.mk_sub(exp_n, one)); const expr_ref nested_pow(seq.str.mk_power(base_expr, n_minus_1), m); euf::snode const* nested_power_snode = m_sg.mk(nested_pow); euf::snode const* replacement = dir_concat(m_sg, base, nested_power_snode, fwd); child = mk_child(node); e = mk_edge(node, child, "unwinding >", true); const nielsen_subst s2(power, replacement, eq->m_dep); e->add_subst(s2); child->apply_subst(m_sg, s2); if (exp_n) e->add_side_constraint(mk_constraint(a.mk_ge(exp_n, one), eq->m_dep)); return true; } // ----------------------------------------------------------------------- // Helper: ensure_automaton_explored (lazy, on-demand, cached) // Records the *complete* reachable Brzozowski automaton of root_re into the // partial DFA — but only once per regex component. This is the lazy // Q-completion: instead of re-running a fixed-depth BFS on every cycle // decomposition (the old precompute_partial_dfa(R, depth), which re-explored // R's automaton on each of the ~hundreds of decompositions and, with a // shallow depth, left Q incomplete so guards discharged prematurely and laps // never closed), we explore each component once to saturation and remember // every state we visited. A later decomposition whose head was reached by an // earlier exploration finds it already covered and does no work. // // Completeness matters: the cycle guard / stabilizer view gate on whether the // current state lies in Q (projection_state_in_Q), and that is only sound as // a *bound* when Q contains the whole SCC of the head. The BFS is bounded by // the finite reachable automaton (Brzozowski states modulo ACI), so it // terminates; m.inc() keeps it responsive to the resource limit. // ----------------------------------------------------------------------- void nielsen_graph::ensure_automaton_explored(euf::snode const* root_re) { SASSERT(root_re); if (!root_re->is_ground()) return; if (m_explored_automaton.contains(root_re->get_expr()->get_id())) return; svector queue; queue.push_back(root_re); while (!queue.empty()) { if (!m.inc()) return; // resource limit: leave Q partial (sound under-approx) euf::snode const* re = queue.back(); queue.pop_back(); const unsigned re_eid = re->get_expr()->get_id(); if (m_explored_automaton.contains(re_eid)) continue; // already explored (here or in a previous component) m_explored_automaton.insert(re_eid); euf::snode_vector mts; m_sg.compute_minterms(re, mts); for (euf::snode const* mt : mts) { euf::snode const* deriv = m_sg.brzozowski_deriv(re, mt); if (!deriv || deriv->is_fail()) continue; record_partial_derivative_edge(re, mt, deriv); if (deriv->is_ground() && !m_explored_automaton.contains(deriv->get_expr()->get_id())) queue.push_back(deriv); } } } // ----------------------------------------------------------------------- // Modifier: apply_cycle_subsumption (paper Section "Cycle Subsumption") // For a membership x·u ∈ R (u≠ε) with L(⊓Reg_x) ⊆ stab(R,Q_ν), drop the // leading x: replace x·u ∈ R by u ∈ R. The inclusion is decided as the // product-emptiness test L(⊓Reg_x) ∩ ~stab(R,Q_ν) = ∅ (Section 3.3), // adding one co-view component for ~stab. // ----------------------------------------------------------------------- bool nielsen_graph::apply_cycle_subsumption(nielsen_node* node) { for (unsigned mi = 0; mi < node->str_mems().size(); ++mi) { str_mem const& mem = node->str_mems()[mi]; SASSERT(mem.well_formed()); if (!mem.is_plain() || mem.is_primitive()) continue; euf::snode const* first = mem.m_str->first(); SASSERT(first); if (!first->is_var()) continue; euf::snode const* R = mem.m_regex; // R must lie on a detected cycle with a marked SCC snapshot. uint_set scc; if (!collect_scc_for_projection(R, scc)) continue; const unsigned nu = m_projection_extract_idx; if (nu == 0) continue; // Decide L(⊓Reg_x) ⊆ stab(R,Q_ν) as ⊓Reg_x ∩ ~stab = ∅. vector comps; dep_tracker x_dep = nullptr; collect_var_components(first, *node, comps, x_dep); comps.push_back(prod_comp::mk_view(R, R, nu, /*complemented*/ true)); if (check_product_emptiness(comps, 5000) != l_true) continue; // Subsume: replace x·u ∈ R with u ∈ R. euf::snode const* tail = m_sg.drop_first(mem.m_str); SASSERT(tail); nielsen_node* child = mk_child(node); mk_edge(node, child, "cycle subs", true); for (auto& cm : child->str_mems()) { if (cm == mem) { cm.m_str = tail; cm.m_dep = m_dep_mgr.mk_join(cm.m_dep, x_dep); break; } } TRACE(seq, tout << "cycle_subsumption: dropped x=" << mk_pp(first->get_expr(), m) << " from " << mk_pp(mem.m_str->get_expr(), m) << " ∈ " << mk_pp(R->get_expr(), m) << " nu=" << nu << "\n"); return true; } return false; } // ----------------------------------------------------------------------- // Modifier: apply_cycle_decomposition (paper Section "Cycle Decomposition") // // For a membership x·u ∈ R whose leading variable x sits on a detected cycle // (R lies in an SCC of the partial DFA) and that does not already carry a // matching cycle guard for the current SCC snapshot, split // x → x'·x'' // and attach the two *view*/*guard* primitive constraints (Section 3.3): // x' ∈ stab(R, Q_ν) -- stabilizer view (F = {R}) // noloop(x'', R, Q_ν) -- cycle guard (two-mode monitor) // The leading x' is immediately subsumed (its only constraint is the view, // and stab ⊆ stab trivially), so it is dropped from the primary constraint. // // Unlike the old projection-operator encoding, the view and guard are kept // as *constraint metadata* over the plain state R and the ν-indexed explored // subautomaton Q_ν — nothing is materialized as a regex, which keeps the // reachable state space finite (termination). // ----------------------------------------------------------------------- bool nielsen_graph::apply_cycle_decomposition(nielsen_node* node) { // Look for a str_mem with a variable-headed string for (unsigned mi = 0; mi < node->str_mems().size(); ++mi) { str_mem const& mem = node->str_mems()[mi]; SASSERT(mem.well_formed()); if (!mem.is_plain() || mem.is_primitive()) continue; euf::snode const* first = mem.m_str->first(); SASSERT(first); if (!first->is_var()) continue; euf::snode const* x = first; euf::snode const* R = mem.m_regex; // Lazily explore R's full automaton (once, cached) so that // collect_scc_for_projection sees the complete SCC of R and the // stabilizer view / cycle guard gate on a complete Q. ensure_automaton_explored(R); // R must sit on a cycle (an SCC of the partial DFA). uint_set scc; if (!collect_scc_for_projection(R, scc)) continue; // Mark the SCC edges; this gives a ν identifying the current Q_SCC. // (We trigger on absence of a matching guard, NOT on novelty.) mark_scc_projection_edges(scc); const unsigned nu = m_projection_extract_idx; if (nu == 0) continue; // Trigger condition: x must not already carry a cycle guard for the // current SCC snapshot ν. All states of one SCC share a single ν, so // the guard is keyed on ν rather than on the (changing) head R: as the // derivation walks the SCC the head moves, but the lineage is already // guarded against re-traversing the cycle. An already-decomposed // variable whose guard refers to a strictly smaller, stale ν is // re-decomposed to adopt the enlarged SCC. bool already_guarded = false; for (str_mem const& g : node->str_mems()) { if (g.is_guard() && g.m_nu >= nu && g.m_str && g.m_str->first() == x) { already_guarded = true; break; } } if (already_guarded) continue; sort* seq_sort = x->get_expr()->get_sort(); // Construct the replacement x = x' x'' euf::snode const* xp = m_sg.mk(m_sk.mk("cycle", x->get_expr(), R->get_expr(), seq_sort)); euf::snode const* xpp = get_tail(x, compute_length_expr(xp).get()); euf::snode const* xp_xpp = m_sg.mk_concat(xp, xpp); nielsen_node* child = mk_child(node); SASSERT(child->m_str_mem[mi] == mem); nielsen_edge* e = mk_edge(node, child, "cycle decomp", false); const nielsen_subst s(x, xp_xpp, mem.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); // remove from mi^th element of the child the leading token, as it is immediately subsumed SASSERT(child->m_str_mem[mi].m_str->first() == xp); child->m_str_mem[mi].m_str = dir_drop(m_sg, child->m_str_mem[mi].m_str, 1, true); // x' ∈ stab(R, Q_ν) (stabilizer view, F = {R}, current state = R) child->add_str_mem(str_mem::mk_view(xp, R, R, nu, mem.m_dep)); // noloop(x'', R, Q_ν) (cycle guard, two-mode monitor, state = R) child->add_str_mem(str_mem::mk_guard(xpp, R, R, nu, mem.m_dep)); TRACE(seq, tout << "cycle_decomp: x=" << mk_pp(x->get_expr(), m) << " R=" << mk_pp(R->get_expr(), m) << " nu=" << nu << "\n"); #ifdef Z3DEBUG std::string dot = partial_dfa_to_dot(R, false); #endif return true; } return false; } // ----------------------------------------------------------------------- // Modifier: apply_gpower_intr // Generalized power introduction: for an equation where one side's head // is a variable v and the other side has a ground prefix followed by a // variable x that forms a dependency cycle back to v, introduce // v = base^n · suffix where base is the ground prefix. // Generates side constraints n >= 0 and 0 <= len(suffix) < len(base). // mirrors ZIPT's GPowerIntrModifier (SplitGroundPower + TryGetPowerSplitBase) // ----------------------------------------------------------------------- bool nielsen_graph::apply_gpower_intr(nielsen_node* node) { for (str_eq const& eq : node->str_eqs()) { SASSERT(eq.well_formed()); if (eq.is_trivial()) continue; // Try both directions (ZIPT's ExtendDir(fwd=true/false)). for (unsigned od = 0; od < 2; ++od) { const bool fwd = (od == 0); euf::snode const* lhead = dir_token(eq.m_lhs, fwd); euf::snode const* rhead = dir_token(eq.m_rhs, fwd); if (!lhead || !rhead) continue; // Orientation 1: RHS directional head is var, scan LHS in that // direction for ground prefix + matching cycle var. if (rhead->is_var() && !lhead->is_var()) { euf::snode_vector toks; collect_tokens_dir(eq.m_lhs, fwd, toks); euf::snode_vector ground_prefix; euf::snode const* target_var = nullptr; for (unsigned i = 0; i < toks.size(); ++i) { if (toks[i]->is_var()) { target_var = toks[i]; break; } ground_prefix.push_back(toks[i]); } if (target_var && !ground_prefix.empty() && target_var->id() == rhead->id()) { if (fire_gpower_intro(node, eq, rhead, ground_prefix, fwd)) return true; } } // Orientation 2: LHS directional head is var, scan RHS analogously. if (lhead->is_var() && !rhead->is_var()) { euf::snode_vector toks; collect_tokens_dir(eq.m_rhs, fwd, toks); euf::snode_vector ground_prefix; euf::snode const* target_var = nullptr; for (unsigned i = 0; i < toks.size(); ++i) { if (toks[i]->is_var()) { target_var = toks[i]; break; } ground_prefix.push_back(toks[i]); } if (target_var && !ground_prefix.empty() && target_var->id() == lhead->id()) { if (fire_gpower_intro(node, eq, lhead, ground_prefix, fwd)) return true; } } } // TODO: Extend to transitive cycles across multiple equations // (ZIPT's varDep + HasDepCycle). Currently only self-cycles are detected. } return false; } // ----------------------------------------------------------------------- // Modifier: apply_regex_factorization (Boolean Closure) // ----------------------------------------------------------------------- bool nielsen_graph::apply_regex_factorization(nielsen_node* node) { if (m_regex_factorization_threshold == 0) return false; struct rf_split { euf::snode const* m_p; euf::snode const* m_q; dep_tracker m_dep; }; for (str_mem const& mem : node->str_mems()) { SASSERT(mem.well_formed()); // split() handles all regex forms (incl. complement / intersection), // so the classical restriction is no longer needed. if (mem.m_str->is_empty() || mem.is_primitive()) continue; // View / guard memberships (Section 3.3) are handled by the // cycle machinery and the synchronous product, not by factorization. if (!mem.is_plain()) continue; split_set pairs; auto [head, tail] = split_membership(mem.m_str, mem.m_regex, sg(), m_regex_factorization_threshold, pairs); if (!head) { SASSERT(!tail); continue; } SASSERT(tail); euf::snode const* const first = mem.m_str->first(); vector feasible; dep_tracker eliminated_dep = mem.m_dep; for (auto const &[tp, tq] : pairs) { euf::snode const* sn_p = m_sg.mk(tp); euf::snode const* sn_q = m_sg.mk(tq); // Also check intersection with other primitive constraints on `head`. // Only valid when head is the single token `first`; for a multi-token // head Δ constrains the whole prefix, so we only check Δ ≠ ∅. euf::snode_vector regexes_p; regexes_p.push_back(sn_p); dep_tracker first_filter_dep = nullptr; if (head == first) { for (auto const& prev_mem : node->str_mems()) { if (prev_mem.m_str == first) { regexes_p.push_back(prev_mem.m_regex); first_filter_dep = m_dep_mgr.mk_join(first_filter_dep, prev_mem.m_dep); } } } if (m_seq_regex->check_intersection_emptiness(regexes_p, 100) == l_true) { eliminated_dep = m_dep_mgr.mk_join(eliminated_dep, first_filter_dep); continue; } feasible.push_back({ sn_p, sn_q, m_dep_mgr.mk_join(mem.m_dep, first_filter_dep) }); if (feasible.size() > m_regex_factorization_threshold) break; } if (feasible.empty()) { node->set_general_conflict(); node->set_conflict(backtrack_reason::regex, eliminated_dep); return true; } if (feasible.size() > m_regex_factorization_threshold) continue; for (auto& [m_p, m_q, m_dep] : feasible) { nielsen_node* child = mk_child(node); mk_edge(node, child, "regex fact", true); // remove the original mem from child auto& child_mems = child->str_mems(); for (unsigned k = 0; k < child_mems.size(); ++k) { if (child_mems[k] == mem) { child_mems[k] = child_mems.back(); child_mems.pop_back(); break; } } child->add_str_mem(str_mem(head, m_p, m_dep)); child->add_str_mem(str_mem(tail, m_q, m_dep)); } return true; } return false; } bool nielsen_graph::fire_gpower_intro( nielsen_node* node, str_eq const& eq, euf::snode const* var, euf::snode_vector const& ground_prefix_orig, const bool fwd) { // Compress repeated patterns in the ground prefix (mirrors ZIPT's LcpCompressionFull). // E.g., [a,b,a,b] has minimal period 2 → use [a,b] as the power base. // This ensures we use the minimal repeating unit: x = (ab)^n · suffix // instead of x = (abab)^n · suffix. // (mirrors ZIPT: if b.Length == 1 && b is PowerToken pt => b = pt.Base) euf::snode_vector ground_prefix; const unsigned n = ground_prefix_orig.size(); unsigned period = n; for (unsigned p = 1; p <= n / 2; ++p) { if (n % p != 0) continue; bool match = true; for (unsigned i = p; i < n && match; ++i) match = ground_prefix_orig[i]->id() == ground_prefix_orig[i % p]->id(); if (match) { period = p; break; } } for (unsigned i = 0; i < period; ++i) { ground_prefix.push_back(ground_prefix_orig[i]); } // If the compressed prefix is a single power snode, unwrap it to use // its base tokens, avoiding nested powers. // E.g., [(ab)^3] → [a, b] so we get (ab)^n instead of ((ab)^3)^n. // (mirrors ZIPT: if b.Length == 1 && b is PowerToken pt => b = pt.Base) if (ground_prefix.size() == 1 && ground_prefix[0]->is_power()) { expr* base_e = get_power_base_expr(ground_prefix[0], m_seq); if (base_e) { euf::snode const* base_sn = m_sg.mk(base_e); if (base_sn) { euf::snode_vector base_toks; collect_tokens_dir(base_sn, fwd, base_toks); if (!base_toks.empty()) { ground_prefix.reset(); ground_prefix.append(base_toks); } } } } const unsigned base_len = ground_prefix.size(); // Build base string expression from ground prefix tokens. // Each s_char snode's get_expr() is already seq.unit(ch) (a string). expr_ref base_str(m); for (unsigned i = 0; i < base_len; ++i) { expr* tok_expr = ground_prefix[i]->get_expr(); if (!tok_expr) return false; if (i == 0) base_str = tok_expr; else if (fwd) base_str = m_seq.str.mk_concat(base_str, tok_expr); else base_str = m_seq.str.mk_concat(tok_expr, base_str); } // Create fresh exponent variable and power expression: base^n const expr_ref fresh_n = get_or_create_gpower_n_var(var); const expr_ref power_expr(m_seq.str.mk_power(base_str, fresh_n), m); euf::snode const* power_snode = m_sg.mk(power_expr); if (!power_snode) return false; const expr_ref zero(a.mk_int(0), m); // Generate children mirroring ZIPT's GetDecompose: // P(t0 · t1 · ... · t_{k-1}) = P(t0) | t0·P(t1) | ... | t0·...·t_{k-2}·P(t_{k-1}) // For char tokens P(c) = {ε}, for power tokens P(u^m) = {u^m', 0 ≤ m' ≤ m}. // Child at position i substitutes var → base^n · t0·...·t_{i-1} · P(t_i). for (unsigned i = 0; i < base_len; ++i) { euf::snode const* tok = ground_prefix[i]; // Skip char position when preceding token is a power: // The power case at i-1 with 0 ≤ m' ≤ exp already covers m' = exp, // which produces the same result. Using the original exponent here // creates a rigid coupling that causes cycling. if (!tok->is_power() && i > 0 && ground_prefix[i - 1]->is_power()) continue; // Build full-token prefix: ground_prefix[0..i-1] euf::snode const* prefix_sn = nullptr; for (unsigned j = 0; j < i; ++j) prefix_sn = (j == 0) ? ground_prefix[0] : dir_concat(m_sg, prefix_sn, ground_prefix[j], fwd); euf::snode const* replacement; expr_ref fresh_m(m); if (tok->is_power()) { // Token is a power u^exp: use fresh m' with 0 ≤ m' ≤ exp const expr * inner_exp = get_power_exponent(tok); expr* inner_base = get_power_base_expr(tok, m_seq); if (inner_exp && inner_base) { fresh_m = get_or_create_gpower_m_var(var); expr_ref partial_pow(m_seq.str.mk_power(inner_base, fresh_m), m); euf::snode const* partial_sn = m_sg.mk(partial_pow); euf::snode const* suffix_sn = prefix_sn ? dir_concat(m_sg, prefix_sn, partial_sn, fwd) : partial_sn; replacement = dir_concat(m_sg, power_snode, suffix_sn, fwd); } else { // Fallback: use full token (shouldn't normally happen) euf::snode const* suffix_sn = prefix_sn ? dir_concat(m_sg, prefix_sn, tok, fwd) : tok; replacement = dir_concat(m_sg, power_snode, suffix_sn, fwd); } } else // Token is a char: P(char) = ε, suffix = just the prefix replacement = prefix_sn ? dir_concat(m_sg, power_snode, prefix_sn, fwd) : power_snode; nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "power intr", true); nielsen_subst s(var, replacement, eq.m_dep); // TODO review - ensure var does not occur in replacement. e->add_subst(s); child->apply_subst(m_sg, s); // Side constraint: n >= 0 e->add_side_constraint(mk_constraint(a.mk_ge(fresh_n, zero), eq.m_dep)); // Side constraints for fresh partial exponent if (fresh_m.get()) { expr* inner_exp = get_power_exponent(tok); // m' >= 0 e->add_side_constraint(mk_constraint(a.mk_ge(fresh_m, zero), eq.m_dep)); // m' <= inner_exp e->add_side_constraint(mk_constraint(a.mk_ge(inner_exp, fresh_m), eq.m_dep)); } } return true; } // ----------------------------------------------------------------------- // Modifier: apply_signature_split // Heuristic equation split based on a shortest prefix signature (i, j): // prefixes u[0..i-1], v[0..j-1] must contain at least one variable and // every variable in one prefix must occur in the other prefix. // ----------------------------------------------------------------------- bool nielsen_graph::apply_signature_split(nielsen_node* node) { auto first_var_pos = [](euf::snode_vector const& toks) { for (unsigned k = 0; k < toks.size(); ++k) if (toks[k]->is_var()) return k; return toks.size(); }; auto const& eqs = node->str_eqs(); for (unsigned eq_idx = 0; eq_idx < eqs.size(); ++eq_idx) { str_eq const& eq = eqs[eq_idx]; SASSERT(eq.well_formed()); if (eq.is_trivial()) continue; euf::snode_vector lhs_toks, rhs_toks; eq.m_lhs->collect_tokens(lhs_toks); eq.m_rhs->collect_tokens(rhs_toks); // Start from the first variable on each side; if one side has no // variable, this equation has no usable signature. const unsigned i0 = first_var_pos(lhs_toks); const unsigned j0 = first_var_pos(rhs_toks); if (i0 == lhs_toks.size() || j0 == rhs_toks.size()) continue; std::unordered_map lhs_first, rhs_first; lhs_first.reserve(lhs_toks.size()); rhs_first.reserve(rhs_toks.size()); for (unsigned k = 0; k < lhs_toks.size(); ++k) { if (!lhs_toks[k]->is_var()) continue; expr* x = lhs_toks[k]->get_expr(); if (!lhs_first.contains(x)) lhs_first.emplace(x, k); } for (unsigned k = 0; k < rhs_toks.size(); ++k) { if (!rhs_toks[k]->is_var()) continue; expr* x = rhs_toks[k]->get_expr(); if (!rhs_first.contains(x)) rhs_first.emplace(x, k); } svector lhs_need_j(lhs_toks.size(), UINT_MAX); svector rhs_need_i(rhs_toks.size(), UINT_MAX); // Prefix summary arrays: // lhs_need_j[k] = maximum first-occurrence index in rhs for any // variable seen in lhs[0..k]. Symmetric for rhs_need_i. // A value of UINT_MAX means "no variable requirement yet". // A value of UINT_MAX-1 means "fail: some prefix variable does not // occur on the opposite side at all". constexpr unsigned FAIL_MARK = UINT_MAX - 1; unsigned need = UINT_MAX; for (unsigned k = 0; k < lhs_toks.size(); ++k) { if (lhs_toks[k]->is_var()) { auto it = rhs_first.find(lhs_toks[k]->get_expr()); if (it == rhs_first.end()) need = FAIL_MARK; else if (need != FAIL_MARK) need = (need == UINT_MAX) ? it->second : std::max(need, it->second); } lhs_need_j[k] = need; } need = UINT_MAX; for (unsigned k = 0; k < rhs_toks.size(); ++k) { if (rhs_toks[k]->is_var()) { auto it = lhs_first.find(rhs_toks[k]->get_expr()); if (it == lhs_first.end()) need = FAIL_MARK; else if (need != FAIL_MARK) need = (need == UINT_MAX) ? it->second : std::max(need, it->second); } rhs_need_i[k] = need; } unsigned i = i0 + 1; unsigned j = j0 + 1; // Compute least fixpoint for (i, j): grow one side only when the // current prefix on the other side requires it. bool changed = true; while (changed) { changed = false; const unsigned req_j = lhs_need_j[i - 1]; if (req_j == FAIL_MARK) { i = lhs_toks.size(); break; } if (req_j != UINT_MAX && req_j + 1 > j) { j = req_j + 1; changed = true; } const unsigned req_i = rhs_need_i[j - 1]; if (req_i == FAIL_MARK) { j = rhs_toks.size(); break; } if (req_i != UINT_MAX && req_i + 1 > i) { i = req_i + 1; changed = true; } } if (i >= lhs_toks.size() || j >= rhs_toks.size()) continue; // Decompose u = u1·u2 and v = v1·v2 at signature indices. euf::snode const* u1 = m_sg.drop_right(eq.m_lhs, lhs_toks.size() - i); euf::snode const* u2 = m_sg.drop_left(eq.m_lhs, i); euf::snode const* v1 = m_sg.drop_right(eq.m_rhs, rhs_toks.size() - j); euf::snode const* v2 = m_sg.drop_left(eq.m_rhs, j); // NSB review: if we keep this skolem function it should include arguments // to not clash with other values of i, j // Why not use // x := str.substr(u2, 0, str.len(u2) - str.len(v1)), const auto x_e = m_sk.mk("signature-split", eq.m_lhs->get_expr(), eq.m_rhs->get_expr(), eq.m_lhs->get_sort()); euf::snode const* x = m_sg.mk(x_e); for (unsigned branch = 0; branch < 2; ++branch) { nielsen_node* child = mk_child(node); mk_edge(node, child, "signature split", true); auto& child_eqs = child->str_eqs(); child_eqs[eq_idx] = child_eqs.back(); child_eqs.pop_back(); // Two-way split: // (1) u1·x = v1 and u2 = x·v2 // (2) u1 = v1·x and x·u2 = v2 if (branch == 0) { child_eqs.push_back(str_eq(m_sg.mk_concat(u1, x), v1, eq.m_dep)); child_eqs.push_back(str_eq(u2, m_sg.mk_concat(x, v2), eq.m_dep)); } else { child_eqs.push_back(str_eq(u1, m_sg.mk_concat(v1, x), eq.m_dep)); child_eqs.push_back(str_eq(m_sg.mk_concat(x, u2), v2, eq.m_dep)); } } return true; } return false; } bool nielsen_graph::apply_regex_if_split(nielsen_node *node) { for (str_mem const &mem : node->str_mems()) { SASSERT(mem.well_formed()); expr *r_expr = mem.m_regex->get_expr(); expr_ref c(m), th(m), el(m); if (!bool_rewriter(m).decompose_ite(r_expr, c, th, el)) continue; bool created = false; // Worklist: (regex_expr, accumulated_conditions). // Call decompose_ite in a loop until no more ite sub-expressions, // branching on c=true and c=false and accumulating conditions. vector> worklist; worklist.push_back({expr_ref(r_expr, m), expr_ref_vector(m)}); while (!worklist.empty()) { auto [r, cs] = std::move(worklist.back()); worklist.pop_back(); if (m_seq.re.is_empty(r)) continue; expr_ref c2(m), th2(m), el2(m); if (!bool_rewriter(m).decompose_ite(r, c2, th2, el2)) { // No ite remaining: leaf → create child node with regex updated to r. // Canonicalize with th_rewriter so that the resolved leaf shares // its snode id with the corresponding partial-DFA state (which is // built by brzozowski_deriv); otherwise un-simplified residuals // like (a|∅)·R≠a·R break view/guard Q-membership and lap checks. euf::snode const* new_regex_snode = mk_rewrite(r); nielsen_node *child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "regex if", true); for (const auto f : cs) { e->add_side_constraint(constraint(f, mem.m_dep, m)); } for (str_mem &cm : child->str_mems()) { if (cm == mem) { cm.m_regex = new_regex_snode; // A guard whose symbolic step lands back on the cycle // head closed a lap within Q → this branch is dead. if (cm.is_guard() && new_regex_snode == cm.m_root) { child->set_general_conflict(); child->set_conflict(backtrack_reason::regex, cm.m_dep); } break; } } created = true; continue; } th_rewriter tr(m); expr_ref c_simp(c2, m); tr(c_simp); if (m.is_true(c_simp)) { if (!m_seq.re.is_empty(th2)) worklist.push_back({th2, std::move(cs)}); } else if (m.is_false(c_simp)) { if (!m_seq.re.is_empty(el2)) worklist.push_back({el2, std::move(cs)}); } else { if (!m_seq.re.is_empty(th2)) { expr_ref_vector cs_true(cs); cs_true.push_back(c2); worklist.push_back({th2, std::move(cs_true)}); } if (!m_seq.re.is_empty(el2)) { expr_ref_vector cs_false(cs); cs_false.push_back(mk_not(c2)); worklist.push_back({el2, std::move(cs_false)}); } } } if (created) return true; } return false; } // ----------------------------------------------------------------------- // Modifier: apply_regex_var_split // For str_mem x·s ∈ R where x is a variable, split using minterms: // (1) x → ε (empty) // (2) x → c · x' for each minterm character class c // More general than regex_char_split; uses minterm partitioning rather // than just extracting concrete characters. // mirrors ZIPT's RegexVarSplitModifier // ----------------------------------------------------------------------- bool nielsen_graph::apply_regex_var_split(nielsen_node* node) { for (str_mem const& mem : node->str_mems()) { SASSERT(mem.well_formed()); if (mem.is_primitive()) continue; euf::snode const* first = mem.m_str->first(); SASSERT(first); // std::cout << "Considering regex: " << spp(mem.m_regex, m) << std::endl; // Branch 1: x → ε (progress) { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "regex var split", true); const nielsen_subst s(first, m_sg.mk_empty_seq(first->get_sort()), mem.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); } // Branch 2: x → ?c · x, where ?c is a fresh symbolic char. // for variables at mod_count 0 and other terms, use symbolic (str.len expr) // NSB review: // it really is seq.nth (length-of-prefix that was chopped of for first) // assume len(x) contains the expression for the current length of x, // then the suffix where the current x is located is at str.len(x) - len(x) // (seq.nth x (- (str.len x) len(x)) // euf::snode const* fresh_char = m_sg.mk(get_or_create_char_var(first)); euf::snode const* tail = get_tail(first, 1, true); euf::snode const* replacement = m_sg.mk_concat(fresh_char, tail); nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "regex var split", false); const nielsen_subst s(first, replacement, mem.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); return true; } return false; } // ----------------------------------------------------------------------- // Modifier: apply_power_split // For a variable x facing a power token u^n, branch: // (1) x = u^m · prefix(u) with m < n (bounded power prefix) // (2) x = u^n · x' (the variable extends past the full power) // Generates integer side constraints for the fresh exponent variables. // mirrors ZIPT's PowerSplitModifier // ----------------------------------------------------------------------- bool nielsen_graph::apply_power_split(nielsen_node* node) { euf::snode const* power = nullptr; euf::snode const* var_head = nullptr; str_eq const* eq = nullptr; bool fwd = true; if (!find_power_vs_var(node, power, var_head, eq, fwd)) return false; SASSERT(power->is_power() && power->num_args() >= 1); euf::snode const* base = power->arg0(); const expr_ref zero(a.mk_int(0), m); // Branch 1: enumerate all decompositions of the base. // x = base^m · prefix_i(base) where 0 <= m < n // Uses the same GetDecompose pattern as fire_gpower_intro. { euf::snode_vector base_toks; collect_tokens_dir(base, fwd, base_toks); const unsigned base_len = base_toks.size(); expr* base_expr = get_power_base_expr(power, m_seq); if (!base_expr || base_len == 0) return false; const expr_ref fresh_m = get_or_create_gpower_n_var(var_head); const expr_ref power_m_expr(m_seq.str.mk_power(base_expr, fresh_m), m); euf::snode const* power_m_sn = m_sg.mk(power_m_expr); if (!power_m_sn) return false; for (unsigned i = 0; i < base_len; ++i) { euf::snode const* tok = base_toks[i]; // Skip char position when preceding token is a power: // the power case at i-1 with 0 <= m' <= exp already covers m' = exp. if (!tok->is_power() && i > 0 && base_toks[i - 1]->is_power()) continue; // Build full-token prefix: base_toks[0..i-1] euf::snode const* prefix_sn = nullptr; for (unsigned j = 0; j < i; ++j) prefix_sn = (j == 0) ? base_toks[0] : dir_concat(m_sg, prefix_sn, base_toks[j], fwd); euf::snode const* replacement; expr_ref fresh_inner_m(m); if (tok->is_power()) { // Token is a power u^exp: decompose with fresh m', 0 <= m' <= exp const expr * inner_exp = get_power_exponent(tok); expr* inner_base = get_power_base_expr(tok, m_seq); if (inner_exp && inner_base) { fresh_inner_m = get_or_create_gpower_m_var(var_head); expr_ref partial_pow(m_seq.str.mk_power(inner_base, fresh_inner_m), m); euf::snode const* partial_sn = m_sg.mk(partial_pow); euf::snode const* suffix_sn = prefix_sn ? dir_concat(m_sg, prefix_sn, partial_sn, fwd) : partial_sn; replacement = dir_concat(m_sg, power_m_sn, suffix_sn, fwd); } else { euf::snode const* suffix_sn = prefix_sn ? dir_concat(m_sg, prefix_sn, tok, fwd) : tok; replacement = dir_concat(m_sg, power_m_sn, suffix_sn, fwd); } } else // Token is a char: P(char) = ε, suffix is just the prefix replacement = prefix_sn ? dir_concat(m_sg, power_m_sn, prefix_sn, fwd) : power_m_sn; nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "power split", true); nielsen_subst s(var_head, replacement, eq->m_dep); // TODO review - ensure var does not occur in replacement. e->add_subst(s); child->apply_subst(m_sg, s); // Side constraint: n >= 0 e->add_side_constraint(mk_constraint(a.mk_ge(fresh_m, zero), eq->m_dep)); // Side constraints for fresh partial exponent if (fresh_inner_m.get()) { expr* inner_exp = get_power_exponent(tok); // m' >= 0 e->add_side_constraint(mk_constraint(a.mk_ge(fresh_inner_m, zero), eq->m_dep)); // m' <= inner_exp e->add_side_constraint(mk_constraint(a.mk_ge(inner_exp, fresh_inner_m), eq->m_dep)); } } } // Branch 2: x = u^n · x' (variable extends past full power, non-progress) // so replace x -> u^n · x { euf::snode const* replacement = dir_concat(m_sg, power, var_head, fwd); nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "power split", false); const nielsen_subst s(var_head, replacement, eq->m_dep); e->add_subst(s); child->apply_subst(m_sg, s); } return true; } // ----------------------------------------------------------------------- // Modifier: apply_var_num_unwinding // For a power token u^n facing a variable, branch: // (1) n = 0 → u^n = ε (replace power with empty) // (2) n >= 1 → peel one u from the power // Generates integer side constraints for each branch. // mirrors ZIPT's VarNumUnwindingModifier // ----------------------------------------------------------------------- bool nielsen_graph::apply_var_num_unwinding_eq(nielsen_node* node) { euf::snode const* power = nullptr; euf::snode const* var_head = nullptr; str_eq const* eq = nullptr; bool fwd = true; if (!find_power_vs_var(node, power, var_head, eq, fwd)) return false; SASSERT(power->is_power() && power->num_args() >= 1); euf::snode const* base = power->arg0(); expr* exp_n = get_power_exponent(power); SASSERT(exp_n); const expr_ref zero(a.mk_int(0), m); const expr_ref one(a.mk_int(1), m); // Branch 1: n = 0 → replace u^n with ε (progress) // Side constraint: n = 0 { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "unwinding eq 0", true); const nielsen_subst s(power, m_sg.mk_empty_seq(power->get_sort()), eq->m_dep); e->add_subst(s); child->apply_subst(m_sg, s); e->add_side_constraint(mk_constraint(a.mk_ge(exp_n, zero), eq->m_dep)); e->add_side_constraint(mk_constraint(a.mk_le(exp_n, zero), eq->m_dep)); } // Branch 2: n >= 1 → peel one u: u^n → u · u^(n-1) // Side constraint: n >= 1 { const expr_ref power_expr(m_seq.str.mk_power(base->get_expr(), a.mk_sub(exp_n, a.mk_int(1))), m); euf::snode const* power_snode = m_sg.mk(power_expr); euf::snode const* replacement = dir_concat(m_sg, base, power_snode, fwd); nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "unwinding eq >", false); const nielsen_subst s(power, replacement, eq->m_dep); // TODO review - ensure var does not occur in replacement. e->add_subst(s); child->apply_subst(m_sg, s); e->add_side_constraint(mk_constraint(a.mk_ge(exp_n, one), eq->m_dep)); } return true; } bool nielsen_graph::apply_var_num_unwinding_mem(nielsen_node* node) { euf::snode const* power = nullptr; str_mem const* mem = nullptr; bool fwd = true; if (!find_power_vs_var(node, power, mem, fwd)) return false; SASSERT(power->is_power() && power->num_args() >= 1); euf::snode const* base = power->arg0(); expr* exp_n = get_power_exponent(power); SASSERT(exp_n); const expr_ref zero(a.mk_int(0), m); const expr_ref one(a.mk_int(1), m); // Branch 1: n = 0 → replace u^n with ε (progress) // Side constraint: n = 0 { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "unwinding mem 0", true); const nielsen_subst s(power, m_sg.mk_empty_seq(power->get_sort()), mem->m_dep); e->add_subst(s); child->apply_subst(m_sg, s); e->add_side_constraint(mk_constraint(a.mk_eq(exp_n, zero), mem->m_dep)); } // Branch 2: n >= 1 → peel one u: u^n → u · u^(n-1) // Side constraint: n >= 1 { const expr_ref power_expr(m_seq.str.mk_power(base->get_expr(), a.mk_sub(exp_n, a.mk_int(1))), m); euf::snode const* power_snode = m_sg.mk(power_expr); euf::snode const* replacement = dir_concat(m_sg, base, power_snode, fwd); nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "unwinding mem >", false); const nielsen_subst s(power, replacement, mem->m_dep); // TODO review - ensure var does not occur in replacement. e->add_subst(s); child->apply_subst(m_sg, s); e->add_side_constraint(mk_constraint(a.mk_ge(exp_n, one), mem->m_dep)); } return true; } // Cf. axioms::diseq_axiom bool nielsen_graph::axiomatize_diseq(nielsen_node* node) { SASSERT(node->m_str_eq.empty() && std::ranges::all_of(node->m_str_mem, [](str_mem const& mem){ return mem.is_primitive(); })); if (node->m_str_deq.empty()) return false; const str_deq& first = node->m_str_deq.back(); euf::snode const* u = first.m_lhs; euf::snode const* v = first.m_rhs; const expr_ref u_len(compute_length_expr(u), m); const expr_ref v_len(compute_length_expr(v), m); expr_ref len_eq(m.mk_eq(u_len, v_len), m); str_eq eq_uv(u, v, first.m_dep); sort *char_sort = nullptr; VERIFY(seq().is_seq(u->get_sort(), char_sort)); euf::snode const* a = m_sg.mk(seq().str.mk_unit(m_sk.mk("diseq.a", u->get_expr(), v->get_expr(), char_sort).get())); euf::snode const* b = m_sg.mk(seq().str.mk_unit(m_sk.mk("diseq.b", u->get_expr(), v->get_expr(), char_sort).get())); euf::snode const* w = m_sg.mk(m_sk.mk("diseq.w", u->get_expr(), v->get_expr()).get()); euf::snode const* up = m_sg.mk(m_sk.mk("diseq.u'", u->get_expr(), v->get_expr()).get()); euf::snode const* vp = m_sg.mk(m_sk.mk("diseq.v'", u->get_expr(), v->get_expr()).get()); const expr_ref up_len(compute_length_expr(up), m); const expr_ref vp_len(compute_length_expr(vp), m); euf::snode const* wau = dir_concat(m_sg, dir_concat(m_sg, w, a, true), up, true); euf::snode const* wbv = dir_concat(m_sg, dir_concat(m_sg, w, b, true), vp, true); const str_eq u_eq(u, wau, first.m_dep); const str_eq v_eq(v, wbv, first.m_dep); // Branch 1: |u| < |v| { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "diseq I", true); child->m_str_deq.pop_back(); expr_ref cmp(this->a.mk_lt(u_len, v_len), m); m_rw(cmp); e->add_side_constraint(constraint(cmp, first.m_dep, m)); } // Branch 2: |v| < |u| { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "diseq II", true); child->m_str_deq.pop_back(); expr_ref cmp(this->a.mk_lt(v_len, u_len), m); m_rw(cmp); e->add_side_constraint(constraint(cmp, first.m_dep, m)); } // Branch 3: u = wau' && v = wbv' && |u'| = |v'| && a != b { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, "diseq III", true); child->m_str_deq.pop_back(); child->add_str_eq(u_eq); child->add_str_eq(v_eq); child->add_constraint(constraint(m.mk_eq(up_len, vp_len), first.m_dep, m)); e->add_side_constraint(constraint(m.mk_not(m.mk_eq(a->get_expr(), b->get_expr())), first.m_dep, m)); } return true; } dep_tracker nielsen_graph::collect_conflict_deps() const { dep_tracker deps = nullptr; // todo: Add visit set if the graph could contain cycles in the future // enumerating all nodes would not work due to hot-restarts having created // children that are currently not relevant vector to_visit; to_visit.push_back(m_root); while (!to_visit.empty()) { nielsen_node const* n = to_visit.back(); to_visit.pop_back(); if (n->reason() == backtrack_reason::children_failed) { SASSERT(n->m_conflict_external_literal == sat::null_literal); SASSERT(!n->m_conflict_internal); for (unsigned i = n->outgoing().size(); i > 0; i--) { nielsen_edge const* e = n->outgoing()[i - 1]; to_visit.push_back(e->tgt()); } continue; } // not true anymore since we might have done a hot-restart where we previously created the child: //SASSERT(n->outgoing().empty()); SASSERT(n->is_currently_conflict()); if (n->m_conflict_external_literal != sat::null_literal) // We know from the outer solver that this literal is assigned true and contradicts node constraint deps = m_dep_mgr.mk_join(deps, m_dep_mgr.mk_leaf(n->m_conflict_external_literal)); if (n->m_conflict_internal) deps = m_dep_mgr.mk_join(deps, n->m_conflict_internal); } return deps; } // NSB review: this is one of several methods exposed for testing void nielsen_graph::test_aux_explain_conflict(svector& eqs, svector& mem_literals) const { SASSERT(m_root); const auto deps = collect_conflict_deps(); vector vs; m_dep_mgr.linearize(deps, vs); for (dep_source const& d : vs) { if (std::holds_alternative(d)) eqs.push_back(std::get(d)); else if (std::holds_alternative(d)) mem_literals.push_back(std::get(d)); else UNREACHABLE(); } } // ----------------------------------------------------------------------- // nielsen_graph: length constraint generation // ----------------------------------------------------------------------- expr_ref nielsen_graph::compute_length_expr(euf::snode const* n) { if (n->is_empty()) return expr_ref(a.mk_int(0), m); if (n->is_char_or_unit()) return expr_ref(a.mk_int(1), m); if (n->is_concat()) { const expr_ref left = compute_length_expr(n->arg0()); const expr_ref right = compute_length_expr(n->arg(1)); return expr_ref(a.mk_add(left, right), m); } //euf::snode const* length_term = nullptr; //if (m_length_info.find(n->id(), length_term) && length_term) // return expr_ref(length_term->get_expr(), m); return expr_ref(m_seq.str.mk_length(n->get_expr()), m); } void nielsen_graph::generate_length_constraints(vector& constraints) { if (!m_root) return; uint_set seen_vars; TRACE(seq, display(tout, m_root)); const seq_util & seq = m_sg.get_seq_util(); for (str_eq const& eq : m_root->str_eqs()) { if (eq.is_trivial()) continue; expr_ref len_lhs = compute_length_expr(eq.m_lhs); expr_ref len_rhs = compute_length_expr(eq.m_rhs); TRACE(seq, tout << "Length constraint from LHS " << snode_label_html(eq.m_lhs, m, true) << " to " << len_lhs << ":\n"; tout << "Length constraint from RHS " << snode_label_html(eq.m_rhs, m, true) << " to " << len_rhs << "\n"); expr_ref len_eq(m.mk_eq(len_lhs, len_rhs), m); constraints.push_back(length_constraint(len_eq, eq.m_dep, length_kind::eq, true, m)); // collect variables for non-negativity constraints euf::snode_vector tokens; eq.m_lhs->collect_tokens(tokens); eq.m_rhs->collect_tokens(tokens); for (euf::snode const* tok : tokens) { if (tok->is_var() && !seen_vars.contains(tok->id())) { seen_vars.insert(tok->id()); expr_ref len_var(seq.str.mk_length(tok->get_expr()), m); expr_ref ge_zero(a.mk_ge(len_var, a.mk_int(0)), m); TRACE(seq, tout << "non-negative length " << ge_zero << "\n"); constraints.push_back(length_constraint(ge_zero, eq.m_dep, length_kind::nonneg, true, m)); } } } // Parikh interval reasoning for regex memberships for (str_mem const& mem : m_root->str_mems()) { expr* re_expr = mem.m_regex->get_expr(); SASSERT(re_expr && seq.is_re(re_expr)); unsigned min_len = 0, max_len = UINT_MAX; compute_regex_length_interval(mem.m_regex, min_len, max_len); expr_ref len_str = compute_length_expr(mem.m_str); // generate len(str) >= min_len when min_len > 0 if (min_len > 0) { expr_ref bound(a.mk_ge(len_str, a.mk_int(min_len)), m); TRACE(seq, tout << "Parikh " << mk_pp(re_expr, m) << " bound: " << bound << "\n"); constraints.push_back(length_constraint(bound, mem.m_dep, length_kind::bound, false, m)); } // generate len(str) <= max_len when bounded if (max_len < UINT_MAX) { expr_ref bound(a.mk_le(len_str, a.mk_int(max_len)), m); TRACE(seq, tout << "Parikh " << mk_pp(re_expr, m) << " bound: " << bound << "\n"); constraints.push_back(length_constraint(bound, mem.m_dep, length_kind::bound, false, m)); } } } void nielsen_graph::compute_regex_length_interval(euf::snode const* regex, unsigned& min_len, unsigned& max_len) const { const seq_util & seq = m_sg.get_seq_util(); expr* e = regex->get_expr(); SASSERT(e && seq.is_re(e)); min_len = seq.re.min_length(e); max_len = seq.re.max_length(e); // for empty language, min_length may be UINT_MAX (vacuously true); // normalize to avoid generating bogus constraints if (min_len > max_len) { min_len = 0; max_len = 0; } } // ----------------------------------------------------------------------- // int_constraint display // ----------------------------------------------------------------------- std::ostream& constraint::display(std::ostream& out) const { return out << fml; } // ----------------------------------------------------------------------- // Integer feasibility subsolver implementation // Uses the injected simple_solver (default: lp_simple_solver). // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- // Modification counter: substitution length tracking // mirrors ZIPT's LocalInfo.CurrentModificationCnt + NielsenEdge.IncModCount/DecModCount // + NielsenNode constructor length assertion logic // ----------------------------------------------------------------------- expr_ref nielsen_graph::get_or_create_char_var(euf::snode const* var) { SASSERT(var && var->is_var()); const expr_ref idx(a.mk_sub(seq().str.mk_length(var->get_expr()), compute_length_expr(var)), m); const auto e = seq().str.mk_nth_u(var->get_expr(), idx); return expr_ref(m_seq.str.mk_unit(expr_ref(e, m)), m); } expr_ref nielsen_graph::get_or_create_gpower_n_var(euf::snode const* var) { SASSERT(var && var->is_var()); //unsigned mc = 0; //m_mod_cnt.find(var->id(), mc); th_rewriter rw(m); return m_sk.mk("gpn!", var->get_expr()/*, a.mk_int(mc)*/, a.mk_int()); } expr_ref nielsen_graph::get_or_create_gpower_m_var(euf::snode const* var) { SASSERT(var && var->is_var()); //unsigned mc = 0; //m_mod_cnt.find(var->id(), mc); th_rewriter rw(m); return m_sk.mk("gpm!", var->get_expr()/*, a.mk_int(mc)*/, a.mk_int()); } void nielsen_graph::add_subst_length_constraints(nielsen_edge* e) { auto const& substs = e->subst(); // Compute LHS (|x|) for each non-eliminating substitution vector> lhs_exprs; for (unsigned i = 0; i < substs.size(); ++i) { auto const& s = substs[i]; if (!s.m_var->is_var()) continue; if (!m_seq.is_seq(s.m_var->get_expr())) continue; expr_ref lhs = compute_length_expr(s.m_var); expr_ref rhs = compute_length_expr(s.m_replacement); e->add_side_constraint(mk_constraint(a.mk_eq(lhs, rhs), s.m_dep)); } } void nielsen_graph::assert_to_subsolver(const constraint& c) const { m_length_solver.assert_expr(c.fml, c.dep); } void nielsen_graph::assert_to_subsolver(expr* e) const { m_length_solver.assert_expr(e); } void nielsen_graph::assert_node_side_constraints(nielsen_node* node) const { // Assert only the constraints that are new to this node (beyond those // inherited from its parent via clone_from). The parent's constraints are // already present in the enclosing solver scope; asserting them again would // be redundant (though harmless). This is called by search_dfs right after // simplify_and_init, which is where new constraints are produced. for (unsigned i = node->m_parent_ic_count; i < node->constraints().size(); ++i) { auto& c = node->constraints()[i]; auto lit = m_context_solver.literal_if_false(c.fml); // std::cout << "Internalizing literal " << mk_pp(c.fml, m) << " [" << (lit == sat::null_literal) << "]" << // std::endl; if (lit != sat::null_literal) { node->set_external_conflict(lit, c.dep); return; } assert_to_subsolver(c); } } void nielsen_graph::generate_node_length_constraints(nielsen_node* node) { if (node->m_node_len_constraints_generated) return; node->m_node_len_constraints_generated = true; // Skip the root node — its length constraints are already asserted // at the base solver level by assert_root_constraints_to_solver(). if (node == m_root) return; // TODO: Do we really need this? uint_set seen_vars; for (str_eq const& eq : node->str_eqs()) { if (eq.is_trivial()) continue; expr_ref len_lhs = compute_length_expr(eq.m_lhs); expr_ref len_rhs = compute_length_expr(eq.m_rhs); //node->add_constraint(mk_constraint(m.mk_eq(len_lhs, len_rhs), eq.m_dep)); node->add_constraint(mk_constraint(a.mk_eq(len_lhs, len_rhs), eq.m_dep)); // non-negativity for each variable (mod-count-aware) euf::snode_vector tokens; eq.m_lhs->collect_tokens(tokens); eq.m_rhs->collect_tokens(tokens); for (euf::snode const* tok : tokens) { if (tok->is_var() && !seen_vars.contains(tok->id())) { seen_vars.insert(tok->id()); expr_ref len_var = compute_length_expr(tok); node->add_constraint(mk_constraint(a.mk_ge(len_var, a.mk_int(0)), eq.m_dep)); } } } // Parikh interval bounds for regex memberships at this node for (str_mem const& mem : node->str_mems()) { expr* re_expr = mem.m_regex->get_expr(); SASSERT(re_expr && m_seq.is_re(re_expr)); unsigned min_len = 0, max_len = UINT_MAX; compute_regex_length_interval(mem.m_regex, min_len, max_len); expr_ref len_str = compute_length_expr(mem.m_str); if (min_len > 0) node->add_constraint(mk_constraint(a.mk_ge(len_str, a.mk_int(min_len)), mem.m_dep)); if (max_len < UINT_MAX) node->add_constraint(mk_constraint(a.mk_le(len_str, a.mk_int(max_len)), mem.m_dep)); } } bool nielsen_graph::check_int_feasibility() const { // In incremental mode the solver already holds all path constraints // (root length constraints at the base level, edge side_constraints and node // constraints pushed/popped as the DFS descends and backtracks). // A plain check() is therefore sufficient. return m_length_solver.check() != l_false; } dep_tracker nielsen_graph::get_subsolver_dependency(nielsen_node* /*n*/) const { // check_int_feasibility() already called m_solver.check() which computed // the UNSAT core in terms of tracked assumption literals and their deps. return m_length_solver.core(); } bool nielsen_graph::check_lp_le(expr* lhs, expr* rhs, nielsen_node* n, dep_tracker& dep) { dep = nullptr; rational lhs_lo, rhs_up; literal_vector lits; enode_pair_vector eqs; if (m_context_solver.lower_bound(lhs, lhs_lo, lits, eqs) && m_context_solver.upper_bound(rhs, rhs_up, lits, eqs) && lhs_lo > rhs_up) return false; // lhs <= lhs_up <= rhs_lo <= rhs // => lhs <= rhs is entailed lits.reset(); eqs.reset(); rational rhs_lo, lhs_up; if (m_context_solver.upper_bound(lhs, lhs_up, lits, eqs) && m_context_solver.lower_bound(rhs, rhs_lo, lits, eqs) && lhs_up <= rhs_lo) { for (auto lit : lits) dep = m_dep_mgr.mk_join(dep, m_dep_mgr.mk_leaf(lit)); for (enode_pair eq : eqs) dep = m_dep_mgr.mk_join(dep, m_dep_mgr.mk_leaf(eq)); return true; } // fall through - ask the solver [expensive] // TODO: Maybe cache the result? // The solver already holds all path constraints incrementally. // Temporarily add NOT(lhs <= rhs), i.e. lhs >= rhs + 1. // If that is unsat, then lhs <= rhs is entailed. const expr_ref one(a.mk_int(1), m); const expr_ref rhs_plus_one(a.mk_add(rhs, one), m); m_length_solver.push(); assert_to_subsolver(a.mk_ge(lhs, rhs_plus_one)); const lbool result = m_length_solver.check(); if (result == l_false) dep = m_length_solver.core(); m_length_solver.pop(1); if (result == l_false) { n->add_constraint(constraint(a.mk_le(lhs, rhs), dep, m)); return true; } return false; } constraint nielsen_graph::mk_constraint(expr *fml, dep_tracker const &dep) const { return constraint(fml, dep, m); } expr* nielsen_graph::get_power_exponent(euf::snode const* power) { SASSERT(power); if (!power->is_power()) return nullptr; SASSERT(power->num_args() == 2); euf::snode const* exp_snode = power->arg(1); return exp_snode ? exp_snode->get_expr() : nullptr; } // ----------------------------------------------------------------------- // Regex widening: overapproximate string and check intersection emptiness // Mirrors ZIPT NielsenNode.CheckRegexWidening (NielsenNode.cs:1350-1380) // ----------------------------------------------------------------------- bool nielsen_graph::check_regex_widening(nielsen_node const& node, str_mem const& mem, dep_tracker& dep) { const auto str = mem.m_str; const auto regex = mem.m_regex; SASSERT(m_seq_regex); // Only apply to ground regexes with non-trivial strings if (!regex->is_ground()) return false; // Build the overapproximation regex for the string. // Variables → intersection of their primitive regex constraints (or Σ*) // Symbolic chars → re.range from char_ranges (or full_char) // Concrete chars → to_re(unit(c)) euf::snode_vector tokens; str->collect_tokens(tokens); if (tokens.empty()) return false; SASSERT(dep); euf::snode const* approx = nullptr; for (euf::snode const* tok : tokens) { euf::snode const* tok_re = nullptr; if (tok->is_char()) { // Concrete character → to_re(unit(c)) expr* te = tok->get_expr(); SASSERT(te); tok_re = m_sg.mk(m_seq.re.mk_to_re(te)); } else if (tok->is_var()) { // Variable → intersection of primitive regex constraints, or Σ* euf::snode const* x_range = m_seq_regex->collect_primitive_regex_intersection(tok, node, m_dep_mgr, dep); if (x_range) tok_re = x_range; else { // Unconstrained variable: approximate as Σ* sort* str_sort = m_seq.str.mk_string_sort(); expr_ref all_re(m_seq.re.mk_full_seq(m_seq.re.mk_re(str_sort)), m); tok_re = m_sg.mk(all_re); } TRACE(seq, tout << "intersection-collection\n" << spp(tok, m) << "\n" << spp(tok_re, m) << "\n"); } else if (tok->is_unit()) { // Symbolic char — try to get char_range if (node.char_ranges().contains(tok->id())) { auto& cs = node.char_ranges()[tok->id()]; if (!cs.first.is_empty()) { // Build union of re.range for each interval euf::snode const* range_re = nullptr; for (auto const& r : cs.first.ranges()) { expr_ref rng(m_seq.re.mk_range( m_seq.str.mk_string(zstring(r.m_lo)), m_seq.str.mk_string(zstring(r.m_hi - 1))), m); euf::snode const* rng_sn = m_sg.mk(rng); if (!range_re) range_re = rng_sn; else { expr_ref u(m_seq.re.mk_union(range_re->get_expr(), rng_sn->get_expr()), m); range_re = m_sg.mk(u); } } dep = dep_mgr().mk_join(dep, cs.second); tok_re = range_re; } } if (!tok_re) { // Unconstrained symbolic char: approximate as full_char (single char, any value) sort* str_sort = m_seq.str.mk_string_sort(); expr_ref fc(m_seq.re.mk_full_char(m_seq.re.mk_re(str_sort)), m); tok_re = m_sg.mk(fc); } } else { // Unknown token type — approximate as Σ* sort* str_sort = m_seq.str.mk_string_sort(); expr_ref all_re(m_seq.re.mk_full_seq(m_seq.re.mk_re(str_sort)), m); tok_re = m_sg.mk(all_re); } SASSERT(tok_re); if (!approx) approx = tok_re; else { expr* ae = approx->get_expr(); expr* te = tok_re->get_expr(); SASSERT(ae && te); approx = m_sg.mk(m_seq.re.mk_concat(ae, te)); } } if (!approx) return false; // Check intersection(approx, regex) for emptiness // Build intersection regex expr* ae = approx->get_expr(); expr* re = regex->get_expr(); SASSERT(ae && re); const expr_ref inter(m_seq.re.mk_inter(ae, re), m); euf::snode const* inter_sn = m_sg.mk(inter); SASSERT(inter_sn); // TODO: Minimize the conflict here const lbool result = m_seq_regex->is_empty_bfs(inter_sn, 5000); TRACE(seq, tout << "widen empty-intersect: " << result << " " << mk_pp(re, m) << " <= " << mk_pp(ae, m) << "\n" << mem_pp(mem, m) << "\n"; display(tout, &node) << "\n"); return result == l_true; } // ----------------------------------------------------------------------- // Leaf-node regex intersection emptiness check // Mirrors ZIPT NielsenNode.CheckRegex (NielsenNode.cs:1311-1329) // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- // Synchronous product over plain / view / guard / co-view components. // ----------------------------------------------------------------------- lbool nielsen_graph::comp_accepting(prod_comp const& c) const { if (c.m_dead) return l_false; switch (c.m_kind) { case mem_kind::plain: return m_sg.re_nullable(c.m_state); case mem_kind::stab_view: if (c.m_complemented) return (c.m_sink || c.m_state != c.m_root) ? l_true : l_false; return (c.m_state == c.m_root) ? l_true : l_false; case mem_kind::no_loop: return l_true; // guard accepts on every prefix it has not failed on } return l_undef; } nielsen_graph::prod_comp nielsen_graph::comp_step(prod_comp const& c, euf::snode const* mt) { prod_comp r = c; if (c.m_dead) return r; switch (c.m_kind) { case mem_kind::plain: { euf::snode const* d = m_sg.brzozowski_deriv(c.m_state, mt); if (!d || d->is_fail()) r.m_dead = true; else r.m_state = d; return r; } case mem_kind::stab_view: { if (c.m_complemented) { if (c.m_sink) return r; // Σ* if (!projection_state_in_Q(c.m_state->get_expr(), c.m_nu)) { r.m_sink = true; return r; } euf::snode const* d = m_sg.brzozowski_deriv(c.m_state, mt); if (!d || d->is_fail()) { r.m_sink = true; return r; } // ~∅ = Σ* r.m_state = d; return r; } if (!projection_state_in_Q(c.m_state->get_expr(), c.m_nu)) { r.m_dead = true; return r; } euf::snode const* d = m_sg.brzozowski_deriv(c.m_state, mt); if (!d || d->is_fail()) { r.m_dead = true; return r; } r.m_state = d; return r; } case mem_kind::no_loop: { if (c.m_sink) return r; // discharged: Σ* if (!projection_state_in_Q(c.m_state->get_expr(), c.m_nu)) { r.m_sink = true; return r; } euf::snode const* d = m_sg.brzozowski_deriv(c.m_state, mt); if (!d || d->is_fail()) { r.m_sink = true; return r; } // lap cannot close through ∅ if (d == c.m_root) { r.m_dead = true; return r; } // lap completed → forbidden r.m_state = d; return r; } } return r; } lbool nielsen_graph::check_product_emptiness(vector const& comps0, unsigned max_states) { if (comps0.empty()) return l_false; // empty intersection = Σ* (non-empty) auto encode = [](vector const& cs) { std::vector key; key.reserve(cs.size() * 5); for (auto const& c : cs) { key.push_back(static_cast(c.m_kind)); key.push_back((c.m_complemented ? 1u : 0u) | (c.m_sink ? 2u : 0u) | (c.m_dead ? 4u : 0u)); key.push_back(c.m_state ? c.m_state->id() : UINT_MAX); } return key; }; std::set> visited; vector> work; work.push_back(comps0); visited.insert(encode(comps0)); unsigned explored = 0; while (!work.empty()) { if (!m.inc()) return l_undef; if (explored >= max_states) return l_undef; vector cur = work.back(); work.pop_back(); ++explored; bool any_dead = false; for (auto const& c : cur) if (c.m_dead) { any_dead = true; break; } if (any_dead) continue; // simultaneously accepting? bool all_acc = true, any_undef = false; for (auto const& c : cur) { const lbool a = comp_accepting(c); if (a == l_false) { all_acc = false; break; } if (a == l_undef) any_undef = true; } if (all_acc && !any_undef) return l_false; // found a common word // joint first-character partition = minterms of the intersection of // all still-discriminating component states. expr* combined = nullptr; for (auto const& c : cur) { if (c.m_sink || c.m_dead) continue; combined = combined ? m_seq.re.mk_inter(combined, c.m_state->get_expr()) : c.m_state->get_expr(); } if (!combined) continue; // no discriminating state and not accepting: dead end euf::snode_vector mts; m_sg.compute_minterms(m_sg.mk(combined), mts); for (euf::snode const* mt : mts) { vector nxt; bool dead = false; for (auto const& c : cur) { prod_comp d = comp_step(c, mt); if (d.m_dead) { dead = true; break; } nxt.push_back(d); } if (dead) continue; if (visited.insert(encode(nxt)).second) work.push_back(nxt); } } return l_true; // exhausted with no accepting tuple → empty } bool nielsen_graph::collect_var_components(euf::snode const* var, nielsen_node const& node, vector& out, dep_tracker& dep) { bool found = false; for (auto const& mem : node.str_mems()) { if (!mem.is_primitive()) continue; if (mem.m_str->first() != var) continue; switch (mem.m_kind) { case mem_kind::plain: out.push_back(prod_comp::mk_plain(mem.m_regex)); break; case mem_kind::stab_view: out.push_back(prod_comp::mk_view(mem.m_regex, mem.m_root, mem.m_nu, false)); break; case mem_kind::no_loop: out.push_back(prod_comp::mk_guard(mem.m_regex, mem.m_root, mem.m_nu, mem.m_discharged)); break; } dep = m_dep_mgr.mk_join(dep, mem.m_dep); found = true; } return found; } bool nielsen_graph::check_leaf_regex(nielsen_node const& node, dep_tracker& dep) { SASSERT(m_seq_regex); // distinct variables carrying a primitive constraint uint_set seen; for (auto const& mem : node.str_mems()) { SASSERT(mem.is_primitive()); euf::snode const* const first = mem.m_str->first(); SASSERT(first && first->is_var()); if (seen.contains(first->id())) continue; seen.insert(first->id()); vector comps; dep_tracker d = nullptr; collect_var_components(first, node, comps, d); const lbool result = check_product_emptiness(comps, 5000); if (result == l_true) { TRACE(seq, tout << "empty intersection\n"); dep = d; return false; } } return true; } // ----------------------------------------------------------------------- // Statistics collection // ----------------------------------------------------------------------- void nielsen_graph::collect_statistics(::statistics& st) const { st.update("nseq solve calls", m_stats.m_num_solve_calls); st.update("nseq dfs nodes", m_stats.m_num_dfs_nodes); st.update("nseq sat", m_stats.m_num_sat); st.update("nseq unsat", m_stats.m_num_unsat); st.update("nseq unknown", m_stats.m_num_unknown); st.update("nseq simplify clash", m_stats.m_num_simplify_conflict); st.update("nseq extensions", m_stats.m_num_extensions); st.update("nseq fresh vars", m_stats.m_num_fresh_vars); st.update("nseq max depth", m_stats.m_max_depth); // modifier breakdown st.update("nseq mod det", m_stats.m_mod_det); st.update("nseq mod power epsilon", m_stats.m_mod_power_epsilon); st.update("nseq mod num cmp", m_stats.m_mod_num_cmp); st.update("nseq mod const num unwind", m_stats.m_mod_const_num_unwinding); st.update("nseq mod eq split", m_stats.m_mod_eq_split); st.update("nseq mod star intr", m_stats.m_mod_star_intr); st.update("nseq mod cycle subsump", m_stats.m_mod_cycle_subsumption); st.update("nseq mod gpower intr", m_stats.m_mod_gpower_intr); st.update("nseq mod regex fact", m_stats.m_mod_regex_factorization); st.update("nseq mod const nielsen", m_stats.m_mod_const_nielsen); st.update("nseq mod signature split", m_stats.m_mod_signature_split); st.update("nseq mod regex var", m_stats.m_mod_regex_var_split); st.update("nseq mod regex if", m_stats.m_mod_regex_if_split); st.update("nseq mod power split", m_stats.m_mod_power_split); st.update("nseq mod var nielsen", m_stats.m_mod_var_nielsen); st.update("nseq mod var num unwind (eq)", m_stats.m_mod_var_num_unwinding_eq); st.update("nseq mod var num unwind (mem)", m_stats.m_mod_var_num_unwinding_mem); st.update("nseq mod axiomatized disequalities", m_stats.m_ax_diseq); st.update("nseq unsat-cache size", (unsigned) m_unsat_node_cache.size()); st.update("nseq unsat-cache hits", m_num_cache_hits); } }