/*++ Copyright (c) 2026 Microsoft Corporation Module Name: seq_nielsen.cpp Abstract: Nielsen graph implementation for string constraint solving. Ports the constraint types and Nielsen graph structures from ZIPT (https://github.com/CEisenhofer/ZIPT/tree/parikh/ZIPT/Constraints) Author: Nikolaj Bjorner (nbjorner) 2026-03-02 Clemens Eisenhofer 2026-03-02 --*/ #include "smt/seq/seq_nielsen.h" #include "ast/arith_decl_plugin.h" #include "ast/ast_pp.h" #include "math/lp/lar_solver.h" #include "util/bit_util.h" #include "util/hashtable.h" #include #include #include namespace seq { // ----------------------------------------------- // dep_tracker // ----------------------------------------------- dep_tracker::dep_tracker(unsigned num_bits) { unsigned words = (num_bits + 31) / 32; m_bits.resize(words, 0); } dep_tracker::dep_tracker(unsigned num_bits, unsigned set_bit) : dep_tracker(num_bits) { if (set_bit < num_bits) { unsigned word_idx = set_bit / 32; unsigned bit_idx = set_bit % 32; m_bits[word_idx] = 1u << bit_idx; } } void dep_tracker::merge(dep_tracker const& other) { if (other.m_bits.empty()) return; if (m_bits.size() < other.m_bits.size()) m_bits.resize(other.m_bits.size(), 0); for (unsigned i = 0; i < other.m_bits.size(); ++i) m_bits[i] |= other.m_bits[i]; } bool dep_tracker::is_superset(dep_tracker const& other) const { for (unsigned i = 0; i < other.m_bits.size(); ++i) { unsigned my_bits = (i < m_bits.size()) ? m_bits[i] : 0; if ((my_bits & other.m_bits[i]) != other.m_bits[i]) return false; } return true; } bool dep_tracker::empty() const { for (unsigned b : m_bits) if (b != 0) return false; return true; } void dep_tracker::get_set_bits(unsigned_vector& indices) const { for (unsigned i = 0; i < m_bits.size(); ++i) { unsigned word = m_bits[i]; while (word != 0) { unsigned bit = i * 32 + ntz_core(word); indices.push_back(bit); word &= word - 1; // clear lowest set bit } } } // ----------------------------------------------- // str_eq // ----------------------------------------------- void str_eq::sort() { if (m_lhs && m_rhs && m_lhs->id() > m_rhs->id()) std::swap(m_lhs, m_rhs); } bool str_eq::is_trivial() const { return m_lhs == m_rhs || (m_lhs && m_rhs && m_lhs->is_empty() && m_rhs->is_empty()); } bool str_eq::contains_var(euf::snode* var) const { if (!var) return false; // check if var appears in the token list of lhs or rhs if (m_lhs) { euf::snode_vector tokens; m_lhs->collect_tokens(tokens); for (euf::snode* t : tokens) if (t == var) return true; } if (m_rhs) { euf::snode_vector tokens; m_rhs->collect_tokens(tokens); for (euf::snode* t : tokens) if (t == var) return true; } return false; } // ----------------------------------------------- // str_mem // ----------------------------------------------- bool str_mem::is_primitive() const { return m_str && m_str->length() == 1 && m_str->is_var(); } bool str_mem::contains_var(euf::snode* var) const { if (!var) return false; if (m_str) { euf::snode_vector tokens; m_str->collect_tokens(tokens); for (euf::snode* t : tokens) if (t == var) return true; } return false; } // ----------------------------------------------- // nielsen_subst // ----------------------------------------------- bool nielsen_subst::is_eliminating() const { if (!m_var || !m_replacement) return true; // check if var appears in replacement euf::snode_vector tokens; m_replacement->collect_tokens(tokens); for (euf::snode* t : tokens) if (t == m_var) return false; return true; } // ----------------------------------------------- // char_set // ----------------------------------------------- unsigned char_set::char_count() const { unsigned count = 0; for (auto const& r : m_ranges) count += r.length(); return count; } bool char_set::contains(unsigned c) const { // binary search over sorted non-overlapping ranges int lo = 0, hi = static_cast(m_ranges.size()) - 1; while (lo <= hi) { int mid = lo + (hi - lo) / 2; if (c < m_ranges[mid].m_lo) hi = mid - 1; else if (c >= m_ranges[mid].m_hi) lo = mid + 1; else return true; } return false; } void char_set::add(unsigned c) { if (m_ranges.empty()) { m_ranges.push_back(char_range(c)); return; } // binary search for insertion point int lo = 0, hi = static_cast(m_ranges.size()) - 1; while (lo <= hi) { int mid = lo + (hi - lo) / 2; if (c < m_ranges[mid].m_lo) hi = mid - 1; else if (c >= m_ranges[mid].m_hi) lo = mid + 1; else return; // already contained } // lo is the insertion point unsigned idx = static_cast(lo); bool merge_left = idx > 0 && m_ranges[idx - 1].m_hi == c; bool merge_right = idx < m_ranges.size() && m_ranges[idx].m_lo == c + 1; if (merge_left && merge_right) { m_ranges[idx - 1].m_hi = m_ranges[idx].m_hi; m_ranges.erase(m_ranges.begin() + idx); } else if (merge_left) { m_ranges[idx - 1].m_hi = c + 1; } else if (merge_right) { m_ranges[idx].m_lo = c; } else { // positional insert: shift elements right and place new element m_ranges.push_back(char_range()); for (unsigned k = m_ranges.size() - 1; k > idx; --k) m_ranges[k] = m_ranges[k - 1]; m_ranges[idx] = char_range(c); } } void char_set::add(char_set const& other) { for (auto const& r : other.m_ranges) { for (unsigned c = r.m_lo; c < r.m_hi; ++c) add(c); } } char_set char_set::intersect_with(char_set const& other) const { char_set result; unsigned i = 0, j = 0; while (i < m_ranges.size() && j < other.m_ranges.size()) { unsigned lo = std::max(m_ranges[i].m_lo, other.m_ranges[j].m_lo); unsigned hi = std::min(m_ranges[i].m_hi, other.m_ranges[j].m_hi); if (lo < hi) result.m_ranges.push_back(char_range(lo, hi)); if (m_ranges[i].m_hi < other.m_ranges[j].m_hi) ++i; else ++j; } return result; } char_set char_set::complement(unsigned max_char) const { char_set result; if (m_ranges.empty()) { result.m_ranges.push_back(char_range(0, max_char + 1)); return result; } unsigned from = 0; for (auto const& r : m_ranges) { if (from < r.m_lo) result.m_ranges.push_back(char_range(from, r.m_lo)); from = r.m_hi; } if (from <= max_char) result.m_ranges.push_back(char_range(from, max_char + 1)); return result; } bool char_set::is_disjoint(char_set const& other) const { unsigned i = 0, j = 0; while (i < m_ranges.size() && j < other.m_ranges.size()) { if (m_ranges[i].m_hi <= other.m_ranges[j].m_lo) ++i; else if (other.m_ranges[j].m_hi <= m_ranges[i].m_lo) ++j; else return false; } return true; } std::ostream& char_set::display(std::ostream& out) const { if (m_ranges.empty()) { out << "{}"; return out; } out << "{ "; bool first = true; for (auto const& r : m_ranges) { if (!first) out << ", "; first = false; if (r.is_unit()) { unsigned c = r.m_lo; if (c >= 'a' && c <= 'z') out << (char)c; else if (c >= 'A' && c <= 'Z') out << (char)c; else if (c >= '0' && c <= '9') out << (char)c; else out << "#[" << c << "]"; } else { out << "[" << r.m_lo << "-" << (r.m_hi - 1) << "]"; } } out << " }"; return out; } // ----------------------------------------------- // nielsen_edge // ----------------------------------------------- nielsen_edge::nielsen_edge(nielsen_node* src, nielsen_node* tgt, bool is_progress): m_src(src), m_tgt(tgt), m_is_progress(is_progress) { } // ----------------------------------------------- // nielsen_node // ----------------------------------------------- nielsen_node::nielsen_node(nielsen_graph* graph, unsigned id): m_id(id), m_graph(graph), m_is_progress(true) { } void nielsen_node::clone_from(nielsen_node const& parent) { m_str_eq.reset(); m_str_mem.reset(); m_int_constraints.reset(); m_char_diseqs.reset(); m_char_ranges.reset(); for (auto const& eq : parent.m_str_eq) m_str_eq.push_back(str_eq(eq.m_lhs, eq.m_rhs, eq.m_dep)); for (auto const& mem : parent.m_str_mem) m_str_mem.push_back(str_mem(mem.m_str, mem.m_regex, mem.m_history, mem.m_id, mem.m_dep)); for (auto const& ic : parent.m_int_constraints) m_int_constraints.push_back(ic); // clone character disequalities for (auto const& kv : parent.m_char_diseqs) { ptr_vector diseqs; for (euf::snode* s : kv.m_value) diseqs.push_back(s); m_char_diseqs.insert(kv.m_key, diseqs); } // clone character ranges for (auto const& kv : parent.m_char_ranges) { m_char_ranges.insert(kv.m_key, kv.m_value.clone()); } } void nielsen_node::apply_subst(euf::sgraph& sg, nielsen_subst const& s) { if (!s.m_var) return; for (unsigned i = 0; i < m_str_eq.size(); ++i) { str_eq& eq = m_str_eq[i]; eq.m_lhs = sg.subst(eq.m_lhs, s.m_var, s.m_replacement); eq.m_rhs = sg.subst(eq.m_rhs, s.m_var, s.m_replacement); eq.m_dep.merge(s.m_dep); eq.sort(); } for (unsigned i = 0; i < m_str_mem.size(); ++i) { str_mem& mem = m_str_mem[i]; mem.m_str = sg.subst(mem.m_str, s.m_var, s.m_replacement); // regex is typically ground, but apply subst for generality mem.m_regex = sg.subst(mem.m_regex, s.m_var, s.m_replacement); mem.m_dep.merge(s.m_dep); } } void nielsen_node::add_char_range(euf::snode* sym_char, char_set const& range) { SASSERT(sym_char && sym_char->is_unit()); unsigned id = sym_char->id(); if (m_char_ranges.contains(id)) { char_set& existing = m_char_ranges.find(id); char_set inter = existing.intersect_with(range); existing = inter; if (inter.is_empty()) { m_is_general_conflict = true; m_reason = backtrack_reason::character_range; } } else { m_char_ranges.insert(id, range.clone()); } } void nielsen_node::add_char_diseq(euf::snode* sym_char, euf::snode* other) { SASSERT(sym_char && sym_char->is_unit()); SASSERT(other && other->is_unit()); unsigned id = sym_char->id(); if (!m_char_diseqs.contains(id)) m_char_diseqs.insert(id, ptr_vector()); ptr_vector& existing = m_char_diseqs.find(id); // check for duplicates for (euf::snode* s : existing) if (s == other) return; existing.push_back(other); } void nielsen_node::apply_char_subst(euf::sgraph& sg, char_subst const& s) { if (!s.m_var) return; // replace occurrences of s.m_var with s.m_val in all string constraints for (unsigned i = 0; i < m_str_eq.size(); ++i) { str_eq& eq = m_str_eq[i]; eq.m_lhs = sg.subst(eq.m_lhs, s.m_var, s.m_val); eq.m_rhs = sg.subst(eq.m_rhs, s.m_var, s.m_val); eq.sort(); } for (unsigned i = 0; i < m_str_mem.size(); ++i) { str_mem& mem = m_str_mem[i]; mem.m_str = sg.subst(mem.m_str, s.m_var, s.m_val); mem.m_regex = sg.subst(mem.m_regex, s.m_var, s.m_val); } unsigned var_id = s.m_var->id(); if (s.m_val->is_unit()) { // symbolic char → symbolic char: check disequalities if (m_char_diseqs.contains(var_id)) { ptr_vector& diseqs = m_char_diseqs.find(var_id); for (euf::snode* d : diseqs) { if (d == s.m_val) { m_is_general_conflict = true; m_reason = backtrack_reason::character_range; return; } } m_char_diseqs.remove(var_id); m_char_ranges.remove(var_id); } } else { SASSERT(s.m_val->is_char()); // symbolic char → concrete char: check range constraints if (m_char_ranges.contains(var_id)) { char_set& range = m_char_ranges.find(var_id); // extract the concrete char value from the s_char snode unsigned ch_val = 0; seq_util& seq = sg.get_seq_util(); expr* unit_expr = s.m_val->get_expr(); expr* ch_expr = nullptr; if (unit_expr && seq.str.is_unit(unit_expr, ch_expr)) seq.is_const_char(ch_expr, ch_val); if (!range.contains(ch_val)) { m_is_general_conflict = true; m_reason = backtrack_reason::character_range; return; } m_char_diseqs.remove(var_id); m_char_ranges.remove(var_id); } } } // ----------------------------------------------- // nielsen_graph // ----------------------------------------------- nielsen_graph::nielsen_graph(euf::sgraph& sg): m_sg(sg) { } nielsen_graph::~nielsen_graph() { reset(); } nielsen_node* nielsen_graph::mk_node() { unsigned id = m_nodes.size(); nielsen_node* n = alloc(nielsen_node, this, id); m_nodes.push_back(n); return n; } nielsen_node* nielsen_graph::mk_child(nielsen_node* parent) { nielsen_node* child = mk_node(); child->clone_from(*parent); return child; } nielsen_edge* nielsen_graph::mk_edge(nielsen_node* src, nielsen_node* tgt, bool is_progress) { nielsen_edge* e = alloc(nielsen_edge, src, tgt, is_progress); m_edges.push_back(e); src->add_outgoing(e); return e; } void nielsen_graph::add_str_eq(euf::snode* lhs, euf::snode* rhs) { if (!m_root) m_root = mk_node(); dep_tracker dep(m_root->str_eqs().size() + m_root->str_mems().size() + 1, m_root->str_eqs().size()); str_eq eq(lhs, rhs, dep); eq.sort(); m_root->add_str_eq(eq); ++m_num_input_eqs; } void nielsen_graph::add_str_mem(euf::snode* str, euf::snode* regex) { if (!m_root) m_root = mk_node(); dep_tracker dep(m_root->str_eqs().size() + m_root->str_mems().size() + 1, m_root->str_eqs().size() + m_root->str_mems().size()); euf::snode* history = m_sg.mk_empty(); unsigned id = next_mem_id(); m_root->add_str_mem(str_mem(str, regex, history, id, dep)); ++m_num_input_mems; } void nielsen_graph::inc_run_idx() { if (m_run_idx == UINT_MAX) { for (nielsen_node* n : m_nodes) n->reset_counter(); m_run_idx = 1; } else ++m_run_idx; } void nielsen_graph::reset() { for (nielsen_node* n : m_nodes) dealloc(n); for (nielsen_edge* e : m_edges) dealloc(e); m_nodes.reset(); m_edges.reset(); m_root = nullptr; m_sat_node = nullptr; m_sat_path.reset(); m_run_idx = 0; m_depth_bound = 0; m_next_mem_id = 0; m_fresh_cnt = 0; m_num_input_eqs = 0; m_num_input_mems = 0; } std::ostream& nielsen_graph::display(std::ostream& out) const { out << "nielsen_graph with " << m_nodes.size() << " nodes, " << m_edges.size() << " edges\n"; for (nielsen_node const* n : m_nodes) { out << " node[" << n->id() << "]"; if (n == m_root) out << " (root)"; if (n->is_general_conflict()) out << " CONFLICT"; if (n->is_extended()) out << " EXTENDED"; out << "\n"; // display string equalities for (auto const& eq : n->str_eqs()) { out << " str_eq: "; if (eq.m_lhs) out << "lhs[id=" << eq.m_lhs->id() << ",len=" << eq.m_lhs->length() << "]"; else out << "null"; out << " = "; if (eq.m_rhs) out << "rhs[id=" << eq.m_rhs->id() << ",len=" << eq.m_rhs->length() << "]"; else out << "null"; out << "\n"; } // display regex memberships for (auto const& mem : n->str_mems()) { out << " str_mem[" << mem.m_id << "]: "; if (mem.m_str) out << "str[id=" << mem.m_str->id() << ",len=" << mem.m_str->length() << "]"; else out << "null"; out << " in "; if (mem.m_regex) out << "re[id=" << mem.m_regex->id() << "]"; else out << "null"; out << "\n"; } // display outgoing edges for (nielsen_edge const* e : n->outgoing()) { out << " -> node[" << e->tgt()->id() << "]"; if (e->is_progress()) out << " (progress)"; for (auto const& s : e->subst()) { out << " {"; if (s.m_var) out << "var[" << s.m_var->id() << "]"; out << " -> "; if (s.m_replacement) out << "repl[" << s.m_replacement->id() << ",len=" << s.m_replacement->length() << "]"; else out << "eps"; out << "}"; } out << "\n"; } if (n->backedge()) out << " backedge -> node[" << n->backedge()->id() << "]\n"; } return out; } // ----------------------------------------------------------------------- // nielsen_node: display_html // Render constraint set as an HTML fragment for DOT labels. // Mirrors ZIPT's NielsenNode.ToHtmlString(). // ----------------------------------------------------------------------- // Helper: HTML-escape a string and replace literal \n with
. static std::string dot_html_escape(std::string const& s) { std::string r; r.reserve(s.size()); for (char c : s) { switch (c) { case '&': r += "&"; break; case '<': r += "<"; break; case '>': r += ">"; break; default: r += c; break; } } // replace literal "\n" two-char sequence with
std::string result; result.reserve(r.size()); for (std::size_t i = 0; i < r.size(); ) { if (i + 1 < r.size() && r[i] == '\\' && r[i+1] == 'n') { result += "
"; i += 2; } else { result += r[i++]; } } return result; } // Helper: render an snode as a human-readable label. // Groups consecutive s_char tokens into quoted strings, renders s_var by name, // and falls back to mk_pp for other token kinds. static std::string snode_label(euf::snode const* n, ast_manager& m) { if (!n) return "null"; seq_util seq(m); // collect all leaf tokens left-to-right euf::snode_vector tokens; n->collect_tokens(tokens); if (tokens.empty()) return "\"\""; // empty string std::string result; std::string char_acc; // accumulator for consecutive printable chars bool first = true; // flush accumulated chars as a quoted string auto flush_chars = [&]() { if (char_acc.empty()) return; if (!first) result += " + "; result += "\"" + char_acc + "\""; first = false; char_acc.clear(); }; for (euf::snode const* tok : tokens) { expr* e = tok->get_expr(); // s_char: seq.unit(const_char) – extract char code and accumulate if (tok->is_char() && e) { expr* ch_expr = to_app(e)->get_arg(0); unsigned code = 0; if (seq.is_const_char(ch_expr, code)) { if (code >= 32 && code < 127 && code != '"' && code != '\\') { char_acc += static_cast(code); } else { flush_chars(); char buf[16]; snprintf(buf, sizeof(buf), "'\\x%x'", code); if (!first) result += " + "; result += buf; first = false; } continue; } } flush_chars(); if (!first) result += " + "; first = false; if (!e) { result += "#" + std::to_string(tok->id()); } else if (tok->is_var()) { result += to_app(e)->get_decl()->get_name().str(); } else { std::ostringstream os; os << mk_pp(e, m); result += os.str(); } } flush_chars(); return result; } std::ostream& nielsen_node::display_html(std::ostream& out, ast_manager& m) const { bool any = false; // string equalities for (auto const& eq : m_str_eq) { if (!any) { out << "Cnstr:
"; any = true; } out << dot_html_escape(snode_label(eq.m_lhs, m)) << " = " << dot_html_escape(snode_label(eq.m_rhs, m)) << "
"; } // regex memberships for (auto const& mem : m_str_mem) { if (!any) { out << "Cnstr:
"; any = true; } out << dot_html_escape(snode_label(mem.m_str, m)) << " ∈ " << dot_html_escape(snode_label(mem.m_regex, m)) << "
"; } // character ranges for (auto const& kv : m_char_ranges) { if (!any) { out << "Cnstr:
"; any = true; } out << "?" << kv.m_key << " ∈ "; kv.m_value.display(out); out << "
"; } // character disequalities for (auto const& kv : m_char_diseqs) { if (!any) { out << "Cnstr:
"; any = true; } for (euf::snode* d : kv.m_value) { out << "?" << kv.m_key << " ≠ ?" << d->id() << "
"; } } if (!any) out << "⊤"; // ⊤ (trivially satisfied) return out; } // ----------------------------------------------------------------------- // nielsen_graph: to_dot // Output the graph in graphviz DOT format, optionally colour-highlighting // the satisfying path. Mirrors ZIPT's NielsenGraph.ToDot(). // ----------------------------------------------------------------------- // Convert a backtrack_reason to a short display string. static char const* reason_to_str(backtrack_reason r) { switch (r) { case backtrack_reason::unevaluated: return "Unevaluated"; case backtrack_reason::extended: return "Extended"; case backtrack_reason::symbol_clash: return "Symbol Clash"; case backtrack_reason::parikh_image: return "Parikh Image"; case backtrack_reason::subsumption: return "Subsumption"; case backtrack_reason::arithmetic: return "Arithmetic"; case backtrack_reason::regex: return "Regex"; case backtrack_reason::regex_widening: return "RegexWidening"; case backtrack_reason::character_range: return "Character Range"; case backtrack_reason::smt: return "SMT"; case backtrack_reason::children_failed: return "Children Failed"; default: return "?"; } } // Returns true when the reason is a direct (non-propagated) conflict. // Mirrors ZIPT's NielsenNode.IsActualConflict(): all conflicts except ChildrenFailed. static bool is_actual_conflict(backtrack_reason r) { return r == backtrack_reason::symbol_clash || r == backtrack_reason::parikh_image || r == backtrack_reason::subsumption || r == backtrack_reason::arithmetic || r == backtrack_reason::regex || r == backtrack_reason::regex_widening || r == backtrack_reason::character_range || r == backtrack_reason::smt; } // gives a graphviz graph representation of the Nielsen graph (for debugging) std::ostream& nielsen_graph::to_dot(std::ostream& out) const { ast_manager& m = m_sg.get_manager(); // collect sat-path nodes and edges for green highlighting ptr_addr_hashtable sat_nodes; ptr_addr_hashtable sat_edges; for (nielsen_edge* e : m_sat_path) { if (e->src()) sat_nodes.insert(e->src()); if (e->tgt()) sat_nodes.insert(e->tgt()); sat_edges.insert(e); } out << "digraph G {\n"; // --- nodes --- for (nielsen_node const* n : m_nodes) { out << "\t" << n->id() << " [label=<" << n->id() << ": "; n->display_html(out, m); // append conflict reason if this is a direct conflict if (is_actual_conflict(n->reason())) out << "
" << reason_to_str(n->reason()); out << ">"; // colour if (sat_nodes.contains(const_cast(n))) out << ", color=green"; else if (n->is_general_conflict()) out << ", color=darkred"; else if (n->eval_idx() != m_run_idx) // inactive (not visited this run) out << ", color=blue"; else if (n->is_currently_conflict()) out << ", color=red"; out << "];\n"; } // --- edges --- for (nielsen_node const* n : m_nodes) { for (nielsen_edge const* e : n->outgoing()) { out << "\t" << n->id() << " -> " << e->tgt()->id() << " [label=<"; // edge label: substitutions joined by
bool first = true; for (auto const& s : e->subst()) { if (!first) out << "
"; first = false; out << dot_html_escape(snode_label(s.m_var, m)) << " → " // mapping arrow << dot_html_escape(snode_label(s.m_replacement, m)); } for (auto const& cs : e->char_substs()) { if (!first) out << "
"; first = false; out << "?" << cs.m_var->id() << " → ?" << cs.m_val->id(); } out << ">"; // colour nielsen_edge* ep = const_cast(e); if (sat_edges.contains(ep)) out << ", color=green"; else if (e->tgt()->eval_idx() != m_run_idx) out << ", color=blue"; else if (e->tgt()->is_currently_conflict()) out << ", color=red"; out << "];\n"; } // backedge as dotted arrow if (n->backedge()) out << "\t" << n->id() << " -> " << n->backedge()->id() << " [style=dotted];\n"; } out << "}\n"; return out; } // ----------------------------------------------------------------------- // nielsen_node: simplify_and_init // ----------------------------------------------------------------------- bool nielsen_node::handle_empty_side(euf::sgraph& sg, euf::snode* non_empty_side, dep_tracker const& dep, bool& changed) { euf::snode_vector tokens; non_empty_side->collect_tokens(tokens); bool all_vars_or_opaque = true; bool has_char = false; for (euf::snode* t : tokens) { if (t->is_char()) has_char = true; else if (!t->is_var() && t->kind() != euf::snode_kind::s_other) { all_vars_or_opaque = false; break; } } if (has_char || !all_vars_or_opaque) { m_is_general_conflict = true; m_reason = backtrack_reason::symbol_clash; return true; } for (euf::snode* t : tokens) { if (t->is_var()) { nielsen_subst s(t, sg.mk_empty(), dep); apply_subst(sg, s); changed = true; } } return false; } simplify_result nielsen_node::simplify_and_init(nielsen_graph& g) { euf::sgraph& sg = g.sg(); bool changed = true; while (changed) { changed = false; // pass 1: remove trivially satisfied equalities unsigned wi = 0; for (unsigned i = 0; i < m_str_eq.size(); ++i) { str_eq& eq = m_str_eq[i]; if (eq.is_trivial()) continue; m_str_eq[wi++] = eq; } if (wi < m_str_eq.size()) { m_str_eq.shrink(wi); changed = true; } // pass 2: detect symbol clashes, empty-propagation, and prefix cancellation for (str_eq& eq : m_str_eq) { if (!eq.m_lhs || !eq.m_rhs) continue; // one side empty, the other not empty => conflict or substitution if (eq.m_lhs->is_empty() && !eq.m_rhs->is_empty()) { if (handle_empty_side(sg, eq.m_rhs, eq.m_dep, changed)) return simplify_result::conflict; continue; } if (eq.m_rhs->is_empty() && !eq.m_lhs->is_empty()) { if (handle_empty_side(sg, eq.m_lhs, eq.m_dep, changed)) return simplify_result::conflict; continue; } // prefix/suffix cancellation: strip common leading and trailing tokens. // Same char or same variable on both sides can always be cancelled. // Two different concrete characters is a symbol clash. { euf::snode_vector lhs_toks, rhs_toks; eq.m_lhs->collect_tokens(lhs_toks); eq.m_rhs->collect_tokens(rhs_toks); // --- prefix --- unsigned prefix = 0; while (prefix < lhs_toks.size() && prefix < rhs_toks.size()) { euf::snode* lt = lhs_toks[prefix]; euf::snode* rt = rhs_toks[prefix]; if (lt->id() == rt->id() && (lt->is_char() || lt->is_var())) { ++prefix; } else if (lt->is_char() && rt->is_char()) { m_is_general_conflict = true; m_reason = backtrack_reason::symbol_clash; return simplify_result::conflict; } else { break; } } // --- suffix (only among the tokens not already consumed by prefix) --- unsigned lsz = lhs_toks.size(), rsz = rhs_toks.size(); unsigned suffix = 0; while (suffix < lsz - prefix && suffix < rsz - prefix) { euf::snode* lt = lhs_toks[lsz - 1 - suffix]; euf::snode* rt = rhs_toks[rsz - 1 - suffix]; if (lt->id() == rt->id() && (lt->is_char() || lt->is_var())) { ++suffix; } else if (lt->is_char() && rt->is_char()) { m_is_general_conflict = true; m_reason = backtrack_reason::symbol_clash; return simplify_result::conflict; } else { break; } } if (prefix > 0 || suffix > 0) { eq.m_lhs = sg.drop_left(sg.drop_right(eq.m_lhs, suffix), prefix); eq.m_rhs = sg.drop_left(sg.drop_right(eq.m_rhs, suffix), prefix); changed = true; } } } } // consume leading concrete characters from str_mem via Brzozowski derivatives for (str_mem& mem : m_str_mem) { if (!mem.m_str || !mem.m_regex) continue; while (mem.m_str && !mem.m_str->is_empty()) { euf::snode* first = mem.m_str->first(); if (!first || !first->is_char()) break; euf::snode* deriv = sg.brzozowski_deriv(mem.m_regex, first); if (!deriv) break; if (deriv->is_fail()) { m_is_general_conflict = true; m_reason = backtrack_reason::regex; return simplify_result::conflict; } mem.m_str = sg.drop_first(mem.m_str); mem.m_regex = deriv; mem.m_history = sg.mk_concat(mem.m_history, first); } } // check for regex memberships that are immediately infeasible for (str_mem& mem : m_str_mem) { if (!mem.m_str || !mem.m_regex) continue; if (mem.m_str->is_empty() && !mem.m_regex->is_nullable()) { m_is_general_conflict = true; m_reason = backtrack_reason::regex; return simplify_result::conflict; } } // remove satisfied str_mem constraints (ε ∈ nullable regex) unsigned wi = 0; for (unsigned i = 0; i < m_str_mem.size(); ++i) { str_mem& mem = m_str_mem[i]; if (mem.m_str && mem.m_str->is_empty() && mem.m_regex && mem.m_regex->is_nullable()) continue; // satisfied, drop m_str_mem[wi++] = mem; } if (wi < m_str_mem.size()) m_str_mem.shrink(wi); if (is_satisfied()) return simplify_result::satisfied; return simplify_result::proceed; } bool nielsen_node::is_satisfied() const { for (str_eq const& eq : m_str_eq) { if (!eq.is_trivial()) return false; } return m_str_mem.empty(); } bool nielsen_node::has_opaque_terms() const { auto is_opaque = [](euf::snode* n) { return n && n->kind() == euf::snode_kind::s_other; }; for (str_eq const& eq : m_str_eq) { if (eq.is_trivial()) continue; if (is_opaque(eq.m_lhs) || is_opaque(eq.m_rhs)) return true; euf::snode_vector toks; if (eq.m_lhs) { eq.m_lhs->collect_tokens(toks); for (auto* t : toks) if (is_opaque(t)) return true; toks.reset(); } if (eq.m_rhs) { eq.m_rhs->collect_tokens(toks); for (auto* t : toks) if (is_opaque(t)) return true; } } for (str_mem const& mem : m_str_mem) { if (!mem.m_str) continue; if (is_opaque(mem.m_str)) return true; euf::snode_vector toks; mem.m_str->collect_tokens(toks); for (auto* t : toks) if (is_opaque(t)) return true; } return false; } // ----------------------------------------------------------------------- // nielsen_graph: search // ----------------------------------------------------------------------- nielsen_graph::search_result nielsen_graph::solve() { if (!m_root) return search_result::sat; ++m_stats.m_num_solve_calls; m_sat_node = nullptr; m_sat_path.reset(); // Iterative deepening: start at depth 3, increment by 1 on each failure. // m_max_search_depth == 0 means unlimited; otherwise stop when bound exceeds it. m_depth_bound = 3; while (true) { if (m_cancel_fn && m_cancel_fn()) break; if (m_max_search_depth > 0 && m_depth_bound > m_max_search_depth) break; inc_run_idx(); svector cur_path; search_result r = search_dfs(m_root, 0, cur_path); if (r == search_result::sat) { ++m_stats.m_num_sat; return r; } if (r == search_result::unsat) { ++m_stats.m_num_unsat; return r; } // depth limit hit – increment bound by 1 and retry if (m_max_search_depth > 0 && m_depth_bound >= m_max_search_depth) break; ++m_depth_bound; } ++m_stats.m_num_unknown; return search_result::unknown; } nielsen_graph::search_result nielsen_graph::search_dfs(nielsen_node* node, unsigned depth, svector& cur_path) { ++m_stats.m_num_dfs_nodes; m_stats.m_max_depth = std::max(m_stats.m_max_depth, depth); // check for external cancellation (timeout, user interrupt) if (m_cancel_fn && m_cancel_fn()) return search_result::unknown; // revisit detection: if already visited this run, return cached status. // mirrors ZIPT's NielsenNode.GraphExpansion() evalIdx check. if (node->eval_idx() == m_run_idx) { if (node->is_satisfied()) { m_sat_node = node; m_sat_path = cur_path; return search_result::sat; } if (node->is_currently_conflict()) return search_result::unsat; return search_result::unknown; } node->set_eval_idx(m_run_idx); // simplify constraints (idempotent after first call) simplify_result sr = node->simplify_and_init(*this); if (sr == simplify_result::conflict) { ++m_stats.m_num_simplify_conflict; node->set_general_conflict(true); return search_result::unsat; } if (sr == simplify_result::satisfied || node->is_satisfied()) { m_sat_node = node; m_sat_path = cur_path; return search_result::sat; } // integer feasibility check: collect side constraints along the path // and verify they are jointly satisfiable using the LP solver if (!cur_path.empty() && !check_int_feasibility(node, cur_path)) { node->set_reason(backtrack_reason::arithmetic); return search_result::unsat; } // depth bound check if (depth >= m_depth_bound) return search_result::unknown; // generate extensions only once per node; children persist across runs if (!node->is_extended()) { bool ext = generate_extensions(node); if (!ext) { node->set_extended(true); // No extensions could be generated. If the node still has // unsatisfied constraints with opaque (s_other) terms that // we cannot decompose, report unknown rather than unsat // to avoid unsoundness. if (node->has_opaque_terms()) return search_result::unknown; node->set_reason(backtrack_reason::children_failed); return search_result::unsat; } node->set_extended(true); ++m_stats.m_num_extensions; } // explore children bool any_unknown = false; for (nielsen_edge* e : node->outgoing()) { cur_path.push_back(e); search_result r = search_dfs(e->tgt(), depth + 1, cur_path); if (r == search_result::sat) return search_result::sat; cur_path.pop_back(); if (r == search_result::unknown) any_unknown = true; } // If no children exist and the node has opaque terms, report unknown if (node->outgoing().empty() && node->has_opaque_terms()) return search_result::unknown; if (!any_unknown) { node->set_reason(backtrack_reason::children_failed); return search_result::unsat; } return search_result::unknown; } // Returns true if variable snode `var` appears anywhere in the token list of `n`. static bool snode_contains_var(euf::snode const* n, euf::snode const* var) { if (!n || !var) return false; euf::snode_vector tokens; n->collect_tokens(tokens); for (const euf::snode* t : tokens) { if (t == var) return true; } return false; } bool nielsen_graph::apply_det_modifier(nielsen_node* node) { for (str_eq const& eq : node->str_eqs()) { SASSERT(!eq.is_trivial()); // We should have simplified it away before if (!eq.m_lhs || !eq.m_rhs) continue; // variable definition: x = t where x is a single var and x ∉ vars(t) // → deterministically substitute x → t throughout the node euf::snode* var = nullptr; euf::snode* def; if (eq.m_lhs->is_var() && !snode_contains_var(eq.m_rhs, eq.m_lhs)) { var = eq.m_lhs; def = eq.m_rhs; } else if (eq.m_rhs->is_var() && !snode_contains_var(eq.m_lhs, eq.m_rhs)) { var = eq.m_rhs; def = eq.m_lhs; } if (var) { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, true); nielsen_subst s(var, def, eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); return true; } } return false; } bool nielsen_graph::apply_const_nielsen(nielsen_node* node) { for (str_eq const& eq : node->str_eqs()) { if (eq.is_trivial()) continue; if (!eq.m_lhs || !eq.m_rhs) continue; euf::snode_vector lhs_toks, rhs_toks; eq.m_lhs->collect_tokens(lhs_toks); eq.m_rhs->collect_tokens(rhs_toks); if (lhs_toks.empty() || rhs_toks.empty()) continue; euf::snode* lhead = lhs_toks[0]; euf::snode* rhead = rhs_toks[0]; // char·A = y·B → branch 1: y→ε, branch 2: y→char·fresh if (lhead->is_char() && rhead->is_var()) { // branch 1: y → ε (progress) { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, true); nielsen_subst s(rhead, m_sg.mk_empty(), eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); } // branch 2: y → char·fresh (progress) { euf::snode* fresh = mk_fresh_var(); euf::snode* replacement = m_sg.mk_concat(lhead, fresh); nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, true); nielsen_subst s(rhead, replacement, eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); } return true; } // x·A = char·B → branch 1: x→ε, branch 2: x→char·fresh if (rhead->is_char() && lhead->is_var()) { // branch 1: x → ε (progress) { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, true); nielsen_subst s(lhead, m_sg.mk_empty(), eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); } // branch 2: x → char·fresh (progress) { euf::snode* fresh = mk_fresh_var(); euf::snode* replacement = m_sg.mk_concat(rhead, fresh); nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, true); nielsen_subst s(lhead, replacement, eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); } return true; } } return false; } bool nielsen_graph::apply_var_nielsen(nielsen_node* node) { for (str_eq const& eq : node->str_eqs()) { if (eq.is_trivial()) continue; if (!eq.m_lhs || !eq.m_rhs) continue; euf::snode_vector lhs_toks, rhs_toks; eq.m_lhs->collect_tokens(lhs_toks); eq.m_rhs->collect_tokens(rhs_toks); if (lhs_toks.empty() || rhs_toks.empty()) continue; euf::snode* lhead = lhs_toks[0]; euf::snode* rhead = rhs_toks[0]; if (!lhead->is_var() || !rhead->is_var()) continue; if (lhead->id() == rhead->id()) continue; // x·A = y·B where x,y are distinct variables (classic Nielsen) // child 1: x → ε (progress) { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, true); nielsen_subst s(lhead, m_sg.mk_empty(), eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); } // child 2: x → y·x' (progress) { euf::snode* fresh = mk_fresh_var(); euf::snode* replacement = m_sg.mk_concat(rhead, fresh); nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, true); nielsen_subst s(lhead, replacement, eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); } // child 3: y → x·y' (progress) { euf::snode* fresh = mk_fresh_var(); euf::snode* replacement = m_sg.mk_concat(lhead, fresh); nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, true); nielsen_subst s(rhead, replacement, eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); } return true; } return false; } // ----------------------------------------------------------------------- // EqSplit helpers: token length classification // ----------------------------------------------------------------------- bool nielsen_graph::token_has_variable_length(euf::snode* tok) const { // Chars and units have known constant length 1. if (tok->is_char() || tok->is_unit()) return false; // Variables and powers have symbolic/unknown length. if (tok->is_var() || tok->is_power()) return true; // For s_other: check if it's a string literal (known constant length). if (tok->get_expr()) { seq_util& seq = m_sg.get_seq_util(); zstring s; if (seq.str.is_string(tok->get_expr(), s)) return false; } // Everything else is treated as variable length. return true; } unsigned nielsen_graph::token_const_length(euf::snode* tok) const { if (tok->is_char() || tok->is_unit()) return 1; if (tok->get_expr()) { seq_util& seq = m_sg.get_seq_util(); zstring s; if (seq.str.is_string(tok->get_expr(), s)) return s.length(); } return 0; } // ----------------------------------------------------------------------- // EqSplit: find_eq_split_point // Port of ZIPT's StrEq.SplitEq algorithm. // // Walks tokens from each side tracking two accumulators: // - lhs_has_symbolic / rhs_has_symbolic : whether a variable-length token // has been consumed on that side since the last reset // - const_diff : net constant-length difference (LHS constants − RHS constants) // // A potential split point arises when both accumulators are "zero" // (no outstanding symbolic contributions on either side), meaning // the prefix lengths are determined up to a constant offset (const_diff). // // Among all such split points, we pick the one minimising |const_diff| // (the padding amount). We also require having seen at least one variable- // length token before accepting a split, so that the split is non-trivial. // ----------------------------------------------------------------------- bool nielsen_graph::find_eq_split_point( euf::snode_vector const& lhs_toks, euf::snode_vector const& rhs_toks, unsigned& out_lhs_idx, unsigned& out_rhs_idx, int& out_padding) const { unsigned lhs_len = lhs_toks.size(); unsigned rhs_len = rhs_toks.size(); if (lhs_len <= 1 || rhs_len <= 1) return false; // Initialize accumulators with the first token on each side (index 0). bool lhs_has_symbolic = token_has_variable_length(lhs_toks[0]); bool rhs_has_symbolic = token_has_variable_length(rhs_toks[0]); int const_diff = 0; if (!lhs_has_symbolic) const_diff += (int)token_const_length(lhs_toks[0]); if (!rhs_has_symbolic) const_diff -= (int)token_const_length(rhs_toks[0]); bool seen_variable = lhs_has_symbolic || rhs_has_symbolic; // Best confirmed split point. bool has_best = false; unsigned best_lhs = 0, best_rhs = 0; int best_padding = 0; // Pending split (needs confirmation by a subsequent variable token). bool has_pending = false; unsigned pending_lhs = 0, pending_rhs = 0; int pending_padding = 0; unsigned li = 1, ri = 1; while (li < lhs_len || ri < rhs_len) { bool lhs_zero = !lhs_has_symbolic; bool rhs_zero = !rhs_has_symbolic; // Potential split point: both symbolic accumulators are zero. if (seen_variable && lhs_zero && rhs_zero) { if (!has_pending || std::abs(const_diff) < std::abs(pending_padding)) { has_pending = true; pending_padding = const_diff; pending_lhs = li; pending_rhs = ri; } } // Decide which side to consume from. // Prefer consuming from the side whose symbolic accumulator is zero // (i.e., that side has no outstanding variable contributions). bool consume_lhs; if (lhs_zero && !rhs_zero) consume_lhs = true; else if (!lhs_zero && rhs_zero) consume_lhs = false; else if (lhs_zero && rhs_zero) consume_lhs = (const_diff <= 0); // consume from side with less accumulated constant else consume_lhs = (li <= ri); // both non-zero: round-robin if (consume_lhs) { if (li >= lhs_len) break; euf::snode* tok = lhs_toks[li++]; bool is_var = token_has_variable_length(tok); if (is_var) { // Confirm any pending split point. if (has_pending && (!has_best || std::abs(pending_padding) < std::abs(best_padding))) { has_best = true; best_padding = pending_padding; best_lhs = pending_lhs; best_rhs = pending_rhs; has_pending = false; } seen_variable = true; lhs_has_symbolic = true; } else { const_diff += (int)token_const_length(tok); } } else { if (ri >= rhs_len) break; euf::snode* tok = rhs_toks[ri++]; bool is_var = token_has_variable_length(tok); if (is_var) { if (has_pending && (!has_best || std::abs(pending_padding) < std::abs(best_padding))) { has_best = true; best_padding = pending_padding; best_lhs = pending_lhs; best_rhs = pending_rhs; has_pending = false; } seen_variable = true; rhs_has_symbolic = true; } else { const_diff -= (int)token_const_length(tok); } } } if (!has_best) return false; // A split at the very start or very end is degenerate — skip. if ((best_lhs == 0 && best_rhs == 0) || (best_lhs == lhs_len && best_rhs == rhs_len)) return false; out_lhs_idx = best_lhs; out_rhs_idx = best_rhs; out_padding = best_padding; return true; } // ----------------------------------------------------------------------- // apply_eq_split — Port of ZIPT's EqSplitModifier.Apply // // For a regex-free equation LHS = RHS, finds a split point and decomposes // into two shorter equations with optional padding variable: // // eq1: LHS[0..lhsIdx] · [pad if padding<0] = [pad if padding>0] · RHS[0..rhsIdx] // eq2: LHS[lhsIdx..] · [pad if padding>0] = [pad if padding<0] · RHS[rhsIdx..] // // Creates a single progress child. // ----------------------------------------------------------------------- bool nielsen_graph::apply_eq_split(nielsen_node* node) { ast_manager& m = m_sg.get_manager(); seq_util& seq = m_sg.get_seq_util(); arith_util arith(m); for (unsigned eq_idx = 0; eq_idx < node->str_eqs().size(); ++eq_idx) { str_eq const& eq = node->str_eqs()[eq_idx]; if (eq.is_trivial()) continue; if (!eq.m_lhs || !eq.m_rhs) continue; // EqSplit only applies to regex-free equations. if (!eq.m_lhs->is_regex_free() || !eq.m_rhs->is_regex_free()) continue; euf::snode_vector lhs_toks, rhs_toks; eq.m_lhs->collect_tokens(lhs_toks); eq.m_rhs->collect_tokens(rhs_toks); if (lhs_toks.empty() || rhs_toks.empty()) continue; unsigned split_lhs = 0, split_rhs = 0; int padding = 0; if (!find_eq_split_point(lhs_toks, rhs_toks, split_lhs, split_rhs, padding)) continue; // Split the equation sides into prefix / suffix. euf::snode* lhs_prefix = m_sg.drop_right(eq.m_lhs, lhs_toks.size() - split_lhs); euf::snode* lhs_suffix = m_sg.drop_left(eq.m_lhs, split_lhs); euf::snode* rhs_prefix = m_sg.drop_right(eq.m_rhs, rhs_toks.size() - split_rhs); euf::snode* rhs_suffix = m_sg.drop_left(eq.m_rhs, split_rhs); // Build the two new equations, incorporating a padding variable if needed. euf::snode* eq1_lhs = lhs_prefix; euf::snode* eq1_rhs = rhs_prefix; euf::snode* eq2_lhs = lhs_suffix; euf::snode* eq2_rhs = rhs_suffix; euf::snode* pad = nullptr; if (padding != 0) { pad = mk_fresh_var(); if (padding > 0) { // LHS prefix is longer by |padding| constants. // Prepend pad to RHS prefix, append pad to LHS suffix. eq1_rhs = m_sg.mk_concat(pad, rhs_prefix); eq2_lhs = m_sg.mk_concat(lhs_suffix, pad); } else { // RHS prefix is longer by |padding| constants. // Append pad to LHS prefix, prepend pad to RHS suffix. eq1_lhs = m_sg.mk_concat(lhs_prefix, pad); eq2_rhs = m_sg.mk_concat(pad, rhs_suffix); } } // Create single progress child. nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, true); // Remove the original equation and add the two new ones. auto& eqs = child->str_eqs(); eqs[eq_idx] = eqs.back(); eqs.pop_back(); eqs.push_back(str_eq(eq1_lhs, eq1_rhs, eq.m_dep)); eqs.push_back(str_eq(eq2_lhs, eq2_rhs, eq.m_dep)); // Int constraints on the edge. // 1) len(pad) = |padding| (if padding variable was created) if (pad && pad->get_expr()) { expr_ref len_pad(seq.str.mk_length(pad->get_expr()), m); expr_ref abs_pad(arith.mk_int(std::abs(padding)), m); e->add_side_int(mk_int_constraint(len_pad, abs_pad, int_constraint_kind::eq, eq.m_dep)); } // 2) len(eq1_lhs) = len(eq1_rhs) expr_ref l1 = compute_length_expr(eq1_lhs); expr_ref r1 = compute_length_expr(eq1_rhs); e->add_side_int(mk_int_constraint(l1, r1, int_constraint_kind::eq, eq.m_dep)); // 3) len(eq2_lhs) = len(eq2_rhs) expr_ref l2 = compute_length_expr(eq2_lhs); expr_ref r2 = compute_length_expr(eq2_rhs); e->add_side_int(mk_int_constraint(l2, r2, int_constraint_kind::eq, eq.m_dep)); return true; } return false; } // ----------------------------------------------------------------------- // nielsen_graph: mk_fresh_var // ----------------------------------------------------------------------- euf::snode* nielsen_graph::mk_fresh_var() { ++m_stats.m_num_fresh_vars; std::string name = "v!" + std::to_string(m_fresh_cnt++); return m_sg.mk_var(symbol(name.c_str())); } euf::snode* nielsen_graph::mk_fresh_char_var() { ++m_stats.m_num_fresh_vars; std::string name = "?c!" + std::to_string(m_fresh_cnt++); seq_util& seq = m_sg.get_seq_util(); ast_manager& m = m_sg.get_manager(); sort* char_sort = seq.mk_char_sort(); expr_ref fresh_const(m.mk_fresh_const(name.c_str(), char_sort), m); expr_ref unit(seq.str.mk_unit(fresh_const), m); return m_sg.mk(unit); } // ----------------------------------------------------------------------- // nielsen_graph: apply_regex_char_split // ----------------------------------------------------------------------- void nielsen_graph::collect_first_chars(euf::snode* re, euf::snode_vector& chars) { if (!re) return; // to_re(s): the first character of the string s if (re->is_to_re()) { euf::snode* body = re->arg(0); if (body && !body->is_empty()) { euf::snode* first = body->first(); if (first && first->is_char()) { bool dup = false; for (euf::snode* c : chars) if (c == first) { dup = true; break; } if (!dup) chars.push_back(first); } // Handle string literals (classified as s_other in sgraph): // extract the first character from the zstring and create a char snode. else if (first && first->get_expr()) { seq_util& seq = m_sg.get_seq_util(); zstring s; if (seq.str.is_string(first->get_expr(), s) && s.length() > 0) { euf::snode* ch = m_sg.mk_char(s[0]); bool dup = false; for (euf::snode* c : chars) if (c == ch) { dup = true; break; } if (!dup) chars.push_back(ch); } } } return; } // full character set (.): use 'a' as representative if (re->is_full_char()) { euf::snode* ch = m_sg.mk_char('a'); bool dup = false; for (euf::snode* c : chars) if (c == ch) { dup = true; break; } if (!dup) chars.push_back(ch); return; } // re.range(lo, hi): use lo as representative if (re->get_expr()) { seq_util& seq = m_sg.get_seq_util(); expr* lo = nullptr, *hi = nullptr; if (seq.re.is_range(re->get_expr(), lo, hi) && lo) { unsigned ch_val = 'a'; if (seq.is_const_char(lo, ch_val)) { euf::snode* ch = m_sg.mk_char(ch_val); bool dup = false; for (euf::snode* c : chars) if (c == ch) { dup = true; break; } if (!dup) chars.push_back(ch); } return; } } // fail, full_seq: no concrete first chars to extract if (re->is_fail() || re->is_full_seq()) return; // recurse into children (handles union, concat, star, loop, etc.) for (unsigned i = 0; i < re->num_args(); ++i) collect_first_chars(re->arg(i), chars); } bool nielsen_graph::apply_regex_char_split(nielsen_node* node) { for (str_mem const& mem : node->str_mems()) { if (!mem.m_str || !mem.m_regex) continue; // find the first token of the string side euf::snode* first = mem.m_str->first(); if (!first || !first->is_var()) continue; // collect concrete characters from the regex euf::snode_vector chars; collect_first_chars(mem.m_regex, chars); // If no concrete characters found, fall through to apply_regex_var_split if (chars.empty()) continue; bool created = false; // for each character c with non-fail derivative: // child: x → c · fresh_var for (euf::snode* ch : chars) { euf::snode* deriv = m_sg.brzozowski_deriv(mem.m_regex, ch); if (!deriv || deriv->is_fail()) continue; euf::snode* fresh = mk_fresh_var(); euf::snode* replacement = m_sg.mk_concat(ch, fresh); nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, true); nielsen_subst s(first, replacement, mem.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); created = true; } // x → ε branch (variable is empty) { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, true); nielsen_subst s(first, m_sg.mk_empty(), mem.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); created = true; } if (created) return true; } return false; } bool nielsen_graph::generate_extensions(nielsen_node *node) { // The first modifier that produces edges is used and returned immediately. // Priority 1: deterministic modifiers (single child, always progress) if (apply_det_modifier(node)) return ++m_stats.m_mod_det, true; // Priority 2: PowerEpsilon - power → ε via base=ε or n=0 if (apply_power_epsilon(node)) return ++m_stats.m_mod_power_epsilon, true; // Priority 3: NumCmp - length comparison branching for power tokens if (apply_num_cmp(node)) return ++m_stats.m_mod_num_cmp, true; // Priority 4: ConstNumUnwinding - power vs constant: n=0 or peel if (apply_const_num_unwinding(node)) return ++m_stats.m_mod_const_num_unwinding, true; // Priority 5: EqSplit - split equations into two (single progress child) if (apply_eq_split(node)) return ++m_stats.m_mod_eq_split, true; // Priority 6: StarIntr - stabilizer introduction for regex cycles if (apply_star_intr(node)) return ++m_stats.m_mod_star_intr, true; // Priority 7: GPowerIntr - generalized power introduction if (apply_gpower_intr(node)) return ++m_stats.m_mod_gpower_intr, true; // Priority 8: ConstNielsen - char vs var (2 children) if (apply_const_nielsen(node)) return ++m_stats.m_mod_const_nielsen, true; // Priority 9: RegexCharSplit - split str_mem by first-position chars if (apply_regex_char_split(node)) return ++m_stats.m_mod_regex_char_split, true; // Priority 10: RegexVarSplit - split str_mem by minterms if (apply_regex_var_split(node)) return ++m_stats.m_mod_regex_var_split, true; // Priority 11: PowerSplit - power unwinding with bounded prefix if (apply_power_split(node)) return ++m_stats.m_mod_power_split, true; // Priority 12: VarNielsen - var vs var, all progress (classic Nielsen) if (apply_var_nielsen(node)) return ++m_stats.m_mod_var_nielsen, true; // Priority 13: VarNumUnwinding - variable power unwinding if (apply_var_num_unwinding(node)) return ++m_stats.m_mod_var_num_unwinding, true; return false; } // ----------------------------------------------------------------------- // Helper: find a power token in any str_eq // ----------------------------------------------------------------------- euf::snode* nielsen_graph::find_power_token(nielsen_node* node) const { for (str_eq const& eq : node->str_eqs()) { if (eq.is_trivial()) continue; if (!eq.m_lhs || !eq.m_rhs) continue; euf::snode_vector toks; eq.m_lhs->collect_tokens(toks); for (euf::snode* t : toks) if (t->is_power()) return t; toks.reset(); eq.m_rhs->collect_tokens(toks); for (euf::snode* t : toks) if (t->is_power()) return t; } return nullptr; } // ----------------------------------------------------------------------- // Helper: find a power token facing a constant (char) head // Returns true if found, sets power, other_head, eq_out. // ----------------------------------------------------------------------- bool nielsen_graph::find_power_vs_const(nielsen_node* node, euf::snode*& power, euf::snode*& other_head, str_eq const*& eq_out) const { for (str_eq const& eq : node->str_eqs()) { if (eq.is_trivial()) continue; if (!eq.m_lhs || !eq.m_rhs) continue; euf::snode* lhead = eq.m_lhs->first(); euf::snode* rhead = eq.m_rhs->first(); if (lhead && lhead->is_power() && rhead && rhead->is_char()) { power = lhead; other_head = rhead; eq_out = &eq; return true; } if (rhead && rhead->is_power() && lhead && lhead->is_char()) { power = rhead; other_head = lhead; eq_out = &eq; return true; } } return false; } // ----------------------------------------------------------------------- // Helper: find a power token facing a variable head // ----------------------------------------------------------------------- bool nielsen_graph::find_power_vs_var(nielsen_node* node, euf::snode*& power, euf::snode*& var_head, str_eq const*& eq_out) const { for (str_eq const& eq : node->str_eqs()) { if (eq.is_trivial()) continue; if (!eq.m_lhs || !eq.m_rhs) continue; euf::snode* lhead = eq.m_lhs->first(); euf::snode* rhead = eq.m_rhs->first(); if (lhead && lhead->is_power() && rhead && rhead->is_var()) { power = lhead; var_head = rhead; eq_out = &eq; return true; } if (rhead && rhead->is_power() && lhead && lhead->is_var()) { power = rhead; var_head = lhead; eq_out = &eq; return true; } } return false; } // ----------------------------------------------------------------------- // Modifier: apply_power_epsilon // For a power token u^n in an equation, branch: // (1) base u → ε (base is empty, so u^n = ε) // (2) u^n → ε (the power is zero, replace power with empty) // mirrors ZIPT's PowerEpsilonModifier // ----------------------------------------------------------------------- bool nielsen_graph::apply_power_epsilon(nielsen_node* node) { euf::snode *power = find_power_token(node); if (!power) return false; SASSERT(power->is_power() && power->num_args() >= 1); euf::snode *base = power->arg(0); dep_tracker dep; for (str_eq const &eq : node->str_eqs()) { if (!eq.is_trivial()) { dep = eq.m_dep; break; } } nielsen_node* child; nielsen_edge* e; // Branch 1: base → ε (if base is a variable, substitute it to empty) // This makes u^n = ε^n = ε for any n. if (base->is_var()) { child = mk_child(node); e = mk_edge(node, child, true); nielsen_subst s1(base, m_sg.mk_empty(), dep); e->add_subst(s1); child->apply_subst(m_sg, s1); } // Branch 2: replace the power token itself with ε (n = 0 semantics) child = mk_child(node); e = mk_edge(node, child, true); nielsen_subst s2(power, m_sg.mk_empty(), dep); e->add_subst(s2); child->apply_subst(m_sg, s2); return true; } // ----------------------------------------------------------------------- // Modifier: apply_num_cmp // For equations involving two power tokens u^m and u^n with the same base, // branch on the numeric relationship: m <= n vs n < m. // Generates proper integer side constraints for each branch. // mirrors ZIPT's NumCmpModifier // ----------------------------------------------------------------------- bool nielsen_graph::apply_num_cmp(nielsen_node* node) { ast_manager& m = m_sg.get_manager(); arith_util arith(m); // Look for two power tokens with the same base on opposite sides for (str_eq const& eq : node->str_eqs()) { if (eq.is_trivial()) continue; if (!eq.m_lhs || !eq.m_rhs) continue; euf::snode* lhead = eq.m_lhs->first(); euf::snode* rhead = eq.m_rhs->first(); if (!lhead || !rhead) continue; if (!lhead->is_power() || !rhead->is_power()) continue; if (lhead->num_args() < 1 || rhead->num_args() < 1) continue; // same base: compare the two powers if (lhead->arg(0)->id() != rhead->arg(0)->id()) continue; // Get exponent expressions (m and n from u^m and u^n) expr* exp_m = get_power_exponent(lhead); expr* exp_n = get_power_exponent(rhead); // Branch 1: m <= n → cancel u^m from both sides // Side constraint: m <= n // After cancellation: ε·A = u^(n-m)·B { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, true); nielsen_subst s(lhead, m_sg.mk_empty(), eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); if (exp_m && exp_n) e->add_side_int(mk_int_constraint(exp_m, exp_n, int_constraint_kind::le, eq.m_dep)); } // Branch 2: n < m → cancel u^n from both sides // Side constraint: n < m, i.e., n + 1 <= m, i.e., m >= n + 1 // After cancellation: u^(m-n)·A = ε·B { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, true); nielsen_subst s(rhead, m_sg.mk_empty(), eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); if (exp_m && exp_n) { expr_ref n_plus_1(arith.mk_add(exp_n, arith.mk_int(1)), m); e->add_side_int(mk_int_constraint(exp_m, n_plus_1, int_constraint_kind::ge, eq.m_dep)); } } return true; } return false; } // ----------------------------------------------------------------------- // Modifier: apply_const_num_unwinding // For a power token u^n facing a constant (char) head, // branch: (1) n = 0 → u^n = ε, (2) n >= 1 → peel one u from power. // Generates integer side constraints for each branch. // mirrors ZIPT's ConstNumUnwindingModifier // ----------------------------------------------------------------------- bool nielsen_graph::apply_const_num_unwinding(nielsen_node* node) { ast_manager& m = m_sg.get_manager(); arith_util arith(m); euf::snode* power = nullptr; euf::snode* other_head = nullptr; str_eq const* eq = nullptr; if (!find_power_vs_const(node, power, other_head, eq)) return false; SASSERT(power->is_power() && power->num_args() >= 1); euf::snode* base = power->arg(0); expr* exp_n = get_power_exponent(power); expr* zero = arith.mk_int(0); expr* one = arith.mk_int(1); // Branch 1: n = 0 → replace power with ε (progress) // Side constraint: n = 0 { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, true); nielsen_subst s(power, m_sg.mk_empty(), eq->m_dep); e->add_subst(s); child->apply_subst(m_sg, s); if (exp_n) e->add_side_int(mk_int_constraint(exp_n, zero, int_constraint_kind::eq, eq->m_dep)); } // Branch 2: n >= 1 → peel one u: replace u^n with u · u^(n-1) // Side constraint: n >= 1 // Use fresh power variable for u^(n-1) { euf::snode* fresh_power = mk_fresh_var(); euf::snode* replacement = m_sg.mk_concat(base, fresh_power); nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, false); nielsen_subst s(power, replacement, eq->m_dep); e->add_subst(s); child->apply_subst(m_sg, s); if (exp_n) e->add_side_int(mk_int_constraint(exp_n, one, int_constraint_kind::ge, eq->m_dep)); } return true; } // ----------------------------------------------------------------------- // Modifier: apply_star_intr // Star introduction: for a str_mem x·s ∈ R where a cycle is detected // (backedge exists), introduce a stabilizer constraint x ∈ base*. // Creates a child that adds x ∈ base* membership and constrains // the remainder not to start with base. // mirrors ZIPT's StarIntrModifier // ----------------------------------------------------------------------- bool nielsen_graph::apply_star_intr(nielsen_node* node) { // Only fire if a backedge was set (cycle detected) if (!node->backedge()) return false; // Look for a str_mem with a variable-headed string for (unsigned mi = 0; mi < node->str_mems().size(); ++mi) { str_mem const& mem = node->str_mems()[mi]; if (!mem.m_str || !mem.m_regex) continue; euf::snode* first = mem.m_str->first(); if (!first || !first->is_var()) continue; // Introduce a fresh variable z constrained by z ∈ R*, // replacing x → z·fresh_post. This breaks the cycle because // z is a new variable that won't trigger star_intr again. euf::snode* x = first; euf::snode* z = mk_fresh_var(); euf::snode* fresh_post = mk_fresh_var(); euf::snode* str_tail = m_sg.drop_first(mem.m_str); // Build z ∈ R* membership: the star of the current regex seq_util& seq = m_sg.get_seq_util(); expr* re_expr = mem.m_regex->get_expr(); if (!re_expr) continue; expr_ref star_re(seq.re.mk_star(re_expr), seq.get_manager()); euf::snode* star_sn = m_sg.mk(star_re); if (!star_sn) continue; nielsen_node* child = mk_child(node); // Add membership: z ∈ R* (stabilizer constraint) child->add_str_mem(str_mem(z, star_sn, mem.m_history, next_mem_id(), mem.m_dep)); // Add remaining membership: fresh_post · tail ∈ R euf::snode* post_tail = str_tail->is_empty() ? fresh_post : m_sg.mk_concat(fresh_post, str_tail); child->add_str_mem(str_mem(post_tail, mem.m_regex, mem.m_history, next_mem_id(), mem.m_dep)); // Substitute x → z · fresh_post nielsen_edge* e = mk_edge(node, child, false); euf::snode* replacement = m_sg.mk_concat(z, fresh_post); nielsen_subst s(x, replacement, mem.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); // Do not re-set backedge — z is fresh, so star_intr won't fire again return true; } return false; } // ----------------------------------------------------------------------- // Modifier: apply_gpower_intr // Ground power introduction: for a variable x matched against a // ground repeated pattern, introduce x = base^n · prefix with fresh n. // Generates integer side constraint n >= 0 for the fresh exponent. // mirrors ZIPT's GPowerIntrModifier // ----------------------------------------------------------------------- bool nielsen_graph::apply_gpower_intr(nielsen_node* node) { ast_manager& m = m_sg.get_manager(); arith_util arith(m); seq_util& seq = m_sg.get_seq_util(); for (str_eq const& eq : node->str_eqs()) { if (eq.is_trivial()) continue; if (!eq.m_lhs || !eq.m_rhs) continue; euf::snode* var_side = nullptr; euf::snode* ground_side = nullptr; // One side must be a single variable, other side must be ground euf::snode* lhead = eq.m_lhs->first(); euf::snode* rhead = eq.m_rhs->first(); if (lhead && lhead->is_var() && eq.m_lhs->length() == 1 && eq.m_rhs->is_ground()) { var_side = lhead; ground_side = eq.m_rhs; } else if (rhead && rhead->is_var() && eq.m_rhs->length() == 1 && eq.m_lhs->is_ground()) { var_side = rhead; ground_side = eq.m_lhs; } else continue; if (!ground_side || ground_side->is_empty()) continue; if (ground_side->length() < 2) continue; // need a repeated pattern // Extract the first token as the "base" for power introduction euf::snode* base_char = ground_side->first(); if (!base_char || !base_char->is_char()) continue; // Check if the ground side has a repeated leading character euf::snode_vector toks; ground_side->collect_tokens(toks); unsigned repeat_len = 0; for (unsigned i = 0; i < toks.size(); ++i) { if (toks[i]->id() == base_char->id()) ++repeat_len; else break; } if (repeat_len < 2) continue; // need at least 2 repetitions // Introduce: x = base^n · fresh_suffix // Create fresh integer variable n for the exponent expr_ref fresh_n = mk_fresh_int_var(); expr* base_expr = base_char->get_expr(); // Create the power snode: base^n euf::snode* fresh_suffix = mk_fresh_var(); euf::snode* fresh_power = nullptr; if (base_expr) { // Build the seq.power(base_str, n) expression and register it expr_ref base_str(seq.str.mk_unit(base_expr), m); expr_ref power_expr(seq.str.mk_power(base_str, fresh_n), m); fresh_power = m_sg.mk(power_expr); } if (!fresh_power) fresh_power = mk_fresh_var(); // fallback euf::snode* replacement = m_sg.mk_concat(fresh_power, fresh_suffix); nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, true); nielsen_subst s(var_side, replacement, eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); // Side constraint: n >= 0 expr* zero = arith.mk_int(0); e->add_side_int(mk_int_constraint(fresh_n, zero, int_constraint_kind::ge, eq.m_dep)); // Side constraint: len(fresh_suffix) < len(base) = 1 // This ensures the remainder is shorter than the base character if (fresh_suffix->get_expr()) { expr_ref len_suffix(seq.str.mk_length(fresh_suffix->get_expr()), m); expr* one = arith.mk_int(1); e->add_side_int(mk_int_constraint(len_suffix, one, int_constraint_kind::le, eq.m_dep)); e->add_side_int(mk_int_constraint(len_suffix, zero, int_constraint_kind::ge, eq.m_dep)); } return true; } return false; } // ----------------------------------------------------------------------- // Modifier: apply_regex_var_split // For str_mem x·s ∈ R where x is a variable, split using minterms: // (1) x → ε (empty) // (2) x → c · x' for each minterm character class c // More general than regex_char_split; uses minterm partitioning rather // than just extracting concrete characters. // mirrors ZIPT's RegexVarSplitModifier // ----------------------------------------------------------------------- bool nielsen_graph::apply_regex_var_split(nielsen_node* node) { for (str_mem const& mem : node->str_mems()) { if (!mem.m_str || !mem.m_regex) continue; euf::snode* first = mem.m_str->first(); if (!first || !first->is_var()) continue; // Compute minterms from the regex euf::snode_vector minterms; m_sg.compute_minterms(mem.m_regex, minterms); if (minterms.empty()) continue; bool created = false; // Branch 1: x → ε (progress) { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, true); nielsen_subst s(first, m_sg.mk_empty(), mem.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); created = true; } // Branch 2+: for each minterm m_i, x → ?c · x' // where ?c is a symbolic char constrained by the minterm for (euf::snode* mt : minterms) { if (mt->is_fail()) continue; // Try Brzozowski derivative with respect to this minterm // If the derivative is fail (empty language), skip this minterm euf::snode* deriv = m_sg.brzozowski_deriv(mem.m_regex, mt); if (deriv && deriv->is_fail()) continue; euf::snode* fresh_var = mk_fresh_var(); euf::snode* fresh_char = mk_fresh_char_var(); euf::snode* replacement = m_sg.mk_concat(fresh_char, fresh_var); nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, true); nielsen_subst s(first, replacement, mem.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); // TODO: derive char_set from minterm and add as range constraint // child->add_char_range(fresh_char, minterm_to_char_set(mt)); created = true; } if (created) return true; } return false; } // ----------------------------------------------------------------------- // Modifier: apply_power_split // For a variable x facing a power token u^n, branch: // (1) x = u^m · prefix(u) with m < n (bounded power prefix) // (2) x = u^n · x' (the variable extends past the full power) // Generates integer side constraints for the fresh exponent variables. // mirrors ZIPT's PowerSplitModifier // ----------------------------------------------------------------------- bool nielsen_graph::apply_power_split(nielsen_node* node) { ast_manager& m = m_sg.get_manager(); arith_util arith(m); seq_util& seq = m_sg.get_seq_util(); euf::snode* power = nullptr; euf::snode* var_head = nullptr; str_eq const* eq = nullptr; if (!find_power_vs_var(node, power, var_head, eq)) return false; SASSERT(power->is_power() && power->num_args() >= 1); euf::snode* base = power->arg(0); expr* exp_n = get_power_exponent(power); expr* zero = arith.mk_int(0); // Branch 1: x = base^m · prefix where 0 <= m < n // Side constraints: m >= 0, m < n (i.e., n >= m + 1) { expr_ref fresh_m = mk_fresh_int_var(); euf::snode* fresh_power = mk_fresh_var(); // represents base^m euf::snode* fresh_suffix = mk_fresh_var(); // represents prefix(base) euf::snode* replacement = m_sg.mk_concat(fresh_power, fresh_suffix); nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, true); nielsen_subst s(var_head, replacement, eq->m_dep); e->add_subst(s); child->apply_subst(m_sg, s); // m >= 0 e->add_side_int(mk_int_constraint(fresh_m, zero, int_constraint_kind::ge, eq->m_dep)); // m < n ⟺ n >= m + 1 if (exp_n) { expr_ref m_plus_1(arith.mk_add(fresh_m, arith.mk_int(1)), m); e->add_side_int(mk_int_constraint(exp_n, m_plus_1, int_constraint_kind::ge, eq->m_dep)); } } // Branch 2: x = u^n · x' (variable extends past full power, non-progress) // Side constraint: n >= 0 { euf::snode* fresh_tail = mk_fresh_var(); // Peel one base unit (approximation of extending past the power) euf::snode* replacement = m_sg.mk_concat(base, fresh_tail); nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, false); nielsen_subst s(var_head, replacement, eq->m_dep); e->add_subst(s); child->apply_subst(m_sg, s); if (exp_n) e->add_side_int(mk_int_constraint(exp_n, zero, int_constraint_kind::ge, eq->m_dep)); } return true; } // ----------------------------------------------------------------------- // Modifier: apply_var_num_unwinding // For a power token u^n facing a variable, branch: // (1) n = 0 → u^n = ε (replace power with empty) // (2) n >= 1 → peel one u from the power // Generates integer side constraints for each branch. // mirrors ZIPT's VarNumUnwindingModifier // ----------------------------------------------------------------------- bool nielsen_graph::apply_var_num_unwinding(nielsen_node* node) { ast_manager& m = m_sg.get_manager(); arith_util arith(m); euf::snode* power = nullptr; euf::snode* var_head = nullptr; str_eq const* eq = nullptr; if (!find_power_vs_var(node, power, var_head, eq)) return false; SASSERT(power->is_power() && power->num_args() >= 1); euf::snode* base = power->arg(0); expr* exp_n = get_power_exponent(power); expr* zero = arith.mk_int(0); expr* one = arith.mk_int(1); // Branch 1: n = 0 → replace u^n with ε (progress) // Side constraint: n = 0 { nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, true); nielsen_subst s(power, m_sg.mk_empty(), eq->m_dep); e->add_subst(s); child->apply_subst(m_sg, s); if (exp_n) e->add_side_int(mk_int_constraint(exp_n, zero, int_constraint_kind::eq, eq->m_dep)); } // Branch 2: n >= 1 → peel one u: u^n → u · u^(n-1) // Side constraint: n >= 1 { euf::snode* fresh = mk_fresh_var(); euf::snode* replacement = m_sg.mk_concat(base, fresh); nielsen_node* child = mk_child(node); nielsen_edge* e = mk_edge(node, child, false); nielsen_subst s(power, replacement, eq->m_dep); e->add_subst(s); child->apply_subst(m_sg, s); if (exp_n) e->add_side_int(mk_int_constraint(exp_n, one, int_constraint_kind::ge, eq->m_dep)); } return true; } void nielsen_graph::collect_conflict_deps(dep_tracker& deps) const { for (nielsen_node const* n : m_nodes) { if (n->eval_idx() != m_run_idx) continue; if (!n->is_currently_conflict()) continue; for (str_eq const& eq : n->str_eqs()) deps.merge(eq.m_dep); for (str_mem const& mem : n->str_mems()) deps.merge(mem.m_dep); } } void nielsen_graph::explain_conflict(unsigned_vector& eq_indices, unsigned_vector& mem_indices) const { SASSERT(m_root); dep_tracker deps; collect_conflict_deps(deps); unsigned_vector bits; deps.get_set_bits(bits); for (unsigned b : bits) { if (b < m_num_input_eqs) eq_indices.push_back(b); else mem_indices.push_back(b - m_num_input_eqs); } } // ----------------------------------------------------------------------- // nielsen_graph: length constraint generation // ----------------------------------------------------------------------- expr_ref nielsen_graph::compute_length_expr(euf::snode* n) { ast_manager& m = m_sg.get_manager(); seq_util& seq = m_sg.get_seq_util(); arith_util arith(m); if (n->is_empty()) return expr_ref(arith.mk_int(0), m); if (n->is_char() || n->is_unit()) return expr_ref(arith.mk_int(1), m); if (n->is_concat()) { expr_ref left = compute_length_expr(n->arg(0)); expr_ref right = compute_length_expr(n->arg(1)); return expr_ref(arith.mk_add(left, right), m); } // for variables and other terms, use symbolic (str.len expr) return expr_ref(seq.str.mk_length(n->get_expr()), m); } void nielsen_graph::generate_length_constraints(vector& constraints) { if (!m_root) return; ast_manager& m = m_sg.get_manager(); uint_set seen_vars; seq_util& seq = m_sg.get_seq_util(); arith_util arith(m); for (str_eq const& eq : m_root->str_eqs()) { if (eq.is_trivial()) continue; expr_ref len_lhs = compute_length_expr(eq.m_lhs); expr_ref len_rhs = compute_length_expr(eq.m_rhs); expr_ref len_eq(m.mk_eq(len_lhs, len_rhs), m); constraints.push_back(length_constraint(len_eq, eq.m_dep, length_kind::eq, m)); // collect variables for non-negativity constraints euf::snode_vector tokens; eq.m_lhs->collect_tokens(tokens); eq.m_rhs->collect_tokens(tokens); for (euf::snode* tok : tokens) { if (tok->is_var() && !seen_vars.contains(tok->id())) { seen_vars.insert(tok->id()); expr_ref len_var(seq.str.mk_length(tok->get_expr()), m); expr_ref ge_zero(arith.mk_ge(len_var, arith.mk_int(0)), m); constraints.push_back(length_constraint(ge_zero, eq.m_dep, length_kind::nonneg, m)); } } } // Parikh interval reasoning for regex memberships: // for each str_mem constraint str in regex, compute length bounds // from the regex structure and generate arithmetic constraints. for (str_mem const& mem : m_root->str_mems()) { expr* re_expr = mem.m_regex->get_expr(); if (!re_expr || !seq.is_re(re_expr)) continue; unsigned min_len = 0, max_len = UINT_MAX; compute_regex_length_interval(mem.m_regex, min_len, max_len); expr_ref len_str = compute_length_expr(mem.m_str); // generate len(str) >= min_len when min_len > 0 if (min_len > 0) { expr_ref bound(arith.mk_ge(len_str, arith.mk_int(min_len)), m); constraints.push_back(length_constraint(bound, mem.m_dep, length_kind::bound, m)); } // generate len(str) <= max_len when bounded if (max_len < UINT_MAX) { expr_ref bound(arith.mk_le(len_str, arith.mk_int(max_len)), m); constraints.push_back(length_constraint(bound, mem.m_dep, length_kind::bound, m)); } // generate len(x) >= 0 for variables in the string side euf::snode_vector tokens; mem.m_str->collect_tokens(tokens); for (euf::snode* tok : tokens) { if (tok->is_var() && !seen_vars.contains(tok->id())) { seen_vars.insert(tok->id()); expr_ref len_var(seq.str.mk_length(tok->get_expr()), m); expr_ref ge_zero(arith.mk_ge(len_var, arith.mk_int(0)), m); constraints.push_back(length_constraint(ge_zero, mem.m_dep, length_kind::nonneg, m)); } } } } void nielsen_graph::compute_regex_length_interval(euf::snode* regex, unsigned& min_len, unsigned& max_len) { seq_util& seq = m_sg.get_seq_util(); expr* e = regex->get_expr(); if (!e || !seq.is_re(e)) { min_len = 0; max_len = UINT_MAX; return; } min_len = seq.re.min_length(e); max_len = seq.re.max_length(e); // for empty language, min_length may be UINT_MAX (vacuously true); // normalize to avoid generating bogus constraints if (min_len > max_len) { min_len = 0; max_len = 0; } } // ----------------------------------------------------------------------- // int_constraint display // ----------------------------------------------------------------------- std::ostream& int_constraint::display(std::ostream& out) const { if (m_lhs) out << mk_pp(m_lhs, m_lhs.get_manager()); else out << "null"; switch (m_kind) { case int_constraint_kind::eq: out << " = "; break; case int_constraint_kind::le: out << " <= "; break; case int_constraint_kind::ge: out << " >= "; break; } if (m_rhs) out << mk_pp(m_rhs, m_rhs.get_manager()); else out << "null"; return out; } // ----------------------------------------------------------------------- // LP integer subsolver implementation // Replaces ZIPT's SubSolver with Z3's native lp::lar_solver. // ----------------------------------------------------------------------- void nielsen_graph::lp_solver_reset() { m_lp_solver = alloc(lp::lar_solver); m_expr2lpvar.reset(); m_lp_ext_cnt = 0; } lp::lpvar nielsen_graph::lp_ensure_var(expr* e) { if (!e) return lp::null_lpvar; ast_manager& m = m_sg.get_manager(); arith_util arith(m); // If already mapped, return cached unsigned eid = e->get_id(); lp::lpvar j; if (m_expr2lpvar.find(eid, j)) return j; // Check if this is a numeral — shouldn't be called with a numeral directly, // but handle it gracefully by creating a fixed variable rational val; if (arith.is_numeral(e, val)) { j = m_lp_solver->add_var(m_lp_ext_cnt++, true); m_lp_solver->add_var_bound(j, lp::EQ, lp::mpq(val)); m_expr2lpvar.insert(eid, j); return j; } // Decompose linear arithmetic: a + b → create LP term (1*a + 1*b) expr* e1 = nullptr; expr* e2 = nullptr; if (arith.is_add(e, e1, e2)) { lp::lpvar j1 = lp_ensure_var(e1); lp::lpvar j2 = lp_ensure_var(e2); if (j1 == lp::null_lpvar || j2 == lp::null_lpvar) return lp::null_lpvar; vector> coeffs; coeffs.push_back({lp::mpq(1), j1}); coeffs.push_back({lp::mpq(1), j2}); j = m_lp_solver->add_term(coeffs, m_lp_ext_cnt++); m_expr2lpvar.insert(eid, j); return j; } // Decompose: a - b → create LP term (1*a + (-1)*b) if (arith.is_sub(e, e1, e2)) { lp::lpvar j1 = lp_ensure_var(e1); lp::lpvar j2 = lp_ensure_var(e2); if (j1 == lp::null_lpvar || j2 == lp::null_lpvar) return lp::null_lpvar; vector> coeffs; coeffs.push_back({lp::mpq(1), j1}); coeffs.push_back({lp::mpq(-1), j2}); j = m_lp_solver->add_term(coeffs, m_lp_ext_cnt++); m_expr2lpvar.insert(eid, j); return j; } // Decompose: c * x where c is numeral → create LP term (c*x) if (arith.is_mul(e, e1, e2)) { rational coeff; if (arith.is_numeral(e1, coeff)) { lp::lpvar jx = lp_ensure_var(e2); if (jx == lp::null_lpvar) return lp::null_lpvar; vector> coeffs; coeffs.push_back({lp::mpq(coeff), jx}); j = m_lp_solver->add_term(coeffs, m_lp_ext_cnt++); m_expr2lpvar.insert(eid, j); return j; } if (arith.is_numeral(e2, coeff)) { lp::lpvar jx = lp_ensure_var(e1); if (jx == lp::null_lpvar) return lp::null_lpvar; vector> coeffs; coeffs.push_back({lp::mpq(coeff), jx}); j = m_lp_solver->add_term(coeffs, m_lp_ext_cnt++); m_expr2lpvar.insert(eid, j); return j; } } // Decompose: -x → create LP term (-1*x) if (arith.is_uminus(e, e1)) { lp::lpvar jx = lp_ensure_var(e1); if (jx == lp::null_lpvar) return lp::null_lpvar; vector> coeffs; coeffs.push_back({lp::mpq(-1), jx}); j = m_lp_solver->add_term(coeffs, m_lp_ext_cnt++); m_expr2lpvar.insert(eid, j); return j; } // Leaf expression (variable, length, or other opaque term): map to fresh LP var j = m_lp_solver->add_var(m_lp_ext_cnt++, true /* is_integer */); m_expr2lpvar.insert(eid, j); return j; } void nielsen_graph::lp_add_constraint(int_constraint const& ic) { if (!m_lp_solver || !ic.m_lhs || !ic.m_rhs) return; ast_manager& m = m_sg.get_manager(); arith_util arith(m); // We handle simple patterns: // expr numeral → add_var_bound // expr expr → create term (lhs - rhs) and bound to 0 rational val; bool is_num_rhs = arith.is_numeral(ic.m_rhs, val); if (is_num_rhs && (is_app(ic.m_lhs))) { // Simple case: single variable or len(x) vs numeral lp::lpvar j = lp_ensure_var(ic.m_lhs); if (j == lp::null_lpvar) return; lp::mpq rhs_val(val); switch (ic.m_kind) { case int_constraint_kind::eq: m_lp_solver->add_var_bound(j, lp::EQ, rhs_val); break; case int_constraint_kind::le: m_lp_solver->add_var_bound(j, lp::LE, rhs_val); break; case int_constraint_kind::ge: m_lp_solver->add_var_bound(j, lp::GE, rhs_val); break; } return; } // General case: create term lhs - rhs and bound it // For now, handle single variable on LHS vs single variable on RHS // (covers the common case of len(x) = len(y), n >= 0, etc.) bool is_num_lhs = arith.is_numeral(ic.m_lhs, val); if (is_num_lhs) { // numeral expr → reverse and flip kind lp::lpvar j = lp_ensure_var(ic.m_rhs); if (j == lp::null_lpvar) return; lp::mpq lhs_val(val); switch (ic.m_kind) { case int_constraint_kind::eq: m_lp_solver->add_var_bound(j, lp::EQ, lhs_val); break; case int_constraint_kind::le: // val <= rhs ⇔ rhs >= val m_lp_solver->add_var_bound(j, lp::GE, lhs_val); break; case int_constraint_kind::ge: // val >= rhs ⇔ rhs <= val m_lp_solver->add_var_bound(j, lp::LE, lhs_val); break; } return; } // Both sides are expressions: create term (lhs - rhs) with bound 0 lp::lpvar j_lhs = lp_ensure_var(ic.m_lhs); lp::lpvar j_rhs = lp_ensure_var(ic.m_rhs); if (j_lhs == lp::null_lpvar || j_rhs == lp::null_lpvar) return; // Create a term: 1*lhs + (-1)*rhs vector> coeffs; coeffs.push_back({lp::mpq(1), j_lhs}); coeffs.push_back({lp::mpq(-1), j_rhs}); lp::lpvar term_j = m_lp_solver->add_term(coeffs, m_lp_ext_cnt++); lp::mpq zero(0); switch (ic.m_kind) { case int_constraint_kind::eq: m_lp_solver->add_var_bound(term_j, lp::EQ, zero); break; case int_constraint_kind::le: m_lp_solver->add_var_bound(term_j, lp::LE, zero); break; case int_constraint_kind::ge: m_lp_solver->add_var_bound(term_j, lp::GE, zero); break; } } void nielsen_graph::collect_path_int_constraints(nielsen_node* node, svector const& cur_path, vector& out) { // collect from root node if (m_root) { for (auto const& ic : m_root->int_constraints()) out.push_back(ic); } // collect from edges along the path and their target nodes for (nielsen_edge* e : cur_path) { for (auto const& ic : e->side_int()) out.push_back(ic); if (e->tgt()) { for (auto const& ic : e->tgt()->int_constraints()) out.push_back(ic); } } } bool nielsen_graph::check_int_feasibility(nielsen_node* node, svector const& cur_path) { vector constraints; collect_path_int_constraints(node, cur_path, constraints); if (constraints.empty()) return true; // no integer constraints, trivially feasible lp_solver_reset(); for (auto const& ic : constraints) lp_add_constraint(ic); lp::lp_status status = m_lp_solver->find_feasible_solution(); return status != lp::lp_status::INFEASIBLE; } int_constraint nielsen_graph::mk_int_constraint(expr* lhs, expr* rhs, int_constraint_kind kind, dep_tracker const& dep) { return int_constraint(lhs, rhs, kind, dep, m_sg.get_manager()); } expr* nielsen_graph::get_power_exponent(euf::snode* power) { if (!power || !power->is_power()) return nullptr; if (power->num_args() < 2) return nullptr; euf::snode* exp_snode = power->arg(1); return exp_snode ? exp_snode->get_expr() : nullptr; } expr_ref nielsen_graph::mk_fresh_int_var() { ast_manager& m = m_sg.get_manager(); arith_util arith(m); std::string name = "n!" + std::to_string(m_fresh_cnt++); return expr_ref(m.mk_fresh_const(name.c_str(), arith.mk_int()), m); } }