3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2026-03-22 12:38:52 +00:00

Implement ZIPT code review improvements: add euf_sgraph/seq_plugin files and apply 3 fixes

Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot] 2026-03-03 03:41:13 +00:00
parent 50ace4471f
commit 008ef90d4d
18 changed files with 3624 additions and 2 deletions

View file

@ -9,6 +9,8 @@ z3_add_component(euf
euf_justification.cpp
euf_mam.cpp
euf_plugin.cpp
euf_seq_plugin.cpp
euf_sgraph.cpp
euf_specrel_plugin.cpp
ho_matcher.cpp
COMPONENT_DEPENDENCIES

View file

@ -0,0 +1,296 @@
/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
euf_seq_plugin.cpp
Abstract:
Plugin structure for sequences/strings.
Merges equivalence classes taking into account associativity
of concatenation and algebraic properties of strings and
regular expressions.
Author:
Nikolaj Bjorner (nbjorner) 2026-03-01
Clemens Eisenhofer 2026-03-01
--*/
#include "ast/euf/euf_seq_plugin.h"
#include "ast/euf/euf_egraph.h"
#include "ast/euf/euf_sgraph.h"
#include "ast/ast_pp.h"
namespace euf {
// Check if enode is any kind of concat (str.++ or re.++)
static bool is_any_concat(enode* n, seq_util const& seq) {
expr* a = nullptr, *b = nullptr;
return seq.str.is_concat(n->get_expr(), a, b) || seq.re.is_concat(n->get_expr(), a, b);
}
// Collect leaves of a concat tree in left-to-right order.
// For non-concat nodes, the node itself is a leaf.
// Handles both str.++ and re.++.
static void collect_enode_leaves(enode* n, seq_util const& seq, enode_vector& leaves) {
if (is_any_concat(n, seq)) {
collect_enode_leaves(n->get_arg(0), seq, leaves);
collect_enode_leaves(n->get_arg(1), seq, leaves);
}
else {
leaves.push_back(n);
}
}
unsigned enode_concat_hash::operator()(enode* n) const {
snode* sn = sg.find(n->get_expr());
if (sn && sn->has_cached_hash())
return sn->assoc_hash();
if (!is_any_concat(n, seq))
return n->get_id();
enode_vector leaves;
collect_enode_leaves(n, seq, leaves);
unsigned h = 0;
for (enode* l : leaves)
h = combine_hash(h, l->get_id());
return h;
}
bool enode_concat_eq::operator()(enode* a, enode* b) const {
if (a == b) return true;
if (!is_any_concat(a, seq) || !is_any_concat(b, seq))
return false;
enode_vector la, lb;
collect_enode_leaves(a, seq, la);
collect_enode_leaves(b, seq, lb);
if (la.size() != lb.size())
return false;
for (unsigned i = 0; i < la.size(); ++i)
if (la[i] != lb[i])
return false;
return true;
}
seq_plugin::seq_plugin(egraph& g, sgraph* sg):
plugin(g),
m_seq(g.get_manager()),
m_rewriter(g.get_manager()),
m_sg(sg ? *sg : *alloc(sgraph, g.get_manager(), g, false)),
m_sg_owned(sg == nullptr),
m_concat_hash(m_seq, m_sg),
m_concat_eq(m_seq),
m_concat_table(DEFAULT_HASHTABLE_INITIAL_CAPACITY, m_concat_hash, m_concat_eq) {
}
seq_plugin::~seq_plugin() {
if (m_sg_owned)
dealloc(&m_sg);
}
void seq_plugin::register_node(enode* n) {
m_queue.push_back(n);
push_undo(undo_kind::undo_add_concat);
}
void seq_plugin::merge_eh(enode* n1, enode* n2) {
m_queue.push_back(enode_pair(n1, n2));
push_undo(undo_kind::undo_add_concat);
}
void seq_plugin::push_undo(undo_kind k) {
m_undo.push_back(k);
push_plugin_undo(get_id());
}
void seq_plugin::propagate() {
if (m_qhead == m_queue.size())
return;
for (; m_qhead < m_queue.size(); ++m_qhead) {
if (g.inconsistent())
break;
if (std::holds_alternative<enode*>(m_queue[m_qhead])) {
auto n = std::get<enode*>(m_queue[m_qhead]);
propagate_register_node(n);
}
else {
auto [a, b] = std::get<enode_pair>(m_queue[m_qhead]);
propagate_merge(a, b);
}
}
}
void seq_plugin::propagate_register_node(enode* n) {
if (!m_seq.is_seq(n->get_expr()) && !m_seq.is_re(n->get_expr()))
return;
TRACE(seq, tout << "seq register " << g.bpp(n) << "\n");
if (is_concat(n)) {
propagate_assoc(n);
propagate_simplify(n);
}
// str.++ identity: concat(a, ε) = a, concat(ε, b) = b
enode* a, *b;
if (is_str_concat(n, a, b)) {
if (is_str_empty(a))
push_merge(n, b);
else if (is_str_empty(b))
push_merge(n, a);
}
// re.++ identity: concat(a, epsilon) = a, concat(epsilon, b) = b
// re.++ absorption: concat(a, ∅) = ∅, concat(∅, b) = ∅
if (is_re_concat(n, a, b)) {
if (is_re_epsilon(a))
push_merge(n, b);
else if (is_re_epsilon(b))
push_merge(n, a);
else if (is_re_empty(a))
push_merge(n, a);
else if (is_re_empty(b))
push_merge(n, b);
}
}
void seq_plugin::propagate_merge(enode* a, enode* b) {
if (!m_seq.is_seq(a->get_expr()) && !m_seq.is_re(a->get_expr()))
return;
TRACE(seq, tout << "seq merge " << g.bpp(a) << " == " << g.bpp(b) << "\n");
// when equivalence classes merge, re-check concat simplifications
for (enode* n : enode_class(a)) {
if (is_concat(n))
propagate_simplify(n);
}
}
//
// Concat associativity:
// Instead of creating new expressions, maintain a hash table
// that respects associativity. When a concat is registered,
// look up existing concats with the same leaf sequence.
// If found, merge the existing node with the new one.
//
void seq_plugin::propagate_assoc(enode* n) {
if (!is_concat(n))
return;
enode* existing = nullptr;
if (m_concat_table.find(n, existing)) {
if (existing != n)
push_merge(n, existing);
}
else {
m_concat_table.insert(n);
m_concats.push_back(n);
push_undo(undo_kind::undo_add_to_table);
}
}
//
// Concat simplification rules from ZIPT:
//
// 1. Kleene star merging: concat(u, v*, v*, w) = concat(u, v*, w)
// when adjacent children in a concat chain have congruent star bodies.
//
// 2. Nullable absorption: concat(u, .*, v, w) = concat(u, .*, w)
// when v is nullable and adjacent to full_seq (.*).
//
void seq_plugin::propagate_simplify(enode* n) {
enode* a, *b;
if (!is_concat(n, a, b))
return;
// Rule 1: Kleene star merging
// concat(v*, v*) = v*
if (same_star_body(a, b))
push_merge(n, a);
// Rule 1 extended (right): concat(v*, concat(v*, c)) = concat(v*, c)
enode* b1, *b2;
if (is_concat(b, b1, b2) && same_star_body(a, b1))
push_merge(n, b);
// Rule 1 extended (left): concat(concat(c, v*), v*) = concat(c, v*)
enode* a1, *a2;
if (is_concat(a, a1, a2) && same_star_body(a2, b))
push_merge(n, a);
// Rule 2: Nullable absorption by .*
// concat(.*, v) = .* when v is nullable
if (is_full_seq(a) && is_nullable(b))
push_merge(n, a);
// concat(v, .*) = .* when v is nullable
if (is_nullable(a) && is_full_seq(b))
push_merge(n, b);
// concat(.*, concat(v, w)) = concat(.*, w) when v nullable
// handled by associativity + nullable absorption on sub-concats
// concat(concat(u, v), .*) = concat(u, .*) when v nullable
// handled by associativity + nullable absorption on sub-concats
}
bool seq_plugin::is_nullable(expr* e) {
expr_ref result = m_rewriter.is_nullable(e);
return g.get_manager().is_true(result);
}
bool seq_plugin::same_star_body(enode* a, enode* b) {
if (!is_star(a) || !is_star(b))
return false;
// re.star(x) and re.star(y) have congruent bodies if x ~ y
return a->get_arg(0)->get_root() == b->get_arg(0)->get_root();
}
bool seq_plugin::same_loop_body(enode* a, enode* b,
unsigned& lo1, unsigned& hi1,
unsigned& lo2, unsigned& hi2) {
if (!is_loop(a) || !is_loop(b))
return false;
expr* body_a, *body_b;
if (!m_seq.re.is_loop(a->get_expr(), body_a, lo1, hi1))
return false;
if (!m_seq.re.is_loop(b->get_expr(), body_b, lo2, hi2))
return false;
enode* na = g.find(body_a);
enode* nb = g.find(body_b);
if (!na || !nb)
return false;
return na->get_root() == nb->get_root();
}
void seq_plugin::undo() {
auto k = m_undo.back();
m_undo.pop_back();
switch (k) {
case undo_kind::undo_add_concat:
SASSERT(!m_queue.empty());
m_queue.pop_back();
if (m_qhead > m_queue.size())
m_qhead = m_queue.size();
break;
case undo_kind::undo_add_to_table:
SASSERT(!m_concats.empty());
m_concat_table.remove(m_concats.back());
m_concats.pop_back();
break;
}
}
std::ostream& seq_plugin::display(std::ostream& out) const {
out << "seq-plugin\n";
return out;
}
void seq_plugin::collect_statistics(statistics& st) const {
// statistics are collected by sgraph which owns us
}
}

View file

@ -0,0 +1,175 @@
/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
euf_seq_plugin.h
Abstract:
Plugin structure for sequences/strings.
Merges equivalence classes taking into account associativity
of concatenation and algebraic properties of strings and
regular expressions. Implements features from ZIPT:
-- Concat associativity: str.++ is associative, so
concat(a, concat(b, c)) = concat(concat(a, b), c).
Handled via an AC-style plugin for the concat operator.
-- Kleene star merging: adjacent identical Kleene stars
in a concatenation are collapsed, u.v*.v*.w = u.v*.w
-- Loop merging: adjacent loops over the same body are
merged, v{l1,h1}.v{l2,h2} = v{l1+l2,h1+h2}
-- Nullable absorption: a nullable token adjacent to .*
is absorbed, u.*.v.w = u.*.w when v is nullable.
The plugin integrates with euf_egraph for congruence closure.
Node registration in sgraph is handled by sgraph itself via
the egraph's on_make callback, not by the plugin.
Author:
Nikolaj Bjorner (nbjorner) 2026-03-01
Clemens Eisenhofer 2026-03-01
--*/
#pragma once
#include "ast/seq_decl_plugin.h"
#include "ast/rewriter/seq_rewriter.h"
#include "ast/euf/euf_plugin.h"
#include "util/hashtable.h"
namespace euf {
class egraph;
class sgraph;
// Associativity-respecting hash for enode concat trees.
// Uses cached snode hash matrices from the sgraph for O(1) hashing.
// Handles both str.++ (OP_SEQ_CONCAT) and re.++ (OP_RE_CONCAT).
struct enode_concat_hash {
seq_util const& seq;
sgraph& sg;
enode_concat_hash(seq_util const& s, sgraph& sg) : seq(s), sg(sg) {}
unsigned operator()(enode* n) const;
};
// Associativity-respecting equality for enode concat trees.
// Handles both str.++ (OP_SEQ_CONCAT) and re.++ (OP_RE_CONCAT).
struct enode_concat_eq {
seq_util const& seq;
enode_concat_eq(seq_util const& s) : seq(s) {}
bool operator()(enode* a, enode* b) const;
};
class seq_plugin : public plugin {
enum class undo_kind {
undo_add_concat,
undo_add_to_table,
};
seq_util m_seq;
seq_rewriter m_rewriter;
sgraph& m_sg;
bool m_sg_owned = false; // whether we own the sgraph
svector<undo_kind> m_undo;
// queue of merges and registrations to process
vector<std::variant<enode*, enode_pair>> m_queue;
unsigned m_qhead = 0;
// track registered concat nodes for simplification
enode_vector m_concats;
// associativity-respecting hash table for concat nodes
enode_concat_hash m_concat_hash;
enode_concat_eq m_concat_eq;
hashtable<enode*, enode_concat_hash, enode_concat_eq> m_concat_table;
// string concat predicates
bool is_str_concat(enode* n) const { return m_seq.str.is_concat(n->get_expr()); }
bool is_str_concat(enode* n, enode*& a, enode*& b) {
expr* ea = nullptr, *eb = nullptr;
return m_seq.str.is_concat(n->get_expr(), ea, eb) &&
n->num_args() == 2 &&
(a = n->get_arg(0), b = n->get_arg(1), true);
}
// regex concat predicates
bool is_re_concat(enode* n) const { return m_seq.re.is_concat(n->get_expr()); }
bool is_re_concat(enode* n, enode*& a, enode*& b) {
expr* ea = nullptr, *eb = nullptr;
return m_seq.re.is_concat(n->get_expr(), ea, eb) &&
n->num_args() == 2 &&
(a = n->get_arg(0), b = n->get_arg(1), true);
}
// any concat, string or regex
bool is_concat(enode* n) const { return is_str_concat(n) || is_re_concat(n); }
bool is_concat(enode* n, enode*& a, enode*& b) {
return is_str_concat(n, a, b) || is_re_concat(n, a, b);
}
bool is_star(enode* n) const { return m_seq.re.is_star(n->get_expr()); }
bool is_loop(enode* n) const { return m_seq.re.is_loop(n->get_expr()); }
// string empty: ε for str.++
bool is_str_empty(enode* n) const { return m_seq.str.is_empty(n->get_expr()); }
// regex empty set: ∅ for re.++ (absorbing element)
bool is_re_empty(enode* n) const { return m_seq.re.is_empty(n->get_expr()); }
// regex epsilon: to_re("") for re.++ (identity element)
bool is_re_epsilon(enode* n) const { return m_seq.re.is_epsilon(n->get_expr()); }
bool is_to_re(enode* n) const { return m_seq.re.is_to_re(n->get_expr()); }
bool is_full_seq(enode* n) const { return m_seq.re.is_full_seq(n->get_expr()); }
void push_undo(undo_kind k);
void propagate_register_node(enode* n);
void propagate_merge(enode* a, enode* b);
// concat associativity: maintain hash table of concat nodes,
// merge nodes that are equal modulo associativity
void propagate_assoc(enode* n);
// concat simplification:
// merging Kleene stars, merging loops, absorbing nullables
void propagate_simplify(enode* n);
// check if expression is nullable using existing seq_rewriter
bool is_nullable(expr* e);
bool is_nullable(enode* n) { return is_nullable(n->get_expr()); }
// check if two enodes have congruent star bodies
bool same_star_body(enode* a, enode* b);
// check if two enodes have congruent loop bodies and extract bounds
bool same_loop_body(enode* a, enode* b, unsigned& lo1, unsigned& hi1, unsigned& lo2, unsigned& hi2);
public:
seq_plugin(egraph& g, sgraph* sg = nullptr);
~seq_plugin() override;
theory_id get_id() const override { return m_seq.get_family_id(); }
void register_node(enode* n) override;
void merge_eh(enode* n1, enode* n2) override;
void diseq_eh(enode*) override {}
void propagate() override;
void undo() override;
std::ostream& display(std::ostream& out) const override;
void collect_statistics(statistics& st) const override;
};
}

637
src/ast/euf/euf_sgraph.cpp Normal file
View file

@ -0,0 +1,637 @@
/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
euf_sgraph.cpp
Abstract:
Sequence/string graph implementation
Author:
Nikolaj Bjorner (nbjorner) 2026-03-01
Clemens Eisenhofer 2026-03-01
--*/
#include "ast/euf/euf_sgraph.h"
#include "ast/euf/euf_seq_plugin.h"
#include "ast/arith_decl_plugin.h"
#include "ast/ast_pp.h"
namespace euf {
// substitution cache stored on snode for ZIPT-style optimization
struct snode_subst_cache {
struct entry {
unsigned var_id;
unsigned repl_id;
snode* result;
};
svector<entry> m_entries;
snode* find(unsigned var_id, unsigned repl_id) const {
for (auto const& e : m_entries)
if (e.var_id == var_id && e.repl_id == repl_id)
return e.result;
return nullptr;
}
void insert(unsigned var_id, unsigned repl_id, snode* result) {
m_entries.push_back({var_id, repl_id, result});
}
};
sgraph::sgraph(ast_manager& m, egraph& eg, bool add_plugin):
m(m),
m_seq(m),
m_rewriter(m),
m_egraph(eg),
m_str_sort(m_seq.str.mk_string_sort(), m),
m_add_plugin(add_plugin) {
// create seq_plugin and register it with the egraph
if (add_plugin)
m_egraph.add_plugin(alloc(seq_plugin, m_egraph, this));
// register on_make callback so sgraph creates snodes for new enodes
std::function<void(enode*)> on_make = [this](enode* n) {
expr* e = n->get_expr();
if (m_seq.is_seq(e) || m_seq.is_re(e))
mk(e);
};
m_egraph.set_on_make(on_make);
}
sgraph::~sgraph() {
for (auto* c : m_subst_caches)
dealloc(c);
}
snode_kind sgraph::classify(expr* e) const {
if (!is_app(e))
return snode_kind::s_other;
if (m_seq.str.is_empty(e))
return snode_kind::s_empty;
if (m_seq.str.is_string(e)) {
zstring s;
if (m_seq.str.is_string(e, s) && s.empty())
return snode_kind::s_empty;
return snode_kind::s_other;
}
if (m_seq.str.is_concat(e))
return snode_kind::s_concat;
if (m_seq.str.is_unit(e)) {
expr* ch = to_app(e)->get_arg(0);
if (m_seq.is_const_char(ch))
return snode_kind::s_char;
return snode_kind::s_unit;
}
if (m_seq.str.is_power(e))
return snode_kind::s_power;
if (m_seq.re.is_star(e))
return snode_kind::s_star;
if (m_seq.re.is_loop(e))
return snode_kind::s_loop;
if (m_seq.re.is_union(e))
return snode_kind::s_union;
if (m_seq.re.is_intersection(e))
return snode_kind::s_intersect;
if (m_seq.re.is_complement(e))
return snode_kind::s_complement;
if (m_seq.re.is_empty(e))
return snode_kind::s_fail;
if (m_seq.re.is_full_char(e))
return snode_kind::s_full_char;
if (m_seq.re.is_full_seq(e))
return snode_kind::s_full_seq;
if (m_seq.re.is_to_re(e))
return snode_kind::s_to_re;
if (m_seq.str.is_in_re(e))
return snode_kind::s_in_re;
// uninterpreted constants of string sort are variables
if (is_uninterp_const(e) && m_seq.is_seq(e->get_sort()))
return snode_kind::s_var;
return snode_kind::s_other;
}
void sgraph::compute_metadata(snode* n) {
switch (n->m_kind) {
case snode_kind::s_empty:
n->m_ground = true;
n->m_regex_free = true;
n->m_nullable = true;
n->m_level = 0;
n->m_length = 0;
break;
case snode_kind::s_char:
n->m_ground = true;
n->m_regex_free = true;
n->m_nullable = false;
n->m_level = 1;
n->m_length = 1;
break;
case snode_kind::s_var:
n->m_ground = false;
n->m_regex_free = true;
n->m_nullable = false;
n->m_level = 1;
n->m_length = 1;
break;
case snode_kind::s_unit:
n->m_ground = n->num_args() > 0 ? n->arg(0)->is_ground() : true;
n->m_regex_free = true;
n->m_nullable = false;
n->m_level = 1;
n->m_length = 1;
break;
case snode_kind::s_concat: {
SASSERT(n->num_args() == 2);
snode* l = n->arg(0);
snode* r = n->arg(1);
n->m_ground = l->is_ground() && r->is_ground();
n->m_regex_free = l->is_regex_free() && r->is_regex_free();
n->m_nullable = l->is_nullable() && r->is_nullable();
n->m_level = std::max(l->level(), r->level()) + 1;
n->m_length = l->length() + r->length();
++m_stats.m_num_concat;
break;
}
case snode_kind::s_power: {
// s^n: nullable follows base, consistent with ZIPT's PowerToken
// the exponent n is assumed to be a symbolic integer, may or may not be zero
SASSERT(n->num_args() >= 1);
snode* base = n->arg(0);
n->m_ground = base->is_ground();
n->m_regex_free = base->is_regex_free();
n->m_nullable = base->is_nullable();
n->m_level = 1;
n->m_length = 1;
++m_stats.m_num_power;
break;
}
case snode_kind::s_star:
SASSERT(n->num_args() == 1);
n->m_ground = n->arg(0)->is_ground();
n->m_regex_free = false;
n->m_nullable = true;
n->m_level = 1;
n->m_length = 1;
break;
case snode_kind::s_loop: {
bool base_nullable = n->num_args() > 0 && n->arg(0)->is_nullable();
unsigned lo = 0, hi = 0;
expr* body = nullptr;
bool lo_zero = n->get_expr() && m_seq.re.is_loop(n->get_expr(), body, lo, hi) && lo == 0;
n->m_ground = n->num_args() > 0 ? n->arg(0)->is_ground() : true;
n->m_regex_free = false;
n->m_nullable = lo_zero || base_nullable;
n->m_level = 1;
n->m_length = 1;
break;
}
case snode_kind::s_union:
SASSERT(n->num_args() == 2);
n->m_ground = n->arg(0)->is_ground() && n->arg(1)->is_ground();
n->m_regex_free = false;
n->m_nullable = n->arg(0)->is_nullable() || n->arg(1)->is_nullable();
n->m_level = 1;
n->m_length = 1;
break;
case snode_kind::s_intersect:
SASSERT(n->num_args() == 2);
n->m_ground = n->arg(0)->is_ground() && n->arg(1)->is_ground();
n->m_regex_free = false;
n->m_nullable = n->arg(0)->is_nullable() && n->arg(1)->is_nullable();
n->m_level = 1;
n->m_length = 1;
break;
case snode_kind::s_complement:
SASSERT(n->num_args() == 1);
n->m_ground = n->arg(0)->is_ground();
n->m_regex_free = false;
n->m_nullable = !n->arg(0)->is_nullable();
n->m_level = 1;
n->m_length = 1;
break;
case snode_kind::s_fail:
n->m_ground = true;
n->m_regex_free = false;
n->m_nullable = false;
n->m_level = 1;
n->m_length = 1;
break;
case snode_kind::s_full_char:
n->m_ground = true;
n->m_regex_free = false;
n->m_nullable = false;
n->m_level = 1;
n->m_length = 1;
break;
case snode_kind::s_full_seq:
n->m_ground = true;
n->m_regex_free = false;
n->m_nullable = true;
n->m_level = 1;
n->m_length = 1;
break;
case snode_kind::s_to_re:
SASSERT(n->num_args() == 1);
n->m_ground = n->arg(0)->is_ground();
n->m_regex_free = false;
n->m_nullable = n->arg(0)->is_nullable();
n->m_level = 1;
n->m_length = 1;
break;
case snode_kind::s_in_re:
SASSERT(n->num_args() == 2);
n->m_ground = n->arg(0)->is_ground() && n->arg(1)->is_ground();
n->m_regex_free = false;
n->m_nullable = false;
n->m_level = 1;
n->m_length = 1;
break;
default:
n->m_ground = true;
n->m_regex_free = true;
n->m_nullable = false;
n->m_level = 1;
n->m_length = 1;
break;
}
}
static const unsigned HASH_BASE = 31;
// Compute a 2x2 polynomial hash matrix for associativity-respecting hashing.
// Unsigned overflow is intentional and well-defined (mod 2^32).
// M[0][0] tracks HASH_BASE^(num_leaves), which is always nonzero since
// HASH_BASE is odd. M[0][1] is the actual hash value.
void sgraph::compute_hash_matrix(snode* n) {
if (n->is_empty()) {
// identity matrix: concat with empty is identity
n->m_hash_matrix[0][0] = 1;
n->m_hash_matrix[0][1] = 0;
n->m_hash_matrix[1][0] = 0;
n->m_hash_matrix[1][1] = 1;
}
else if (n->is_concat()) {
snode* l = n->arg(0);
snode* r = n->arg(1);
if (l->has_cached_hash() && r->has_cached_hash()) {
// 2x2 matrix multiplication: M(L) * M(R)
n->m_hash_matrix[0][0] = l->m_hash_matrix[0][0] * r->m_hash_matrix[0][0] + l->m_hash_matrix[0][1] * r->m_hash_matrix[1][0];
n->m_hash_matrix[0][1] = l->m_hash_matrix[0][0] * r->m_hash_matrix[0][1] + l->m_hash_matrix[0][1] * r->m_hash_matrix[1][1];
n->m_hash_matrix[1][0] = l->m_hash_matrix[1][0] * r->m_hash_matrix[0][0] + l->m_hash_matrix[1][1] * r->m_hash_matrix[1][0];
n->m_hash_matrix[1][1] = l->m_hash_matrix[1][0] * r->m_hash_matrix[0][1] + l->m_hash_matrix[1][1] * r->m_hash_matrix[1][1];
}
}
else {
// leaf/token: [[HASH_BASE, value], [0, 1]]
// +1 avoids zero hash values; wraps safely on unsigned overflow
unsigned v = n->get_expr() ? n->get_expr()->get_id() + 1 : n->id() + 1;
n->m_hash_matrix[0][0] = HASH_BASE;
n->m_hash_matrix[0][1] = v;
n->m_hash_matrix[1][0] = 0;
n->m_hash_matrix[1][1] = 1;
}
}
snode* sgraph::mk_snode(expr* e, snode_kind k, unsigned num_args, snode* const* args) {
unsigned id = m_nodes.size();
snode* n = snode::mk(m_region, e, k, id, num_args, args);
compute_metadata(n);
compute_hash_matrix(n);
m_nodes.push_back(n);
if (e) {
unsigned eid = e->get_id();
m_expr2snode.reserve(eid + 1, nullptr);
m_expr2snode[eid] = n;
// pin expression via egraph (the egraph has an expr trail)
mk_enode(e);
}
++m_stats.m_num_nodes;
return n;
}
snode* sgraph::mk(expr* e) {
SASSERT(e);
snode* n = find(e);
if (n)
return n;
snode_kind k = classify(e);
if (!is_app(e))
return mk_snode(e, k, 0, nullptr);
app* a = to_app(e);
unsigned arity = a->get_num_args();
// recursively register children
// for seq/re children, create classified snodes
// for other children (e.g. integer exponents), create s_other snodes
snode_vector child_nodes;
for (unsigned i = 0; i < arity; ++i) {
expr* ch = a->get_arg(i);
snode* cn = mk(ch);
child_nodes.push_back(cn);
}
return mk_snode(e, k, child_nodes.size(), child_nodes.data());
}
snode* sgraph::find(expr* e) const {
if (!e)
return nullptr;
unsigned eid = e->get_id();
if (eid < m_expr2snode.size())
return m_expr2snode[eid];
return nullptr;
}
enode* sgraph::mk_enode(expr* e) {
enode* n = m_egraph.find(e);
if (n) return n;
enode_vector args;
if (is_app(e)) {
for (expr* arg : *to_app(e)) {
enode* a = mk_enode(arg);
args.push_back(a);
}
}
return m_egraph.mk(e, 0, args.size(), args.data());
}
void sgraph::push() {
m_scopes.push_back(m_nodes.size());
++m_num_scopes;
m_egraph.push();
}
void sgraph::pop(unsigned num_scopes) {
if (num_scopes == 0)
return;
SASSERT(num_scopes <= m_num_scopes);
unsigned new_lvl = m_num_scopes - num_scopes;
unsigned old_sz = m_scopes[new_lvl];
for (unsigned i = m_nodes.size(); i-- > old_sz; ) {
snode* n = m_nodes[i];
if (n->get_expr()) {
unsigned eid = n->get_expr()->get_id();
if (eid < m_expr2snode.size())
m_expr2snode[eid] = nullptr;
}
}
m_nodes.shrink(old_sz);
m_scopes.shrink(new_lvl);
m_num_scopes = new_lvl;
m_egraph.pop(num_scopes);
}
snode* sgraph::mk_var(symbol const& name) {
expr_ref e(m.mk_const(name, m_str_sort), m);
return mk(e);
}
snode* sgraph::mk_char(unsigned ch) {
expr_ref c(m_seq.str.mk_char(ch), m);
expr_ref u(m_seq.str.mk_unit(c), m);
return mk(u);
}
snode* sgraph::mk_empty() {
expr_ref e(m_seq.str.mk_empty(m_str_sort), m);
return mk(e);
}
snode* sgraph::mk_concat(snode* a, snode* b) {
if (a->is_empty()) return b;
if (b->is_empty()) return a;
expr_ref e(m_seq.str.mk_concat(a->get_expr(), b->get_expr()), m);
return mk(e);
}
snode* sgraph::drop_first(snode* n) {
if (n->is_empty() || n->is_token())
return mk_empty();
SASSERT(n->is_concat());
snode* l = n->arg(0);
snode* r = n->arg(1);
if (l->is_token() || l->is_empty())
return r;
return mk_concat(drop_first(l), r);
}
snode* sgraph::drop_last(snode* n) {
if (n->is_empty() || n->is_token())
return mk_empty();
SASSERT(n->is_concat());
snode* l = n->arg(0);
snode* r = n->arg(1);
if (r->is_token() || r->is_empty())
return l;
return mk_concat(l, drop_last(r));
}
snode* sgraph::drop_left(snode* n, unsigned count) {
if (count == 0 || n->is_empty()) return n;
if (count >= n->length()) return mk_empty();
for (unsigned i = 0; i < count; ++i)
n = drop_first(n);
return n;
}
snode* sgraph::drop_right(snode* n, unsigned count) {
if (count == 0 || n->is_empty()) return n;
if (count >= n->length()) return mk_empty();
for (unsigned i = 0; i < count; ++i)
n = drop_last(n);
return n;
}
snode* sgraph::subst(snode* n, snode* var, snode* replacement) {
if (n == var)
return replacement;
if (n->is_empty() || n->is_char())
return n;
if (n->is_concat()) {
// check substitution cache (ZIPT-style optimization)
if (n->m_subst_cache) {
snode* cached = n->m_subst_cache->find(var->id(), replacement->id());
if (cached)
return cached;
}
snode* result = mk_concat(subst(n->arg(0), var, replacement),
subst(n->arg(1), var, replacement));
// cache the result
if (!n->m_subst_cache) {
n->m_subst_cache = alloc(snode_subst_cache);
m_subst_caches.push_back(n->m_subst_cache);
}
n->m_subst_cache->insert(var->id(), replacement->id(), result);
return result;
}
// for non-concat compound nodes (power, star, etc.), no substitution into children
return n;
}
snode* sgraph::brzozowski_deriv(snode* re, snode* elem) {
expr* re_expr = re->get_expr();
expr* elem_expr = elem->get_expr();
if (!re_expr || !elem_expr)
return nullptr;
// unwrap str.unit to get the character expression
expr* ch = nullptr;
if (m_seq.str.is_unit(elem_expr, ch))
elem_expr = ch;
expr_ref result = m_rewriter.mk_derivative(elem_expr, re_expr);
if (!result)
return nullptr;
return mk(result);
}
void sgraph::collect_re_predicates(snode* re, expr_ref_vector& preds) {
if (!re || !re->get_expr())
return;
expr* e = re->get_expr();
expr* ch = nullptr, *lo = nullptr, *hi = nullptr;
// leaf regex predicates: character ranges and single characters
if (m_seq.re.is_range(e, lo, hi)) {
preds.push_back(e);
return;
}
if (m_seq.re.is_to_re(e))
return;
if (m_seq.re.is_full_char(e))
return;
if (m_seq.re.is_full_seq(e))
return;
if (m_seq.re.is_empty(e))
return;
// recurse into compound regex operators
for (unsigned i = 0; i < re->num_args(); ++i)
collect_re_predicates(re->arg(i), preds);
}
void sgraph::compute_minterms(snode* re, snode_vector& minterms) {
// extract character predicates from the regex
expr_ref_vector preds(m);
collect_re_predicates(re, preds);
if (preds.empty()) {
// no predicates means the whole alphabet is one minterm
// represented by full_char
expr_ref fc(m_seq.re.mk_full_char(m_str_sort), m);
minterms.push_back(mk(fc));
return;
}
// generate minterms as conjunctions/negations of predicates
// for n predicates, there are up to 2^n minterms
unsigned n = preds.size();
// cap at reasonable size to prevent exponential blowup
if (n > 20)
n = 20;
for (unsigned mask = 0; mask < (1u << n); ++mask) {
expr_ref_vector conj(m);
for (unsigned i = 0; i < n; ++i) {
if (mask & (1u << i))
conj.push_back(preds.get(i));
else
conj.push_back(m_seq.re.mk_complement(preds.get(i)));
}
SASSERT(!conj.empty());
// intersect all terms
expr_ref mt(conj.get(0), m);
for (unsigned i = 1; i < conj.size(); ++i)
mt = m_seq.re.mk_inter(mt, conj.get(i));
minterms.push_back(mk(mt));
}
}
std::ostream& sgraph::display(std::ostream& out) const {
auto kind_str = [](snode_kind k) -> char const* {
switch (k) {
case snode_kind::s_empty: return "empty";
case snode_kind::s_char: return "char";
case snode_kind::s_var: return "var";
case snode_kind::s_unit: return "unit";
case snode_kind::s_concat: return "concat";
case snode_kind::s_power: return "power";
case snode_kind::s_star: return "star";
case snode_kind::s_loop: return "loop";
case snode_kind::s_union: return "union";
case snode_kind::s_intersect: return "intersect";
case snode_kind::s_complement: return "complement";
case snode_kind::s_fail: return "fail";
case snode_kind::s_full_char: return "full_char";
case snode_kind::s_full_seq: return "full_seq";
case snode_kind::s_to_re: return "to_re";
case snode_kind::s_in_re: return "in_re";
case snode_kind::s_other: return "other";
}
return "?";
};
for (snode* n : m_nodes) {
out << "snode[" << n->id() << "] "
<< kind_str(n->kind())
<< " level=" << n->level()
<< " len=" << n->length()
<< " ground=" << n->is_ground()
<< " rfree=" << n->is_regex_free()
<< " nullable=" << n->is_nullable();
if (n->num_args() > 0) {
out << " args=(";
for (unsigned i = 0; i < n->num_args(); ++i) {
if (i > 0) out << ", ";
out << n->arg(i)->id();
}
out << ")";
}
if (n->get_expr())
out << " expr=" << mk_pp(n->get_expr(), m);
out << "\n";
}
return out;
}
void sgraph::collect_statistics(statistics& st) const {
st.update("seq-graph-nodes", m_stats.m_num_nodes);
st.update("seq-graph-concat", m_stats.m_num_concat);
st.update("seq-graph-power", m_stats.m_num_power);
st.update("seq-graph-hash-hits", m_stats.m_num_hash_hits);
m_egraph.collect_statistics(st);
}
}

160
src/ast/euf/euf_sgraph.h Normal file
View file

@ -0,0 +1,160 @@
/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
euf_sgraph.h
Abstract:
Sequence/string graph layer
Encapsulates string and regex expressions for the string solver.
Implements the string graph layer from ZIPT (https://github.com/CEisenhofer/ZIPT/tree/parikh/ZIPT).
The sgraph maps Z3 sequence/regex AST expressions to snode structures
organized as binary concatenation trees with metadata, and owns an
egraph with a seq_plugin for congruence closure.
-- snode classification: empty, char, variable, unit, concat, power,
star, loop, union, intersection, complement, fail, full_char,
full_seq, to_re, in_re, other.
-- Metadata computation: ground, regex_free, nullable, level, length.
-- Expression registration via mk(expr*), lookup via find(expr*).
-- Scope management: push/pop with backtracking.
-- egraph ownership with seq_plugin for:
* concat associativity via associativity-respecting hash table,
* Kleene star merging (u.v*.v*.w = u.v*.w),
* nullable absorption next to .* (u.*.v.w = u.*.w when v nullable),
* str.++ identity elimination (concat(a, ε) = a),
* re.++ identity/absorption (concat(a, epsilon) = a, concat(a, ) = ).
-- enode registration via mk_enode(expr*).
ZIPT features not yet ported:
-- Str operations: normalisation with union-find representatives and
cache migration, balanced tree maintenance, drop left/right with
caching, substitution, indexed access, iteration, ToList caching,
simplification, derivative computation, structural equality with
associative hashing, rotation equality, expression reconstruction,
Graphviz export.
-- StrToken subclasses: SymCharToken, StrAtToken, SubStrToken,
SetToken, PostToken/PreToken.
-- StrToken features: Nielsen-style GetDecomposition with side
constraints, NamedStrToken extension tracking for variable
splitting with PowerExtension, CollectSymbols for Parikh analysis,
MinTerms for character class analysis, token ordering, Derivable
and BasicRegex flags.
Author:
Nikolaj Bjorner (nbjorner) 2026-03-01
Clemens Eisenhofer 2026-03-01
--*/
#pragma once
#include "util/region.h"
#include "util/statistics.h"
#include "ast/ast.h"
#include "ast/seq_decl_plugin.h"
#include "ast/rewriter/seq_rewriter.h"
#include "ast/euf/euf_snode.h"
#include "ast/euf/euf_egraph.h"
namespace euf {
class seq_plugin;
class sgraph {
struct stats {
unsigned m_num_nodes;
unsigned m_num_concat;
unsigned m_num_power;
unsigned m_num_hash_hits;
stats() { reset(); }
void reset() { memset(this, 0, sizeof(*this)); }
};
ast_manager& m;
seq_util m_seq;
seq_rewriter m_rewriter;
egraph& m_egraph;
region m_region;
snode_vector m_nodes;
sort_ref m_str_sort; // cached string sort
unsigned_vector m_scopes;
unsigned m_num_scopes = 0;
stats m_stats;
bool m_add_plugin; // whether sgraph created the seq_plugin
// tracks allocated subst caches for cleanup
ptr_vector<snode_subst_cache> m_subst_caches;
// maps expression id to snode
ptr_vector<snode> m_expr2snode;
snode* mk_snode(expr* e, snode_kind k, unsigned num_args, snode* const* args);
snode_kind classify(expr* e) const;
void compute_metadata(snode* n);
void compute_hash_matrix(snode* n);
void collect_re_predicates(snode* re, expr_ref_vector& preds);
public:
sgraph(ast_manager& m, egraph& eg, bool add_plugin = true);
~sgraph();
ast_manager& get_manager() const { return m; }
seq_util& get_seq_util() { return m_seq; }
egraph& get_egraph() { return m_egraph; }
egraph const& get_egraph() const { return m_egraph; }
// register an expression and return its snode
snode* mk(expr* e);
// lookup an already-registered expression
snode* find(expr* e) const;
// register expression in both sgraph and egraph
enode* mk_enode(expr* e);
// factory methods for creating snodes with corresponding expressions
snode* mk_var(symbol const& name);
snode* mk_char(unsigned ch);
snode* mk_empty();
snode* mk_concat(snode* a, snode* b);
// drop operations: remove tokens from the front/back of a concat tree
snode* drop_first(snode* n);
snode* drop_last(snode* n);
snode* drop_left(snode* n, unsigned count);
snode* drop_right(snode* n, unsigned count);
// substitution: replace all occurrences of var in n by replacement
snode* subst(snode* n, snode* var, snode* replacement);
// Brzozowski derivative of regex re with respect to element elem
snode* brzozowski_deriv(snode* re, snode* elem);
// compute minterms (character class partition) from a regex
void compute_minterms(snode* re, snode_vector& minterms);
// scope management for backtracking
void push();
void pop(unsigned num_scopes);
// access
snode_vector const& nodes() const { return m_nodes; }
unsigned num_nodes() const { return m_nodes.size(); }
// display
std::ostream& display(std::ostream& out) const;
void collect_statistics(statistics& st) const;
};
}

202
src/ast/euf/euf_snode.h Normal file
View file

@ -0,0 +1,202 @@
/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
euf_snode.h
Abstract:
snode layer for sequence/string graph
Encapsulates strings in the style of euf_enode.h.
Maps Z3 sequence expressions to a ZIPT-style representation where
strings are composed of tokens (characters, variables, powers, regex, etc.)
organized as a binary tree of concatenations.
Author:
Nikolaj Bjorner (nbjorner) 2026-03-01
Clemens Eisenhofer 2026-03-01
--*/
#pragma once
#include "util/vector.h"
#include "util/region.h"
#include "ast/ast.h"
#include "ast/seq_decl_plugin.h"
namespace euf {
class sgraph;
class snode;
struct snode_subst_cache;
typedef ptr_vector<snode> snode_vector;
enum class snode_kind {
s_empty, // empty string (OP_SEQ_EMPTY or empty string constant)
s_char, // concrete character unit (OP_SEQ_UNIT wrapping a char literal)
s_var, // string variable (uninterpreted constant of string sort)
s_unit, // generic unit (OP_SEQ_UNIT with non-literal character)
s_concat, // concatenation of two snodes (OP_SEQ_CONCAT)
s_power, // string exponentiation s^n (OP_SEQ_POWER)
s_star, // Kleene star r* (OP_RE_STAR)
s_loop, // bounded loop r{lo,hi} (OP_RE_LOOP)
s_union, // union r1|r2 (OP_RE_UNION)
s_intersect, // intersection r1&r2 (OP_RE_INTERSECT)
s_complement, // complement ~r (OP_RE_COMPLEMENT)
s_fail, // empty language (OP_RE_EMPTY_SET)
s_full_char, // full character set (OP_RE_FULL_CHAR_SET)
s_full_seq, // full sequence set r=.* (OP_RE_FULL_SEQ_SET)
s_to_re, // string to regex (OP_SEQ_TO_RE)
s_in_re, // regex membership (OP_SEQ_IN_RE)
s_other, // other sequence expression not directly classified
};
class snode {
expr* m_expr = nullptr;
snode_kind m_kind = snode_kind::s_other;
unsigned m_id = UINT_MAX;
unsigned m_num_args = 0;
// metadata flags, analogous to ZIPT's Str/StrToken properties
bool m_ground = true; // no uninterpreted string variables
bool m_regex_free = true; // no regex constructs
bool m_nullable = false; // accepts the empty string
unsigned m_level = 0; // tree depth/level (0 for empty, 1 for singletons)
unsigned m_length = 0; // token count, number of leaf tokens in the tree
// hash matrix for associativity-respecting hashing (2x2 polynomial hash matrix)
// all zeros means not cached, non-zero means cached
unsigned m_hash_matrix[2][2] = {{0,0},{0,0}};
// substitution cache (lazy-initialized, owned by sgraph)
snode_subst_cache* m_subst_cache = nullptr;
snode* m_args[0]; // variable-length array, allocated via get_snode_size(num_args)
friend class sgraph;
static unsigned get_snode_size(unsigned num_args) {
return sizeof(snode) + num_args * sizeof(snode*);
}
static snode* mk(region& r, expr* e, snode_kind k, unsigned id, unsigned num_args, snode* const* args) {
void* mem = r.allocate(get_snode_size(num_args));
snode* n = new (mem) snode();
n->m_expr = e;
n->m_kind = k;
n->m_id = id;
n->m_num_args = num_args;
for (unsigned i = 0; i < num_args; ++i)
n->m_args[i] = args[i];
return n;
}
public:
expr* get_expr() const { return m_expr; }
snode_kind kind() const { return m_kind; }
unsigned id() const { return m_id; }
unsigned num_args() const { return m_num_args; }
snode* arg(unsigned i) const { SASSERT(i < m_num_args); return m_args[i]; }
bool is_ground() const { return m_ground; }
bool is_regex_free() const { return m_regex_free; }
bool is_nullable() const { return m_nullable; }
unsigned level() const { return m_level; }
unsigned length() const { return m_length; }
// associativity-respecting hash: cached if the 2x2 matrix is non-zero.
// M[0][0] = HASH_BASE^(num_leaves) which is always nonzero since HASH_BASE
// is odd and gcd(odd, 2^32) = 1, so the check is safe.
bool has_cached_hash() const { return m_hash_matrix[0][0] != 0; }
unsigned assoc_hash() const { return m_hash_matrix[0][1]; }
bool is_empty() const { return m_kind == snode_kind::s_empty; }
bool is_char() const { return m_kind == snode_kind::s_char; }
bool is_var() const { return m_kind == snode_kind::s_var; }
bool is_unit() const { return m_kind == snode_kind::s_unit; }
bool is_concat() const { return m_kind == snode_kind::s_concat; }
bool is_power() const { return m_kind == snode_kind::s_power; }
bool is_star() const { return m_kind == snode_kind::s_star; }
bool is_loop() const { return m_kind == snode_kind::s_loop; }
bool is_union() const { return m_kind == snode_kind::s_union; }
bool is_intersect() const { return m_kind == snode_kind::s_intersect; }
bool is_complement() const { return m_kind == snode_kind::s_complement; }
bool is_fail() const { return m_kind == snode_kind::s_fail; }
bool is_full_char() const { return m_kind == snode_kind::s_full_char; }
bool is_full_seq() const { return m_kind == snode_kind::s_full_seq; }
bool is_to_re() const { return m_kind == snode_kind::s_to_re; }
bool is_in_re() const { return m_kind == snode_kind::s_in_re; }
// is this a leaf token (analogous to ZIPT's StrToken as opposed to Str)
bool is_token() const {
switch (m_kind) {
case snode_kind::s_empty:
case snode_kind::s_concat:
return false;
default:
return true;
}
}
// analogous to ZIPT's Str.First / Str.Last
snode const* first() const {
snode const* s = this;
while (s->is_concat())
s = s->arg(0);
return s;
}
snode const* last() const {
snode const* s = this;
while (s->is_concat())
s = s->arg(1);
return s;
}
snode* first() {
snode* s = this;
while (s->is_concat())
s = s->arg(0);
return s;
}
snode* last() {
snode* s = this;
while (s->is_concat())
s = s->arg(1);
return s;
}
// collect all leaf tokens in left-to-right order
void collect_tokens(snode_vector& tokens) const {
if (is_concat()) {
arg(0)->collect_tokens(tokens);
arg(1)->collect_tokens(tokens);
}
else if (!is_empty()) {
tokens.push_back(const_cast<snode*>(this));
}
}
// access the i-th token (0-based, left-to-right order)
// returns nullptr if i >= length()
snode* at(unsigned i) const {
if (is_concat()) {
unsigned left_len = arg(0)->length();
if (i < left_len)
return arg(0)->at(i);
return arg(1)->at(i - left_len);
}
if (is_empty())
return nullptr;
return i == 0 ? const_cast<snode*>(this) : nullptr;
}
};
}