Revert is_concat changes, add ZIPT URL, implement snode/sgraph operations and tests

- Revert is_str_concat/is_re_concat to original form (PR #8820 review) - Add ZIPT URL (https://github.com/CEisenhofer/ZIPT) to euf_sgraph.h - Add snode::at() for token indexing and collect_tokens() for enumeration - Add sgraph factory methods: mk_var, mk_char, mk_empty, mk_concat - Add sgraph drop operations: drop_first, drop_last, drop_left, drop_right - Add sgraph substitution: subst(snode*, snode*, snode*) - Add Brzozowski derivative via seq_rewriter::mk_derivative - Add minterm computation from regex predicates - Add 7 new unit tests covering all new operations with complex concats Co-authored-by: NikolajBjorner <56730610+NikolajBjorner@users.noreply.github.com>
2026-06-27 19:08:49 +00:00 · 2026-03-02 19:28:32 +00:00 · 2026-03-02 19:28:32 +00:00 · 150f1fe2ea
commit 150f1fe2ea
parent 40b99311e3
5 changed files with 501 additions and 11 deletions
--- a/src/ast/euf/euf_seq_plugin.h
+++ b/src/ast/euf/euf_seq_plugin.h
@ -93,22 +93,18 @@ namespace euf {
        bool is_str_concat(enode* n) const { return m_seq.str.is_concat(n->get_expr()); }
        bool is_str_concat(enode* n, enode*& a, enode*& b) {
            expr* ea = nullptr, *eb = nullptr;
-            if (!m_seq.str.is_concat(n->get_expr(), ea, eb))
-                return false;
-            a = n->get_arg(0);
-            b = n->get_arg(1);
-            return true;
+            return m_seq.str.is_concat(n->get_expr(), ea, eb) &&
+                   n->num_args() == 2 &&
+                   (a = n->get_arg(0), b = n->get_arg(1), true);
        }

        // regex concat predicates
        bool is_re_concat(enode* n) const { return m_seq.re.is_concat(n->get_expr()); }
        bool is_re_concat(enode* n, enode*& a, enode*& b) {
            expr* ea = nullptr, *eb = nullptr;
-            if (!m_seq.re.is_concat(n->get_expr(), ea, eb))
-                return false;
-            a = n->get_arg(0);
-            b = n->get_arg(1);
-            return true;
+            return m_seq.re.is_concat(n->get_expr(), ea, eb) &&
+                   n->num_args() == 2 &&
+                   (a = n->get_arg(0), b = n->get_arg(1), true);
        }

        // any concat, string or regex
--- a/src/ast/euf/euf_sgraph.cpp
+++ b/src/ast/euf/euf_sgraph.cpp
@ -26,8 +26,10 @@ namespace euf {
    sgraph::sgraph(ast_manager& m):
        m(m),
        m_seq(m),
+        m_rewriter(m),
        m_egraph(m),
-        m_exprs(m) {
+        m_exprs(m),
+        m_str_sort(m_seq.str.mk_string_sort(), m) {
        // create seq_plugin and register it with the egraph
        m_egraph.add_plugin(alloc(seq_plugin, m_egraph));
        // register on_make callback so sgraph creates snodes for new enodes
@ -354,6 +356,143 @@ namespace euf {
        m_egraph.pop(num_scopes);
    }

+    snode* sgraph::mk_var(symbol const& name) {
+        expr_ref e(m.mk_const(name, m_str_sort), m);
+        return mk(e);
+    }
+
+    snode* sgraph::mk_char(unsigned ch) {
+        expr_ref c(m_seq.str.mk_char(ch), m);
+        expr_ref u(m_seq.str.mk_unit(c), m);
+        return mk(u);
+    }
+
+    snode* sgraph::mk_empty() {
+        expr_ref e(m_seq.str.mk_empty(m_str_sort), m);
+        return mk(e);
+    }
+
+    snode* sgraph::mk_concat(snode* a, snode* b) {
+        if (a->is_empty()) return b;
+        if (b->is_empty()) return a;
+        expr_ref e(m_seq.str.mk_concat(a->get_expr(), b->get_expr()), m);
+        return mk(e);
+    }
+
+    snode* sgraph::drop_first(snode* n) {
+        if (n->is_empty() || n->is_token())
+            return mk_empty();
+        SASSERT(n->is_concat());
+        snode* l = n->arg(0);
+        snode* r = n->arg(1);
+        if (l->is_token() || l->is_empty())
+            return r;
+        return mk_concat(drop_first(l), r);
+    }
+
+    snode* sgraph::drop_last(snode* n) {
+        if (n->is_empty() || n->is_token())
+            return mk_empty();
+        SASSERT(n->is_concat());
+        snode* l = n->arg(0);
+        snode* r = n->arg(1);
+        if (r->is_token() || r->is_empty())
+            return l;
+        return mk_concat(l, drop_last(r));
+    }
+
+    snode* sgraph::drop_left(snode* n, unsigned count) {
+        for (unsigned i = 0; i < count && !n->is_empty(); ++i)
+            n = drop_first(n);
+        return n;
+    }
+
+    snode* sgraph::drop_right(snode* n, unsigned count) {
+        for (unsigned i = 0; i < count && !n->is_empty(); ++i)
+            n = drop_last(n);
+        return n;
+    }
+
+    snode* sgraph::subst(snode* n, snode* var, snode* replacement) {
+        if (n == var)
+            return replacement;
+        if (n->is_empty() || n->is_char())
+            return n;
+        if (n->is_concat())
+            return mk_concat(subst(n->arg(0), var, replacement),
+                             subst(n->arg(1), var, replacement));
+        // for non-concat compound nodes (power, star, etc.), no substitution into children
+        return n;
+    }
+
+    snode* sgraph::brzozowski_deriv(snode* re, snode* elem) {
+        expr* re_expr = re->get_expr();
+        expr* elem_expr = elem->get_expr();
+        if (!re_expr || !elem_expr)
+            return nullptr;
+        // unwrap str.unit to get the character expression
+        expr* ch = nullptr;
+        if (m_seq.str.is_unit(elem_expr, ch))
+            elem_expr = ch;
+        expr_ref result = m_rewriter.mk_derivative(elem_expr, re_expr);
+        if (!result)
+            return nullptr;
+        return mk(result);
+    }
+
+    void sgraph::collect_re_predicates(snode* re, expr_ref_vector& preds) {
+        if (!re || !re->get_expr())
+            return;
+        expr* e = re->get_expr();
+        expr* ch = nullptr, *lo = nullptr, *hi = nullptr;
+        // leaf regex predicates: character ranges and single characters
+        if (m_seq.re.is_range(e, lo, hi)) {
+            preds.push_back(e);
+            return;
+        }
+        if (m_seq.re.is_to_re(e))
+            return;
+        if (m_seq.re.is_full_char(e))
+            return;
+        if (m_seq.re.is_full_seq(e))
+            return;
+        if (m_seq.re.is_empty(e))
+            return;
+        // recurse into compound regex operators
+        for (unsigned i = 0; i < re->num_args(); ++i)
+            collect_re_predicates(re->arg(i), preds);
+    }
+
+    void sgraph::compute_minterms(snode* re, snode_vector& minterms) {
+        // extract character predicates from the regex
+        expr_ref_vector preds(m);
+        collect_re_predicates(re, preds);
+        if (preds.empty()) {
+            // no predicates means the whole alphabet is one minterm
+            // represented by full_char
+            expr_ref fc(m_seq.re.mk_full_char(m_str_sort), m);
+            minterms.push_back(mk(fc));
+            return;
+        }
+        // generate minterms as conjunctions/negations of predicates
+        // for n predicates, there are up to 2^n minterms
+        unsigned n = preds.size();
+        for (unsigned mask = 0; mask < (1u << n); ++mask) {
+            expr_ref_vector conj(m);
+            for (unsigned i = 0; i < n; ++i) {
+                if (mask & (1u << i))
+                    conj.push_back(preds.get(i));
+                else
+                    conj.push_back(m_seq.re.mk_complement(preds.get(i)));
+            }
+            // intersect all terms
+            expr_ref mt(conj.get(0), m);
+            for (unsigned i = 1; i < conj.size(); ++i)
+                mt = m_seq.re.mk_inter(mt, conj.get(i));
+            minterms.push_back(mk(mt));
+        }
+    }
+
    std::ostream& sgraph::display(std::ostream& out) const {
        auto kind_str = [](snode_kind k) -> char const* {
            switch (k) {
--- a/src/ast/euf/euf_sgraph.h
+++ b/src/ast/euf/euf_sgraph.h
@ -10,6 +10,7 @@ Abstract:
    Sequence/string graph layer

    Encapsulates string and regex expressions for the string solver.
+    Implements the string graph layer from ZIPT (https://github.com/CEisenhofer/ZIPT).
    The sgraph maps Z3 sequence/regex AST expressions to snode structures
    organized as binary concatenation trees with metadata, and owns an
    egraph with a seq_plugin for congruence closure.
@ -59,6 +60,7 @@ Author:
 #include "util/statistics.h"
 #include "ast/ast.h"
 #include "ast/seq_decl_plugin.h"
+#include "ast/rewriter/seq_rewriter.h"
 #include "ast/euf/euf_snode.h"
 #include "ast/euf/euf_egraph.h"

@ -79,10 +81,12 @@ namespace euf {

        ast_manager&     m;
        seq_util         m_seq;
+        seq_rewriter     m_rewriter;
        egraph           m_egraph;
        region           m_region;
        snode_vector     m_nodes;
        expr_ref_vector  m_exprs;       // pin expressions
+        sort_ref         m_str_sort;    // cached string sort
        unsigned_vector  m_scopes;
        unsigned         m_num_scopes = 0;
        stats            m_stats;
@ -93,6 +97,7 @@ namespace euf {
        snode* mk_snode(expr* e, snode_kind k, unsigned num_args, snode* const* args);
        snode_kind classify(expr* e) const;
        void compute_metadata(snode* n);
+        void collect_re_predicates(snode* re, expr_ref_vector& preds);

    public:
        sgraph(ast_manager& m);
@ -112,6 +117,27 @@ namespace euf {
        // register expression in both sgraph and egraph
        enode* mk_enode(expr* e);

+        // factory methods for creating snodes with corresponding expressions
+        snode* mk_var(symbol const& name);
+        snode* mk_char(unsigned ch);
+        snode* mk_empty();
+        snode* mk_concat(snode* a, snode* b);
+
+        // drop operations: remove tokens from the front/back of a concat tree
+        snode* drop_first(snode* n);
+        snode* drop_last(snode* n);
+        snode* drop_left(snode* n, unsigned count);
+        snode* drop_right(snode* n, unsigned count);
+
+        // substitution: replace all occurrences of var in n by replacement
+        snode* subst(snode* n, snode* var, snode* replacement);
+
+        // Brzozowski derivative of regex re with respect to element elem
+        snode* brzozowski_deriv(snode* re, snode* elem);
+
+        // compute minterms (character class partition) from a regex
+        void compute_minterms(snode* re, snode_vector& minterms);
+
        // scope management for backtracking
        void push();
        void pop(unsigned num_scopes);
--- a/src/ast/euf/euf_snode.h
+++ b/src/ast/euf/euf_snode.h
@ -157,6 +157,31 @@ namespace euf {
                s = s->arg(1);
            return s;
        }
+
+        // collect all leaf tokens in left-to-right order
+        void collect_tokens(snode_vector& tokens) const {
+            if (is_concat()) {
+                arg(0)->collect_tokens(tokens);
+                arg(1)->collect_tokens(tokens);
+            }
+            else if (!is_empty()) {
+                tokens.push_back(const_cast<snode*>(this));
+            }
+        }
+
+        // access the i-th token (0-based, left-to-right order)
+        // returns nullptr if i >= length()
+        snode* at(unsigned i) const {
+            if (is_concat()) {
+                unsigned left_len = arg(0)->length();
+                if (i < left_len)
+                    return arg(0)->at(i);
+                return arg(1)->at(i - left_len);
+            }
+            if (is_empty())
+                return nullptr;
+            return i == 0 ? const_cast<snode*>(this) : nullptr;
+        }
    };

 }