Moved the regex splitting into rewriter

Added some simplifications
2026-07-11 17:46:20 +00:00 · 2026-06-10 15:00:42 +02:00 · 2026-06-10 15:00:42 +02:00 · dbb3f70873
commit dbb3f70873
parent 03a76c0309
7 changed files with 484 additions and 324 deletions
--- a/src/ast/rewriter/CMakeLists.txt
+++ b/src/ast/rewriter/CMakeLists.txt
@ -40,6 +40,7 @@ z3_add_component(rewriter
    seq_axioms.cpp
    seq_eq_solver.cpp
    seq_subset.cpp
+    seq_split.cpp
    seq_rewriter.cpp
    seq_skolem.cpp
    th_rewriter.cpp
--- a/src/ast/rewriter/seq_rewriter.h
+++ b/src/ast/rewriter/seq_rewriter.h
@ -24,6 +24,7 @@ Notes:
 #include "ast/rewriter/rewriter_types.h"
 #include "ast/rewriter/bool_rewriter.h"
 #include "ast/rewriter/seq_subset.h"
+#include "ast/rewriter/seq_split.h"
 #include "util/params.h"
 #include "util/lbool.h"
 #include "util/sign.h"
@ -130,6 +131,7 @@ class seq_rewriter {

    seq_util       m_util;
    seq_subset     m_subset;
+    seq_split      m_split;
    arith_util     m_autil;
    bool_rewriter  m_br;
    // re2automaton   m_re2aut;
@ -342,7 +344,7 @@ class seq_rewriter {

 public:
    seq_rewriter(ast_manager & m, params_ref const & p = params_ref()):
-        m_util(m), m_subset(m_util.re), m_autil(m), m_br(m, p), // m_re2aut(m), 
+        m_util(m), m_subset(m_util.re), m_split(*this), m_autil(m), m_br(m, p), // m_re2aut(m),
        m_op_cache(m), m_es(m), 
        m_lhs(m), m_rhs(m), m_coalesce_chars(true) {
    }
@ -381,6 +383,12 @@ public:
        return result;
    }

+    // Split decomposition (sigma) of a regex; see seq_split.h.
+    bool split(expr* r, split_set& out, unsigned threshold, split_mode mode = split_mode::strong) {
+        return m_split.compute(r, out, threshold, mode);
+    }
+    void simplify_split(split_set& s) { m_split.simplify(s); }
+
    expr_ref mk_symmetric_diff(expr *r1, expr *r2);

    /**
--- a/src/ast/rewriter/seq_split.cpp
+++ b/src/ast/rewriter/seq_split.cpp
@ -0,0 +1,366 @@
+/*++
+Copyright (c) 2026 Microsoft Corporation
+
+Module Name:
+
+    seq_split.cpp
+
+Abstract:
+
+    Regex split decomposition (the split function sigma).  See seq_split.h.
+
+Author:
+
+    Nikolaj Bjorner (nbjorner) 2026-6-10
+    Clemens Eisenhofer 2026-6-10
+
+--*/
+
+#include "ast/rewriter/seq_split.h"
+#include "ast/rewriter/seq_rewriter.h"
+#include "ast/ast_pp.h"
+#include "util/obj_hashtable.h"
+
+seq_split::seq_split(seq_rewriter& rw) :
+    m_rw(rw), m_subset(rw.u().re) {}
+
+ast_manager&   seq_split::m()   const { return m_rw.m(); }
+seq_util&      seq_split::seq() const { return m_rw.u(); }
+seq_util::rex& seq_split::re()  const { return m_rw.u().re; }
+
+// Cross-product intersection of two split-sets (split algebra):
+//   S1 cap S2 = { <D1 cap D2, N1 cap N2> | <D1,N1> in S1, <D2,N2> in S2 }.
+// Pairs where any component is bottom (the empty regex) are dropped.
+bool seq_split::intersect(split_set const& s1, split_set const& s2, split_set& result, unsigned threshold) {
+    seq_util::rex& r = re();
+    for (auto const& p1 : s1) {
+        for (auto const& p2 : s2) {
+            if (r.is_empty(p1.m_d) || r.is_empty(p2.m_d) ||
+                r.is_empty(p1.m_n) || r.is_empty(p2.m_n))
+                continue;
+            result.push_back(split_pair(r.mk_inter(p1.m_d, p2.m_d),
+                                        r.mk_inter(p1.m_n, p2.m_n), m()));
+            if (result.size() > threshold)
+                return false;
+        }
+    }
+    return true;
+}
+
+// Complement of a split-set via De Morgan: ~S = cap_{s in S} ~s with
+//   ~<D,N> = { <~D, .*>, <.*, ~N> }  and  ~{} = { <.*, .*> }.
+// May produce up to 2^|sp| pairs (bounded by the threshold).  A threshold
+// overrun must abort entirely: a partial fold is a strictly weaker (unsound)
+// split-set, since each ~sp[i] further constrains ~S.
+bool seq_split::complement(sort* seq_sort, split_set const& sp, split_set& result, unsigned threshold) {
+    seq_util::rex& r = re();
+    sort* re_sort = r.mk_re(seq_sort);
+    const expr_ref full(r.mk_full_seq(re_sort), m());   // .*
+    if (sp.empty()) {                                   // ~{} = <.*, .*>
+        result.push_back(split_pair(full, full, m()));
+        return true;
+    }
+    split_set acc;
+    acc.push_back(split_pair(r.mk_complement(sp[0].m_d), full, m()));
+    acc.push_back(split_pair(full, r.mk_complement(sp[0].m_n), m()));
+    for (unsigned i = 1; i < sp.size(); ++i) {
+        split_set next;
+        next.push_back(split_pair(r.mk_complement(sp[i].m_d), full, m()));
+        next.push_back(split_pair(full, r.mk_complement(sp[i].m_n), m()));
+        split_set tmp;
+        if (!intersect(acc, next, tmp, threshold))
+            return false;
+        acc = std::move(tmp);
+        if (acc.empty())            // intersection empty => ~S is empty
+            break;
+        if (acc.size() > threshold)
+            return false;
+    }
+    result.append(acc);
+    return true;
+}
+
+bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mode mode) {
+    SASSERT(r);
+    seq_util& sq = seq();
+    seq_util::rex& rex = re();
+    ast_manager& mm = m();
+
+    sort* seq_sort = nullptr;
+    if (!sq.is_re(r, seq_sort))
+        return false;
+
+    // bottom: sigma(empty) = {} (the empty split-set)
+    if (rex.is_empty(r))
+        return true;
+
+    // epsilon: sigma(eps) = { <eps, eps> }
+    if (rex.is_epsilon(r)) {
+        const expr_ref eps(rex.mk_epsilon(seq_sort), mm);
+        result.push_back(split_pair(eps, eps, mm));
+        return true;
+    }
+
+    expr* a = nullptr, *b = nullptr;
+
+    // to_re(s): split the literal word s at every position.
+    expr* s = nullptr;
+    if (rex.is_to_re(r, s)) {
+        zstring str;
+        if (sq.str.is_string(s, str)) {
+            for (unsigned i = 0; i <= str.length(); ++i) {
+                const expr_ref p(rex.mk_to_re(sq.str.mk_string(str.extract(0, i))), mm);
+                const expr_ref q(rex.mk_to_re(sq.str.mk_string(str.extract(i, str.length() - i))), mm);
+                result.push_back(split_pair(p, q, mm));
+            }
+            return true;
+        }
+        // a single symbolic unit behaves like one token: { <eps, u>, <u, eps> }
+        if (sq.str.is_unit(s)) {
+            const expr_ref ex(r, mm);
+            const expr_ref eps(rex.mk_epsilon(seq_sort), mm);
+            result.push_back(split_pair(eps, ex, mm));
+            result.push_back(split_pair(ex, eps, mm));
+            return true;
+        }
+        // to_re over a non-literal sequence: not handled.
+        return false;
+    }
+
+    // single-character class alpha (., [lo-hi], of_pred):
+    //   sigma(alpha) = { <eps, alpha>, <alpha, eps> }
+    if (rex.is_full_char(r) || rex.is_range(r) || rex.is_of_pred(r)) {
+        const expr_ref ex(r, mm);
+        const expr_ref eps(rex.mk_epsilon(seq_sort), mm);
+        result.push_back(split_pair(eps, ex, mm));
+        result.push_back(split_pair(ex, eps, mm));
+        return true;
+    }
+
+    // .* : sigma(.*) = { <.*, .*> }
+    if (rex.is_full_seq(r)) {
+        const expr_ref ex(r, mm);
+        result.push_back(split_pair(ex, ex, mm));
+        return true;
+    }
+
+    // union: sigma(r0 | ... | r_{n-1}) = U sigma(ri)   (re.union may be n-ary)
+    if (rex.is_union(r)) {
+        app* ap = to_app(r);
+        for (unsigned i = 0; i < ap->get_num_args(); ++i)
+            if (!compute(ap->get_arg(i), result, threshold, mode))
+                return false;
+        return true;
+    }
+
+    // concat: sigma(r0...r_{n-1}) = U_i  (r0...r_{i-1}) . sigma(ri) . (r_{i+1}...r_{n-1})
+    // (re.++ may be n-ary)
+    if (rex.is_concat(r)) {
+        app* ap = to_app(r);
+        const unsigned n = ap->get_num_args();
+        for (unsigned i = 0; i < n; ++i) {
+            split_set sigma_arg;
+            if (!compute(ap->get_arg(i), sigma_arg, threshold, mode))
+                return false;
+
+            expr_ref left(mm), right(mm);
+            if (i == 0)
+                left = rex.mk_epsilon(seq_sort);
+            else
+                for (unsigned j = 0; j < i; ++j) {
+                    expr* arg = ap->get_arg(j);
+                    left = left ? expr_ref(rex.mk_concat(left, arg), mm) : expr_ref(arg, mm);
+                }
+            if (i == n - 1)
+                right = rex.mk_epsilon(seq_sort);
+            else
+                for (unsigned j = i + 1; j < n; ++j) {
+                    expr* arg = ap->get_arg(j);
+                    right = right ? expr_ref(rex.mk_concat(right, arg), mm) : expr_ref(arg, mm);
+                }
+
+            for (auto const& [d, nn] : sigma_arg) {
+                const expr_ref p = m_rw.mk_re_append(left, d);
+                const expr_ref q = m_rw.mk_re_append(nn, right);
+                result.push_back(split_pair(p, q, mm));
+            }
+        }
+        return true;
+    }
+
+    // star: sigma(a*) = { <eps, eps> } cup a*.sigma(a).a*
+    if (rex.is_star(r, a)) {
+        const expr_ref eps(rex.mk_epsilon(seq_sort), mm);
+        result.push_back(split_pair(eps, eps, mm));
+        split_set sa;
+        if (!compute(a, sa, threshold, mode))
+            return false;
+        for (auto const& [d, n] : sa) {
+            const expr_ref p = m_rw.mk_re_append(r, d);   // a*.D
+            const expr_ref q = m_rw.mk_re_append(n, r);   // N.a*
+            result.push_back(split_pair(p, q, mm));
+        }
+        return true;
+    }
+
+    // plus: a+ = a.a* ; sigma(a+) = a*.sigma(a).a*  (star rule without <eps,eps>)
+    if (rex.is_plus(r, a)) {
+        const expr_ref star(rex.mk_star(a), mm);          // a*
+        split_set sa;
+        if (!compute(a, sa, threshold, mode))
+            return false;
+        for (auto const& [d, n] : sa) {
+            const expr_ref p = m_rw.mk_re_append(star, d);
+            const expr_ref q = m_rw.mk_re_append(n, star);
+            result.push_back(split_pair(p, q, mm));
+        }
+        return true;
+    }
+
+    // intersection: sigma(r0 & ... & r_{n-1}) = cap sigma(ri)   (re.inter may be n-ary)
+    if (rex.is_intersection(r)) {
+        if (mode == split_mode::weak)
+            return false;
+        app* ap = to_app(r);
+        const unsigned n = ap->get_num_args();
+        split_set current;
+        if (!compute(ap->get_arg(0), current, threshold, mode))
+            return false;
+        // A give-up on any conjunct must propagate as a give-up: silently treating
+        // it as the empty split-set would collapse the whole intersection to bottom
+        // and be misreported as an (unsound) conflict.
+        for (unsigned i = 1; i < n && !current.empty(); ++i) {
+            split_set arg_i, tmp;
+            if (!compute(ap->get_arg(i), arg_i, threshold, mode))
+                return false;
+            if (!intersect(current, arg_i, tmp, threshold))
+                return false;
+            current = std::move(tmp);
+        }
+        result.append(current);
+        return true;
+    }
+
+    // complement: sigma(~a) = ~sigma(a)
+    if (rex.is_complement(r, a)) {
+        if (mode == split_mode::weak)
+            return false;
+        split_set sa;
+        if (!compute(a, sa, threshold, mode))
+            return false;
+        return complement(seq_sort, sa, result, threshold);
+    }
+
+    // difference: a \ b = a & ~b ; sigma(a \ b) = sigma(a) cap ~sigma(b)
+    if (rex.is_diff(r, a, b)) {
+        if (mode == split_mode::weak)
+            return false;
+        split_set sa, sb, sb_compl, tmp;
+        if (!compute(a, sa, threshold, mode))
+            return false;
+        if (!compute(b, sb, threshold, mode))
+            return false;
+        if (!complement(seq_sort, sb, sb_compl, threshold))
+            return false;
+        if (!intersect(sa, sb_compl, tmp, threshold))
+            return false;
+        result.append(tmp);
+        return true;
+    }
+
+    // bounded loop / ite / other: not handled (paper "v1: bail").
+    TRACE(seq, tout << "seq_split: unsupported regex " << mk_pp(r, mm) << "\n";);
+    return false;
+}
+
+// same-D / same-N merge (paper eqs. 1 & 2):
+//   { <D,N>, <D,N'> } -> <D, N | N'>      (by_left = true,  group by D)
+//   { <D,N>, <D',N> } -> <D | D', N>      (by_left = false, group by N)
+// Only fires on syntactically-identical (perfectly-shared) key components, so
+// it is a conservative instance of the rule.
+void seq_split::merge_by(split_set& pairs, bool by_left) {
+    seq_util::rex& r = re();
+    ast_manager& mm = m();
+    obj_map<expr, unsigned> idx;   // key component -> position in `out`
+    split_set out;
+    for (auto const& p : pairs) {
+        expr* key   = by_left ? p.m_d.get() : p.m_n.get();
+        expr* other = by_left ? p.m_n.get() : p.m_d.get();
+        unsigned pos;
+        if (idx.find(key, pos)) {
+            expr* prev = by_left ? out[pos].m_n.get() : out[pos].m_d.get();
+            const expr_ref u(r.mk_union(prev, other), mm);
+            if (by_left)
+                out[pos].m_n = u;
+            else
+                out[pos].m_d = u;
+        }
+        else {
+            idx.insert(key, out.size());
+            out.push_back(p);
+        }
+    }
+    pairs.swap(out);
+}
+
+void seq_split::simplify(split_set& pairs) {
+    seq_util::rex& r = re();
+
+    // 1. drop pairs with a bottom (empty-language) component.
+    unsigned w = 0;
+    for (unsigned i = 0; i < pairs.size(); ++i) {
+        if (r.is_empty(pairs[i].m_d) || r.is_empty(pairs[i].m_n))
+            continue;
+        if (w != i)
+            pairs[w] = pairs[i];
+        ++w;
+    }
+    pairs.shrink(w);
+    if (pairs.size() <= 1)
+        return;
+
+    // 2. same-D / same-N merge rules.
+    merge_by(pairs, true);
+    merge_by(pairs, false);
+    if (pairs.size() <= 1)
+        return;
+
+    // 3. subsumption: drop <D_i,N_i> when L(D_i) subseteq L(D_j) and
+    //    L(N_i) subseteq L(N_j) for some kept j.  seq_subset is conservative
+    //    (returns true only for definite containment), so we never drop a
+    //    needed split.  Size-capped to bound the O(n^2) subset checks.
+    if (pairs.size() > 64)
+        return;
+
+    struct row { expr* d; expr* n; unsigned idx; };
+    vector<row> rows;
+    for (unsigned i = 0; i < pairs.size(); ++i)
+        rows.push_back({ pairs[i].m_d.get(), pairs[i].m_n.get(), i });
+
+    auto subsumes = [&](row const& a, row const& b) {
+        return m_subset.is_subset(b.d, a.d) && m_subset.is_subset(b.n, a.n);
+    };
+
+    vector<row> kept;
+    for (row const& row_r : rows) {
+        bool redundant = false;
+        for (row const& k : kept)
+            if (subsumes(k, row_r)) { redundant = true; break; }
+        if (redundant)
+            continue;
+        // drop already-kept rows strictly subsumed by row_r
+        unsigned kw = 0;
+        for (unsigned t = 0; t < kept.size(); ++t) {
+            if (subsumes(row_r, kept[t]))
+                continue;
+            kept[kw++] = kept[t];
+        }
+        kept.shrink(kw);
+        kept.push_back(row_r);
+    }
+
+    split_set result;
+    for (row const& k : kept)
+        result.push_back(pairs[k.idx]);
+    pairs.swap(result);
+}
--- a/src/ast/rewriter/seq_split.h
+++ b/src/ast/rewriter/seq_split.h
@ -0,0 +1,93 @@
+/*++
+Copyright (c) 2026 Microsoft Corporation
+
+Module Name:
+
+    seq_split.h
+
+Abstract:
+
+    Regex split decomposition: the split function sigma from the paper
+    "Solving by Splitting".  For a regular expression r, sigma(r) is a finite
+    "split-set" of pairs { <D_i, N_i> } such that
+
+        u.v in L(r)   iff   exists i:  u in L(D_i)  and  v in L(N_i).
+
+    This lifts the decomposition previously buried in the nseq solver
+    (seq::compute_sigma over euf::snode) into a self-contained, expr*-based
+    engine that can be reused anywhere a seq_rewriter is available (the legacy
+    seq solver, nseq, and the regex rewriter itself).
+
+    The split algebra (intersection, De Morgan complement, left/right
+    concatenation with a regex) and the cardinality-reducing simplification
+    heuristics (drop bottom, same-D/same-N merge, subsumption via seq_subset)
+    follow the paper.
+
+Author:
+
+    Nikolaj Bjorner (nbjorner) 2026-6-10
+    Clemens Eisenhofer 2026-6-10
+
+--*/
+#pragma once
+
+#include "ast/seq_decl_plugin.h"
+#include "ast/rewriter/seq_subset.h"
+
+class seq_rewriter;
+
+// An individual split <D, N>: the left (prefix) regex D and right (suffix)
+// regex N.  u.v in L(r) for this split iff u in L(D) and v in L(N).
+struct split_pair {
+    expr_ref m_d;
+    expr_ref m_n;
+    split_pair(expr* d, expr* n, ast_manager& m) : m_d(d, m), m_n(n, m) {
+        SASSERT(d && n);
+    }
+};
+
+// A split-set is a union of individual splits.
+typedef vector<split_pair> split_set;
+
+// Controls how aggressively sigma expands the Boolean-closure cases:
+//   strong - fully expand complement / intersection via the split algebra
+//            (De Morgan / cross product).  This is the behaviour the nseq
+//            solver relies on.
+//   weak   - do not perform the (potentially 2^k) Boolean-closure expansion;
+//            give up (return false) on complement / intersection instead.
+enum class split_mode { weak, strong };
+
+class seq_split {
+    seq_rewriter& m_rw;       // for mk_re_append + manager / seq_util access
+    seq_subset    m_subset;   // language-subset checks for subsumption
+
+    ast_manager&   m() const;
+    seq_util&      seq() const;
+    seq_util::rex& re() const;
+
+    // S1 cap S2 = { <D1 cap D2, N1 cap N2> } dropping any pair with a bottom
+    // component.  Returns false on threshold overrun.
+    bool intersect(split_set const& s1, split_set const& s2, split_set& result, unsigned threshold);
+
+    // De Morgan complement of a split-set: ~S = cap_{s in S} ~s with
+    //   ~<D,N> = { <~D, .*>, <.*, ~N> }  and  ~{} = { <.*, .*> }.
+    bool complement(sort* seq_sort, split_set const& sp, split_set& result, unsigned threshold);
+
+    // same-D / same-N merge: groups pairs that share a (syntactically identical)
+    // left (resp. right) component and unions the other component.
+    void merge_by(split_set& pairs, bool by_left);
+
+public:
+    explicit seq_split(seq_rewriter& rw);
+
+    // Compute sigma(r), appending to `out` (does not clear it).  `threshold`
+    // bounds the number of produced splits; an overrun, an unsupported regex
+    // shape (bounded loop / ite), or a Boolean-closure case in weak mode makes
+    // it return false ("give up").
+    bool compute(expr* r, split_set& out, unsigned threshold, split_mode mode = split_mode::strong);
+
+    // In-place simplification of a split-set: drop bottom components, apply the
+    // same-D / same-N merge rules, and drop splits subsumed by another (using
+    // seq_subset).  Size-capped to keep the O(n^2) subsumption affordable.
+    void simplify(split_set& s);
+};
--- a/src/smt/seq/seq_nielsen.cpp
+++ b/src/smt/seq/seq_nielsen.cpp
@ -3576,291 +3576,6 @@ namespace seq {
    // Modifier: apply_regex_factorization (Boolean Closure)
    // -----------------------------------------------------------------------

-    // Cross-product intersection of two split-sets (split algebra):
-    //   S1 ⊓ S2 = { ⟨Δ1⊓Δ2, ∇1⊓∇2⟩ | ⟨Δ1,∇1⟩∈S1, ⟨Δ2,∇2⟩∈S2 }
-    // Pairs where either component is the empty regex are dropped (∅⊓r ≡ ∅).
-    static bool intersect_sigma_pairs(ast_manager& m, seq_util& seq,
-            sigma_pairs const& s1, sigma_pairs const& s2, sigma_pairs& result, unsigned threshold) {
-        for (auto const& p1 : s1) {
-            for (auto const& p2 : s2) {
-                if (seq.re.is_empty(p1.m_p) || seq.re.is_empty(p2.m_p) ||
-                    seq.re.is_empty(p1.m_q) || seq.re.is_empty(p2.m_q))
-                    continue;
-                result.push_back(sigma_pair(seq.re.mk_inter(p1.m_p, p2.m_p),
-                                            seq.re.mk_inter(p1.m_q, p2.m_q), m));
-                if (result.size() > threshold)
-                    return false;
-            }
-        }
-        return true;
-    }
-
-    // Complement of a split-set via De Morgan: ~S = ⊓_{s∈S} ~s,
-    //   ~⟨Δ,∇⟩ = { ⟨~Δ, .*⟩, ⟨.*, ~∇⟩ }   and   ~{} = { ⟨.*, .*⟩ }.
-    // str_sort is the sequence-element sort; mk_full_seq needs the regex sort.
-    // May produce up to 2^|sp| pairs (bounded downstream by the factorization threshold).
-    static bool complement_sigma_pairs(ast_manager& m, seq_util& seq, sort* str_sort,
-            sigma_pairs const& sp, sigma_pairs& result, unsigned threshold) {
-
-        sort* re_sort = seq.re.mk_re(str_sort);
-        const expr_ref full(seq.re.mk_full_seq(re_sort), m);   // .*
-        if (sp.empty()) {                                      // ~{} = ⟨.*, .*⟩
-            result.push_back(sigma_pair(full, full, m));
-            return true;
-        }
-        sigma_pairs acc;
-        acc.push_back(sigma_pair(seq.re.mk_complement(sp[0].m_p), full, m));
-        acc.push_back(sigma_pair(full, seq.re.mk_complement(sp[0].m_q), m));
-        for (unsigned i = 1; i < sp.size(); ++i) {
-            sigma_pairs next;
-            next.push_back(sigma_pair(seq.re.mk_complement(sp[i].m_p), full, m));
-            next.push_back(sigma_pair(full, seq.re.mk_complement(sp[i].m_q), m));
-            sigma_pairs tmp;
-            // De Morgan fold: acc := acc ⊓ ~sp[i]. intersect_sigma_pairs returns
-            // false ONLY when it overruns the threshold; in that case we must give
-            // up entirely (a partial fold is a strictly weaker — hence unsound —
-            // split set, since each ~sp[i] further constrains ~S).
-            if (!intersect_sigma_pairs(m, seq, acc, next, tmp, threshold))
-                return false;
-            acc = std::move(tmp);
-            if (acc.empty())            // intersection empty ⇒ ~S is empty
-                break;
-            if (acc.size() > threshold)
-                return false;
-        }
-        result.append(acc);
-        return true;
-    }
-
-    bool compute_sigma(ast_manager& m, seq_util& seq, seq_rewriter& rw, const euf::snode* r, sigma_pairs& result, unsigned threshold) {
-        SASSERT(r);
-        sort* str_sort = nullptr;
-        if (!seq.is_re(r->get_expr(), str_sort))
-            return false;
-        // std::cout << "Computing sigma of " << snode_label_html(r, m, false) << std::endl;
-
-        if (r->is_empty()) {
-            const expr_ref eps(seq.re.mk_epsilon(str_sort), m);
-            result.push_back(sigma_pair(eps, eps, m));
-            return true;
-        }
-        if (r->is_to_re()) {
-            const euf::snode* const c = r->arg0();
-            if (c->is_range()) {
-                const expr_ref ex(c->get_expr(), m);
-                const expr_ref eps(seq.re.mk_epsilon(str_sort), m);
-
-                result.push_back(sigma_pair(eps, ex, m));
-                result.push_back(sigma_pair(ex, eps, m));
-                return true;
-            }
-            if (c->is_empty()) {
-                const expr_ref eps(seq.re.mk_epsilon(str_sort), m);
-                result.push_back(sigma_pair(eps, eps, m));
-                return true;
-            }
-            if (c->is_char()) {
-                unsigned val;
-                VERIFY(seq.is_const_char(c->arg0()->get_expr(), val));
-                const expr_ref ex(seq.re.mk_to_re(seq.str.mk_string(zstring(val))), m);
-                const expr_ref eps(seq.re.mk_epsilon(str_sort), m);
-
-                result.push_back(sigma_pair(eps, ex, m));
-                result.push_back(sigma_pair(ex, eps, m));
-                return true;
-            }
-            zstring s;
-            if (c->is_string(s, seq)) {
-                for (unsigned i = 0; i <= s.length(); ++i) {
-                    expr_ref p(seq.re.mk_to_re(seq.str.mk_string(s.extract(0, i))), m);
-                    expr_ref q(seq.re.mk_to_re(seq.str.mk_string(s.extract(i, s.length() - i))), m);
-                    result.push_back(sigma_pair(p, q, m));
-                }
-                return true;
-            }
-            std::cout << mk_pp(c->get_expr(), m) << std::endl;
-            UNREACHABLE();
-            return false;
-        }
-        if (r->is_full_char()) {
-            const expr_ref ex(r->get_expr(), m);
-            const expr_ref eps(seq.re.mk_epsilon(str_sort), m);
-
-            result.push_back(sigma_pair(eps, ex, m));
-            result.push_back(sigma_pair(ex, eps, m));
-            return true;
-        }
-        if (r->is_full_seq()) {
-            const expr_ref ex(r->get_expr(), m);
-            result.push_back(sigma_pair(ex, ex, m));
-            return true;
-        }
-        if (r->is_union()) {
-            // σ(r₁ ⊔ r₂) = σ(r₁) ∪ σ(r₂)
-            SASSERT(r->num_args() >= 2);
-            for (unsigned i = 0; i < r->num_args(); i++) {
-                if (!compute_sigma(m, seq, rw, r->arg(i), result, threshold))
-                    return false;
-            }
-            return true;
-        }
-        if (r->is_intersect()) {
-            // σ(r₁ ⊓ r₂ ⊓ …) = σ(r₁) ⊓ σ(r₂) ⊓ …; empty intersection (0 args) = {⟨.*,.*⟩}
-            const unsigned n = r->num_args();
-            SASSERT(n >= 2);
-            sigma_pairs current;
-            if (!compute_sigma(m, seq, rw, r->arg0(), current, threshold))
-                return false;
-            for (unsigned i = 1; i < n && !current.empty(); ++i) {
-                sigma_pairs arg_i;
-                // A give-up on any conjunct must propagate as a give-up: silently
-                // treating arg_i as the empty split-set would collapse the whole
-                // intersection to ∅ and be misreported as an (unsound) conflict.
-                if (!compute_sigma(m, seq, rw, r->arg(i), arg_i, threshold))
-                    return false;
-                sigma_pairs tmp;
-                if (!intersect_sigma_pairs(m, seq, current, arg_i, tmp, threshold))
-                    return false;
-                current = std::move(tmp);
-            }
-            result.append(current);
-            return true;
-        }
-        if (r->is_complement()) {
-            // σ(~r) = ~σ(r)
-            sigma_pairs body_pairs;
-            if (!compute_sigma(m, seq, rw, r->arg0(), body_pairs, threshold))
-                return false;
-            return complement_sigma_pairs(m, seq, str_sort, body_pairs, result, threshold);
-        }
-        if (r->is_concat()) {
-            const unsigned n = r->num_args();
-            SASSERT(n >= 2);
-            for (unsigned i = 0; i < n; ++i) {
-                sigma_pairs sigma_arg;
-                if (!compute_sigma(m, seq, rw, r->arg(i), sigma_arg, threshold))
-                    return false;
-
-                expr_ref left(m);
-                expr_ref right(m);
-
-                if (i == 0)
-                    left = seq.re.mk_epsilon(str_sort);
-                else {
-                    for (unsigned j = 0; j < i; ++j) {
-                        const euf::snode* arg = r->arg(j);
-                        left = left ? seq.re.mk_concat(left, arg->get_expr()) : arg->get_expr();
-                    }
-                }
-
-                if (i == n - 1)
-                    right = seq.re.mk_epsilon(str_sort);
-                else {
-                    for (unsigned j = i + 1; j < n; ++j) {
-                        const euf::snode* arg = r->arg(j);
-                        right = right ? seq.re.mk_concat(right, arg->get_expr()) : arg->get_expr();
-                    }
-                }
-
-                for (auto const &[tp, tq] : sigma_arg) {
-                    expr_ref p = rw.mk_re_append(left, tp);
-                    expr_ref q = rw.mk_re_append(tq, right);
-                    result.push_back(sigma_pair(p, q, m));
-                }
-            }
-            return true;
-        }
-        if (r->is_star()) {
-            const euf::snode* body = r->arg0();
-            const expr_ref eps(seq.re.mk_epsilon(str_sort), m);
-            result.push_back(sigma_pair(eps, eps, m));
-
-            sigma_pairs sigma_body;
-            if (!compute_sigma(m, seq, rw, body, sigma_body, threshold))
-                return false;
-            for (auto const &[tp, tq] : sigma_body) {
-                auto p = rw.mk_re_append(r->get_expr(), tp);
-                auto q = rw.mk_re_append(tq, r->get_expr());
-                result.push_back(sigma_pair(p, q, m));
-            }
-            return true;
-        }
-        if (r->is_plus()) {
-            // r⁺ = r·r* ; by Kleene factorization σ(r⁺) = r*·σ(r)·r*.
-            // Same shape as the star rule but with the surrounding factor being
-            // body* without the {⟨ε,ε⟩} pair
-            const euf::snode* body = r->arg0();
-            const expr_ref star(seq.re.mk_star(body->get_expr()), m);   // body*
-
-            sigma_pairs sigma_body;
-            if (!compute_sigma(m, seq, rw, body, sigma_body, threshold))
-                return false;
-            for (auto const &[tp, tq] : sigma_body) {
-                auto p = rw.mk_re_append(star, tp);   // body* · tp
-                auto q = rw.mk_re_append(tq, star);   // tq · body*
-                result.push_back(sigma_pair(p, q, m));
-            }
-            return true;
-        }
-        // the simplifier should have eliminated most of them already
-        // TODO: so far, we are, however, still missing bounded repetitions and ite
-        std::cout << "Unknown element " << mk_pp(r->get_expr(), m) << std::endl;
-        return false;
-    }
-
-    void simplify_sigma_pairs(sigma_pairs& pairs, seq_regex& sr, euf::sgraph& sg) {
-        return; // For now
-        if (pairs.size() <= 1)
-            return;
-        // Guard against pathological cost: subsumption is O(n^2) language-subset
-        // BFS checks. Large split-sets are left to the factorization threshold.
-        if (pairs.size() > 64)
-            return;
-
-        struct row { euf::snode* p; euf::snode* q; unsigned idx; };
-
-        // Materialise snodes once; drop pairs with a structurally-empty component.
-        vector<row> rows;
-        for (unsigned i = 0; i < pairs.size(); ++i) {
-            euf::snode* p = sg.mk(pairs[i].m_p);
-            euf::snode* q = sg.mk(pairs[i].m_q);
-            if (sr.is_empty_regex(p) || sr.is_empty_regex(q))
-                continue;
-            rows.push_back({ p, q, i });
-        }
-
-        // a subsumes b iff L(b.p) ⊆ L(a.p) and L(b.q) ⊆ L(a.q).
-        // is_language_subset may return l_undef (inconclusive); only treat a
-        // definite l_true as subsumption, so we never drop a needed split.
-        auto subsumes = [&](row const& a, row const& b) {
-            return sr.is_language_subset(b.p, a.p) == l_true &&
-                   sr.is_language_subset(b.q, a.q) == l_true;
-        };
-
-        vector<row> kept;
-        for (row const& r : rows) {
-            bool redundant = false;
-            for (row const& k : kept)
-                if (subsumes(k, r)) { redundant = true; break; }
-            if (redundant)
-                continue;
-            // drop already-kept rows strictly subsumed by r
-            unsigned w = 0;
-            for (unsigned t = 0; t < kept.size(); ++t) {
-                if (subsumes(r, kept[t]))
-                    continue;
-                kept[w++] = kept[t];
-            }
-            kept.shrink(w);
-            kept.push_back(r);
-        }
-
-        sigma_pairs result;
-        for (row const& k : kept)
-            result.push_back(pairs[k.idx]);
-        pairs.swap(result);
-    }
-
    bool nielsen_graph::apply_regex_factorization(nielsen_node* node) {
        if (m_regex_factorization_threshold == 0)
            return false;
@ -3874,14 +3589,14 @@ namespace seq {
        for (str_mem const& mem : node->str_mems()) {
            SASSERT(mem.well_formed());

-            // compute_sigma handles all regex forms (incl. complement / intersection),
+            // split() handles all regex forms (incl. complement / intersection),
            // so the classical restriction is no longer needed.
            if (mem.m_str->is_empty() || mem.is_primitive())
                continue;

-            // compute_sigma / compute_tau do not understand the projection
-            // operator (re.proj) — they would recurse into it and hit an
-            // UNREACHABLE.  Projection-constrained memberships are handled by the
+            // The split engine works on plain regex AST and does not understand the
+            // projection operator (re.proj) — it would give up on it anyway.
+            // Projection-constrained memberships are handled by the
            // cycle-decomposition path, so skip them here.
            if (mem.m_regex->has_projection())
                continue;
@ -3892,13 +3607,14 @@ namespace seq {
            euf::snode* tail = m_sg.drop_first(mem.m_str);
            SASSERT(tail);

-            sigma_pairs pairs;
+            // Decompose the regex into a split-set via the shared seq_split engine
+            // (sigma from the paper): first ∈ Δ ∧ tail ∈ ∇ for each ⟨Δ,∇⟩.
+            split_set pairs;
            seq_rewriter rw(m);
-            if (!compute_sigma(m, m_seq, rw, mem.m_regex, pairs, m_regex_factorization_threshold))
+            if (!rw.split(mem.m_regex->get_expr(), pairs, m_regex_factorization_threshold))
                continue;

-            if (m_seq_regex)
-                simplify_sigma_pairs(pairs, *m_seq_regex, m_sg);
+            rw.simplify_split(pairs);

            vector<rf_split> feasible;
            dep_tracker eliminated_dep = mem.m_dep;
--- a/src/smt/seq/seq_nielsen.h
+++ b/src/smt/seq/seq_nielsen.h
@ -262,30 +262,6 @@ namespace seq {

    std::string snode_label_html(euf::snode const* n, ast_manager& m, bool html_escape);

-    // Split-pair produced by compute_sigma: uv |= r  iff  exists i: u |= m_p[i] and v |= m_q[i]
-    struct sigma_pair {
-        expr_ref m_p;
-        expr_ref m_q;
-        sigma_pair(expr* p, expr* q, ast_manager& m) : m_p(p, m), m_q(q, m) {
-            SASSERT(p && q);
-        }
-    };
-    typedef vector<sigma_pair> sigma_pairs;
-
-    // Compute the split-set sigma(r) per the splitting rules of the paper
-    // "Extended Regular Expression Membership". Generalises the classical compute_tau
-    // with intersection and complement cases via the split-set algebra.
-    // `result` is appended to (not cleared).
-    bool compute_sigma(ast_manager& m, seq_util& seq, seq_rewriter& rw, const euf::snode* r, sigma_pairs& result, unsigned threshold);
-
-    // Simplify a split-set in place using the split algebra's language-level rules
-    // (paper section "split-set simplification heuristics"): drop pairs with an
-    // empty-language component, and drop any split subsumed by another
-    // (<D_i,N_i> is subsumed by <D_j,N_j> iff L(D_i) subseteq L(D_j) and L(N_i) subseteq L(N_j)).
-    // Subsumption tames the 2^k blow-up of sigma(~r) (e.g. sigma(~a*): 8 -> 2).
-    // Requires the regex emptiness/subset checker and an sgraph to build snodes.
-    void simplify_sigma_pairs(sigma_pairs& pairs, seq_regex& sr, euf::sgraph& sg);
-
    // simplification result for constraint processing
    // mirrors ZIPT's SimplifyResult enum
    enum class simplify_result {
--- a/src/smt/theory_nseq.cpp
+++ b/src/smt/theory_nseq.cpp
@ -555,7 +555,7 @@ namespace smt {

        // Eager sigma factorization (token-level): when enabled, split a non-primitive
        // membership s ∈ r at the boundary between the first concat argument (head) and
-        // the rest (tail), using compute_sigma. This mirrors the lazy Nielsen
+        // the rest (tail), using the shared seq_split engine. This mirrors the lazy Nielsen
        // apply_regex_factorization and the paper's Reduce rule for x·u'.
        //   (s ∈ r) → ⋁_{⟨Δ,∇⟩∈σ(r)} ( head ∈ Δ ∧ tail ∈ ∇ )
        // Only fires for a concatenation s (single-variable s is already primitive).
@ -570,12 +570,12 @@ namespace smt {
            const expr_ref tail(m_seq.str.mk_concat(na - 1, a->get_args() + 1, s->get_sort()), m);

            const unsigned threshold = get_fparams().m_nseq_regex_factorization_threshold;
-            seq::sigma_pairs pairs;
-            if (!seq::compute_sigma(m, m_seq, m_rewriter, mem.m_regex, pairs, threshold))
+            split_set pairs;
+            if (!m_rewriter.split(mem.m_regex->get_expr(), pairs, threshold))
                // we give up
                return;

-            seq::simplify_sigma_pairs(pairs, m_regex, m_sgraph);
+            m_rewriter.simplify_split(pairs);

            if (pairs.empty()) {
                // no viable splits
@ -598,8 +598,8 @@ namespace smt {
                lits.push_back(~mem.lit);
                //std::cout << "Decomposing into:\n";
                for (auto const& sp : pairs) {
-                    expr_ref mem_head(m_seq.re.mk_in_re(head, sp.m_p), m);
-                    expr_ref mem_tail(m_seq.re.mk_in_re(tail, sp.m_q), m);
+                    expr_ref mem_head(m_seq.re.mk_in_re(head, sp.m_d), m);
+                    expr_ref mem_tail(m_seq.re.mk_in_re(tail, sp.m_n), m);
                    expr_ref conj(m.mk_and(mem_head, mem_tail), m);
                    lits.push_back(mk_literal(conj));
                    //seq::dep_tracker dep = nullptr;