Porting seq_split to master (#9840)

Co-authored-by: Nikolaj Bjorner <nbjorner@microsoft.com>
2026-07-21 22:45:51 +00:00 · 2026-06-30 19:18:28 +02:00 · 2026-06-30 19:18:28 +02:00 · b3143e759b
commit b3143e759b
parent c22a7bac7c
11 changed files with 1545 additions and 1 deletions
--- a/src/ast/rewriter/CMakeLists.txt
+++ b/src/ast/rewriter/CMakeLists.txt
@ -41,6 +41,7 @@ z3_add_component(rewriter
    seq_eq_solver.cpp
    seq_derive.cpp
    seq_subset.cpp
+    seq_split.cpp
    seq_derive.cpp
    seq_range_collapse.cpp
    seq_range_predicate.cpp
--- a/src/ast/rewriter/seq_rewriter.h
+++ b/src/ast/rewriter/seq_rewriter.h
@ -18,6 +18,7 @@ Notes:
 --*/
 #pragma once

+#include "seq_split.h"
 #include "ast/seq_decl_plugin.h"
 #include "ast/rewriter/seq_derive.h"
 #include "ast/ast_pp.h"
@ -133,6 +134,7 @@ class seq_rewriter {

    seq_util       m_util;
    seq_subset     m_subset;
+    seq_split      m_split;
    arith_util     m_autil;
    bool_rewriter  m_br;
    seq::derive    m_derive;
@ -332,7 +334,7 @@ class seq_rewriter {

 public:
    seq_rewriter(ast_manager & m, params_ref const & p = params_ref()):
-        m_util(m), m_subset(m_util.re), m_autil(m), m_br(m, p), m_derive(m, *this),
+        m_util(m), m_subset(m_util.re), m_split(*this), m_autil(m), m_br(m, p), m_derive(m, *this), // m_re2aut(m),
        m_op_cache(m), m_es(m), 
        m_lhs(m), m_rhs(m) {
    }
@ -411,6 +413,20 @@ public:
        return result;
    }

+    // Split decomposition (sigma) of a regex; see seq_split.h.  `oracle` (optional)
+    // prunes non-viable splits during generation.
+    bool split(expr* r, split_set& out, unsigned threshold,
+        const split_mode mode = split_mode::strong, split_oracle const& oracle = {}) {
+        return m_split.compute(r, out, threshold, mode, oracle);
+    }
+
+    void simplify_split(split_set& s) { m_split.simplify(s); }
+
+    // decompose a membership constraint into a set of pairs of regex splits
+    std::pair<expr_ref, expr_ref> split_membership(expr* str, expr* regex, unsigned threshold, split_set& result) const {
+        return m_split.split_membership(str, regex, threshold, result);
+    }
+
    /**
     * check if regular expression is of the form all ++ s ++ all ++ t + u ++ all, where, s, t, u are sequences
     */
--- a/src/ast/rewriter/seq_split.cpp
+++ b/src/ast/rewriter/seq_split.cpp
@ -0,0 +1,820 @@
+/*++
+Copyright (c) 2026 Microsoft Corporation
+
+Module Name:
+
+    seq_split.cpp
+
+Abstract:
+
+    Regex split decomposition (the split function sigma).  See seq_split.h.
+
+Author:
+
+    Clemens Eisenhofer 2026-6-10
+
+--*/
+
+#include "ast/rewriter/seq_split.h"
+#include "ast/rewriter/seq_rewriter.h"
+#include "ast/ast_pp.h"
+#include "util/obj_hashtable.h"
+#include "util/stack.h"
+
+seq_split::seq_split(seq_rewriter& rw) :
+    m(rw.m()), m_rw(rw), m_subset(rw.u().re),
+    m_set_sort(m),
+    m_d_empty(m), m_d_single(m), m_d_fromre(m), m_d_union(m),
+    m_d_inter(m), m_d_compl(m), m_d_lcat(m), m_d_rcat(m),
+    m_empty_app(m) {}
+
+// ---------------------------------------------------------------------------
+// Suspended split-set representation (split algebra over `expr`).
+// ---------------------------------------------------------------------------
+
+void seq_split::ensure_decls(sort* seq_sort) {
+    SASSERT(seq_sort);
+    if (m_seq_sort == seq_sort)
+        return;
+    sort* re_sort = re().mk_re(seq_sort);
+    m_set_sort  = m.mk_uninterpreted_sort(symbol("seq.split.set"));
+    sort* ss    = m_set_sort;
+    m_d_empty   = m.mk_func_decl(symbol("seq.split.empty"),   0u, nullptr, ss);
+    m_d_single  = m.mk_func_decl(symbol("seq.split.single"),  re_sort, re_sort, ss);
+    m_d_fromre  = m.mk_func_decl(symbol("seq.split.from_re"), re_sort, ss);
+    m_d_union   = m.mk_func_decl(symbol("seq.split.union"),   ss, ss, ss);
+    m_d_inter   = m.mk_func_decl(symbol("seq.split.inter"),   ss, ss, ss);
+    m_d_compl   = m.mk_func_decl(symbol("seq.split.compl"),   ss, ss);
+    m_d_lcat    = m.mk_func_decl(symbol("seq.split.lcat"),    re_sort, ss, ss);
+    m_d_rcat    = m.mk_func_decl(symbol("seq.split.rcat"),    ss, re_sort, ss);
+    m_empty_app = m.mk_const(m_d_empty);
+    m_seq_sort  = seq_sort;
+}
+
+// --- smart constructors ----------------------------------------------------
+
+expr_ref seq_split::mk_empty() {
+    SASSERT(m_empty_app);
+    return m_empty_app;
+}
+
+expr_ref seq_split::mk_single(expr* d, expr* n) {
+    SASSERT(d && n);
+    if (re().is_empty(d) || re().is_empty(n))
+        return mk_empty();
+    return expr_ref(m.mk_app(m_d_single, d, n), m);
+}
+
+expr_ref seq_split::mk_fromre(expr* r) {
+    SASSERT(r);
+    sort* seq_sort = nullptr;
+    VERIFY(seq().is_re(r, seq_sort));
+    ensure_decls(seq_sort);
+    if (re().is_empty(r))
+        return mk_empty();
+    return expr_ref(m.mk_app(m_d_fromre, r), m);
+}
+
+expr_ref seq_split::mk_union(expr* a, expr* b) {
+    SASSERT(a && b);
+    if (is_empty_ss(a))
+        return expr_ref(b, m);
+    if (is_empty_ss(b))
+        return expr_ref(a, m);
+    return expr_ref(m.mk_app(m_d_union, a, b), m);
+}
+
+expr_ref seq_split::mk_inter(expr* a, expr* b) {
+    SASSERT(a && b);
+    if (is_empty_ss(a) || is_empty_ss(b))
+        return mk_empty();
+    return expr_ref(m.mk_app(m_d_inter, a, b), m);
+}
+
+expr_ref seq_split::mk_compl(expr* a) {
+    SASSERT(a);
+    return expr_ref(m.mk_app(m_d_compl, a), m);
+}
+
+expr_ref seq_split::mk_lcat(expr* r, expr* s) {
+    SASSERT(r && s);
+    if (is_empty_ss(s))
+        return mk_empty();
+    if (re().is_epsilon(r))            // eps . S = S
+        return expr_ref(s, m);
+    return expr_ref(m.mk_app(m_d_lcat, r, s), m);
+}
+
+expr_ref seq_split::mk_rcat(expr* s, expr* r) {
+    SASSERT(r && s);
+    if (is_empty_ss(s))
+        return mk_empty();
+    if (re().is_epsilon(r))            // S . eps = S
+        return expr_ref(s, m);
+    return expr_ref(m.mk_app(m_d_rcat, s, r), m);
+}
+
+// --- recognizers -----------------------------------------------------------
+
+bool seq_split::is_empty_ss(expr* e) const {
+    return is_app(e) && to_app(e)->get_decl() == m_d_empty;
+}
+bool seq_split::is_single(expr* e, expr*& d, expr*& n) const {
+    if (!is_app(e) || to_app(e)->get_decl() != m_d_single)
+        return false;
+    d = to_app(e)->get_arg(0);
+    n = to_app(e)->get_arg(1);
+    return true;
+}
+bool seq_split::is_fromre(expr* e, expr*& r) const {
+    if (!is_app(e) || to_app(e)->get_decl() != m_d_fromre)
+        return false;
+    r = to_app(e)->get_arg(0);
+    return true;
+}
+bool seq_split::is_union(expr* e, expr*& a, expr*& b) const {
+    if (!is_app(e) || to_app(e)->get_decl() != m_d_union)
+        return false;
+    a = to_app(e)->get_arg(0);
+    b = to_app(e)->get_arg(1);
+    return true;
+}
+bool seq_split::is_inter(expr* e, expr*& a, expr*& b) const {
+    if (!is_app(e) || to_app(e)->get_decl() != m_d_inter)
+        return false;
+    a = to_app(e)->get_arg(0);
+    b = to_app(e)->get_arg(1);
+    return true;
+}
+bool seq_split::is_compl(expr* e, expr*& a) const {
+    if (!is_app(e) || to_app(e)->get_decl() != m_d_compl)
+        return false;
+    a = to_app(e)->get_arg(0);
+    return true;
+}
+bool seq_split::is_lcat(expr* e, expr*& r, expr*& s) const {
+    if (!is_app(e) || to_app(e)->get_decl() != m_d_lcat)
+        return false;
+    r = to_app(e)->get_arg(0);
+    s = to_app(e)->get_arg(1);
+    return true;
+}
+bool seq_split::is_rcat(expr* e, expr*& s, expr*& r) const {
+    if (!is_app(e) || to_app(e)->get_decl() != m_d_rcat)
+        return false;
+    s = to_app(e)->get_arg(0);
+    r = to_app(e)->get_arg(1);
+    return true;
+}
+bool seq_split::is_frontier(expr* e) const {
+    expr *a = nullptr, *b = nullptr;
+    return is_empty_ss(e) || is_single(e, a, b) || is_union(e, a, b);
+}
+
+seq_util&      seq_split::seq() const { return m_rw.u(); }
+seq_util::rex& seq_split::re()  const { return m_rw.u().re; }
+
+// Add <d, n> unless the (optional) lookahead oracle prunes it.
+void seq_split::push(split_set& out, split_oracle const& oracle, expr* d, expr* n) const {
+    if (!oracle || oracle(d, n))
+        out.push_back(split_pair(d, n, m));
+}
+
+// Cross-product intersection of two split-sets (split algebra):
+//   S1 cap S2 = { <D1 cap D2, N1 cap N2> | <D1,N1> in S1, <D2,N2> in S2 }.
+// Pairs where any component is bottom (the empty regex) are dropped.
+bool seq_split::intersect(split_set const& s1, split_set const& s2, split_set& result,
+                          unsigned threshold, split_oracle const& oracle) const {
+    const seq_util::rex& r = re();
+    for (auto const& p1 : s1) {
+        for (auto const& p2 : s2) {
+            if (r.is_empty(p1.m_d) || r.is_empty(p2.m_d) ||
+                r.is_empty(p1.m_n) || r.is_empty(p2.m_n))
+                continue;
+            const expr_ref di(m_rw.mk_regex_inter_normalize(p1.m_d, p2.m_d), m);
+            const expr_ref ni(m_rw.mk_regex_inter_normalize(p1.m_n, p2.m_n), m);
+            push(result, oracle, di, ni);
+            if (result.size() > threshold)
+                return false;
+        }
+    }
+    return true;
+}
+
+// Complement of a split-set via De Morgan: ~S = cap_{s in S} ~s with
+//   ~<D,N> = { <~D, .*>, <.*, ~N> }  and  ~{} = { <.*, .*> }.
+// May produce up to 2^|sp| pairs (bounded by the threshold).  A threshold
+// overrun must abort entirely: a partial fold is a strictly weaker (unsound)
+// split-set, since each ~sp[i] further constrains ~S.
+bool seq_split::complement(sort* seq_sort, split_set const& sp, split_set& result,
+    const unsigned threshold, split_oracle const& oracle) const {
+
+    seq_util::rex& r = re();
+    sort* re_sort = r.mk_re(seq_sort);
+    const expr_ref full(r.mk_full_seq(re_sort), m);   // .*
+    if (sp.empty()) {                                   // ~{} = <.*, .*>
+        push(result, oracle, full, full);
+        return true;
+    }
+    // The acc/next pairs carry genuine output-orientation N components (the De
+    // Morgan ~<D,N> = {<~D,.*>, <.*,~N>}), so the oracle prunes them soundly and
+    // keeps the 2^|sp| fold from blowing up.
+    split_set acc;
+    push(acc, oracle, r.mk_complement(sp[0].m_d), full);
+    push(acc, oracle, full, r.mk_complement(sp[0].m_n));
+    for (unsigned i = 1; i < sp.size(); ++i) {
+        split_set next;
+        push(next, oracle, r.mk_complement(sp[i].m_d), full);
+        push(next, oracle, full, r.mk_complement(sp[i].m_n));
+        split_set tmp;
+        if (!intersect(acc, next, tmp, threshold, oracle))
+            return false;
+        acc = std::move(tmp);
+        if (acc.empty())            // intersection empty => ~S is empty
+            break;
+        if (acc.size() > threshold)
+            return false;
+    }
+    result.append(acc);
+    return true;
+}
+
+// One level of the sigma rules.  Mirrors the historic eager `compute`, except it
+// emits *suspended* split-algebra terms (from_re / lcat / rcat / inter / compl) for
+// the subterms instead of recursing.  `mode` is irrelevant here: weak vs. strong is
+// decided when `head_normalize` reaches an inter / compl node.
+expr_ref seq_split::expand_fromre(expr* r, bool& ok) {
+    ok = true;
+    seq_util& sq = seq();
+    seq_util::rex& rex = re();
+
+    sort* seq_sort = nullptr;
+    if (!sq.is_re(r, seq_sort)) {
+        ok = false;
+        return expr_ref(m);
+    }
+    ensure_decls(seq_sort);
+
+    // bottom: sigma(empty) = {}
+    if (rex.is_empty(r))
+        return mk_empty();
+
+    // epsilon: sigma(eps) = { <eps, eps> }
+    if (rex.is_epsilon(r)) {
+        const expr_ref eps(rex.mk_epsilon(seq_sort), m);
+        return mk_single(eps, eps);
+    }
+
+    expr* a = nullptr, *b = nullptr;
+
+    // to_re(s): split the literal word s at every position.
+    expr* s = nullptr;
+    if (rex.is_to_re(r, s)) {
+        zstring str;
+        vector<expr*> stack;
+        stack.push_back(s);
+
+        while (!stack.empty()) {
+            expr* cur = stack.back();
+            stack.pop_back();
+            if (seq().str.is_concat(cur, a, b)) {
+                stack.push_back(b);
+                stack.push_back(a);
+            }
+            else {
+                expr* ch;
+                unsigned cv;
+                if (seq().str.is_unit(cur, ch) && seq().is_const_char(ch, cv)) {
+                    str += zstring(cv);
+                    continue;
+                }
+                zstring str2;
+                if (sq.str.is_string(s, str2)) {
+                    str = str2;
+                    continue;
+                }
+                // not a constant string; unsupported for now
+                ok = false;
+                return expr_ref(m);
+            }
+        }
+        expr_ref acc = mk_empty();
+        for (unsigned i = 0; i <= str.length(); ++i) {
+            const expr_ref p(rex.mk_to_re(sq.str.mk_string(str.extract(0, i))), m);
+            const expr_ref q(rex.mk_to_re(sq.str.mk_string(str.extract(i, str.length() - i))), m);
+            acc = mk_union(acc, mk_single(p, q));
+        }
+        return acc;
+    }
+
+    // single-character class alpha (., [lo-hi], of_pred):
+    //   sigma(alpha) = { <eps, alpha>, <alpha, eps> }
+    if (rex.is_full_char(r) || rex.is_range(r) || rex.is_of_pred(r)) {
+        const expr_ref ex(r, m);
+        const expr_ref eps(rex.mk_epsilon(seq_sort), m);
+        return mk_union(mk_single(eps, ex), mk_single(ex, eps));
+    }
+
+    // .* : sigma(.*) = { <.*, .*> }
+    if (rex.is_full_seq(r)) {
+        const expr_ref ex(r, m);
+        return mk_single(ex, ex);
+    }
+
+    // union: sigma(r0 | ... | r_{n-1}) = U from_re(ri)   (re.union may be n-ary)
+    if (rex.is_union(r)) {
+        app* ap = to_app(r);
+        expr_ref acc = mk_empty();
+        for (expr* arg : *ap) {
+            acc = mk_union(acc, mk_fromre(arg));
+        }
+        return acc;
+    }
+
+    // concat: sigma(r0...r_{n-1}) = U_i  (r0...r_{i-1}) . sigma(ri) . (r_{i+1}...r_{n-1})
+    // emitted as U_i  lcat(left, rcat(from_re(ri), right))   (re.++ may be n-ary)
+    if (rex.is_concat(r)) {
+        app* ap = to_app(r);
+        const unsigned n = ap->get_num_args();
+        expr_ref acc = mk_empty();
+        for (unsigned i = 0; i < n; ++i) {
+            expr_ref left(m), right(m);
+            if (i == 0)
+                left = rex.mk_epsilon(seq_sort);
+            else {
+                for (unsigned j = 0; j < i; ++j) {
+                    expr* arg = ap->get_arg(j);
+                    left = left ? expr_ref(rex.mk_concat(left, arg), m) : expr_ref(arg, m);
+                }
+            }
+            if (i == n - 1)
+                right = rex.mk_epsilon(seq_sort);
+            else {
+                right = ap->get_arg(i + 1);
+                for (unsigned j = i + 2; j < n; ++j) {
+                    expr* arg = ap->get_arg(j);
+                    right = rex.mk_concat(right, arg);
+                }
+            }
+            expr_ref term = mk_lcat(left, mk_rcat(mk_fromre(ap->get_arg(i)), right));
+            acc = mk_union(acc, term);
+        }
+        return acc;
+    }
+
+    // star: sigma(a*) = { <eps, eps> } cup a*.sigma(a).a*
+    if (rex.is_star(r, a)) {
+        const expr_ref eps(rex.mk_epsilon(seq_sort), m);
+        expr_ref body = mk_lcat(r, mk_rcat(mk_fromre(a), r));  // a*.from_re(a).a*
+        return mk_union(mk_single(eps, eps), body);
+    }
+
+    // plus: a+ = a.a* ; sigma(a+) = a*.sigma(a).a*  (star rule without <eps,eps>)
+    if (rex.is_plus(r, a)) {
+        const expr_ref star(rex.mk_star(a), m);          // a*
+        return mk_lcat(star, mk_rcat(mk_fromre(a), star));
+    }
+
+    // intersection: sigma(r0 & ... & r_{n-1}) = cap from_re(ri)   (re.inter may be n-ary)
+    if (rex.is_intersection(r)) {
+        app* ap = to_app(r);
+        const unsigned n = ap->get_num_args();
+        expr_ref acc = mk_fromre(ap->get_arg(0));
+        for (unsigned i = 1; i < n; ++i)
+            acc = mk_inter(acc, mk_fromre(ap->get_arg(i)));
+        return acc;
+    }
+
+    // complement: sigma(~a) = ~sigma(a).
+    if (rex.is_complement(r, a))
+        return mk_compl(mk_fromre(a));
+
+    // difference: a \ b = a & ~b ; sigma(a \ b) = sigma(a) cap ~sigma(b).
+    if (rex.is_diff(r, a, b))
+        return mk_inter(mk_fromre(a), mk_compl(mk_fromre(b)));
+
+    // bounded loop / ite / other: not handled (paper "v1: bail").
+    TRACE(seq, tout << "seq_split: unsupported regex " << mk_pp(r, m) << "\n";);
+    ok = false;
+    return expr_ref(m);
+}
+
+// r . hs : push the left regex onto the D component of a head-normal split-set.
+expr_ref seq_split::distribute_lcat(expr* r, expr* hs) {
+    expr *a = nullptr, *b = nullptr, *d = nullptr, *n = nullptr;
+    if (is_empty_ss(hs))
+        return mk_empty();
+    if (is_single(hs, d, n))
+        return mk_single(m_rw.mk_re_append(r, d), n);   // r.D
+    if (is_union(hs, a, b))
+        return mk_union(mk_lcat(r, a), mk_lcat(r, b));
+    UNREACHABLE();
+    return expr_ref(hs, m);
+}
+
+// hs . r : push the right regex onto the N component of a head-normal split-set.
+expr_ref seq_split::distribute_rcat(expr* hs, expr* r) {
+    expr *a = nullptr, *b = nullptr, *d = nullptr, *n = nullptr;
+    if (is_empty_ss(hs))
+        return mk_empty();
+    if (is_single(hs, d, n))
+        return mk_single(d, m_rw.mk_re_append(n, r));    // N.r
+    if (is_union(hs, a, b))
+        return mk_union(mk_rcat(a, r), mk_rcat(b, r));
+    UNREACHABLE();
+    return expr_ref(hs, m);
+}
+
+expr_ref seq_split::from_split_set(split_set const& s) {
+    expr_ref acc = mk_empty();
+    for (auto const& p : s)
+        acc = mk_union(acc, mk_single(p.m_d, p.m_n));
+    return acc;
+}
+
+expr_ref seq_split::head_normalize(expr* t, split_mode mode, unsigned threshold,
+                                   split_oracle const& oracle, bool& ok) {
+    ok = true;
+    expr *a = nullptr, *b = nullptr, *r = nullptr, *s = nullptr;
+
+    // already a frontier node
+    if (is_frontier(t))
+        return expr_ref(t, m);
+
+    // from_re(r): one level of sigma; recurse to settle a non-frontier head
+    // (plus / inter / compl / diff expand to lcat / inter / compl nodes).
+    if (is_fromre(t, r)) {
+        expr_ref e = expand_fromre(r, ok);
+        if (!ok)
+            return expr_ref(m);
+        if (is_frontier(e))
+            return e;
+        return head_normalize(e, mode, threshold, oracle, ok);
+    }
+
+    // r.S : head-normalize S, then distribute r over the frontier.
+    if (is_lcat(t, r, s)) {
+        expr_ref hs = head_normalize(s, mode, threshold, oracle, ok);
+        if (!ok)
+            return expr_ref(m);
+        return distribute_lcat(r, hs);
+    }
+    if (is_rcat(t, s, r)) {
+        expr_ref hs = head_normalize(s, mode, threshold, oracle, ok);
+        if (!ok)
+            return expr_ref(m);
+        return distribute_rcat(hs, r);
+    }
+
+    // inter / compl are eager by nature: a single split of S1 cap S2 (or ~S)
+    // cannot be produced without materializing the operand split-sets.
+    if (is_inter(t, a, b)) {
+        if (mode == split_mode::weak) {
+            ok = false;
+            return expr_ref(m);
+        }
+        split_set sa, sb, tmp;
+        if (!materialize(a, mode, threshold, oracle, sa) ||
+            !materialize(b, mode, threshold, oracle, sb) ||
+            !intersect(sa, sb, tmp, threshold, oracle)) {
+            ok = false;
+            return expr_ref(m);
+        }
+        return from_split_set(tmp);
+    }
+    if (is_compl(t, a)) {
+        if (mode == split_mode::weak) {
+            ok = false;
+            return expr_ref(m);
+        }
+        // The body is materialized WITHOUT the oracle (its pairs are inverted, so
+        // their N is unrelated to the output N); the oracle is re-applied in
+        // complement().
+        split_set sa, res;
+        if (!materialize(a, mode, threshold, split_oracle{}, sa) ||
+            !complement(m_seq_sort, sa, res, threshold, oracle)) {
+            ok = false;
+            return expr_ref(m);
+        }
+        return from_split_set(res);
+    }
+
+    UNREACHABLE();
+    ok = false;
+    return expr_ref(m);
+}
+
+bool seq_split::materialize(expr* node, split_mode mode, unsigned threshold,
+                            split_oracle const& oracle, split_set& out) {
+    iterator it(*this, node, mode, threshold, oracle);
+    expr_ref d(m), n(m);
+    while (it.next(d, n))
+        out.push_back(split_pair(d, n, m));
+    return !it.gave_up();
+}
+
+expr_ref seq_split::make(expr* r) {
+    SASSERT(r);
+    sort* seq_sort = nullptr;
+    if (!seq().is_re(r, seq_sort))
+        return expr_ref(m);
+    return mk_fromre(r);
+}
+
+// --- Lazy enumerator --------------------------------------------------------
+// The worklist holds suspended split-sets.  Each next() pops a node, head-
+// normalizes it to a frontier (empty | single | union), and either returns the
+// single split, pushes the two union branches back, or skips an empty.  All the
+// expansion work happens lazily, one split per next() call.
+
+seq_split::iterator::iterator(seq_split& engine, expr* node, split_mode mode,
+                              unsigned threshold, split_oracle oracle) :
+    m_engine(engine), m(engine.m), m_mode(mode), m_threshold(threshold),
+    m_oracle(std::move(oracle)), m_work(engine.m) {
+    SASSERT(node);
+    m_work.push_back(node);
+}
+
+bool seq_split::iterator::next(expr_ref& out_d, expr_ref& out_n) {
+    if (m_giveup)
+        return false;                  // a prior give-up is sticky
+    while (!m_work.empty()) {
+        expr_ref t(m_work.back(), m);
+        m_work.pop_back();
+
+        bool ok = true;
+        expr_ref hn = m_engine.head_normalize(t, m_mode, m_threshold, m_oracle, ok);
+        if (!ok) {
+            m_giveup = true;           // unsupported / weak Boolean / overrun
+            return false;
+        }
+
+        expr *a = nullptr, *b = nullptr, *d = nullptr, *n = nullptr;
+        if (m_engine.is_empty_ss(hn))
+            continue;
+        if (m_engine.is_single(hn, d, n)) {
+            if (m_oracle && !m_oracle(d, n))
+                continue;              // pruned by lookahead
+            if (++m_count > m_threshold) {
+                m_giveup = true;       // safety cap against space bloat
+                return false;
+            }
+            out_d = d;
+            out_n = n;
+            return true;
+        }
+        if (m_engine.is_union(hn, a, b)) {
+            m_work.push_back(a);
+            m_work.push_back(b);
+            continue;
+        }
+        UNREACHABLE();
+    }
+    return false;                      // exhausted (m_giveup stays false)
+}
+
+seq_split::iterator seq_split::iterate(expr* node, split_mode mode, unsigned threshold,
+                                       split_oracle const& oracle) {
+    return iterator(*this, node, mode, threshold, oracle);
+}
+
+// Eager wrapper: drain the lazy enumeration into `out`.  Semantics (give-up cases,
+// oracle discipline) match the historic engine.
+bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mode mode,
+                        split_oracle const& oracle) {
+    SASSERT(r);
+    sort* seq_sort = nullptr;
+    if (!seq().is_re(r, seq_sort))
+        return false;
+    expr_ref node = mk_fromre(r);
+    return materialize(node, mode, threshold, oracle, result);
+}
+
+// same-D / same-N merge (paper eqs. 1 & 2):
+//   { <D,N>, <D,N'> } -> <D, N | N'>      (by_left = true,  group by D)
+//   { <D,N>, <D',N> } -> <D | D', N>      (by_left = false, group by N)
+// Only fires on syntactically-identical (perfectly-shared) key components, so
+// it is a conservative instance of the rule.
+void seq_split::merge_by(split_set& pairs, const bool by_left) const {
+    obj_map<expr, unsigned> idx;   // key component -> position in `out`
+    split_set out;
+    for (auto const& p : pairs) {
+        expr* key   = by_left ? p.m_d.get() : p.m_n.get();
+        expr* other = by_left ? p.m_n.get() : p.m_d.get();
+        unsigned pos;
+        if (idx.find(key, pos)) {
+            expr* prev = by_left ? out[pos].m_n.get() : out[pos].m_d.get();
+            const expr_ref u(m_rw.mk_regex_union_normalize(prev, other), m);
+            if (by_left)
+                out[pos].m_n = u;
+            else
+                out[pos].m_d = u;
+        }
+        else {
+            idx.insert(key, out.size());
+            out.push_back(p);
+        }
+    }
+    pairs.swap(out);
+}
+
+void seq_split::simplify(split_set& pairs) const {
+    seq_util::rex& r = re();
+
+    // 1. drop pairs with a bottom (empty-language) component.
+    unsigned w = 0;
+    for (unsigned i = 0; i < pairs.size(); ++i) {
+        if (r.is_empty(pairs[i].m_d) || r.is_empty(pairs[i].m_n))
+            continue;
+        if (w != i)
+            pairs[w] = pairs[i];
+        ++w;
+    }
+    pairs.shrink(w);
+    if (pairs.size() <= 1)
+        return;
+
+    // 2. same-D / same-N merge rules.
+    merge_by(pairs, true);
+    merge_by(pairs, false);
+    if (pairs.size() <= 1)
+        return;
+
+    // 3. subsumption: drop <D_i,N_i> when L(D_i) subseteq L(D_j) and
+    //    L(N_i) subseteq L(N_j) for some kept j.  seq_subset is conservative
+    //    (returns true only for definite containment), so we never drop a
+    //    needed split.
+    //if (pairs.size() > 64)
+    //    return;
+
+    struct row { expr* d; expr* n; unsigned idx; };
+    vector<row> rows;
+    for (unsigned i = 0; i < pairs.size(); ++i)
+        rows.push_back({ pairs[i].m_d.get(), pairs[i].m_n.get(), i });
+
+    auto subsumes = [&](row const& a, row const& b) {
+        return m_subset.is_subset(b.d, a.d) && m_subset.is_subset(b.n, a.n);
+    };
+
+    vector<row> kept;
+    for (row const& row_r : rows) {
+        bool redundant = false;
+        for (row const& k : kept)
+            if (subsumes(k, row_r)) { redundant = true; break; }
+        if (redundant)
+            continue;
+        // drop already-kept rows strictly subsumed by row_r
+        unsigned kw = 0;
+        for (unsigned t = 0; t < kept.size(); ++t) {
+            if (subsumes(row_r, kept[t]))
+                continue;
+            kept[kw++] = kept[t];
+        }
+        kept.shrink(kw);
+        kept.push_back(row_r);
+    }
+
+    split_set result;
+    for (row const& k : kept)
+        result.push_back(pairs[k.idx]);
+    pairs.swap(result);
+}
+
+std::pair<expr_ref, expr_ref> seq_split::split_membership(expr* str, expr* regex, unsigned threshold, split_set& result) const {
+    expr_ref_vector tokens(m);
+    vector<expr*> stack;
+    stack.push_back(str);
+
+    while (!stack.empty()) {
+        expr* cur = stack.back();
+        stack.pop_back();
+        expr* l, *r;
+        if (seq().str.is_concat(cur, l, r)) {
+            stack.push_back(r);
+            stack.push_back(l);
+        }
+        else
+            tokens.push_back(expr_ref(cur, m));
+    }
+
+    expr* ch;
+    unsigned i = 0;
+
+    while (i < tokens.size() && (seq().str.is_string(tokens.get(i)) || (seq().str.is_unit(tokens.get(i), ch) && seq().is_const_char(ch)))) {
+        zstring s;
+        if (seq().str.is_string(tokens.get(i), s)) {
+            if (s.empty()) {
+                i++;
+                continue;
+            }
+            ch = seq().mk_char(s[0]);
+            tokens[i] = seq().str.mk_string(s.extract(1, s.length() - 1));
+        }
+        else
+            i++;
+        regex = m_rw.mk_derivative(ch, regex);
+    }
+
+    if (i > 0) {
+        unsigned j = 0;
+        for (; i < tokens.size(); i++, j++) {
+            tokens[j] = tokens.get(i);
+        }
+        tokens.shrink(j);
+    }
+
+    // TODO: Do this for the back as well (also, why did no rule before do that?)
+
+    if (tokens.empty())
+        return { expr_ref(m), expr_ref(m) };
+
+    // Choose the factorization boundary so the tail starts with the
+    // longest run of concrete characters c.
+    // This gives the split-engine lookahead oracle the most pruning information.
+    // head = u' (tokens before the run), tail = c · u''' (tokens from the run onward).
+    const unsigned total = tokens.size();
+    unsigned run_start = 0, run_len = 0;
+    for (i = 1; i < total; ) {
+        if (!(seq().str.is_unit(tokens.get(i), ch) && seq().is_const_char(ch))) {
+            i++;
+            continue;
+        }
+        unsigned j = i;
+        while (j < total && seq().str.is_unit(tokens.get(j), ch) && seq().is_const_char(ch)) {
+            j++;
+        }
+        if (j - i > run_len) {
+            run_len = j - i;
+            run_start = i;
+        }
+        i = j;
+    }
+    // No constant run => fall back to splitting off the first token.
+    const unsigned p = run_len == 0 ? 1 : run_start;
+    SASSERT(p >= 1);
+    expr* head = tokens.get(0);
+    for (i = 1; i < p; i++) {
+        head = seq().str.mk_concat(head, tokens.get(i));
+    }
+    expr* tail = seq().str.mk_empty(head->get_sort());
+    if (tokens.size() > p + run_len) {
+        tail = tokens.get(p + run_len);
+        for (i = p + run_len + 1; i < tokens.size(); i++) {
+            tail = seq().str.mk_concat(tail, tokens.get(i));
+        }
+    }
+    SASSERT(head && tail);
+
+    // Build the constant lookahead c and (if non-empty) an oracle that
+    // prunes splits whose postfix cannot match c.
+    zstring c;
+    for (i = 0; i < run_len; ++i) {
+        unsigned cv;
+        VERIFY(seq().str.is_unit(tokens.get(run_start + i), ch));
+        VERIFY(seq().is_const_char(ch, cv));
+        c = c + zstring(cv);
+    }
+    split_oracle oracle;
+    if (!c.empty())
+        oracle = [this, &c](expr*, expr* n) { return split_lookahead_viable(n, c); };
+
+    // Decompose the regex into a split-set via the shared seq_split engine
+    if (!m_rw.split(regex, result, threshold, split_mode::strong, oracle)) {
+        result.clear();
+        return { expr_ref(m), expr_ref(m) };
+    }
+
+    simplify(result);
+
+    // Eagerly consume the constant run c from the tail by taking the c-derivative
+    // of each postfix
+    if (!c.empty()) {
+        unsigned w = 0;
+        for (i = 0; i < result.size(); ++i) {
+            expr* d = result[i].m_n;
+            for (unsigned k = 0; d && !seq().re.is_empty(d) && k < c.length(); ++k) {
+                d = m_rw.mk_derivative(seq().mk_char(c[k]), d);
+            }
+            SASSERT(d);
+            if (re().is_empty(d))
+                continue;   // postfix can't start with c => infeasible split, drop
+            result[w++] = split_pair(result[i].m_d, d, m);
+        }
+        result.shrink(w);
+    }
+
+    return { expr_ref(head, m), expr_ref(tail, m) };
+}
+
+bool seq_split::split_lookahead_viable(expr* regex, zstring const& c) const {
+    SASSERT(regex);
+    for (unsigned i = 0; i < c.length(); i++) {
+        if (m.is_true(m_rw.is_nullable(regex)))
+            return true;            // N accepts the prefix c[0..i) => a suffix completes it
+        regex = m_rw.mk_derivative(seq().mk_char(c[i]), regex);
+        SASSERT(regex);
+        if (re().is_empty(regex))
+            return false;           // N went (syntactically) dead before reaching c
+    }
+    return !re().is_empty(regex);
+}
--- a/src/ast/rewriter/seq_split.h
+++ b/src/ast/rewriter/seq_split.h
@ -0,0 +1,225 @@
+/*++
+Copyright (c) 2026 Microsoft Corporation
+
+Module Name:
+
+    seq_split.h
+
+Abstract:
+
+    Regex split decomposition: the split function sigma from the paper
+    "Solving by Splitting".  For a regular expression r, sigma(r) is a finite
+    "split-set" of pairs { <D_i, N_i> } such that
+
+        u.v in L(r)   iff   exists i:  u in L(D_i)  and  v in L(N_i).
+
+    The split algebra (intersection, De Morgan complement, left/right
+    concatenation with a regex) and the cardinality-reducing simplification
+    heuristics (drop bottom, same-D/same-N merge, subsumption via seq_subset)
+    follow the paper.
+
+Author:
+
+    Clemens Eisenhofer 2026-6-10
+
+--*/
+#pragma once
+
+#include "ast/seq_decl_plugin.h"
+#include "ast/rewriter/seq_subset.h"
+#include <functional>
+
+class seq_rewriter;
+
+// An individual split <D, N>: the left (prefix) regex D and right (suffix)
+// regex N.  u.v in L(r) for this split iff u in L(D) and v in L(N).
+struct split_pair {
+    expr_ref m_d;
+    expr_ref m_n;
+    split_pair(expr* d, expr* n, ast_manager& m) : m_d(d, m), m_n(n, m) {
+        SASSERT(d && n);
+    }
+};
+
+// A split-set is a union of individual splits.
+typedef vector<split_pair> split_set;
+
+// Controls how aggressively sigma expands the Boolean-closure cases:
+//   strong - fully expand complement / intersection via the split algebra
+//            (De Morgan / cross product).  This is the behaviour the nseq
+//            solver relies on.
+//   weak   - do not perform the (potentially 2^k) Boolean-closure expansion;
+//            give up (return false) on complement / intersection instead.
+enum class split_mode { weak, strong };
+
+// Optional lookahead oracle.  Called for each candidate split <D, N> as it is
+// generated; returns true to keep it, false to prune it.  An empty oracle (the
+// default) keeps everything, so sigma is unchanged.  See seq_split::compute.
+typedef std::function<bool(expr* D, expr* N)> split_oracle;
+
+class seq_split {
+    ast_manager& m;
+    seq_rewriter& m_rw;       // for mk_re_append + manager / seq_util access
+    seq_subset    m_subset;   // language-subset checks for subsumption
+
+    // --- Suspended split-set representation -------------------------------
+    // A split-set computation is kept as an `expr` term over a small family of
+    // locally-declared, uninterpreted function symbols (the split algebra of the
+    // paper / split-algebra.md).  Nothing here is ever asserted to the solver;
+    // the terms are only used as scratch structure to drive lazy expansion.
+    //
+    //   empty    : SplitSet                 -- {} (bottom)
+    //   single   : Re x Re -> SplitSet       -- a single split <D, N>
+    //   from_re  : Re -> SplitSet            -- the *suspended* sigma(r)
+    //   union    : SplitSet x SplitSet -> SplitSet
+    //   inter    : SplitSet x SplitSet -> SplitSet
+    //   compl    : SplitSet -> SplitSet
+    //   lcat     : Re x SplitSet -> SplitSet -- r . S (left-concat onto D)
+    //   rcat     : SplitSet x Re -> SplitSet -- S . r (right-concat onto N)
+    sort*         m_seq_sort = nullptr;   // sequence sort the decls are built for
+    sort_ref      m_set_sort;             // the uninterpreted SplitSet sort
+    func_decl_ref m_d_empty, m_d_single, m_d_fromre, m_d_union,
+                  m_d_inter, m_d_compl, m_d_lcat, m_d_rcat;
+    expr_ref      m_empty_app;            // cached nullary `empty` term
+
+    seq_util&      seq() const;
+    seq_util::rex& re() const;
+
+    // (Re)build the local declarations for `seq_sort` if not already current.
+    void ensure_decls(sort* seq_sort);
+
+    // Smart constructors: apply the cheap normalizations the eager engine relies
+    // on (drop-bottom, eps cancellation, union absorption of empty).
+    expr_ref mk_empty();
+    expr_ref mk_single(expr* d, expr* n);
+    expr_ref mk_fromre(expr* r);
+    expr_ref mk_union(expr* a, expr* b);
+    expr_ref mk_inter(expr* a, expr* b);
+    expr_ref mk_compl(expr* a);
+    expr_ref mk_lcat(expr* r, expr* s);
+    expr_ref mk_rcat(expr* s, expr* r);
+
+    // Recognizers over the local decls.
+    bool is_empty_ss(expr* e) const;
+    bool is_single(expr* e, expr*& d, expr*& n) const;
+    bool is_fromre(expr* e, expr*& r) const;
+    bool is_union (expr* e, expr*& a, expr*& b) const;
+    bool is_inter (expr* e, expr*& a, expr*& b) const;
+    bool is_compl (expr* e, expr*& a) const;
+    bool is_lcat  (expr* e, expr*& r, expr*& s) const;
+    bool is_rcat  (expr* e, expr*& s, expr*& r) const;
+    // A term whose head is empty | single | union (ready for the worklist loop).
+    bool is_frontier(expr* e) const;
+
+    // One level of the sigma rules: from_re(r) -> a SplitSet term built from the
+    // immediate subterms.  `ok` is set false on an unsupported shape.
+    expr_ref expand_fromre(expr* r, bool& ok);
+    // Distribute a left/right concatenation over a head-normal split-set.
+    expr_ref distribute_lcat(expr* r, expr* hs);
+    expr_ref distribute_rcat(expr* hs, expr* r);
+    // Materialized split-set -> a `union` of `single`s.
+    expr_ref from_split_set(split_set const& s);
+    // Reduce `t` until its head is empty | single | union (one outermost level
+    // for the lazy nodes; inter/compl are expanded eagerly via `materialize`,
+    // since the paper's De Morgan / cross-product cannot yield a split lazily).
+    // `ok` is set false on a give-up (unsupported shape, weak-mode Boolean, or
+    // threshold overrun).
+    expr_ref head_normalize(expr* t, split_mode mode, unsigned threshold,
+                            split_oracle const& oracle, bool& ok);
+    // Fully drain a suspended split-set into `out` (used for inter/compl bodies).
+    // Runs an `iterator` to exhaustion; returns false on a give-up.
+    bool materialize(expr* node, split_mode mode, unsigned threshold,
+                     split_oracle const& oracle, split_set& out);
+
+    // Push <d, n> onto `out`, unless `oracle` rejects it.
+    void push(split_set& out, split_oracle const& oracle, expr* d, expr* n) const;
+
+    // S1 cap S2 = { <D1 cap D2, N1 cap N2> } dropping any pair with a bottom
+    // component (and any rejected by `oracle`).  Returns false on threshold overrun.
+    bool intersect(split_set const& s1, split_set const& s2, split_set& result,
+                   unsigned threshold, split_oracle const& oracle) const;
+
+    // De Morgan complement of a split-set: ~S = cap_{s in S} ~s with
+    //   ~<D,N> = { <~D, .*>, <.*, ~N> }  and  ~{} = { <.*, .*> }.
+    bool complement(sort* seq_sort, split_set const& sp, split_set& result,
+                    unsigned threshold, split_oracle const& oracle) const;
+
+    // same-D / same-N merge: groups pairs that share a (syntactically identical)
+    // left (resp. right) component and unions the other component.
+    void merge_by(split_set& pairs, bool by_left) const;
+
+public:
+    explicit seq_split(seq_rewriter& rw);
+
+    // Lazy split enumerator.  Holds the suspended split-set worklist and produces
+    // the concrete splits <D, N> one at a time, on demand, instead of computing
+    // them all up front.  Obtain one from seq_split::iterate (or construct it
+    // directly) and pull splits with next() until it returns false; gave_up() then
+    // tells a normal exhaustion (false) apart from a give-up (true).
+    //
+    // The threshold is supplied by the caller and serves only as a safety cap
+    // against space bloat (lazy expansion still has to materialize the operands of
+    // intersection / complement).  A threshold overrun, an unsupported regex shape,
+    // or a Boolean-closure case in weak mode aborts the enumeration: next() returns
+    // false and gave_up() returns true.  To stop early, simply stop calling next().
+    //
+    // `oracle` (optional) prunes non-viable splits as they are produced.  It must
+    // be sound to apply per split: a candidate N can still gain a prefix from a
+    // factor appended to its right later (concat/star), so the oracle must use a
+    // "prefix-compatible" test (prune only when N can never match the lookahead,
+    // even partially), NOT a strict "starts-with" test.  The complement body is
+    // expanded WITHOUT the oracle (inverted orientation); the oracle is re-applied
+    // to the complement's output fold.
+    class iterator {
+        seq_split&      m_engine;
+        ast_manager&    m;
+        split_mode      m_mode;
+        unsigned        m_threshold;
+        split_oracle    m_oracle;
+        expr_ref_vector m_work;        // GC-safe worklist of suspended split-sets
+        unsigned        m_count = 0;   // splits produced so far (vs. threshold)
+        bool            m_giveup = false;
+    public:
+        iterator(seq_split& engine, expr* node, split_mode mode,
+                 unsigned threshold, split_oracle oracle);
+        // Compute the next split.  On success returns true and sets <d, n>; on
+        // exhaustion or give-up returns false (see gave_up()).  Calling next()
+        // again after it has returned false keeps returning false.
+        bool next(expr_ref& d, expr_ref& n);
+        // Valid after next() has returned false: true iff the enumeration aborted
+        // (unsupported regex / weak-mode Boolean / threshold overrun) rather than
+        // running out of splits.
+        bool gave_up() const { return m_giveup; }
+    };
+
+    // Build the *suspended* sigma(r) as a split-algebra term (no expansion).
+    // Returns null on a non-regex argument.  Drive it with `iterate`.
+    expr_ref make(expr* r);
+
+    // Create a lazy enumerator over a suspended split-set `node` (typically the
+    // result of make()).  See `iterator` for the meaning of the arguments.
+    iterator iterate(expr* node, split_mode mode, unsigned threshold,
+                     split_oracle const& oracle = {});
+
+    // Compute sigma(r), appending to `out` (does not clear it).  Thin eager
+    // wrapper that drains an `iterator` to exhaustion; semantics match the historic
+    // engine.  See `iterator` for the meaning of `threshold`, `mode`, and `oracle`.
+    bool compute(expr* r, split_set& out, unsigned threshold,
+                 split_mode mode = split_mode::strong, split_oracle const& oracle = {});
+
+    // In-place simplification of a split-set: drop bottom components, apply the
+    // same-D / same-N merge rules, and drop splits subsumed by another (using
+    // seq_subset).  Size-capped to keep the O(n^2) subsumption affordable.
+    void simplify(split_set& s) const;
+
+    // decompose a membership constraint into a set of pairs of regex splits
+    std::pair<expr_ref, expr_ref> split_membership(expr* str, expr* regex, unsigned threshold, split_set& result) const;
+
+    // Lookahead oracle for the split engine: is the split's right component
+    // `n_regex` prefix-compatible with the constant character sequence `c`?
+    // This is sound to apply during split generation — it never drops a viable split.
+    // Thus, it might not eliminate all cases in order to stay sound
+    bool split_lookahead_viable(expr* regex, zstring const& c) const;
+
+
+};
--- a/src/params/smt_params_helper.pyg
+++ b/src/params/smt_params_helper.pyg
@ -138,6 +138,8 @@ def_module_params(module_name='smt',
                          ('seq.validate', BOOL, False, 'enable self-validation of theory axioms created by seq theory'),
                          ('seq.max_unfolding', UINT, 1000000000, 'maximal unfolding depth for checking string equations and regular expressions'),
                          ('seq.min_unfolding', UINT, 1, 'initial bound for strings whose lengths are bounded by iterative deepening. Set this to a higher value if there are only models with larger string lengths'),
+                          ('seq.regex_factorization_threshold', UINT, 10, 'maximum number of cases to factor a regex into in a single step'),
+                          ('seq.regex_factorization_enabled', BOOL, False, 'apply regex factorization (sigma splitting)'),
                          ('theory_aware_branching', BOOL, False, 'Allow the context to use extra information from theory solvers regarding literal branching prioritization.'),
                          ('sls.enable', BOOL, False, 'enable sls co-processor with SMT engine'),
                          ('sls.parallel', BOOL, True, 'use sls co-processor in parallel or sequential with SMT engine'),
--- a/src/params/theory_seq_params.cpp
+++ b/src/params/theory_seq_params.cpp
@ -23,4 +23,6 @@ void theory_seq_params::updt_params(params_ref const & _p) {
    m_seq_validate = p.seq_validate();
    m_seq_max_unfolding = p.seq_max_unfolding();
    m_seq_min_unfolding = p.seq_min_unfolding();
+    m_seq_regex_factorization_enabled = p.seq_regex_factorization_enabled();
+    m_seq_regex_factorization_threshold = p.seq_regex_factorization_threshold();
 }
--- a/src/params/theory_seq_params.h
+++ b/src/params/theory_seq_params.h
@ -26,6 +26,8 @@ struct theory_seq_params {
    bool m_seq_validate = false;
    unsigned m_seq_max_unfolding = UINT_MAX/4;
    unsigned m_seq_min_unfolding = 1;
+    bool m_seq_regex_factorization_enabled = false;
+    unsigned m_seq_regex_factorization_threshold = 1;

    theory_seq_params(params_ref const & p = params_ref()) {
        updt_params(p);
--- a/src/smt/seq_regex.cpp
+++ b/src/smt/seq_regex.cpp
@ -128,6 +128,30 @@ namespace smt {
            return;
        }

+        if (th.get_fparams().m_seq_regex_factorization_enabled) {
+            unsigned threshold = th.get_fparams().m_seq_regex_factorization_threshold;
+            if (threshold == 0)
+                threshold = UINT_MAX;
+            split_set result;
+            auto [head, tail] = seq_rw().split_membership(s, r, threshold, result);
+            if (head) {
+                SASSERT(tail);
+                // propagate all cases
+                expr_ref_vector cases(m);
+                expr_ref_vector branches(m);
+                for (auto [pre, post] : result) {
+                    expr_ref mem_head(re().mk_in_re(head, pre), m);
+                    expr_ref mem_tail(re().mk_in_re(tail, post), m);
+                    cases.push_back(m.mk_and(mem_head, mem_tail));
+                }
+                const expr_ref cases_expr(m.mk_or(cases), m);
+                ctx.internalize(cases_expr, false);
+                th.propagate_lit(nullptr, 1, &lit, ctx.get_literal(cases_expr));
+                return;
+            }
+            // fallthrough; decomposition failed
+        }
+
        // Convert a non-ground sequence into an additional regex and
        // strengthen the original regex constraint into an intersection
        // for example:
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@ -138,6 +138,7 @@ add_executable(test-z3
  simplifier.cpp
  sls_test.cpp
  sls_seq_plugin.cpp
+  seq_split.cpp
  small_object_allocator.cpp
  smt2print_parse.cpp
  smt_context.cpp
--- a/src/test/main.cpp
+++ b/src/test/main.cpp
@ -197,6 +197,7 @@
    X(ho_matcher) \
    X(finite_set) \
    X(finite_set_rewriter) \
+    X(seq_split) \
    X(fpa) \
    X(seq_regex_bisim) \
    X(term_enumeration) \
--- a/src/test/seq_split.cpp
+++ b/src/test/seq_split.cpp
@ -0,0 +1,450 @@
+/*++
+Copyright (c) 2026 Microsoft Corporation
+
+Module Name:
+
+    seq_split.cpp
+
+Abstract:
+
+    Unit tests for the regex split engine (the split function sigma) in ast/rewriter/seq_split.cpp.
+
+Author:
+
+    Clemens Eisenhofer 2026-6-22
+
+--*/
+
+#include "ast/ast.h"
+#include "ast/reg_decl_plugins.h"
+#include "ast/seq_decl_plugin.h"
+#include "ast/rewriter/seq_rewriter.h"
+#include "ast/rewriter/seq_split.h"
+#include <set>
+#include <utility>
+
+
+struct plugin_registrar {
+    plugin_registrar(ast_manager& m) { reg_decl_plugins(m); }
+};
+
+class seq_split_test {
+    ast_manager     m;
+    plugin_registrar m_reg;
+    seq_rewriter    m_rw;
+    seq_split       m_split;
+    seq_util        u;
+    sort_ref        m_str;   // the sequence (String) sort
+    sort_ref        m_re;    // the RegEx sort over m_str
+
+    seq_util::rex& re() { return u.re; }
+
+    expr_ref eps()        { return expr_ref(re().mk_epsilon(m_str), m); }   // mk_epsilon takes the seq sort
+    expr_ref dot()        { return expr_ref(re().mk_full_char(m_re), m); }  // mk_full_char takes the RegEx sort
+    expr_ref dotstar()    { return expr_ref(re().mk_full_seq(m_re), m); }   // .*
+    expr_ref empty_re()   { return expr_ref(re().mk_empty(m_re), m); }      // the bottom regex
+    expr_ref rappend(expr* a, expr* b) { return m_rw.mk_re_append(a, b); }  // the engine's regex concat
+    expr_ref word(char const* s) { return expr_ref(re().mk_to_re(u.str.mk_string(zstring(s))), m); }
+    expr_ref rng(char lo, char hi) {
+        return expr_ref(re().mk_range(u.str.mk_string(zstring(std::string(1, lo).c_str())),
+                                      u.str.mk_string(zstring(std::string(1, hi).c_str()))), m);
+    }
+
+    typedef std::set<std::pair<expr*, expr*>> pair_set;
+
+    pair_set as_set(split_set const& s) {
+        pair_set out;
+        for (auto const& p : s)
+            out.insert({ p.m_d.get(), p.m_n.get() });
+        return out;
+    }
+
+    bool eager(expr* r, split_set& out, unsigned threshold = UINT_MAX,
+               split_mode mode = split_mode::strong, split_oracle const& oracle = {}) {
+        return m_split.compute(r, out, threshold, mode, oracle);
+    }
+
+    bool lazy(expr* r, split_set& out, unsigned threshold = UINT_MAX,
+              split_mode mode = split_mode::strong, split_oracle const& oracle = {}) {
+        expr_ref node = m_split.make(r);
+        ENSURE(node);
+        seq_split::iterator it = m_split.iterate(node, mode, threshold, oracle);
+        expr_ref d(m), n(m);
+        while (it.next(d, n))
+            out.push_back(split_pair(d, n, m));
+        return !it.gave_up();
+    }
+
+    // assert that the eager and lazy engines agree on sigma(r) as a *set* of
+    // splits, and report the common cardinality.
+    unsigned check_agree(expr* r) {
+        split_set se, sl;
+        bool oke = eager(r, se);
+        bool okl = lazy(r, sl);
+        ENSURE(oke == okl);
+        if (!oke)
+            return 0;
+        ENSURE(as_set(se) == as_set(sl));
+        return (unsigned)as_set(se).size();
+    }
+
+public:
+    seq_split_test() : m_reg(m), m_rw(m), m_split(m_rw), u(m), m_str(m), m_re(m) {
+        m_str = u.str.mk_string_sort();
+        m_re  = re().mk_re(m_str);
+    }
+
+    void test_eager_epsilon() {
+        split_set s;
+        ENSURE(eager(eps(), s));
+        ENSURE(as_set(s) == pair_set({ { eps().get(), eps().get() } }));
+    }
+
+    void test_eager_char() {
+        // sigma(.) = { <eps, .>, <., eps> }
+        expr_ref a = dot();
+        split_set s;
+        ENSURE(eager(a, s));
+        pair_set expected({ { eps().get(), a.get() }, { a.get(), eps().get() } });
+        ENSURE(as_set(s) == expected);
+    }
+
+    void test_eager_word() {
+        // sigma("ab") = { <"", "ab">, <"a","b">, <"ab",""> }
+        split_set s;
+        ENSURE(eager(word("ab"), s));
+        pair_set expected({
+            { word("").get(),  word("ab").get() },
+            { word("a").get(), word("b").get()  },
+            { word("ab").get(), word("").get()  },
+        });
+        ENSURE(as_set(s) == expected);
+    }
+
+    void test_eager_union() {
+        // sigma(a | b) = sigma(a) cup sigma(b)
+        expr_ref a = rng('a', 'a'), b = rng('b', 'b');
+        expr_ref u_re(re().mk_union(a, b), m);
+        split_set s;
+        ENSURE(eager(u_re, s));
+        pair_set expected({
+            { eps().get(), a.get() }, { a.get(), eps().get() },
+            { eps().get(), b.get() }, { b.get(), eps().get() },
+        });
+        ENSURE(as_set(s) == expected);
+    }
+
+    void test_agree_all() {
+        expr_ref a = rng('a', 'a'), b = rng('b', 'b');
+        expr_ref star(re().mk_star(a), m);
+        expr_ref plus(re().mk_plus(a), m);
+        expr_ref concat(re().mk_concat(a, b), m);
+        expr_ref uni(re().mk_union(a, b), m);
+        expr_ref inter(re().mk_inter(re().mk_star(a), re().mk_star(b)), m);
+        expr_ref compl_(re().mk_complement(re().mk_star(a)), m);
+        expr_ref diff(re().mk_diff(re().mk_star(a), re().mk_star(b)), m);
+
+        ENSURE(check_agree(eps())  == 1);
+        ENSURE(check_agree(a)      == 2);
+        ENSURE(check_agree(word("ab")) == 3);
+        ENSURE(check_agree(uni)    == 4);
+        ENSURE(check_agree(star)   == 3);   // { <eps,eps>, <a*, a.a*>, <a*.a, a*> }
+        (void)check_agree(plus);
+        (void)check_agree(concat);
+        (void)check_agree(inter);            // strong-mode intersection
+        (void)check_agree(compl_);           // strong-mode De Morgan complement
+        (void)check_agree(diff);
+    }
+
+    void test_lazy_early_stop() {
+        // a* has 3 splits; pull just the first one and then stop.  (Note .* is the
+        // full_seq special case with a single split, so use a proper char-class body.)
+        expr_ref star(re().mk_star(rng('a', 'a')), m);
+        expr_ref node = m_split.make(star);
+        ENSURE(node);
+        seq_split::iterator it = m_split.iterate(node, split_mode::strong, UINT_MAX, {});
+        expr_ref d(m), n(m);
+        unsigned seen = 0;
+        if (it.next(d, n))       // pull exactly one split, then walk away
+            ++seen;
+        ENSURE(!it.gave_up());   // stopping early is not a give-up
+        ENSURE(seen == 1);
+    }
+
+    void test_threshold_giveup() {
+        expr_ref star(re().mk_star(rng('a', 'a')), m);   // 3 splits
+        split_set s;
+        ENSURE(!lazy(star, s, /*threshold*/ 1));
+        // the eager wrapper honours the same cap
+        split_set s2;
+        ENSURE(!eager(star, s2, /*threshold*/ 1));
+    }
+
+    void test_weak_vs_strong() {
+        expr_ref inter(re().mk_inter(re().mk_star(rng('a', 'a')), re().mk_star(rng('b', 'b'))), m);
+        expr_ref compl_(re().mk_complement(re().mk_star(dot())), m);
+
+        split_set s;
+        ENSURE(!eager(inter, s, UINT_MAX, split_mode::weak));
+        s.reset();
+        ENSURE(!lazy(inter, s, UINT_MAX, split_mode::weak));
+        s.reset();
+        ENSURE(!eager(compl_, s, UINT_MAX, split_mode::weak));
+        s.reset();
+        ENSURE(!lazy(compl_, s, UINT_MAX, split_mode::weak));
+
+        // strong mode succeeds for both
+        s.reset();
+        ENSURE(eager(inter, s, UINT_MAX, split_mode::strong));
+        s.reset();
+        ENSURE(eager(compl_, s, UINT_MAX, split_mode::strong));
+    }
+
+    void test_make_non_regex() {
+        expr_ref not_a_regex(u.str.mk_string(zstring("a")), m);   // String, not RegEx
+        expr_ref node = m_split.make(not_a_regex);
+        ENSURE(!node);
+    }
+
+    void test_oracle_prunes() {
+        // sigma(.) without an oracle = { <eps,.>, <.,eps> }; an oracle that keeps
+        // only splits whose suffix is epsilon must drop one of the two.
+        expr_ref a = dot();
+        expr_ref e = eps();
+        split_oracle keep_eps_suffix = [&](expr*, expr* n) { return n == e.get(); };
+
+        split_set se, sl;
+        ENSURE(eager(a, se, UINT_MAX, split_mode::strong, keep_eps_suffix));
+        ENSURE(lazy(a, sl, UINT_MAX, split_mode::strong, keep_eps_suffix));
+        pair_set expected({ { a.get(), e.get() } });
+        ENSURE(as_set(se) == expected);
+        ENSURE(as_set(sl) == expected);
+    }
+
+    void test_eager_full_seq() {
+        // sigma(.*) = { <.*, .*> }
+        expr_ref ds = dotstar();
+        split_set s;
+        ENSURE(eager(ds, s));
+        ENSURE(as_set(s) == pair_set({ { ds.get(), ds.get() } }));
+    }
+
+    void test_eager_bottom() {
+        // sigma(empty) = {}
+        split_set s;
+        ENSURE(eager(empty_re(), s));
+        ENSURE(s.empty());
+
+        split_set sl;
+        ENSURE(lazy(empty_re(), sl));
+        ENSURE(sl.empty());
+    }
+
+    void test_eager_empty_word() {
+        // sigma(to_re("")) = { <"", ""> }  (a single, trivial split)
+        split_set s;
+        ENSURE(eager(word(""), s));
+        ENSURE(as_set(s) == pair_set({ { word("").get(), word("").get() } }));
+    }
+
+    void test_eager_star_content() {
+        // sigma(a*) = { <eps,eps>, <a*.eps, a.a*>, <a*.a, eps.a*> }
+        expr_ref a = rng('a', 'a');
+        expr_ref as(re().mk_star(a), m);
+        split_set s;
+        ENSURE(eager(as, s));
+        pair_set expected({
+            { eps().get(), eps().get() },
+            { rappend(as, eps()).get(), rappend(a, as).get() },
+            { rappend(as, a).get(),     rappend(eps(), as).get() },
+        });
+        ENSURE(as_set(s) == expected);
+    }
+
+    void test_eager_plus_content() {
+        // sigma(a+) = a*.sigma(a).a*  (the star rule without <eps,eps>)
+        expr_ref a = rng('a', 'a');
+        expr_ref as(re().mk_star(a), m);
+        expr_ref ap(re().mk_plus(a), m);
+        split_set s;
+        ENSURE(eager(ap, s));
+        pair_set expected({
+            { rappend(as, eps()).get(), rappend(a, as).get() },
+            { rappend(as, a).get(),     rappend(eps(), as).get() },
+        });
+        ENSURE(as_set(s) == expected);
+    }
+
+    void test_eager_concat_content() {
+        // sigma(a.b) = sigma(a).b cup a.sigma(b)
+        expr_ref a = rng('a', 'a'), b = rng('b', 'b');
+        expr_ref ab(re().mk_concat(a, b), m);
+        split_set s;
+        ENSURE(eager(ab, s));
+        pair_set expected({
+            { eps().get(),           rappend(a, b).get()   },   // <eps, a.b>
+            { a.get(),               rappend(eps(), b).get() }, // <a, eps.b>
+            { rappend(a, eps()).get(), b.get()             },   // <a.eps, b>
+            { rappend(a, b).get(),   eps().get()           },   // <a.b, eps>
+        });
+        ENSURE(as_set(s) == expected);
+    }
+
+    void test_nary_union() {
+        // sigma(a|b|c) has 2 splits per char-class
+        expr_ref a = rng('a', 'a'), b = rng('b', 'b'), c = rng('c', 'c');
+        expr_ref u3(re().mk_union(a, re().mk_union(b, c)), m);
+        ENSURE(check_agree(u3) == 6);
+    }
+
+    void test_nary_concat() {
+        // sigma(a.b.c)
+        expr_ref a = rng('a', 'a'), b = rng('b', 'b'), c = rng('c', 'c');
+        expr_ref c3(re().mk_concat(a, re().mk_concat(b, c)), m);
+        ENSURE(check_agree(c3) >= 4);
+    }
+
+    void test_nested_complement() {
+        // sigma(~~(a*))
+        expr_ref cc(re().mk_complement(re().mk_complement(re().mk_star(rng('a', 'a')))), m);
+        (void)check_agree(cc);
+    }
+
+    void test_determinism() {
+        expr_ref r(re().mk_concat(rng('a', 'a'), re().mk_star(rng('b', 'b'))), m);
+        split_set s1, s2;
+        ENSURE(lazy(r, s1));
+        ENSURE(lazy(r, s2));
+        ENSURE(as_set(s1) == as_set(s2));
+    }
+
+    void test_threshold_boundary() {
+        expr_ref as(re().mk_star(rng('a', 'a')), m);   // exactly 3 splits
+        split_set s;
+        ENSURE(eager(as, s));
+        unsigned k = (unsigned)as_set(s).size();
+        ENSURE(k == 3);
+
+        split_set ok_e, ok_l, bad_e, bad_l;
+        ENSURE(eager(as, ok_e, k));
+        ENSURE(lazy(as, ok_l, k));
+        ENSURE(!eager(as, bad_e, k - 1));   // one below threshold; give up
+        ENSURE(!lazy(as, bad_l, k - 1));
+    }
+
+    void test_early_stop_after_two() {
+        expr_ref as(re().mk_star(rng('a', 'a')), m);   // 3 splits
+        expr_ref node = m_split.make(as);
+        ENSURE(node);
+        seq_split::iterator it = m_split.iterate(node, split_mode::strong, UINT_MAX, {});
+        expr_ref d(m), n(m);
+        unsigned seen = 0;
+        while (seen < 2 && it.next(d, n))   // pull two splits on demand, then stop
+            ++seen;
+        ENSURE(!it.gave_up());
+        ENSURE(seen == 2);
+    }
+
+    void test_iterator_exhaustion() {
+        // Pull every split on demand; gave_up() must stay false on a clean
+        // exhaustion, and next() must keep returning false once drained.
+        expr_ref as(re().mk_star(rng('a', 'a')), m);   // 3 splits
+        expr_ref node = m_split.make(as);
+        ENSURE(node);
+        seq_split::iterator it = m_split.iterate(node, split_mode::strong, UINT_MAX, {});
+        expr_ref d(m), n(m);
+        unsigned seen = 0;
+        while (it.next(d, n))
+            ++seen;
+        ENSURE(seen == 3);
+        ENSURE(!it.gave_up());
+        // idempotent past the end
+        ENSURE(!it.next(d, n));
+        ENSURE(!it.gave_up());
+    }
+
+    void test_iterator_giveup() {
+        // A threshold overrun aborts: next() returns false and gave_up() is true.
+        expr_ref as(re().mk_star(rng('a', 'a')), m);   // 3 splits, cap at 1
+        expr_ref node = m_split.make(as);
+        ENSURE(node);
+        seq_split::iterator it = m_split.iterate(node, split_mode::strong, /*threshold*/ 1, {});
+        expr_ref d(m), n(m);
+        unsigned seen = 0;
+        while (it.next(d, n))
+            ++seen;
+        ENSURE(it.gave_up());            // aborted, not a clean exhaustion
+        ENSURE(seen <= 1);               // produced at most the capped number
+
+        // A weak-mode Boolean closure is likewise a give-up.
+        expr_ref inter(re().mk_inter(re().mk_star(rng('a', 'a')), re().mk_star(rng('b', 'b'))), m);
+        expr_ref inode = m_split.make(inter);
+        ENSURE(inode);
+        seq_split::iterator wit = m_split.iterate(inode, split_mode::weak, UINT_MAX, {});
+        ENSURE(!wit.next(d, n));
+        ENSURE(wit.gave_up());
+    }
+
+    void test_simplify() {
+        expr_ref regs[] = {
+            expr_ref(re().mk_star(rng('a', 'a')), m),
+            expr_ref(re().mk_complement(re().mk_star(rng('a', 'a'))), m),
+            expr_ref(re().mk_concat(rng('a', 'a'), rng('b', 'b')), m),
+        };
+        for (auto& r : regs) {
+            split_set s;
+            ENSURE(eager(r, s));
+            unsigned before = (unsigned)s.size();
+            m_split.simplify(s);
+            ENSURE(s.size() <= before);
+            ENSURE(!s.empty());
+            // idempotent
+            split_set s2(s);
+            m_split.simplify(s2);
+            ENSURE(as_set(s) == as_set(s2));
+        }
+    }
+
+    void test_trivial_oracle() {
+        expr_ref r(re().mk_star(rng('a', 'a')), m);
+        split_oracle keep_all = [](expr*, expr*) { return true; };
+        split_set s_no, s_yes;
+        ENSURE(eager(r, s_no));
+        ENSURE(eager(r, s_yes, UINT_MAX, split_mode::strong, keep_all));
+        ENSURE(as_set(s_no) == as_set(s_yes));
+    }
+
+    void run() {
+        test_eager_epsilon();
+        test_eager_char();
+        test_eager_word();
+        test_eager_union();
+        test_agree_all();
+        test_lazy_early_stop();
+        test_threshold_giveup();
+        test_weak_vs_strong();
+        test_make_non_regex();
+        test_oracle_prunes();
+        test_eager_full_seq();
+        test_eager_bottom();
+        test_eager_empty_word();
+        test_eager_star_content();
+        test_eager_plus_content();
+        test_eager_concat_content();
+        test_nary_union();
+        test_nary_concat();
+        test_nested_complement();
+        test_determinism();
+        test_threshold_boundary();
+        test_early_stop_after_two();
+        test_iterator_exhaustion();
+        test_iterator_giveup();
+        test_simplify();
+        test_trivial_oracle();
+    }
+};
+
+void tst_seq_split() {
+    seq_split_test t;
+    t.run();
+}