Porting seq_split to master

2026-07-11 17:46:20 +00:00 · 2026-06-12 15:18:14 +02:00 · 2026-06-12 15:18:14 +02:00 · 871a2b3bc8
commit 871a2b3bc8
parent e05ebe8bef
5 changed files with 682 additions and 64 deletions
--- a/src/ast/rewriter/CMakeLists.txt
+++ b/src/ast/rewriter/CMakeLists.txt
@ -40,6 +40,7 @@ z3_add_component(rewriter
    seq_axioms.cpp
    seq_eq_solver.cpp
    seq_subset.cpp
+    seq_split.cpp
    seq_rewriter.cpp
    seq_regex_bisim.cpp
    seq_skolem.cpp
--- a/src/ast/rewriter/seq_rewriter.cpp
+++ b/src/ast/rewriter/seq_rewriter.cpp
@ -3999,58 +3999,54 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) {
            //do not concatenate [], it is a deade-end 
            return result;
        }
-        else {
-            // Classical Brzozowski union: keep the derivative tree free of
-            // antimirov-union nodes so the bisimulation procedure sees a
-            // single regex tree whose leaves are XOR pairs.
-            return mk_der_union(result, mk_der_concat(is_n, dr2));
-        }
+        // Classical Brzozowski union: keep the derivative tree free of
+        // antimirov-union nodes so the bisimulation procedure sees a
+        // single regex tree whose leaves are XOR pairs.
+        return mk_der_union(result, mk_der_concat(is_n, dr2));
    }
-    else if (re().is_star(r, r1)) {
+    if (re().is_star(r, r1)) {
        return mk_der_concat(mk_derivative_rec(ele, r1), r);
    }
-    else if (re().is_plus(r, r1)) {
+    if (re().is_plus(r, r1)) {
        expr_ref star(re().mk_star(r1), m());
        return mk_derivative_rec(ele, star);
    }
-    else if (re().is_union(r, r1, r2)) {
+    if (re().is_union(r, r1, r2)) {
        return mk_der_union(mk_derivative_rec(ele, r1), mk_derivative_rec(ele, r2));
    }
-    else if (re().is_intersection(r, r1, r2)) {
+    if (re().is_intersection(r, r1, r2)) {
        return mk_der_inter(mk_derivative_rec(ele, r1), mk_derivative_rec(ele, r2));
    }
-    else if (re().is_diff(r, r1, r2)) {
+    if (re().is_diff(r, r1, r2)) {
        return mk_der_inter(mk_derivative_rec(ele, r1), mk_der_compl(mk_derivative_rec(ele, r2)));
    }
-    else if (re().is_xor(r, r1, r2)) {
+    if (re().is_xor(r, r1, r2)) {
        return mk_der_xor(mk_derivative_rec(ele, r1), mk_derivative_rec(ele, r2));
    }
-    else if (m().is_ite(r, p, r1, r2)) {
+    if (m().is_ite(r, p, r1, r2)) {
        // there is no BDD normalization here
        result = m().mk_ite(p, mk_derivative_rec(ele, r1), mk_derivative_rec(ele, r2));
        return result;
    }
-    else if (re().is_opt(r, r1)) {
+    if (re().is_opt(r, r1)) {
        return mk_derivative_rec(ele, r1);
    }
-    else if (re().is_complement(r, r1)) {
+    if (re().is_complement(r, r1)) {
        return mk_der_compl(mk_derivative_rec(ele, r1));
    }
-    else if (re().is_loop(r, r1, lo)) {
+    if (re().is_loop(r, r1, lo)) {
        if (lo > 0) {
            lo--;
        }
        result = mk_derivative_rec(ele, r1);
-        //do not concatenate with [] (emptyset)
+        // do not concatenate with [] (emptyset)
        if (re().is_empty(result)) {
            return result;
        }
-        else {
-            //do not create loop r1{0,}, instead create r1*
-            return mk_der_concat(result, (lo == 0 ? re().mk_star(r1) : re().mk_loop(r1, lo)));
-        }
+        // do not create loop r1{0,}, instead create r1*
+        return mk_der_concat(result, (lo == 0 ? re().mk_star(r1) : re().mk_loop(r1, lo)));
    }
-    else if (re().is_loop(r, r1, lo, hi)) {
+    if (re().is_loop(r, r1, lo, hi)) {
        if (hi == 0) {
            return mk_empty();
        }
@ -4059,19 +4055,16 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) {
            lo--;
        }
        result = mk_derivative_rec(ele, r1);
-        //do not concatenate with [] (emptyset) or handle the rest of the loop if no more iterations remain
+        // do not concatenate with [] (emptyset) or handle the rest of the loop if no more iterations remain
        if (re().is_empty(result) || hi == 0) {
            return result;
        }
-        else {
-            return mk_der_concat(result, re().mk_loop_proper(r1, lo, hi));
-        }
+        return mk_der_concat(result, re().mk_loop_proper(r1, lo, hi));
    }
-    else if (re().is_full_seq(r) ||
-             re().is_empty(r)) {
+    if (re().is_full_seq(r) || re().is_empty(r)) {
        return expr_ref(r, m());
    }
-    else if (re().is_to_re(r, r1)) {
+    if (re().is_to_re(r, r1)) {
        // r1 is a string here (not a regexp)
        expr_ref hd(m()), tl(m());
        if (get_head_tail(r1, hd, tl)) {
@ -4084,16 +4077,16 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) {
            result = mk_der_concat(result, r1);
            return result;
        }
-        else if (str().is_empty(r1)) {
-            //observe: str().is_empty(r1) checks that r = () = epsilon
-            //while mk_empty() = [], because deriv(epsilon) = [] = nothing
+        if (str().is_empty(r1)) {
+            // observe: str().is_empty(r1) checks that r = () = epsilon
+            // while mk_empty() = [], because deriv(epsilon) = [] = nothing
            return mk_empty();
        }
-        else if (str().is_itos(r1)) {
+        if (str().is_itos(r1)) {
            //
-            // here r1 = (str.from_int r2) and r2 is non-ground 
+            // here r1 = (str.from_int r2) and r2 is non-ground
            // or else the expression would have been simplified earlier
-            // so r1 must be nonempty and must consists of decimal digits 
+            // so r1 must be nonempty and must consists of decimal digits
            // '0' <= elem <= '9'
            // if ((isdigit ele) and (ele = (hd r1))) then (to_re (tl r1)) else []
            //
@ -4105,19 +4098,17 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) {
            auto a3 = m().mk_eq(hd, ele);
            auto inner = m().mk_and(a2, a3);
            m_br.mk_and(a0, a1, inner, result);
-            tl = re().mk_to_re(mk_seq_rest(r1));            
-            return re_and(result, tl);
-        }
-        else {
-            // recall: [] denotes the empty language (nothing) regex, () denotes epsilon or empty sequence
-            // construct the term (if (r1 != () and (ele = (first r1)) then (to_re (rest r1)) else []))
-            hd = mk_seq_first(r1);
-            m_br.mk_and(m().mk_not(m().mk_eq(r1, str().mk_empty(seq_sort))), m().mk_eq(hd, ele), result);
            tl = re().mk_to_re(mk_seq_rest(r1));
            return re_and(result, tl);
        }
+        // recall: [] denotes the empty language (nothing) regex, () denotes epsilon or empty sequence
+        // construct the term (if (r1 != () and (ele = (first r1)) then (to_re (rest r1)) else []))
+        hd = mk_seq_first(r1);
+        m_br.mk_and(m().mk_not(m().mk_eq(r1, str().mk_empty(seq_sort))), m().mk_eq(hd, ele), result);
+        tl = re().mk_to_re(mk_seq_rest(r1));
+        return re_and(result, tl);
    }
-    else if (re().is_reverse(r, r1)) {
+    if (re().is_reverse(r, r1)) {
        if (re().is_to_re(r1, r2)) {
            // First try to extract hd and tl such that r = hd ++ tl and |tl|=1
            expr_ref hd(m()), tl(m());
@ -4129,22 +4120,20 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) {
                result = mk_der_concat(result, re().mk_reverse(re().mk_to_re(hd)));
                return result;
            }
-            else if (str().is_empty(r2)) {
+            if (str().is_empty(r2)) {
                return mk_empty();
            }
-            else {
-                // construct the term (if (r2 != () and (ele = (last r2)) then reverse(to_re (butlast r2)) else []))
-                // hd = first of reverse(r2) i.e. last of r2
-                // tl = rest of reverse(r2) i.e. butlast of r2
-                //hd = str().mk_nth_i(r2, m_autil.mk_sub(str().mk_length(r2), one()));
-                hd = mk_seq_last(r2);
-                // factor nested constructor calls to enforce deterministic argument evaluation order
-                auto a_non_empty = m().mk_not(m().mk_eq(r2, str().mk_empty(seq_sort)));
-                auto a_eq        = m().mk_eq(hd, ele);
-                m_br.mk_and(a_non_empty, a_eq, result);
-                tl = re().mk_to_re(mk_seq_butlast(r2));
-                return re_and(result, re().mk_reverse(tl));
-            }
+            // construct the term (if (r2 != () and (ele = (last r2)) then reverse(to_re (butlast r2)) else []))
+            // hd = first of reverse(r2) i.e. last of r2
+            // tl = rest of reverse(r2) i.e. butlast of r2
+            // hd = str().mk_nth_i(r2, m_autil.mk_sub(str().mk_length(r2), one()));
+            hd = mk_seq_last(r2);
+            // factor nested constructor calls to enforce deterministic argument evaluation order
+            auto a_non_empty = m().mk_not(m().mk_eq(r2, str().mk_empty(seq_sort)));
+            auto a_eq = m().mk_eq(hd, ele);
+            m_br.mk_and(a_non_empty, a_eq, result);
+            tl = re().mk_to_re(mk_seq_butlast(r2));
+            return re_and(result, re().mk_reverse(tl));
        }
    }
    else if (re().is_range(r, r1, r2)) {
@ -4163,11 +4152,9 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) {
                result = mk_der_inter(p1, p2);
                return result;
            }
-            else {
-                return mk_empty();
-            }
+            return mk_empty();
        }
-        expr* e1 = nullptr, * e2 = nullptr;
+        expr *e1 = nullptr, *e2 = nullptr;
        if (str().is_unit(r1, e1) && str().is_unit(r2, e2)) {
            SASSERT(u().is_char(e1));
            // Use mk_der_cond to normalize
@ -4185,7 +4172,7 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) {
    }
    else if (re().is_of_pred(r, p)) {
        array_util array(m());
-        expr* args[2] = { p, ele };
+        expr *args[2] = {p, ele};
        result = array.mk_select(2, args);
        // Use mk_der_cond to normalize
        STRACE(seq_verbose, tout << "deriv of_pred" << std::endl;);
--- a/src/ast/rewriter/seq_rewriter.h
+++ b/src/ast/rewriter/seq_rewriter.h
@ -18,6 +18,7 @@ Notes:
 --*/
 #pragma once

+#include "seq_split.h"
 #include "ast/seq_decl_plugin.h"
 #include "ast/ast_pp.h"
 #include "ast/arith_decl_plugin.h"
@ -130,6 +131,7 @@ class seq_rewriter {

    seq_util       m_util;
    seq_subset     m_subset;
+    seq_split      m_split;
    arith_util     m_autil;
    bool_rewriter  m_br;
    // re2automaton   m_re2aut;
@ -346,7 +348,7 @@ class seq_rewriter {

 public:
    seq_rewriter(ast_manager & m, params_ref const & p = params_ref()):
-        m_util(m), m_subset(m_util.re), m_autil(m), m_br(m, p), // m_re2aut(m), 
+        m_util(m), m_subset(m_util.re), m_split(*this), m_autil(m), m_br(m, p), // m_re2aut(m),
        m_op_cache(m), m_es(m), 
        m_lhs(m), m_rhs(m), m_coalesce_chars(true) {
    }
@ -397,6 +399,15 @@ public:
        return result;
    }

+    // Split decomposition (sigma) of a regex; see seq_split.h.  `oracle` (optional)
+    // prunes non-viable splits during generation.
+    bool split(expr* r, split_set& out, unsigned threshold,
+        const split_mode mode = split_mode::strong, split_oracle const& oracle = {}) {
+        return m_split.compute(r, out, threshold, mode, oracle);
+    }
+
+    void simplify_split(split_set& s) { m_split.simplify(s); }
+
    /**
     * check if regular expression is of the form all ++ s ++ all ++ t + u ++ all, where, s, t, u are sequences
     */
--- a/src/ast/rewriter/seq_split.cpp
+++ b/src/ast/rewriter/seq_split.cpp
@ -0,0 +1,501 @@
+/*++
+Copyright (c) 2026 Microsoft Corporation
+
+Module Name:
+
+    seq_split.cpp
+
+Abstract:
+
+    Regex split decomposition (the split function sigma).  See seq_split.h.
+
+Author:
+
+    Nikolaj Bjorner (nbjorner) 2026-6-10
+    Clemens Eisenhofer 2026-6-10
+
+--*/
+
+#include "ast/rewriter/seq_split.h"
+#include "ast/rewriter/seq_rewriter.h"
+#include "ast/ast_pp.h"
+#include "util/obj_hashtable.h"
+#include "util/stack.h"
+
+seq_split::seq_split(seq_rewriter& rw) :
+    m_rw(rw), m_subset(rw.u().re) {}
+
+ast_manager&   seq_split::m()   const { return m_rw.m(); }
+seq_util&      seq_split::seq() const { return m_rw.u(); }
+seq_util::rex& seq_split::re()  const { return m_rw.u().re; }
+
+// Add <d, n> unless the (optional) lookahead oracle prunes it.
+void seq_split::push(split_set& out, split_oracle const& oracle, expr* d, expr* n) const {
+    if (!oracle || oracle(d, n))
+        out.push_back(split_pair(d, n, m()));
+}
+
+// Cross-product intersection of two split-sets (split algebra):
+//   S1 cap S2 = { <D1 cap D2, N1 cap N2> | <D1,N1> in S1, <D2,N2> in S2 }.
+// Pairs where any component is bottom (the empty regex) are dropped.
+bool seq_split::intersect(split_set const& s1, split_set const& s2, split_set& result,
+                          unsigned threshold, split_oracle const& oracle) const {
+    const seq_util::rex& r = re();
+    for (auto const& p1 : s1) {
+        for (auto const& p2 : s2) {
+            if (r.is_empty(p1.m_d) || r.is_empty(p2.m_d) ||
+                r.is_empty(p1.m_n) || r.is_empty(p2.m_n))
+                continue;
+            const expr_ref di(m_rw.mk_regex_inter_normalize(p1.m_d, p2.m_d), m());
+            const expr_ref ni(m_rw.mk_regex_inter_normalize(p1.m_n, p2.m_n), m());
+            push(result, oracle, di, ni);
+            if (result.size() > threshold)
+                return false;
+        }
+    }
+    return true;
+}
+
+// Complement of a split-set via De Morgan: ~S = cap_{s in S} ~s with
+//   ~<D,N> = { <~D, .*>, <.*, ~N> }  and  ~{} = { <.*, .*> }.
+// May produce up to 2^|sp| pairs (bounded by the threshold).  A threshold
+// overrun must abort entirely: a partial fold is a strictly weaker (unsound)
+// split-set, since each ~sp[i] further constrains ~S.
+bool seq_split::complement(sort* seq_sort, split_set const& sp, split_set& result,
+    const unsigned threshold, split_oracle const& oracle) const {
+
+    seq_util::rex& r = re();
+    sort* re_sort = r.mk_re(seq_sort);
+    const expr_ref full(r.mk_full_seq(re_sort), m());   // .*
+    if (sp.empty()) {                                   // ~{} = <.*, .*>
+        push(result, oracle, full, full);
+        return true;
+    }
+    // The acc/next pairs carry genuine output-orientation N components (the De
+    // Morgan ~<D,N> = {<~D,.*>, <.*,~N>}), so the oracle prunes them soundly and
+    // keeps the 2^|sp| fold from blowing up.
+    split_set acc;
+    push(acc, oracle, r.mk_complement(sp[0].m_d), full);
+    push(acc, oracle, full, r.mk_complement(sp[0].m_n));
+    for (unsigned i = 1; i < sp.size(); ++i) {
+        split_set next;
+        push(next, oracle, r.mk_complement(sp[i].m_d), full);
+        push(next, oracle, full, r.mk_complement(sp[i].m_n));
+        split_set tmp;
+        if (!intersect(acc, next, tmp, threshold, oracle))
+            return false;
+        acc = std::move(tmp);
+        if (acc.empty())            // intersection empty => ~S is empty
+            break;
+        if (acc.size() > threshold)
+            return false;
+    }
+    result.append(acc);
+    return true;
+}
+
+bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mode mode,
+                        split_oracle const& oracle) {
+    SASSERT(r);
+    seq_util& sq = seq();
+    seq_util::rex& rex = re();
+    ast_manager& mm = m();
+
+    sort* seq_sort = nullptr;
+    if (!sq.is_re(r, seq_sort))
+        return false;
+
+    // bottom: sigma(empty) = {} (the empty split-set)
+    if (rex.is_empty(r))
+        return true;
+
+    // epsilon: sigma(eps) = { <eps, eps> }
+    if (rex.is_epsilon(r)) {
+        const expr_ref eps(rex.mk_epsilon(seq_sort), mm);
+        push(result, oracle, eps, eps);
+        return true;
+    }
+
+    expr* a = nullptr, *b = nullptr;
+
+    // to_re(s): split the literal word s at every position.
+    expr* s = nullptr;
+    if (rex.is_to_re(r, s)) {
+        zstring str;
+        if (sq.str.is_string(s, str)) {
+            for (unsigned i = 0; i <= str.length(); ++i) {
+                const expr_ref p(rex.mk_to_re(sq.str.mk_string(str.extract(0, i))), mm);
+                const expr_ref q(rex.mk_to_re(sq.str.mk_string(str.extract(i, str.length() - i))), mm);
+                push(result, oracle, p, q);
+            }
+            return true;
+        }
+        // a single symbolic unit behaves like one token: { <eps, u>, <u, eps> }
+        if (sq.str.is_unit(s)) {
+            const expr_ref ex(r, mm);
+            const expr_ref eps(rex.mk_epsilon(seq_sort), mm);
+            push(result, oracle, eps, ex);
+            push(result, oracle, ex, eps);
+            return true;
+        }
+        // to_re over a non-literal sequence: not handled.
+        return false;
+    }
+
+    // single-character class alpha (., [lo-hi], of_pred):
+    //   sigma(alpha) = { <eps, alpha>, <alpha, eps> }
+    if (rex.is_full_char(r) || rex.is_range(r) || rex.is_of_pred(r)) {
+        const expr_ref ex(r, mm);
+        const expr_ref eps(rex.mk_epsilon(seq_sort), mm);
+        push(result, oracle, eps, ex);
+        push(result, oracle, ex, eps);
+        return true;
+    }
+
+    // .* : sigma(.*) = { <.*, .*> }
+    if (rex.is_full_seq(r)) {
+        const expr_ref ex(r, mm);
+        push(result, oracle, ex, ex);
+        return true;
+    }
+
+    // union: sigma(r0 | ... | r_{n-1}) = U sigma(ri)   (re.union may be n-ary)
+    if (rex.is_union(r)) {
+        app* ap = to_app(r);
+        for (unsigned i = 0; i < ap->get_num_args(); ++i) {
+            if (!compute(ap->get_arg(i), result, threshold, mode, oracle))
+                return false;
+        }
+        return true;
+    }
+
+    // concat: sigma(r0...r_{n-1}) = U_i  (r0...r_{i-1}) . sigma(ri) . (r_{i+1}...r_{n-1})
+    // (re.++ may be n-ary)
+    if (rex.is_concat(r)) {
+        app* ap = to_app(r);
+        const unsigned n = ap->get_num_args();
+        for (unsigned i = 0; i < n; ++i) {
+            // Sound to pass the oracle into the sub-computation: N_inner.Sigma*
+            // over-approximates the final N_inner.right, so a prune here is a
+            // prune of the final pair too (prefix-compatible test).
+            split_set sigma_arg;
+            if (!compute(ap->get_arg(i), sigma_arg, threshold, mode, oracle))
+                return false;
+
+            expr_ref left(mm), right(mm);
+            if (i == 0)
+                left = rex.mk_epsilon(seq_sort);
+            else {
+                for (unsigned j = 0; j < i; ++j) {
+                    expr* arg = ap->get_arg(j);
+                    left = left ? expr_ref(rex.mk_concat(left, arg), mm) : expr_ref(arg, mm);
+                }
+            }
+            if (i == n - 1)
+                right = rex.mk_epsilon(seq_sort);
+            else {
+                right = ap->get_arg(i + 1);
+                for (unsigned j = i + 2; j < n; ++j) {
+                    expr* arg = ap->get_arg(j);
+                    right = rex.mk_concat(right, arg);
+                }
+            }
+
+            for (auto const& [d, nn] : sigma_arg) {
+                const expr_ref p = m_rw.mk_re_append(left, d);
+                const expr_ref q = m_rw.mk_re_append(nn, right);
+                push(result, oracle, p, q);
+            }
+        }
+        return true;
+    }
+
+    // star: sigma(a*) = { <eps, eps> } cup a*.sigma(a).a*
+    if (rex.is_star(r, a)) {
+        const expr_ref eps(rex.mk_epsilon(seq_sort), mm);
+        push(result, oracle, eps, eps);
+        split_set sa;
+        if (!compute(a, sa, threshold, mode, oracle))
+            return false;
+        for (auto const& [d, n] : sa) {
+            const expr_ref p = m_rw.mk_re_append(r, d);   // a*.D
+            const expr_ref q = m_rw.mk_re_append(n, r);   // N.a*
+            push(result, oracle, p, q);
+        }
+        return true;
+    }
+
+    // plus: a+ = a.a* ; sigma(a+) = a*.sigma(a).a*  (star rule without <eps,eps>)
+    if (rex.is_plus(r, a)) {
+        const expr_ref star(rex.mk_star(a), mm);          // a*
+        split_set sa;
+        if (!compute(a, sa, threshold, mode, oracle))
+            return false;
+        for (auto const& [d, n] : sa) {
+            const expr_ref p = m_rw.mk_re_append(star, d);
+            const expr_ref q = m_rw.mk_re_append(n, star);
+            push(result, oracle, p, q);
+        }
+        return true;
+    }
+
+    // intersection: sigma(r0 & ... & r_{n-1}) = cap sigma(ri)   (re.inter may be n-ary)
+    if (rex.is_intersection(r)) {
+        if (mode == split_mode::weak)
+            return false;
+        app* ap = to_app(r);
+        const unsigned n = ap->get_num_args();
+        split_set current;
+        if (!compute(ap->get_arg(0), current, threshold, mode, oracle))
+            return false;
+        // A give-up on any conjunct must propagate as a give-up: silently treating
+        // it as the empty split-set would collapse the whole intersection to bottom
+        // and be misreported as an (unsound) conflict.
+        for (unsigned i = 1; i < n && !current.empty(); ++i) {
+            split_set arg_i, tmp;
+            if (!compute(ap->get_arg(i), arg_i, threshold, mode, oracle))
+                return false;
+            if (!intersect(current, arg_i, tmp, threshold, oracle))
+                return false;
+            current = std::move(tmp);
+        }
+        result.append(current);
+        return true;
+    }
+
+    // complement: sigma(~a) = ~sigma(a).
+    // The body is computed WITHOUT the oracle (the body's pairs are inverted, so
+    // their N is unrelated to the output N); the oracle is re-applied in complement().
+    if (rex.is_complement(r, a)) {
+        if (mode == split_mode::weak)
+            return false;
+        split_set sa;
+        if (!compute(a, sa, threshold, mode))
+            return false;
+        return complement(seq_sort, sa, result, threshold, oracle);
+    }
+
+    // difference: a \ b = a & ~b ; sigma(a \ b) = sigma(a) cap ~sigma(b).
+    // sigma(b) (used only inside the complement) is computed WITHOUT the oracle.
+    if (rex.is_diff(r, a, b)) {
+        if (mode == split_mode::weak)
+            return false;
+        split_set sa, sb, sb_compl, tmp;
+        if (!compute(a, sa, threshold, mode, oracle))
+            return false;
+        if (!compute(b, sb, threshold, mode))
+            return false;
+        if (!complement(seq_sort, sb, sb_compl, threshold, oracle))
+            return false;
+        if (!intersect(sa, sb_compl, tmp, threshold, oracle))
+            return false;
+        result.append(tmp);
+        return true;
+    }
+
+    // bounded loop / ite / other: not handled (paper "v1: bail").
+    TRACE(seq, tout << "seq_split: unsupported regex " << mk_pp(r, mm) << "\n";);
+    return false;
+}
+
+// same-D / same-N merge (paper eqs. 1 & 2):
+//   { <D,N>, <D,N'> } -> <D, N | N'>      (by_left = true,  group by D)
+//   { <D,N>, <D',N> } -> <D | D', N>      (by_left = false, group by N)
+// Only fires on syntactically-identical (perfectly-shared) key components, so
+// it is a conservative instance of the rule.
+void seq_split::merge_by(split_set& pairs, const bool by_left) const {
+    ast_manager& mm = m();
+    obj_map<expr, unsigned> idx;   // key component -> position in `out`
+    split_set out;
+    for (auto const& p : pairs) {
+        expr* key   = by_left ? p.m_d.get() : p.m_n.get();
+        expr* other = by_left ? p.m_n.get() : p.m_d.get();
+        unsigned pos;
+        if (idx.find(key, pos)) {
+            expr* prev = by_left ? out[pos].m_n.get() : out[pos].m_d.get();
+            const expr_ref u(m_rw.mk_regex_union_normalize(prev, other), mm);
+            if (by_left)
+                out[pos].m_n = u;
+            else
+                out[pos].m_d = u;
+        }
+        else {
+            idx.insert(key, out.size());
+            out.push_back(p);
+        }
+    }
+    pairs.swap(out);
+}
+
+void seq_split::simplify(split_set& pairs) const {
+    seq_util::rex& r = re();
+
+    // 1. drop pairs with a bottom (empty-language) component.
+    unsigned w = 0;
+    for (unsigned i = 0; i < pairs.size(); ++i) {
+        if (r.is_empty(pairs[i].m_d) || r.is_empty(pairs[i].m_n))
+            continue;
+        if (w != i)
+            pairs[w] = pairs[i];
+        ++w;
+    }
+    pairs.shrink(w);
+    if (pairs.size() <= 1)
+        return;
+
+    // 2. same-D / same-N merge rules.
+    merge_by(pairs, true);
+    merge_by(pairs, false);
+    if (pairs.size() <= 1)
+        return;
+
+    // 3. subsumption: drop <D_i,N_i> when L(D_i) subseteq L(D_j) and
+    //    L(N_i) subseteq L(N_j) for some kept j.  seq_subset is conservative
+    //    (returns true only for definite containment), so we never drop a
+    //    needed split.
+    //if (pairs.size() > 64)
+    //    return;
+
+    struct row { expr* d; expr* n; unsigned idx; };
+    vector<row> rows;
+    for (unsigned i = 0; i < pairs.size(); ++i)
+        rows.push_back({ pairs[i].m_d.get(), pairs[i].m_n.get(), i });
+
+    auto subsumes = [&](row const& a, row const& b) {
+        return m_subset.is_subset(b.d, a.d) && m_subset.is_subset(b.n, a.n);
+    };
+
+    vector<row> kept;
+    for (row const& row_r : rows) {
+        bool redundant = false;
+        for (row const& k : kept)
+            if (subsumes(k, row_r)) { redundant = true; break; }
+        if (redundant)
+            continue;
+        // drop already-kept rows strictly subsumed by row_r
+        unsigned kw = 0;
+        for (unsigned t = 0; t < kept.size(); ++t) {
+            if (subsumes(row_r, kept[t]))
+                continue;
+            kept[kw++] = kept[t];
+        }
+        kept.shrink(kw);
+        kept.push_back(row_r);
+    }
+
+    split_set result;
+    for (row const& k : kept)
+        result.push_back(pairs[k.idx]);
+    pairs.swap(result);
+}
+
+std::pair<expr*, expr*> seq_split::split_membership(expr* str, expr* regex, unsigned threshold, split_set& result) const {
+    ast_manager& m = this->m();
+    expr_ref_vector tokens(m);
+
+    vector<expr*> stack;
+    stack.push_back(str);
+
+    while (!stack.empty()) {
+        expr* cur = stack.back();
+        stack.pop_back();
+        expr* l, *r;
+        if (seq().str.is_concat(cur, l, r)) {
+            stack.push_back(r);
+            stack.push_back(l);
+        }
+        else
+            tokens.push_back(expr_ref(cur, m));
+    }
+
+    SASSERT(!tokens.empty());
+    expr* first = tokens.get(0);
+    SASSERT(seq().is_char(first)); // constants are consumed earlier
+
+    // Choose the factorization boundary so the tail starts with the
+    // longest run of concrete characters c.
+    // This gives the split-engine lookahead oracle the most pruning information.
+    // head = u' (tokens before the run), tail = c · u''' (tokens from the run onward).
+    const unsigned total = tokens.size();
+    unsigned run_start = 0, run_len = 0;
+    for (unsigned i = 0; i < total; ) {
+        if (!seq().is_char(tokens.get(i))) {
+            i++;
+            continue;
+        }
+        unsigned j = i;
+        while (j < total && seq().is_char(tokens.get(j))) {
+            j++;
+        }
+        if (j - i > run_len) {
+            run_len = j - i;
+            run_start = i;
+        }
+        i = j;
+    }
+    // No constant run => fall back to splitting off the first token.
+    const unsigned p = run_len == 0 ? 1 : run_start;
+    SASSERT(p >= 1);
+    expr* head = first;
+    for (unsigned i = 1; i < p; i++) {
+        head = seq().str.mk_concat(head, tokens.get(i));
+    }
+    expr* tail = tokens.get(p);
+    for (unsigned i = p + run_len; i < tokens.size(); i++) {
+        tail = seq().str.mk_concat(tail, tokens.get(i));
+    }
+    SASSERT(head && tail);
+
+    // Build the constant lookahead c and (if non-empty) an oracle that
+    // prunes splits whose postfix cannot match c.
+    zstring c;
+    for (unsigned i = 0; i < run_len; ++i) {
+        expr* ch; unsigned cv;
+        VERIFY(seq().str.is_unit(tokens.get(run_start + i), ch));
+        VERIFY(seq().is_const_char(ch, cv));
+        c = c + zstring(cv);
+    }
+    split_oracle oracle;
+    if (!c.empty())
+        oracle = [this, &c](expr*, expr* n) { return split_lookahead_viable(n, c); };
+
+    // Decompose the regex into a split-set via the shared seq_split engine
+    if (!m_rw.split(regex, result, threshold, split_mode::strong, oracle)) {
+        result.clear();
+        return { nullptr, nullptr };
+    }
+
+    m_rw.simplify_split(result);
+
+    // Eagerly consume the constant run c from the tail by taking the c-derivative
+    // of each postfix
+    if (!c.empty()) {
+        unsigned w = 0;
+        for (unsigned i = 0; i < result.size(); ++i) {
+            expr* d = result[i].m_n;
+            for (unsigned k = 0; d && !seq().re.is_empty(d) && k < c.length(); ++k) {
+                d = m_rw.mk_derivative(seq().mk_char(c[k]), d);
+            }
+            SASSERT(d);
+            if (re().is_empty(d))
+                continue;   // postfix can't start with c => infeasible split, drop
+            result[w++] = split_pair(result[i].m_d, d, m);
+        }
+        result.shrink(w);
+    }
+
+    return { head, tail };
+}
+
+bool seq_split::split_lookahead_viable(expr* regex, zstring const& c) const {
+    SASSERT(regex);
+    for (unsigned i = 0; i < c.length(); i++) {
+        if (m().is_true(m_rw.is_nullable(regex)))
+            return true;            // N accepts the prefix c[0..i) => a suffix completes it
+        regex = m_rw.mk_derivative(seq().mk_char(c[i]), regex);
+        SASSERT(regex);
+        if (re().is_empty(regex))
+            return false;           // N went (syntactically) dead before reaching c
+    }
+    return !re().is_empty(regex);
+}
--- a/src/ast/rewriter/seq_split.h
+++ b/src/ast/rewriter/seq_split.h
@ -0,0 +1,118 @@
+/*++
+Copyright (c) 2026 Microsoft Corporation
+
+Module Name:
+
+    seq_split.h
+
+Abstract:
+
+    Regex split decomposition: the split function sigma from the paper
+    "Solving by Splitting".  For a regular expression r, sigma(r) is a finite
+    "split-set" of pairs { <D_i, N_i> } such that
+
+        u.v in L(r)   iff   exists i:  u in L(D_i)  and  v in L(N_i).
+
+    The split algebra (intersection, De Morgan complement, left/right
+    concatenation with a regex) and the cardinality-reducing simplification
+    heuristics (drop bottom, same-D/same-N merge, subsumption via seq_subset)
+    follow the paper.
+
+Author:
+
+    Clemens Eisenhofer 2026-6-10
+
+--*/
+#pragma once
+
+#include "ast/seq_decl_plugin.h"
+#include "ast/rewriter/seq_subset.h"
+#include <functional>
+
+class seq_rewriter;
+
+// An individual split <D, N>: the left (prefix) regex D and right (suffix)
+// regex N.  u.v in L(r) for this split iff u in L(D) and v in L(N).
+struct split_pair {
+    expr_ref m_d;
+    expr_ref m_n;
+    split_pair(expr* d, expr* n, ast_manager& m) : m_d(d, m), m_n(n, m) {
+        SASSERT(d && n);
+    }
+};
+
+// A split-set is a union of individual splits.
+typedef vector<split_pair> split_set;
+
+// Controls how aggressively sigma expands the Boolean-closure cases:
+//   strong - fully expand complement / intersection via the split algebra
+//            (De Morgan / cross product).  This is the behaviour the nseq
+//            solver relies on.
+//   weak   - do not perform the (potentially 2^k) Boolean-closure expansion;
+//            give up (return false) on complement / intersection instead.
+enum class split_mode { weak, strong };
+
+// Optional lookahead oracle.  Called for each candidate split <D, N> as it is
+// generated; returns true to keep it, false to prune it.  An empty oracle (the
+// default) keeps everything, so sigma is unchanged.  See seq_split::compute.
+typedef std::function<bool(expr* D, expr* N)> split_oracle;
+
+class seq_split {
+    seq_rewriter& m_rw;       // for mk_re_append + manager / seq_util access
+    seq_subset    m_subset;   // language-subset checks for subsumption
+
+    ast_manager&   m() const;
+    seq_util&      seq() const;
+    seq_util::rex& re() const;
+
+    // Push <d, n> onto `out`, unless `oracle` rejects it.
+    void push(split_set& out, split_oracle const& oracle, expr* d, expr* n) const;
+
+    // S1 cap S2 = { <D1 cap D2, N1 cap N2> } dropping any pair with a bottom
+    // component (and any rejected by `oracle`).  Returns false on threshold overrun.
+    bool intersect(split_set const& s1, split_set const& s2, split_set& result,
+                   unsigned threshold, split_oracle const& oracle) const;
+
+    // De Morgan complement of a split-set: ~S = cap_{s in S} ~s with
+    //   ~<D,N> = { <~D, .*>, <.*, ~N> }  and  ~{} = { <.*, .*> }.
+    bool complement(sort* seq_sort, split_set const& sp, split_set& result,
+                    unsigned threshold, split_oracle const& oracle) const;
+
+    // same-D / same-N merge: groups pairs that share a (syntactically identical)
+    // left (resp. right) component and unions the other component.
+    void merge_by(split_set& pairs, bool by_left) const;
+
+public:
+    explicit seq_split(seq_rewriter& rw);
+
+    // Compute sigma(r), appending to `out` (does not clear it).  `threshold`
+    // bounds the number of produced splits; an overrun, an unsupported regex
+    // shape (bounded loop / ite), or a Boolean-closure case in weak mode makes
+    // it return false ("give up").
+    //
+    // `oracle` (optional) prunes non-viable splits *during* generation.  It must
+    // be sound to apply at every generation step: a candidate N can still gain a
+    // prefix from a factor appended to its right later (concat/star), so the
+    // oracle must use a "prefix-compatible" test (prune only when N can never
+    // match the lookahead, even partially), NOT a strict "starts-with" test.
+    // The complement body is computed WITHOUT the oracle (inverted orientation);
+    // the oracle is re-applied to the complement's output fold.
+    bool compute(expr* r, split_set& out, unsigned threshold,
+                 split_mode mode = split_mode::strong, split_oracle const& oracle = {});
+
+    // In-place simplification of a split-set: drop bottom components, apply the
+    // same-D / same-N merge rules, and drop splits subsumed by another (using
+    // seq_subset).  Size-capped to keep the O(n^2) subsumption affordable.
+    void simplify(split_set& s) const;
+
+    // decompose a membership constraint into a set of pairs of regex splits
+    std::pair<expr*, expr*> split_membership(expr* str, expr* regex, unsigned threshold, split_set& result) const;
+
+    // Lookahead oracle for the split engine: is the split's right component
+    // `n_regex` prefix-compatible with the constant character sequence `c`?
+    // This is sound to apply during split generation — it never drops a viable split.
+    // Thus, it might not eliminate all cases in order to stay sound
+    bool split_lookahead_viable(expr* regex, zstring const& c) const;
+
+
+};