From ddd8bf84d7a866df44fef389ed198c3aeff0491e Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Thu, 2 Apr 2026 14:27:06 -0700
Subject: [PATCH] Replace apply_regex_unit_split with apply_regex_if_split
 (#9210)

Agent-Logs-Url: https://github.com/Z3Prover/z3/sessions/7df36402-f2c7-4de5-b626-3df14d4eee64

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com>
---
 src/smt/seq/seq_nielsen.cpp | 79 +++++++++++++------------------------
 src/smt/seq/seq_nielsen.h   |  9 ++---
 2 files changed, 32 insertions(+), 56 deletions(-)

diff --git a/src/smt/seq/seq_nielsen.cpp b/src/smt/seq/seq_nielsen.cpp
index 04660cdd8..36d711399 100644
--- a/src/smt/seq/seq_nielsen.cpp
+++ b/src/smt/seq/seq_nielsen.cpp
@@ -2228,9 +2228,9 @@ namespace seq {
         if (apply_const_nielsen(node))
             return ++m_stats.m_mod_const_nielsen, true;
         
-        // Priority 9: RegexUnitSplit - split str_mem c·s ∈ R by minterms of R
-        if (apply_regex_unit_split(node))
-            return ++m_stats.m_mod_regex_unit_split, true;
+        // Priority 9: RegexIfSplit - split str_mem s ∈ ite(c,th,el) by branching on c
+        if (apply_regex_if_split(node))
+            return ++m_stats.m_mod_regex_if_split, true;
 
         // Priority 9b: SignatureSplit - heuristic string equation splitting
         if (m_signature_split && apply_signature_split(node))
@@ -3299,40 +3299,22 @@ namespace seq {
         return false;
     }
     
-    bool nielsen_graph::apply_regex_unit_split(nielsen_node *node) {
+    bool nielsen_graph::apply_regex_if_split(nielsen_node *node) {
         for (str_mem const &mem : node->str_mems()) {
             SASSERT(mem.m_str && mem.m_regex);
-            if (mem.is_primitive())
+
+            expr *r_expr = mem.m_regex->get_expr();
+            expr_ref c(m), th(m), el(m);
+            if (!bool_rewriter(m).decompose_ite(r_expr, c, th, el))
                 continue;
 
-            euf::snode *first = mem.m_str->first();
-            if (!first || !first->is_char_or_unit())
-                continue;
-
-            // Take derivative of R w.r.t. :var 0 (canonical, cached), then
-            // substitute inner_char for :var 0. Resulting ite conditions are
-            // on inner_char.
-            seq_rewriter rw(m);
-            expr_ref d(rw.mk_derivative(mem.m_regex->get_expr()), m);
-            if (!d)
-                continue;
-
-            // Extract the inner char expression from seq.unit(?inner)
-            expr *unit_expr = first->get_expr(), *inner_char;
-            VERIFY(m_seq.str.is_unit(unit_expr, inner_char));
-
-            var_subst vs(m);
-            d = vs(d, inner_char);
-
-
-            euf::snode *rest = m_sg.drop_first(mem.m_str);
             bool created = false;
 
-            // Worklist: (regex_expr, accumulated_char_set).
+            // Worklist: (regex_expr, accumulated_conditions).
             // Call decompose_ite in a loop until no more ite sub-expressions,
-            // branching on c=true and c=false and accumulating char constraints.
+            // branching on c=true and c=false and accumulating conditions.
             vector<std::pair<expr_ref, expr_ref_vector>> worklist;
-            worklist.push_back({d, expr_ref_vector(m)});
+            worklist.push_back({expr_ref(r_expr, m), expr_ref_vector(m)});
 
             while (!worklist.empty()) {
                 auto [r, cs] = std::move(worklist.back());
@@ -3341,50 +3323,45 @@ namespace seq {
                 if (m_seq.re.is_empty(r))
                     continue;
 
-                expr_ref c(m), th(m), el(m);
-                if (!bool_rewriter(m).decompose_ite(r, c, th, el)) {
-                    // No ite remaining: leaf → create child node
-                    euf::snode *deriv_snode = m_sg.mk(r);
+                expr_ref c2(m), th2(m), el2(m);
+                if (!bool_rewriter(m).decompose_ite(r, c2, th2, el2)) {
+                    // No ite remaining: leaf → create child node with regex updated to r
+                    euf::snode *new_regex_snode = m_sg.mk(r);
                     nielsen_node *child = mk_child(node);
                     for (auto f : cs)
                         child->add_constraint(constraint(f, mem.m_dep, m));
                     mk_edge(node, child, true);
                     for (str_mem &cm : child->str_mems())
                         if (cm.m_id == mem.m_id) {
-                            cm.m_str = rest;
-                            cm.m_regex = deriv_snode;
+                            cm.m_regex = new_regex_snode;
                             break;
                         }
                     created = true;
                     continue;
                 }
 
-                // Substitute inner_char into condition and simplify
                 th_rewriter tr(m);
-                expr_ref c_simp(c, m);
+                expr_ref c_simp(c2, m);
                 tr(c_simp);
 
                 if (m.is_true(c_simp)) {
-                    // Condition is always satisfied: only then-branch
-                    if (!m_seq.re.is_empty(th))
-                        worklist.push_back({th, std::move(cs)});
+                    if (!m_seq.re.is_empty(th2))
+                        worklist.push_back({th2, std::move(cs)});
                 }
                 else if (m.is_false(c_simp)) {
-                    // Condition is never satisfied: only else-branch
-                    if (!m_seq.re.is_empty(el))
-                        worklist.push_back({el, std::move(cs)});
+                    if (!m_seq.re.is_empty(el2))
+                        worklist.push_back({el2, std::move(cs)});
                 }
                 else {
-                    // Branch on c=true and c=false, accumulate char constraints
-                    if (!m_seq.re.is_empty(th)) {
+                    if (!m_seq.re.is_empty(th2)) {
                         expr_ref_vector cs_true(cs);
-                        cs_true.push_back(c);
-                        worklist.push_back({th, std::move(cs_true)});
+                        cs_true.push_back(c2);
+                        worklist.push_back({th2, std::move(cs_true)});
                     }
-                    if (!m_seq.re.is_empty(el)) {
+                    if (!m_seq.re.is_empty(el2)) {
                         expr_ref_vector cs_false(cs);
-                        cs_false.push_back(mk_not(c));
-                        worklist.push_back({el, std::move(cs_false)});
+                        cs_false.push_back(mk_not(c2));
+                        worklist.push_back({el2, std::move(cs_false)});
                     }
                 }
             }
@@ -4342,7 +4319,7 @@ namespace seq {
         st.update("nseq mod const nielsen",    m_stats.m_mod_const_nielsen);
         st.update("nseq mod signature split",  m_stats.m_mod_signature_split);
         st.update("nseq mod regex var",        m_stats.m_mod_regex_var_split);
-        st.update("nseq mod regex unit",       m_stats.m_mod_regex_unit_split);
+        st.update("nseq mod regex if",         m_stats.m_mod_regex_if_split);
         st.update("nseq mod power split",      m_stats.m_mod_power_split);
         st.update("nseq mod var nielsen",      m_stats.m_mod_var_nielsen);
         st.update("nseq mod var num unwind (eq)",   m_stats.m_mod_var_num_unwinding_eq);
diff --git a/src/smt/seq/seq_nielsen.h b/src/smt/seq/seq_nielsen.h
index c04345be9..2af940d7a 100644
--- a/src/smt/seq/seq_nielsen.h
+++ b/src/smt/seq/seq_nielsen.h
@@ -730,7 +730,7 @@ namespace seq {
         unsigned m_mod_num_cmp         = 0;
         unsigned m_mod_split_power_elim = 0;
         unsigned m_mod_const_num_unwinding = 0;
-        unsigned m_mod_regex_unit_split = 0;
+        unsigned m_mod_regex_if_split = 0;
         unsigned m_mod_eq_split        = 0;
         unsigned m_mod_star_intr       = 0;
         unsigned m_mod_gpower_intr     = 0;
@@ -1060,10 +1060,9 @@ namespace seq {
         // mirrors ZIPT's ConstNumUnwindingModifier
         bool apply_const_num_unwinding(nielsen_node* node);
 
-        // regex unit split: for str_mem c·s ∈ R where c is a symbolic unit,
-        // branch over regex minterms and constrain c via char_range.
-        // Unlike apply_regex_var_split, no substitution and no epsilon branch.
-        bool apply_regex_unit_split(nielsen_node* node);
+        // regex if split: for str_mem s ∈ R where R decomposes as ite(c, th, el),
+        // branch into s ∈ th under condition c, and s ∈ el under condition ¬c.
+        bool apply_regex_if_split(nielsen_node* node);
 
         // star introduction: for a str_mem x·s ∈ R where a cycle is detected
         // (backedge exists), introduce stabilizer: x ∈ base* with x split.