From 4b08c629c8598805b54f8f2cffb22a291ec88544 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 31 Mar 2026 02:51:21 +0000
Subject: [PATCH] port RegexCharSplitModifier: implement apply_regex_unit_split
 in seq_nielsen

Agent-Logs-Url: https://github.com/Z3Prover/z3/sessions/5d8898c4-5b66-42b6-a05a-03de8d1f0a94

Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com>
---
 src/smt/seq/seq_nielsen.cpp | 67 ++++++++++++++++++++++++++++++++++++-
 src/smt/seq/seq_nielsen.h   |  1 +
 src/test/nseq_zipt.cpp      | 64 +++++++++++++++++++++++++++++++++++
 3 files changed, 131 insertions(+), 1 deletion(-)

diff --git a/src/smt/seq/seq_nielsen.cpp b/src/smt/seq/seq_nielsen.cpp
index 113b0c093..c51febed1 100644
--- a/src/smt/seq/seq_nielsen.cpp
+++ b/src/smt/seq/seq_nielsen.cpp
@@ -2234,7 +2234,11 @@ namespace seq {
         if (apply_const_nielsen(node))
             return ++m_stats.m_mod_const_nielsen, true;
 
-        // Priority 9: SignatureSplit - heuristic string equation splitting
+        // Priority 9: RegexUnitSplit - split str_mem c·s ∈ R by minterms of R
+        if (apply_regex_unit_split(node))
+            return ++m_stats.m_mod_regex_unit_split, true;
+
+        // Priority 9b: SignatureSplit - heuristic string equation splitting
         if (m_signature_split && apply_signature_split(node))
             return ++m_stats.m_mod_signature_split, true;
 
@@ -3112,6 +3116,66 @@ namespace seq {
         }
         return false;
     }
+    // -----------------------------------------------------------------------
+    // Modifier: apply_regex_unit_split (RegexCharSplitModifier)
+    // For str_mem c·s ∈ R where c is a symbolic unit token (seq.unit(?c)),
+    // branch over minterms of R: for each minterm m_i with non-fail derivative,
+    // create a child that constrains ?c to the character class of m_i.
+    // Unlike apply_regex_var_split, no substitution and no epsilon branch.
+    // After the constraint is added, simplify_and_init will consume c
+    // deterministically via the uniform derivative check.
+    // mirrors ZIPT's RegexCharSplitModifier
+    // -----------------------------------------------------------------------
+
+    bool nielsen_graph::apply_regex_unit_split(nielsen_node* node) {
+        for (str_mem const& mem : node->str_mems()) {
+            SASSERT(mem.m_str && mem.m_regex);
+            if (mem.is_primitive())
+                continue;
+            euf::snode* first = mem.m_str->first();
+            if (!first || !first->is_unit())
+                continue;
+
+            // Compute minterms of the regex
+            euf::snode_vector minterms;
+            m_sg.compute_minterms(mem.m_regex, minterms);
+            VERIFY(!minterms.empty());
+
+            // Get the current char_range for this token, fall back to full
+            char_set const& existing =
+                node->char_ranges().contains(first->id())
+                ? node->char_ranges()[first->id()]
+                : char_set::full(zstring::max_char());
+
+            bool created = false;
+            for (euf::snode* mt : minterms) {
+                SASSERT(mt && mt->get_expr());
+                SASSERT(!mt->is_fail());
+
+                char_set mt_cs = m_seq_regex->minterm_to_char_set(mt->get_expr());
+                // skip minterm if it doesn't overlap with the existing range
+                if (existing.intersect_with(mt_cs).is_empty())
+                    continue;
+
+                // skip if the regex derivative is empty for this minterm
+                euf::snode* deriv = m_sg.brzozowski_deriv(mem.m_regex, mt);
+                SASSERT(deriv);
+                if (deriv->is_fail())
+                    continue;
+
+                // create a child and narrow the char_range for this token
+                nielsen_node* child = mk_child(node);
+                mk_edge(node, child, false);
+                child->add_char_range(first, mt_cs);
+                created = true;
+            }
+
+            if (created)
+                return true;
+        }
+        return false;
+    }
+
     // -----------------------------------------------------------------------
     // Modifier: apply_regex_var_split
     // For str_mem x·s ∈ R where x is a variable, split using minterms:
@@ -4050,6 +4114,7 @@ namespace seq {
         st.update("nseq mod star intr",        m_stats.m_mod_star_intr);
         st.update("nseq mod gpower intr",      m_stats.m_mod_gpower_intr);
         st.update("nseq mod const nielsen",    m_stats.m_mod_const_nielsen);
+        st.update("nseq mod regex unit",       m_stats.m_mod_regex_unit_split);
         st.update("nseq mod signature split",  m_stats.m_mod_signature_split);
         st.update("nseq mod regex var",        m_stats.m_mod_regex_var_split);
         st.update("nseq mod power split",      m_stats.m_mod_power_split);
diff --git a/src/smt/seq/seq_nielsen.h b/src/smt/seq/seq_nielsen.h
index cd53d4f2e..172fb00ee 100644
--- a/src/smt/seq/seq_nielsen.h
+++ b/src/smt/seq/seq_nielsen.h
@@ -186,6 +186,7 @@ Abstract:
       GPowerIntrModifier(7), ConstNielsenModifier(8), RegexCharSplitModifier(9),
       RegexVarSplitModifier(10), PowerSplitModifier(11), VarNielsenModifier(12),
       VarNumUnwindingModifier(13).
+    - Z3-specific heuristic inserted between (9) and (10): SignatureSplit.
     - NOT PORTED: DirectedNielsenModifier, DecomposeModifier, CombinedModifier.
     - NumCmp, ConstNumUnwinding, VarNumUnwinding are approximated (no PDD
       integer polynomial infrastructure; power tokens are replaced with ε
diff --git a/src/test/nseq_zipt.cpp b/src/test/nseq_zipt.cpp
index 6a80b9e8c..fca578dd5 100644
--- a/src/test/nseq_zipt.cpp
+++ b/src/test/nseq_zipt.cpp
@@ -684,11 +684,75 @@ static void test_tricky_str_equations() {
     std::cout << "  ok\n";
 }
 
+// -----------------------------------------------------------------------
+// Symbolic unit token in regex membership (RegexCharSplitModifier)
+// Verifies apply_regex_unit_split: when the first token of a str_mem is
+// a seq.unit(?c) with no initial char_range, the modifier must branch
+// over regex minterms and constrain ?c.
+// -----------------------------------------------------------------------
+static void test_regex_unit_split() {
+    std::cout << "test_regex_unit_split\n";
+
+    // ?c · "hhh" · X ∈ hhhbbb|bhhh — SAT (?c='b', X="")
+    {
+        nseq_fixture f;
+        sort* cs = f.su.mk_char_sort();
+        expr_ref fc(f.m.mk_fresh_const("?c", cs), f.m);
+        expr_ref unit_e(f.su.str.mk_unit(fc), f.m);
+        euf::snode* u = f.sg.mk(unit_e);
+        euf::snode* X = f.sb.var('X');
+        euf::snode* str = f.sg.mk_concat(u,
+                          f.sg.mk_concat(f.sg.mk_char('h'),
+                          f.sg.mk_concat(f.sg.mk_char('h'),
+                          f.sg.mk_concat(f.sg.mk_char('h'), X))));
+        euf::snode* re = f.rb.parse("hhhbbb|bhhh");
+        f.ng.add_str_mem(str, re);
+        VERIFY(f.ng.solve() == seq::nielsen_graph::search_result::sat);
+    }
+
+    // ?c · "hhh" · X ∉ h(aa)* — UNSAT
+    // 'h'-branch: "hhh" · X ∈ (aa)*, but 'h' ≠ 'a'
+    // all other branches: D(h(aa)*, non-h) = fail
+    {
+        nseq_fixture f;
+        sort* cs = f.su.mk_char_sort();
+        expr_ref fc(f.m.mk_fresh_const("?c", cs), f.m);
+        expr_ref unit_e(f.su.str.mk_unit(fc), f.m);
+        euf::snode* u = f.sg.mk(unit_e);
+        euf::snode* X = f.sb.var('X');
+        euf::snode* str = f.sg.mk_concat(u,
+                          f.sg.mk_concat(f.sg.mk_char('h'),
+                          f.sg.mk_concat(f.sg.mk_char('h'),
+                          f.sg.mk_concat(f.sg.mk_char('h'), X))));
+        euf::snode* re = f.rb.parse("h(aa)*");
+        f.ng.add_str_mem(str, re);
+        VERIFY(f.ng.solve() == seq::nielsen_graph::search_result::unsat);
+    }
+
+    // ?c · X ∈ (a|b)* — SAT (unit split branches on {a,b,others};
+    // 'a'- and 'b'-branches give X ∈ (a|b)*, trivially SAT)
+    {
+        nseq_fixture f;
+        sort* cs = f.su.mk_char_sort();
+        expr_ref fc(f.m.mk_fresh_const("?c", cs), f.m);
+        expr_ref unit_e(f.su.str.mk_unit(fc), f.m);
+        euf::snode* u = f.sg.mk(unit_e);
+        euf::snode* X = f.sb.var('X');
+        euf::snode* str = f.sg.mk_concat(u, X);
+        euf::snode* re = f.rb.parse("(a|b)*");
+        f.ng.add_str_mem(str, re);
+        VERIFY(f.ng.solve() == seq::nielsen_graph::search_result::sat);
+    }
+
+    std::cout << "  ok\n";
+}
+
 void tst_nseq_zipt() {
     test_zipt_str_equations();
     test_tricky_str_equations();
     test_zipt_regex_ground();
     test_zipt_str_membership();
     test_zipt_parikh();
+    test_regex_unit_split();
     std::cout << "nseq_zipt: all tests passed\n";
 }