From 4b08c629c8598805b54f8f2cffb22a291ec88544 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 31 Mar 2026 02:51:21 +0000 Subject: [PATCH] port RegexCharSplitModifier: implement apply_regex_unit_split in seq_nielsen Agent-Logs-Url: https://github.com/Z3Prover/z3/sessions/5d8898c4-5b66-42b6-a05a-03de8d1f0a94 Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com> --- src/smt/seq/seq_nielsen.cpp | 67 ++++++++++++++++++++++++++++++++++++- src/smt/seq/seq_nielsen.h | 1 + src/test/nseq_zipt.cpp | 64 +++++++++++++++++++++++++++++++++++ 3 files changed, 131 insertions(+), 1 deletion(-) diff --git a/src/smt/seq/seq_nielsen.cpp b/src/smt/seq/seq_nielsen.cpp index 113b0c093..c51febed1 100644 --- a/src/smt/seq/seq_nielsen.cpp +++ b/src/smt/seq/seq_nielsen.cpp @@ -2234,7 +2234,11 @@ namespace seq { if (apply_const_nielsen(node)) return ++m_stats.m_mod_const_nielsen, true; - // Priority 9: SignatureSplit - heuristic string equation splitting + // Priority 9: RegexUnitSplit - split str_mem c·s ∈ R by minterms of R + if (apply_regex_unit_split(node)) + return ++m_stats.m_mod_regex_unit_split, true; + + // Priority 9b: SignatureSplit - heuristic string equation splitting if (m_signature_split && apply_signature_split(node)) return ++m_stats.m_mod_signature_split, true; @@ -3112,6 +3116,66 @@ namespace seq { } return false; } + // ----------------------------------------------------------------------- + // Modifier: apply_regex_unit_split (RegexCharSplitModifier) + // For str_mem c·s ∈ R where c is a symbolic unit token (seq.unit(?c)), + // branch over minterms of R: for each minterm m_i with non-fail derivative, + // create a child that constrains ?c to the character class of m_i. + // Unlike apply_regex_var_split, no substitution and no epsilon branch. + // After the constraint is added, simplify_and_init will consume c + // deterministically via the uniform derivative check. + // mirrors ZIPT's RegexCharSplitModifier + // ----------------------------------------------------------------------- + + bool nielsen_graph::apply_regex_unit_split(nielsen_node* node) { + for (str_mem const& mem : node->str_mems()) { + SASSERT(mem.m_str && mem.m_regex); + if (mem.is_primitive()) + continue; + euf::snode* first = mem.m_str->first(); + if (!first || !first->is_unit()) + continue; + + // Compute minterms of the regex + euf::snode_vector minterms; + m_sg.compute_minterms(mem.m_regex, minterms); + VERIFY(!minterms.empty()); + + // Get the current char_range for this token, fall back to full + char_set const& existing = + node->char_ranges().contains(first->id()) + ? node->char_ranges()[first->id()] + : char_set::full(zstring::max_char()); + + bool created = false; + for (euf::snode* mt : minterms) { + SASSERT(mt && mt->get_expr()); + SASSERT(!mt->is_fail()); + + char_set mt_cs = m_seq_regex->minterm_to_char_set(mt->get_expr()); + // skip minterm if it doesn't overlap with the existing range + if (existing.intersect_with(mt_cs).is_empty()) + continue; + + // skip if the regex derivative is empty for this minterm + euf::snode* deriv = m_sg.brzozowski_deriv(mem.m_regex, mt); + SASSERT(deriv); + if (deriv->is_fail()) + continue; + + // create a child and narrow the char_range for this token + nielsen_node* child = mk_child(node); + mk_edge(node, child, false); + child->add_char_range(first, mt_cs); + created = true; + } + + if (created) + return true; + } + return false; + } + // ----------------------------------------------------------------------- // Modifier: apply_regex_var_split // For str_mem x·s ∈ R where x is a variable, split using minterms: @@ -4050,6 +4114,7 @@ namespace seq { st.update("nseq mod star intr", m_stats.m_mod_star_intr); st.update("nseq mod gpower intr", m_stats.m_mod_gpower_intr); st.update("nseq mod const nielsen", m_stats.m_mod_const_nielsen); + st.update("nseq mod regex unit", m_stats.m_mod_regex_unit_split); st.update("nseq mod signature split", m_stats.m_mod_signature_split); st.update("nseq mod regex var", m_stats.m_mod_regex_var_split); st.update("nseq mod power split", m_stats.m_mod_power_split); diff --git a/src/smt/seq/seq_nielsen.h b/src/smt/seq/seq_nielsen.h index cd53d4f2e..172fb00ee 100644 --- a/src/smt/seq/seq_nielsen.h +++ b/src/smt/seq/seq_nielsen.h @@ -186,6 +186,7 @@ Abstract: GPowerIntrModifier(7), ConstNielsenModifier(8), RegexCharSplitModifier(9), RegexVarSplitModifier(10), PowerSplitModifier(11), VarNielsenModifier(12), VarNumUnwindingModifier(13). + - Z3-specific heuristic inserted between (9) and (10): SignatureSplit. - NOT PORTED: DirectedNielsenModifier, DecomposeModifier, CombinedModifier. - NumCmp, ConstNumUnwinding, VarNumUnwinding are approximated (no PDD integer polynomial infrastructure; power tokens are replaced with ε diff --git a/src/test/nseq_zipt.cpp b/src/test/nseq_zipt.cpp index 6a80b9e8c..fca578dd5 100644 --- a/src/test/nseq_zipt.cpp +++ b/src/test/nseq_zipt.cpp @@ -684,11 +684,75 @@ static void test_tricky_str_equations() { std::cout << " ok\n"; } +// ----------------------------------------------------------------------- +// Symbolic unit token in regex membership (RegexCharSplitModifier) +// Verifies apply_regex_unit_split: when the first token of a str_mem is +// a seq.unit(?c) with no initial char_range, the modifier must branch +// over regex minterms and constrain ?c. +// ----------------------------------------------------------------------- +static void test_regex_unit_split() { + std::cout << "test_regex_unit_split\n"; + + // ?c · "hhh" · X ∈ hhhbbb|bhhh — SAT (?c='b', X="") + { + nseq_fixture f; + sort* cs = f.su.mk_char_sort(); + expr_ref fc(f.m.mk_fresh_const("?c", cs), f.m); + expr_ref unit_e(f.su.str.mk_unit(fc), f.m); + euf::snode* u = f.sg.mk(unit_e); + euf::snode* X = f.sb.var('X'); + euf::snode* str = f.sg.mk_concat(u, + f.sg.mk_concat(f.sg.mk_char('h'), + f.sg.mk_concat(f.sg.mk_char('h'), + f.sg.mk_concat(f.sg.mk_char('h'), X)))); + euf::snode* re = f.rb.parse("hhhbbb|bhhh"); + f.ng.add_str_mem(str, re); + VERIFY(f.ng.solve() == seq::nielsen_graph::search_result::sat); + } + + // ?c · "hhh" · X ∉ h(aa)* — UNSAT + // 'h'-branch: "hhh" · X ∈ (aa)*, but 'h' ≠ 'a' + // all other branches: D(h(aa)*, non-h) = fail + { + nseq_fixture f; + sort* cs = f.su.mk_char_sort(); + expr_ref fc(f.m.mk_fresh_const("?c", cs), f.m); + expr_ref unit_e(f.su.str.mk_unit(fc), f.m); + euf::snode* u = f.sg.mk(unit_e); + euf::snode* X = f.sb.var('X'); + euf::snode* str = f.sg.mk_concat(u, + f.sg.mk_concat(f.sg.mk_char('h'), + f.sg.mk_concat(f.sg.mk_char('h'), + f.sg.mk_concat(f.sg.mk_char('h'), X)))); + euf::snode* re = f.rb.parse("h(aa)*"); + f.ng.add_str_mem(str, re); + VERIFY(f.ng.solve() == seq::nielsen_graph::search_result::unsat); + } + + // ?c · X ∈ (a|b)* — SAT (unit split branches on {a,b,others}; + // 'a'- and 'b'-branches give X ∈ (a|b)*, trivially SAT) + { + nseq_fixture f; + sort* cs = f.su.mk_char_sort(); + expr_ref fc(f.m.mk_fresh_const("?c", cs), f.m); + expr_ref unit_e(f.su.str.mk_unit(fc), f.m); + euf::snode* u = f.sg.mk(unit_e); + euf::snode* X = f.sb.var('X'); + euf::snode* str = f.sg.mk_concat(u, X); + euf::snode* re = f.rb.parse("(a|b)*"); + f.ng.add_str_mem(str, re); + VERIFY(f.ng.solve() == seq::nielsen_graph::search_result::sat); + } + + std::cout << " ok\n"; +} + void tst_nseq_zipt() { test_zipt_str_equations(); test_tricky_str_equations(); test_zipt_regex_ground(); test_zipt_str_membership(); test_zipt_parikh(); + test_regex_unit_split(); std::cout << "nseq_zipt: all tests passed\n"; }