From a81ce477f5e38c0f9ca603b28b2d633ea47c0455 Mon Sep 17 00:00:00 2001 From: CEisenhofer Date: Thu, 2 Apr 2026 20:03:22 +0200 Subject: [PATCH] Added classical regex factorization --- src/ast/euf/euf_sgraph.cpp | 12 +- src/ast/euf/euf_snode.h | 10 +- src/params/smt_params.cpp | 4 + src/params/smt_params.h | 1 + src/params/smt_params_helper.pyg | 1 + src/smt/seq/seq_nielsen.cpp | 195 +++++++++++++++++++++++++++++++ src/smt/seq/seq_nielsen.h | 7 ++ src/smt/theory_nseq.cpp | 34 ++++++ 8 files changed, 258 insertions(+), 6 deletions(-) diff --git a/src/ast/euf/euf_sgraph.cpp b/src/ast/euf/euf_sgraph.cpp index 63a59ea1b..36eb25dcb 100644 --- a/src/ast/euf/euf_sgraph.cpp +++ b/src/ast/euf/euf_sgraph.cpp @@ -154,6 +154,7 @@ namespace euf { n->m_ground = l->is_ground() && r->is_ground(); n->m_regex_free = l->is_regex_free() && r->is_regex_free(); n->m_nullable = l->is_nullable() && r->is_nullable(); + n->m_is_classical = l->is_classical() && r->is_classical(); n->m_level = std::max(l->level(), r->level()) + 1; n->m_length = l->length() + r->length(); ++m_stats.m_num_concat; @@ -163,13 +164,14 @@ namespace euf { case snode_kind::s_power: { // s^n: nullable follows base, consistent with ZIPT's PowerToken // the exponent n is assumed to be a symbolic integer, may or may not be zero - // NSB review: SASSERT(n->num_args() == 2); and simplify code + // NSB review: SASSERT(n->num_args() == 2); and simplify code // NSB review: is this the correct definition of ground what about the exponent? SASSERT(n->num_args() >= 1); snode* base = n->arg(0); n->m_ground = base->is_ground(); n->m_regex_free = base->is_regex_free(); n->m_nullable = base->is_nullable(); + n->m_is_classical = base->is_classical(); n->m_level = 1; n->m_length = 1; ++m_stats.m_num_power; @@ -181,6 +183,7 @@ namespace euf { n->m_ground = n->arg(0)->is_ground(); n->m_regex_free = false; n->m_nullable = true; + n->m_is_classical = n->arg(0)->is_classical(); n->m_level = 1; n->m_length = 1; break; @@ -189,6 +192,7 @@ namespace euf { n->m_ground = n->num_args() > 0 ? n->arg(0)->is_ground() : true; n->m_regex_free = false; // nullable iff lower bound is 0: r{0,n} accepts the empty string + n->m_is_classical = n->arg(0)->is_classical(); // default lo=1 (non-nullable) in case extraction fails unsigned lo = 1, hi = 1; expr* loop_body = nullptr; @@ -207,15 +211,17 @@ namespace euf { n->m_ground = n->arg(0)->is_ground() && n->arg(1)->is_ground(); n->m_regex_free = false; n->m_nullable = n->arg(0)->is_nullable() || n->arg(1)->is_nullable(); + n->m_is_classical = n->arg(0)->is_classical() && n->arg(1)->is_classical(); n->m_level = 1; n->m_length = 1; break; case snode_kind::s_intersect: SASSERT(n->num_args() == 2); - n->m_ground = n->arg(0)->is_ground() && n->arg(1)->is_ground(); + n->m_ground = n->arg(0)->is_ground() && n->arg(1)->is_ground(); n->m_regex_free = false; n->m_nullable = n->arg(0)->is_nullable() && n->arg(1)->is_nullable(); + n->m_is_classical = false; n->m_level = 1; n->m_length = 1; break; @@ -225,6 +231,7 @@ namespace euf { n->m_ground = n->arg(0)->is_ground(); n->m_regex_free = false; n->m_nullable = !n->arg(0)->is_nullable(); + n->m_is_classical = false; n->m_level = 1; n->m_length = 1; break; @@ -233,6 +240,7 @@ namespace euf { n->m_ground = true; n->m_regex_free = false; n->m_nullable = false; + n->m_is_classical = false; n->m_level = 1; n->m_length = 1; break; diff --git a/src/ast/euf/euf_snode.h b/src/ast/euf/euf_snode.h index b5f5b6a38..85bbbc4fe 100644 --- a/src/ast/euf/euf_snode.h +++ b/src/ast/euf/euf_snode.h @@ -63,10 +63,11 @@ namespace euf { unsigned m_num_args = 0; // metadata flags, analogous to ZIPT's Str/StrToken properties - bool m_ground = true; // no uninterpreted string variables - bool m_regex_free = true; // no regex constructs - bool m_nullable = false; // accepts the empty string - unsigned m_level = 0; // tree depth/level (0 for empty, 1 for singletons) + bool m_ground = true; // no uninterpreted string variables + bool m_regex_free = true; // no regex constructs + bool m_nullable = false; // accepts the empty string + bool m_is_classical = true; // classical regular expression + unsigned m_level = 0; // tree depth/level (0 for empty, 1 for singletons) unsigned m_length = 0; // token count, number of leaf tokens in the tree // hash matrix for associativity-respecting hashing (2x2 polynomial hash matrix) @@ -104,6 +105,7 @@ namespace euf { bool is_ground() const { return m_ground; } bool is_regex_free() const { return m_regex_free; } bool is_nullable() const { return m_nullable; } + bool is_classical() const { return m_is_classical; } unsigned level() const { return m_level; } unsigned length() const { return m_length; } diff --git a/src/params/smt_params.cpp b/src/params/smt_params.cpp index 7e148a061..a6a61c3c0 100644 --- a/src/params/smt_params.cpp +++ b/src/params/smt_params.cpp @@ -57,6 +57,7 @@ void smt_params::updt_local_params(params_ref const & _p) { m_nseq_max_nodes = p.nseq_max_nodes(); m_nseq_parikh = p.nseq_parikh(); m_nseq_regex_precheck = p.nseq_regex_precheck(); + m_nseq_regex_factorization = p.nseq_regex_factorization(); m_nseq_signature = p.nseq_signature(); m_up_persist_clauses = p.up_persist_clauses(); validate_string_solver(m_string_solver); @@ -169,6 +170,9 @@ void smt_params::display(std::ostream & out) const { DISPLAY_PARAM(m_lemmas2console); DISPLAY_PARAM(m_logic); DISPLAY_PARAM(m_string_solver); + DISPLAY_PARAM(m_nseq_parikh); + DISPLAY_PARAM(m_nseq_regex_precheck); + DISPLAY_PARAM(m_nseq_regex_factorization); DISPLAY_PARAM(m_profile_res_sub); DISPLAY_PARAM(m_display_bool_var2expr); diff --git a/src/params/smt_params.h b/src/params/smt_params.h index 14e03a114..da8daa405 100644 --- a/src/params/smt_params.h +++ b/src/params/smt_params.h @@ -252,6 +252,7 @@ struct smt_params : public preprocessor_params, unsigned m_nseq_max_nodes = 0; bool m_nseq_parikh = false; bool m_nseq_regex_precheck = true; + bool m_nseq_regex_factorization = true; bool m_nseq_signature = false; smt_params(params_ref const & p = params_ref()): diff --git a/src/params/smt_params_helper.pyg b/src/params/smt_params_helper.pyg index 7cc9e679b..5b71e76d7 100644 --- a/src/params/smt_params_helper.pyg +++ b/src/params/smt_params_helper.pyg @@ -129,6 +129,7 @@ def_module_params(module_name='smt', ('nseq.max_nodes', UINT, 0, 'maximum number of DFS nodes explored by theory_nseq per solve() call (0 = unlimited)'), ('nseq.parikh', BOOL, False, 'enable Parikh image checks in nseq solver'), ('nseq.regex_precheck', BOOL, True, 'enable regex membership pre-check before DFS in theory_nseq: checks intersection emptiness per-variable and short-circuits SAT/UNSAT for regex-only problems'), + ('nseq.regex_factorization', BOOL, True, 'enable syntactic regex factorization in theory_nseq: decomposes Boolean closure of regular expressions into primitive membership constraints'), ('nseq.signature', BOOL, False, 'enable heuristic signature-based string equation splitting in Nielsen solver'), ('core.validate', BOOL, False, '[internal] validate unsat core produced by SMT context. This option is intended for debugging'), ('seq.split_w_len', BOOL, True, 'enable splitting guided by length constraints'), diff --git a/src/smt/seq/seq_nielsen.cpp b/src/smt/seq/seq_nielsen.cpp index 218c3789f..d1f1a27f0 100644 --- a/src/smt/seq/seq_nielsen.cpp +++ b/src/smt/seq/seq_nielsen.cpp @@ -2218,6 +2218,10 @@ namespace seq { if (apply_gpower_intr(node)) return ++m_stats.m_mod_gpower_intr, true; + // Priority 7b: Regex Factorization (Boolean Closure) + if (apply_regex_factorization(node)) + return ++m_stats.m_mod_regex_factorization, true; + // Priority 8: ConstNielsen - char vs var (2 children) if (apply_const_nielsen(node)) return ++m_stats.m_mod_const_nielsen, true; @@ -2810,6 +2814,196 @@ namespace seq { return false; } + // ----------------------------------------------------------------------- + // Modifier: apply_regex_factorization (Boolean Closure) + // ----------------------------------------------------------------------- + + struct tau_pair { + expr_ref m_p; + expr_ref m_q; + tau_pair(expr* p, expr* q, ast_manager& m) : m_p(p, m), m_q(q, m) { + SASSERT(p); + SASSERT(q); + } + }; + typedef vector tau_pairs; + + static void compute_tau(ast_manager& m, seq_util& seq, euf::sgraph& sg, expr* r, tau_pairs& result) { + SASSERT(r); + sort* str_sort = nullptr; + if (!seq.is_re(r, str_sort)) return; + expr *body = nullptr; + + if (seq.re.is_epsilon(r)) { + expr_ref eps(seq.re.mk_epsilon(str_sort), m); + result.push_back(tau_pair(eps, eps, m)); + } + else if (seq.str.is_unit(r) || seq.str.is_string(r) || seq.re.is_range(r) || seq.re.is_full_char(r) || + (seq.re.is_to_re(r) && seq.str.is_string(to_app(r)->get_arg(0)))) { + if (seq.re.is_to_re(r)) { + expr* arg = to_app(r)->get_arg(0); + zstring s; + if (seq.str.is_string(arg, s) && s.length() > 1) { + for (unsigned i = 0; i <= s.length(); ++i) { + expr_ref p(seq.re.mk_to_re(seq.str.mk_string(s.extract(0, i))), m); + expr_ref q(seq.re.mk_to_re(seq.str.mk_string(s.extract(i, s.length() - i))), m); + result.push_back(tau_pair(p, q, m)); + } + return; + } + } + expr_ref eps(seq.re.mk_epsilon(str_sort), m); + result.push_back(tau_pair(eps, r, m)); + result.push_back(tau_pair(r, eps, m)); + } + else if (seq.re.is_empty(r)) { + // empty set has no splits + } + else if (seq.re.is_union(r)) { + for (expr* arg : *to_app(r)) { + compute_tau(m, seq, sg, arg, result); + } + } + else if (seq.re.is_concat(r)) { + unsigned num_args = to_app(r)->get_num_args(); + if (num_args == 0) { + expr_ref eps(seq.re.mk_epsilon(str_sort), m); + result.push_back(tau_pair(eps, eps, m)); + return; + } + for (unsigned i = 0; i < num_args; ++i) { + tau_pairs tau_arg; + compute_tau(m, seq, sg, to_app(r)->get_arg(i), tau_arg); + + expr_ref left(m); + expr_ref right(m); + + if (i == 0) left = seq.re.mk_epsilon(str_sort); + else { + expr_ref_vector left_args(m); + for (unsigned j = 0; j < i; ++j) left_args.push_back(to_app(r)->get_arg(j)); + if (left_args.size() == 1) left = left_args.get(0); + else left = m.mk_app(seq.get_family_id(), OP_RE_CONCAT, left_args.size(), left_args.data()); + } + + if (i == num_args - 1) right = seq.re.mk_epsilon(str_sort); + else { + expr_ref_vector right_args(m); + for (unsigned j = i + 1; j < num_args; ++j) right_args.push_back(to_app(r)->get_arg(j)); + if (right_args.size() == 1) right = right_args.get(0); + else right = m.mk_app(seq.get_family_id(), OP_RE_CONCAT, right_args.size(), right_args.data()); + } + + for (auto const& pair : tau_arg) { + expr_ref p(m), q(m); + if (seq.re.is_epsilon(left)) p = pair.m_p; + else if (seq.re.is_epsilon(pair.m_p)) p = left; + + if (seq.re.is_epsilon(right)) q = pair.m_q; + else if (seq.re.is_epsilon(pair.m_q)) q = right; + else q = seq.re.mk_concat(pair.m_q, right); + + result.push_back(tau_pair(p, q, m)); + } + } + } + else if (seq.re.is_star(r, body) || seq.re.is_plus(r, body)) { + if (seq.re.is_plus(r)) { + expr_ref star(seq.re.mk_star(body), m); + expr_ref concat(seq.re.mk_concat(body, star), m); + compute_tau(m, seq, sg, concat, result); + return; + } + + expr_ref eps(seq.re.mk_epsilon(str_sort), m); + result.push_back(tau_pair(eps, eps, m)); + + tau_pairs tau_body; + compute_tau(m, seq, sg, body, tau_body); + for (auto const& pair : tau_body) { + expr_ref p(m), q(m); + if (seq.re.is_epsilon(pair.m_p)) p = r; + else p = seq.re.mk_concat(r, pair.m_p); + + if (seq.re.is_epsilon(pair.m_q)) q = r; + else q = seq.re.mk_concat(pair.m_q, r); + + result.push_back(tau_pair(p, q, m)); + } + } + else if (seq.re.is_opt(r, body)) { + expr_ref eps(seq.re.mk_epsilon(str_sort), m); + result.push_back(tau_pair(eps, eps, m)); + compute_tau(m, seq, sg, body, result); + } + else { + expr_ref eps(seq.re.mk_epsilon(str_sort), m); + result.push_back(tau_pair(eps, r, m)); + result.push_back(tau_pair(r, eps, m)); + } + } + + bool nielsen_graph::apply_regex_factorization(nielsen_node* node) { + if (!m_regex_factorization) + return false; + + for (str_mem const& mem : node->str_mems()) { + SASSERT(mem.m_str && mem.m_regex); + + if (mem.is_primitive() || !mem.m_regex->is_classical()) + continue; + + euf::snode* first = mem.m_str->first(); + SASSERT(first); + euf::snode* tail = m_sg.drop_first(mem.m_str); + SASSERT(tail); + + tau_pairs pairs; + compute_tau(m, m_seq, m_sg, mem.m_regex->get_expr(), pairs); + + for (auto const& pair : pairs) { + euf::snode* sn_p = m_sg.mk(pair.m_p); + euf::snode* sn_q = m_sg.mk(pair.m_q); + + // Eagerly eliminate contradictory cases + // e.g. check intersection emptiness with max_states = 100 + if (m_seq_regex->is_empty_bfs(sn_p, 100) == l_true) + continue; + if (m_seq_regex->is_empty_bfs(sn_q, 100) == l_true) + continue; + + // Also check intersection with other primitive constraints on `first` + ptr_vector regexes_p; + regexes_p.push_back(sn_p); + for (auto const& prev_mem : node->str_mems()) { + if (prev_mem.m_str == first) + regexes_p.push_back(prev_mem.m_regex); + } + if (regexes_p.size() > 1 && m_seq_regex->check_intersection_emptiness(regexes_p, 100) == l_true) + continue; + + nielsen_node* child = mk_child(node); + mk_edge(node, child, true); + + // remove the original mem from child + auto& child_mems = child->str_mems(); + for (unsigned k = 0; k < child_mems.size(); ++k) { + if (child_mems[k].m_id == mem.m_id) { + child_mems[k] = child_mems.back(); + child_mems.pop_back(); + break; + } + } + + child->add_str_mem(str_mem(first, sn_p, mem.m_history, next_mem_id(), mem.m_dep)); + child->add_str_mem(str_mem(tail, sn_q, mem.m_history, next_mem_id(), mem.m_dep)); + } + + return true; + } + return false; + } + bool nielsen_graph::fire_gpower_intro( nielsen_node* node, str_eq const& eq, euf::snode* var, euf::snode_vector const& ground_prefix_orig, bool fwd) { @@ -4141,6 +4335,7 @@ namespace seq { st.update("nseq mod eq split", m_stats.m_mod_eq_split); st.update("nseq mod star intr", m_stats.m_mod_star_intr); st.update("nseq mod gpower intr", m_stats.m_mod_gpower_intr); + st.update("nseq mod regex fact", m_stats.m_mod_regex_factorization); st.update("nseq mod const nielsen", m_stats.m_mod_const_nielsen); st.update("nseq mod signature split", m_stats.m_mod_signature_split); st.update("nseq mod regex var", m_stats.m_mod_regex_var_split); diff --git a/src/smt/seq/seq_nielsen.h b/src/smt/seq/seq_nielsen.h index a17912f7c..c04345be9 100644 --- a/src/smt/seq/seq_nielsen.h +++ b/src/smt/seq/seq_nielsen.h @@ -734,6 +734,7 @@ namespace seq { unsigned m_mod_eq_split = 0; unsigned m_mod_star_intr = 0; unsigned m_mod_gpower_intr = 0; + unsigned m_mod_regex_factorization = 0; unsigned m_mod_const_nielsen = 0; unsigned m_mod_regex_var_split = 0; unsigned m_mod_signature_split = 0; @@ -763,6 +764,7 @@ namespace seq { unsigned m_max_nodes = 0; // 0 = unlimited bool m_parikh_enabled = true; bool m_signature_split = false; + bool m_regex_factorization = true; unsigned m_next_mem_id = 0; unsigned m_fresh_cnt = 0; nielsen_stats m_stats; @@ -886,6 +888,8 @@ namespace seq { void set_parikh_enabled(bool e) { m_parikh_enabled = e; } void set_signature_split(bool e) { m_signature_split = e; } + + void set_regex_factorization(bool e) { m_regex_factorization = e; } // generate next unique regex membership id unsigned next_mem_id() { return m_next_mem_id++; } @@ -1072,6 +1076,9 @@ namespace seq { // mirrors ZIPT's GPowerIntrModifier bool apply_gpower_intr(nielsen_node* node); + // generalized regex factorization (Boolean closure derivation rule) + bool apply_regex_factorization(nielsen_node* node); + // helper for apply_gpower_intr: fires the substitution. // `fwd=true` uses left-to-right decomposition; `fwd=false` mirrors ZIPT's // backward (right-to-left) direction. diff --git a/src/smt/theory_nseq.cpp b/src/smt/theory_nseq.cpp index e24554cf0..3fed67df9 100644 --- a/src/smt/theory_nseq.cpp +++ b/src/smt/theory_nseq.cpp @@ -390,6 +390,38 @@ namespace smt { expr* s_expr = mem.m_str->get_expr(); if (s_expr) ensure_length_var(s_expr); + + if (!get_fparams().m_nseq_regex_factorization) + return; + + // Boolean Closure Propagations + expr* re_expr = mem.m_regex->get_expr(); + if (m_seq.re.is_intersection(re_expr)) { + for (expr* arg : *to_app(re_expr)) { + expr_ref in_r(m_seq.re.mk_in_re(s_expr, arg), m); + literal_vector lits; + lits.push_back(~mem.lit); + lits.push_back(mk_literal(in_r)); + ctx.mk_th_axiom(get_id(), lits.size(), lits.data()); + } + } + else if (m_seq.re.is_union(re_expr)) { + literal_vector lits; + lits.push_back(~mem.lit); + for (expr* arg : *to_app(re_expr)) { + expr_ref in_r(m_seq.re.mk_in_re(s_expr, arg), m); + lits.push_back(mk_literal(in_r)); + } + ctx.mk_th_axiom(get_id(), lits.size(), lits.data()); + } + else if (m_seq.re.is_complement(re_expr)) { + expr* arg = to_app(re_expr)->get_arg(0); + expr_ref in_r(m_seq.re.mk_in_re(s_expr, arg), m); + literal_vector lits; + lits.push_back(~mem.lit); + lits.push_back(~mk_literal(in_r)); + ctx.mk_th_axiom(get_id(), lits.size(), lits.data()); + } } void theory_nseq::ensure_length_var(expr* e) { @@ -578,6 +610,7 @@ namespace smt { m_nielsen.set_max_nodes(get_fparams().m_nseq_max_nodes); m_nielsen.set_parikh_enabled(get_fparams().m_nseq_parikh); m_nielsen.set_signature_split(get_fparams().m_nseq_signature); + m_nielsen.set_regex_factorization(get_fparams().m_nseq_regex_factorization); // Regex membership pre-check: before running DFS, check intersection // emptiness for each variable's regex constraints. This handles @@ -1221,6 +1254,7 @@ namespace smt { lbool result = m_regex.check_intersection_emptiness(regexes); if (result == l_true) { + // TODO: Incorporate that we might know the maximum length generated by a regex [in those cases, the gradients will never work] // It is empty. Try gradient. regexes.pop_back(); // Remove loop_l