From a81ce477f5e38c0f9ca603b28b2d633ea47c0455 Mon Sep 17 00:00:00 2001
From: CEisenhofer <clemens.eisenhofer@tuwien.ac.at>
Date: Thu, 2 Apr 2026 20:03:22 +0200
Subject: [PATCH] Added classical regex factorization

---
 src/ast/euf/euf_sgraph.cpp       |  12 +-
 src/ast/euf/euf_snode.h          |  10 +-
 src/params/smt_params.cpp        |   4 +
 src/params/smt_params.h          |   1 +
 src/params/smt_params_helper.pyg |   1 +
 src/smt/seq/seq_nielsen.cpp      | 195 +++++++++++++++++++++++++++++++
 src/smt/seq/seq_nielsen.h        |   7 ++
 src/smt/theory_nseq.cpp          |  34 ++++++
 8 files changed, 258 insertions(+), 6 deletions(-)

diff --git a/src/ast/euf/euf_sgraph.cpp b/src/ast/euf/euf_sgraph.cpp
index 63a59ea1b..36eb25dcb 100644
--- a/src/ast/euf/euf_sgraph.cpp
+++ b/src/ast/euf/euf_sgraph.cpp
@@ -154,6 +154,7 @@ namespace euf {
             n->m_ground = l->is_ground() && r->is_ground();
             n->m_regex_free = l->is_regex_free() && r->is_regex_free();
             n->m_nullable = l->is_nullable() && r->is_nullable();
+            n->m_is_classical = l->is_classical() && r->is_classical();
             n->m_level = std::max(l->level(), r->level()) + 1;
             n->m_length = l->length() + r->length();
             ++m_stats.m_num_concat;
@@ -163,13 +164,14 @@ namespace euf {
         case snode_kind::s_power: {
             // s^n: nullable follows base, consistent with ZIPT's PowerToken
             // the exponent n is assumed to be a symbolic integer, may or may not be zero
-            // NSB review: SASSERT(n->num_args() == 2); and simplify code
+            // NSB review: SASSERT(n->num_args() == 2); and simplify code       
             // NSB review: is this the correct definition of ground what about the exponent?
             SASSERT(n->num_args() >= 1);
             snode* base = n->arg(0);
             n->m_ground = base->is_ground();
             n->m_regex_free = base->is_regex_free();
             n->m_nullable = base->is_nullable();
+            n->m_is_classical = base->is_classical();
             n->m_level = 1;
             n->m_length = 1;
             ++m_stats.m_num_power;
@@ -181,6 +183,7 @@ namespace euf {
             n->m_ground = n->arg(0)->is_ground();
             n->m_regex_free = false;
             n->m_nullable = true;
+            n->m_is_classical = n->arg(0)->is_classical();
             n->m_level = 1;
             n->m_length = 1;
             break;
@@ -189,6 +192,7 @@ namespace euf {
             n->m_ground = n->num_args() > 0 ? n->arg(0)->is_ground() : true;
             n->m_regex_free = false;
             // nullable iff lower bound is 0: r{0,n} accepts the empty string
+            n->m_is_classical = n->arg(0)->is_classical();
             // default lo=1 (non-nullable) in case extraction fails
             unsigned lo = 1, hi = 1;
             expr* loop_body = nullptr;
@@ -207,15 +211,17 @@ namespace euf {
             n->m_ground = n->arg(0)->is_ground() && n->arg(1)->is_ground();
             n->m_regex_free = false;
             n->m_nullable = n->arg(0)->is_nullable() || n->arg(1)->is_nullable();
+            n->m_is_classical = n->arg(0)->is_classical() && n->arg(1)->is_classical();
             n->m_level = 1;
             n->m_length = 1;
             break;
 
         case snode_kind::s_intersect:
             SASSERT(n->num_args() == 2);
-            n->m_ground = n->arg(0)->is_ground() && n->arg(1)->is_ground();
+            n->m_ground = n->arg(0)->is_ground() && n->arg(1)->is_ground();     
             n->m_regex_free = false;
             n->m_nullable = n->arg(0)->is_nullable() && n->arg(1)->is_nullable();
+            n->m_is_classical = false;
             n->m_level = 1;
             n->m_length = 1;
             break;
@@ -225,6 +231,7 @@ namespace euf {
             n->m_ground = n->arg(0)->is_ground();
             n->m_regex_free = false;
             n->m_nullable = !n->arg(0)->is_nullable();
+            n->m_is_classical = false;
             n->m_level = 1;
             n->m_length = 1;
             break;
@@ -233,6 +240,7 @@ namespace euf {
             n->m_ground = true;
             n->m_regex_free = false;
             n->m_nullable = false;
+            n->m_is_classical = false;
             n->m_level = 1;
             n->m_length = 1;
             break;
diff --git a/src/ast/euf/euf_snode.h b/src/ast/euf/euf_snode.h
index b5f5b6a38..85bbbc4fe 100644
--- a/src/ast/euf/euf_snode.h
+++ b/src/ast/euf/euf_snode.h
@@ -63,10 +63,11 @@ namespace euf {
         unsigned    m_num_args  = 0;
 
         // metadata flags, analogous to ZIPT's Str/StrToken properties
-        bool        m_ground     = true;   // no uninterpreted string variables
-        bool        m_regex_free = true;   // no regex constructs
-        bool        m_nullable   = false;  // accepts the empty string
-        unsigned    m_level      = 0;      // tree depth/level (0 for empty, 1 for singletons)
+        bool        m_ground       = true;   // no uninterpreted string variables
+        bool        m_regex_free   = true;   // no regex constructs
+        bool        m_nullable     = false;  // accepts the empty string
+        bool        m_is_classical = true;   // classical regular expression
+        unsigned    m_level        = 0;      // tree depth/level (0 for empty, 1 for singletons)
         unsigned    m_length     = 0;      // token count, number of leaf tokens in the tree
 
         // hash matrix for associativity-respecting hashing (2x2 polynomial hash matrix)
@@ -104,6 +105,7 @@ namespace euf {
         bool is_ground()     const { return m_ground; }
         bool is_regex_free() const { return m_regex_free; }
         bool is_nullable()   const { return m_nullable; }
+        bool is_classical()  const { return m_is_classical; }
         unsigned level()     const { return m_level; }
         unsigned length()    const { return m_length; }
 
diff --git a/src/params/smt_params.cpp b/src/params/smt_params.cpp
index 7e148a061..a6a61c3c0 100644
--- a/src/params/smt_params.cpp
+++ b/src/params/smt_params.cpp
@@ -57,6 +57,7 @@ void smt_params::updt_local_params(params_ref const & _p) {
     m_nseq_max_nodes = p.nseq_max_nodes();
     m_nseq_parikh = p.nseq_parikh();
     m_nseq_regex_precheck = p.nseq_regex_precheck();
+    m_nseq_regex_factorization = p.nseq_regex_factorization();
     m_nseq_signature = p.nseq_signature();
     m_up_persist_clauses = p.up_persist_clauses();
     validate_string_solver(m_string_solver);
@@ -169,6 +170,9 @@ void smt_params::display(std::ostream & out) const {
     DISPLAY_PARAM(m_lemmas2console);
     DISPLAY_PARAM(m_logic);
     DISPLAY_PARAM(m_string_solver);
+    DISPLAY_PARAM(m_nseq_parikh);
+    DISPLAY_PARAM(m_nseq_regex_precheck);
+    DISPLAY_PARAM(m_nseq_regex_factorization);
 
     DISPLAY_PARAM(m_profile_res_sub);
     DISPLAY_PARAM(m_display_bool_var2expr);
diff --git a/src/params/smt_params.h b/src/params/smt_params.h
index 14e03a114..da8daa405 100644
--- a/src/params/smt_params.h
+++ b/src/params/smt_params.h
@@ -252,6 +252,7 @@ struct smt_params : public preprocessor_params,
     unsigned m_nseq_max_nodes = 0;
     bool m_nseq_parikh = false;
     bool m_nseq_regex_precheck = true;
+    bool m_nseq_regex_factorization = true;
     bool m_nseq_signature = false;
 
     smt_params(params_ref const & p = params_ref()):
diff --git a/src/params/smt_params_helper.pyg b/src/params/smt_params_helper.pyg
index 7cc9e679b..5b71e76d7 100644
--- a/src/params/smt_params_helper.pyg
+++ b/src/params/smt_params_helper.pyg
@@ -129,6 +129,7 @@ def_module_params(module_name='smt',
                           ('nseq.max_nodes', UINT, 0, 'maximum number of DFS nodes explored by theory_nseq per solve() call (0 = unlimited)'),
                           ('nseq.parikh', BOOL, False, 'enable Parikh image checks in nseq solver'),
                           ('nseq.regex_precheck', BOOL, True, 'enable regex membership pre-check before DFS in theory_nseq: checks intersection emptiness per-variable and short-circuits SAT/UNSAT for regex-only problems'),
+                          ('nseq.regex_factorization', BOOL, True, 'enable syntactic regex factorization in theory_nseq: decomposes Boolean closure of regular expressions into primitive membership constraints'),
                           ('nseq.signature', BOOL, False, 'enable heuristic signature-based string equation splitting in Nielsen solver'),
                           ('core.validate', BOOL, False, '[internal] validate unsat core produced by SMT context. This option is intended for debugging'),
                           ('seq.split_w_len', BOOL, True, 'enable splitting guided by length constraints'),
diff --git a/src/smt/seq/seq_nielsen.cpp b/src/smt/seq/seq_nielsen.cpp
index 218c3789f..d1f1a27f0 100644
--- a/src/smt/seq/seq_nielsen.cpp
+++ b/src/smt/seq/seq_nielsen.cpp
@@ -2218,6 +2218,10 @@ namespace seq {
         if (apply_gpower_intr(node))
             return ++m_stats.m_mod_gpower_intr, true;
 
+        // Priority 7b: Regex Factorization (Boolean Closure)
+        if (apply_regex_factorization(node))
+            return ++m_stats.m_mod_regex_factorization, true;
+
         // Priority 8: ConstNielsen - char vs var (2 children)
         if (apply_const_nielsen(node))
             return ++m_stats.m_mod_const_nielsen, true;
@@ -2810,6 +2814,196 @@ namespace seq {
         return false;
     }
 
+    // -----------------------------------------------------------------------
+    // Modifier: apply_regex_factorization (Boolean Closure)
+    // -----------------------------------------------------------------------
+
+    struct tau_pair {
+        expr_ref m_p;
+        expr_ref m_q;
+        tau_pair(expr* p, expr* q, ast_manager& m) : m_p(p, m), m_q(q, m) {
+            SASSERT(p);
+            SASSERT(q);
+        }
+    };
+    typedef vector<tau_pair> tau_pairs;
+
+    static void compute_tau(ast_manager& m, seq_util& seq, euf::sgraph& sg, expr* r, tau_pairs& result) {
+        SASSERT(r);
+        sort* str_sort = nullptr;
+        if (!seq.is_re(r, str_sort)) return;
+        expr *body = nullptr;
+
+        if (seq.re.is_epsilon(r)) {
+            expr_ref eps(seq.re.mk_epsilon(str_sort), m);
+            result.push_back(tau_pair(eps, eps, m));
+        }
+        else if (seq.str.is_unit(r) || seq.str.is_string(r) || seq.re.is_range(r) || seq.re.is_full_char(r) ||
+                 (seq.re.is_to_re(r) && seq.str.is_string(to_app(r)->get_arg(0)))) {
+            if (seq.re.is_to_re(r)) {
+                expr* arg = to_app(r)->get_arg(0);
+                zstring s;
+                if (seq.str.is_string(arg, s) && s.length() > 1) {
+                    for (unsigned i = 0; i <= s.length(); ++i) {
+                        expr_ref p(seq.re.mk_to_re(seq.str.mk_string(s.extract(0, i))), m);
+                        expr_ref q(seq.re.mk_to_re(seq.str.mk_string(s.extract(i, s.length() - i))), m);
+                        result.push_back(tau_pair(p, q, m));
+                    }
+                    return;
+                }
+            }
+            expr_ref eps(seq.re.mk_epsilon(str_sort), m);
+            result.push_back(tau_pair(eps, r, m));
+            result.push_back(tau_pair(r, eps, m));
+        }
+        else if (seq.re.is_empty(r)) {
+            // empty set has no splits
+        }
+        else if (seq.re.is_union(r)) {
+            for (expr* arg : *to_app(r)) {
+                compute_tau(m, seq, sg, arg, result);
+            }
+        }
+        else if (seq.re.is_concat(r)) {
+            unsigned num_args = to_app(r)->get_num_args();
+            if (num_args == 0) {
+                expr_ref eps(seq.re.mk_epsilon(str_sort), m);
+                result.push_back(tau_pair(eps, eps, m));
+                return;
+            }
+            for (unsigned i = 0; i < num_args; ++i) {
+                tau_pairs tau_arg;
+                compute_tau(m, seq, sg, to_app(r)->get_arg(i), tau_arg);
+                
+                expr_ref left(m);
+                expr_ref right(m);
+                
+                if (i == 0) left = seq.re.mk_epsilon(str_sort);
+                else {
+                    expr_ref_vector left_args(m);
+                    for (unsigned j = 0; j < i; ++j) left_args.push_back(to_app(r)->get_arg(j));
+                    if (left_args.size() == 1) left = left_args.get(0);
+                    else left = m.mk_app(seq.get_family_id(), OP_RE_CONCAT, left_args.size(), left_args.data());
+                }
+
+                if (i == num_args - 1) right = seq.re.mk_epsilon(str_sort);     
+                else {
+                    expr_ref_vector right_args(m);
+                    for (unsigned j = i + 1; j < num_args; ++j) right_args.push_back(to_app(r)->get_arg(j));
+                    if (right_args.size() == 1) right = right_args.get(0);
+                    else right = m.mk_app(seq.get_family_id(), OP_RE_CONCAT, right_args.size(), right_args.data());
+                }
+
+                for (auto const& pair : tau_arg) {
+                    expr_ref p(m), q(m);
+                    if (seq.re.is_epsilon(left)) p = pair.m_p;
+                    else if (seq.re.is_epsilon(pair.m_p)) p = left;
+                    
+                    if (seq.re.is_epsilon(right)) q = pair.m_q;
+                    else if (seq.re.is_epsilon(pair.m_q)) q = right;
+                    else q = seq.re.mk_concat(pair.m_q, right);
+                    
+                    result.push_back(tau_pair(p, q, m));
+                }
+            }
+        }
+        else if (seq.re.is_star(r, body) || seq.re.is_plus(r, body)) {
+            if (seq.re.is_plus(r)) {
+                expr_ref star(seq.re.mk_star(body), m);
+                expr_ref concat(seq.re.mk_concat(body, star), m);
+                compute_tau(m, seq, sg, concat, result);
+                return;
+            }
+            
+            expr_ref eps(seq.re.mk_epsilon(str_sort), m);
+            result.push_back(tau_pair(eps, eps, m));
+            
+            tau_pairs tau_body;
+            compute_tau(m, seq, sg, body, tau_body);
+            for (auto const& pair : tau_body) {
+                expr_ref p(m), q(m);
+                if (seq.re.is_epsilon(pair.m_p)) p = r;
+                else p = seq.re.mk_concat(r, pair.m_p); 
+                
+                if (seq.re.is_epsilon(pair.m_q)) q = r;
+                else q = seq.re.mk_concat(pair.m_q, r); 
+                
+                result.push_back(tau_pair(p, q, m));
+            }
+        }
+        else if (seq.re.is_opt(r, body)) {
+            expr_ref eps(seq.re.mk_epsilon(str_sort), m);
+            result.push_back(tau_pair(eps, eps, m));
+            compute_tau(m, seq, sg, body, result);
+        }
+        else {
+            expr_ref eps(seq.re.mk_epsilon(str_sort), m);
+            result.push_back(tau_pair(eps, r, m));
+            result.push_back(tau_pair(r, eps, m));
+        }
+    }
+
+    bool nielsen_graph::apply_regex_factorization(nielsen_node* node) {
+        if (!m_regex_factorization)
+            return false;
+
+        for (str_mem const& mem : node->str_mems()) {
+            SASSERT(mem.m_str && mem.m_regex);
+            
+            if (mem.is_primitive() || !mem.m_regex->is_classical())
+                continue;
+
+            euf::snode* first = mem.m_str->first();
+            SASSERT(first);
+            euf::snode* tail = m_sg.drop_first(mem.m_str);
+            SASSERT(tail);
+
+            tau_pairs pairs;
+            compute_tau(m, m_seq, m_sg, mem.m_regex->get_expr(), pairs);
+
+            for (auto const& pair : pairs) {
+                euf::snode* sn_p = m_sg.mk(pair.m_p);
+                euf::snode* sn_q = m_sg.mk(pair.m_q);
+                
+                // Eagerly eliminate contradictory cases
+                // e.g. check intersection emptiness with max_states = 100
+                if (m_seq_regex->is_empty_bfs(sn_p, 100) == l_true)
+                    continue;
+                if (m_seq_regex->is_empty_bfs(sn_q, 100) == l_true)
+                    continue;
+                
+                // Also check intersection with other primitive constraints on `first`
+                ptr_vector<euf::snode> regexes_p;
+                regexes_p.push_back(sn_p);
+                for (auto const& prev_mem : node->str_mems()) {
+                    if (prev_mem.m_str == first)
+                        regexes_p.push_back(prev_mem.m_regex);
+                }
+                if (regexes_p.size() > 1 && m_seq_regex->check_intersection_emptiness(regexes_p, 100) == l_true)
+                    continue;
+
+                nielsen_node* child = mk_child(node);
+                mk_edge(node, child, true);
+                
+                // remove the original mem from child
+                auto& child_mems = child->str_mems();
+                for (unsigned k = 0; k < child_mems.size(); ++k) {
+                    if (child_mems[k].m_id == mem.m_id) {
+                        child_mems[k] = child_mems.back();
+                        child_mems.pop_back();
+                        break;
+                    }
+                }
+                
+                child->add_str_mem(str_mem(first, sn_p, mem.m_history, next_mem_id(), mem.m_dep));
+                child->add_str_mem(str_mem(tail, sn_q, mem.m_history, next_mem_id(), mem.m_dep));
+            }
+            
+            return true;
+        }
+        return false;
+    }
+
     bool nielsen_graph::fire_gpower_intro(
             nielsen_node* node, str_eq const& eq,
             euf::snode* var, euf::snode_vector const& ground_prefix_orig, bool fwd) {
@@ -4141,6 +4335,7 @@ namespace seq {
         st.update("nseq mod eq split",         m_stats.m_mod_eq_split);
         st.update("nseq mod star intr",        m_stats.m_mod_star_intr);
         st.update("nseq mod gpower intr",      m_stats.m_mod_gpower_intr);
+        st.update("nseq mod regex fact",       m_stats.m_mod_regex_factorization);
         st.update("nseq mod const nielsen",    m_stats.m_mod_const_nielsen);
         st.update("nseq mod signature split",  m_stats.m_mod_signature_split);
         st.update("nseq mod regex var",        m_stats.m_mod_regex_var_split);
diff --git a/src/smt/seq/seq_nielsen.h b/src/smt/seq/seq_nielsen.h
index a17912f7c..c04345be9 100644
--- a/src/smt/seq/seq_nielsen.h
+++ b/src/smt/seq/seq_nielsen.h
@@ -734,6 +734,7 @@ namespace seq {
         unsigned m_mod_eq_split        = 0;
         unsigned m_mod_star_intr       = 0;
         unsigned m_mod_gpower_intr     = 0;
+        unsigned m_mod_regex_factorization = 0;
         unsigned m_mod_const_nielsen   = 0;
         unsigned m_mod_regex_var_split = 0;
         unsigned m_mod_signature_split = 0;
@@ -763,6 +764,7 @@ namespace seq {
         unsigned                      m_max_nodes = 0;          // 0 = unlimited
         bool                          m_parikh_enabled = true;
         bool                          m_signature_split = false;
+        bool                          m_regex_factorization = true;
         unsigned                      m_next_mem_id = 0;
         unsigned                      m_fresh_cnt = 0;
         nielsen_stats                 m_stats;
@@ -886,6 +888,8 @@ namespace seq {
         void set_parikh_enabled(bool e) { m_parikh_enabled = e; }
 
         void set_signature_split(bool e) { m_signature_split = e; }
+        
+        void set_regex_factorization(bool e) { m_regex_factorization = e; }
 
         // generate next unique regex membership id
         unsigned next_mem_id() { return m_next_mem_id++; }
@@ -1072,6 +1076,9 @@ namespace seq {
         // mirrors ZIPT's GPowerIntrModifier
         bool apply_gpower_intr(nielsen_node* node);
 
+        // generalized regex factorization (Boolean closure derivation rule)
+        bool apply_regex_factorization(nielsen_node* node);
+
         // helper for apply_gpower_intr: fires the substitution.
         // `fwd=true` uses left-to-right decomposition; `fwd=false` mirrors ZIPT's
         // backward (right-to-left) direction.
diff --git a/src/smt/theory_nseq.cpp b/src/smt/theory_nseq.cpp
index e24554cf0..3fed67df9 100644
--- a/src/smt/theory_nseq.cpp
+++ b/src/smt/theory_nseq.cpp
@@ -390,6 +390,38 @@ namespace smt {
         expr* s_expr = mem.m_str->get_expr();
         if (s_expr)
             ensure_length_var(s_expr);
+
+        if (!get_fparams().m_nseq_regex_factorization)
+            return;
+
+        // Boolean Closure Propagations
+        expr* re_expr = mem.m_regex->get_expr();
+        if (m_seq.re.is_intersection(re_expr)) {
+            for (expr* arg : *to_app(re_expr)) {
+                expr_ref in_r(m_seq.re.mk_in_re(s_expr, arg), m);
+                literal_vector lits;
+                lits.push_back(~mem.lit);
+                lits.push_back(mk_literal(in_r));
+                ctx.mk_th_axiom(get_id(), lits.size(), lits.data());
+            }
+        }
+        else if (m_seq.re.is_union(re_expr)) {
+            literal_vector lits;
+            lits.push_back(~mem.lit);
+            for (expr* arg : *to_app(re_expr)) {
+                expr_ref in_r(m_seq.re.mk_in_re(s_expr, arg), m);
+                lits.push_back(mk_literal(in_r));
+            }
+            ctx.mk_th_axiom(get_id(), lits.size(), lits.data());
+        }
+        else if (m_seq.re.is_complement(re_expr)) {
+            expr* arg = to_app(re_expr)->get_arg(0);
+            expr_ref in_r(m_seq.re.mk_in_re(s_expr, arg), m);
+            literal_vector lits;
+            lits.push_back(~mem.lit);
+            lits.push_back(~mk_literal(in_r));
+            ctx.mk_th_axiom(get_id(), lits.size(), lits.data());
+        }
     }
 
     void theory_nseq::ensure_length_var(expr* e) {
@@ -578,6 +610,7 @@ namespace smt {
         m_nielsen.set_max_nodes(get_fparams().m_nseq_max_nodes);
         m_nielsen.set_parikh_enabled(get_fparams().m_nseq_parikh);
         m_nielsen.set_signature_split(get_fparams().m_nseq_signature);
+        m_nielsen.set_regex_factorization(get_fparams().m_nseq_regex_factorization);
 
         // Regex membership pre-check: before running DFS, check intersection
         // emptiness for each variable's regex constraints.  This handles     
@@ -1221,6 +1254,7 @@ namespace smt {
             lbool result = m_regex.check_intersection_emptiness(regexes);
 
             if (result == l_true) {
+                // TODO: Incorporate that we might know the maximum length generated by a regex [in those cases, the gradients will never work]
                 // It is empty. Try gradient.
                 regexes.pop_back(); // Remove loop_l