nseq: port ZIPT regex pre-check to fix benchmark discrepancy on regex-only problems (#8994)

* Initial plan * Port ZIPT regex pre-check and DFS node budget to address nseq benchmark discrepancy Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com>
2026-07-19 05:25:48 +00:00 · 2026-03-15 10:10:53 -07:00 · 2026-03-15 10:10:53 -07:00 · d53846d501
commit d53846d501
parent 2212f59704
8 changed files with 140 additions and 0 deletions
--- a/src/params/smt_params.cpp
+++ b/src/params/smt_params.cpp
@ -54,7 +54,9 @@ void smt_params::updt_local_params(params_ref const & _p) {
    m_logic = _p.get_sym("logic", m_logic);
    m_string_solver = p.string_solver();
    m_nseq_max_depth = p.nseq_max_depth();
+    m_nseq_max_nodes = p.nseq_max_nodes();
    m_nseq_parikh = p.nseq_parikh();
+    m_nseq_regex_precheck = p.nseq_regex_precheck();
    m_up_persist_clauses = p.up_persist_clauses();
    validate_string_solver(m_string_solver);
    if (_p.get_bool("arith.greatest_error_pivot", false))
--- a/src/params/smt_params.h
+++ b/src/params/smt_params.h
@ -249,7 +249,9 @@ struct smt_params : public preprocessor_params,
    // -----------------------------------
    symbol m_string_solver;
    unsigned m_nseq_max_depth = 0;
+    unsigned m_nseq_max_nodes = 0;
    bool m_nseq_parikh = true;
+    bool m_nseq_regex_precheck = true;

    smt_params(params_ref const & p = params_ref()):
        m_string_solver(symbol("auto")){
--- a/src/params/smt_params_helper.pyg
+++ b/src/params/smt_params_helper.pyg
@ -124,7 +124,9 @@ def_module_params(module_name='smt',
                          ('theory_case_split', BOOL, False, 'Allow the context to use heuristics involving theory case splits, which are a set of literals of which exactly one can be assigned True. If this option is false, the context will generate extra axioms to enforce this instead.'),
                          ('string_solver', SYMBOL, 'seq', 'solver for string/sequence theories. options are: \'z3str3\' (specialized string solver), \'seq\' (sequence solver), \'auto\' (use static features to choose best solver), \'empty\' (a no-op solver that forces an answer unknown if strings were used), \'none\' (no solver), \'nseq\' (Nielsen-based string solver)'),
                          ('nseq.max_depth', UINT, 0, 'maximum Nielsen search depth for theory_nseq (0 = unlimited)'),
+                          ('nseq.max_nodes', UINT, 0, 'maximum number of DFS nodes explored by theory_nseq per solve() call (0 = unlimited)'),
                          ('nseq.parikh', BOOL, True, 'enable Parikh image checks in nseq solver'),
+                          ('nseq.regex_precheck', BOOL, True, 'enable regex membership pre-check before DFS in theory_nseq: checks intersection emptiness per-variable and short-circuits SAT/UNSAT for regex-only problems'),
                          ('core.validate', BOOL, False, '[internal] validate unsat core produced by SMT context. This option is intended for debugging'),
                          ('seq.split_w_len', BOOL, True, 'enable splitting guided by length constraints'),
                          ('seq.validate', BOOL, False, 'enable self-validation of theory axioms created by seq theory'),
--- a/src/smt/seq/seq_nielsen.cpp
+++ b/src/smt/seq/seq_nielsen.cpp
@ -2172,6 +2172,10 @@ namespace seq {
        if (m_cancel_fn && m_cancel_fn())
            return search_result::unknown;

+        // check DFS node budget (0 = unlimited)
+        if (m_max_nodes > 0 && m_stats.m_num_dfs_nodes > m_max_nodes)
+            return search_result::unknown;
+
        // revisit detection: if already visited this run, return cached status.
        // mirrors ZIPT's NielsenNode.GraphExpansion() evalIdx check.
        if (node->eval_idx() == m_run_idx) {
--- a/src/smt/seq/seq_nielsen.h
+++ b/src/smt/seq/seq_nielsen.h
@ -734,6 +734,7 @@ namespace seq {
        unsigned                      m_run_idx = 0;
        unsigned                      m_depth_bound = 0;
        unsigned                      m_max_search_depth = 0;
+        unsigned                      m_max_nodes = 0;          // 0 = unlimited
        bool                          m_parikh_enabled = true;
        unsigned                      m_next_mem_id = 0;
        unsigned                      m_fresh_cnt = 0;
@ -821,6 +822,9 @@ namespace seq {

        // maximum overall search depth (0 = unlimited)
        void set_max_search_depth(unsigned d) { m_max_search_depth = d; }
+
+        // maximum total DFS nodes per solve() call (0 = unlimited)
+        void set_max_nodes(unsigned n) { m_max_nodes = n; }
        
        // enable/disable Parikh image verification constraints
        void set_parikh_enabled(bool e) { m_parikh_enabled = e; }
--- a/src/smt/seq/seq_regex.cpp
+++ b/src/smt/seq/seq_regex.cpp
@ -438,6 +438,12 @@ namespace seq {
        // Only handle ground regexes; non-ground can't be fully explored
        if (!re->is_ground())
            return l_undef;
+        // s_other snodes (unrecognized regex kinds, e.g. re.+) cannot be
+        // efficiently explored: the alphabet partition is trivially {∅} and
+        // derivative computations may be slow.  Report l_undef and let the
+        // caller fall back to a more capable procedure.
+        if (re->kind() == euf::snode_kind::s_other)
+            return l_undef;

        // BFS over the Brzozowski derivative automaton.
        // Each state is a derivative regex snode identified by its id.
--- a/src/smt/theory_nseq.cpp
+++ b/src/smt/theory_nseq.cpp
@ -416,7 +416,27 @@ namespace smt {
        ++m_num_final_checks;

        m_nielsen.set_max_search_depth(get_fparams().m_nseq_max_depth);
+        m_nielsen.set_max_nodes(get_fparams().m_nseq_max_nodes);
        m_nielsen.set_parikh_enabled(get_fparams().m_nseq_parikh);
+
+        // Regex membership pre-check: before running DFS, check intersection
+        // emptiness for each variable's regex constraints.  This handles
+        // regex-only problems that the DFS cannot efficiently solve.
+        if (get_fparams().m_nseq_regex_precheck) {
+            lbool precheck = check_regex_memberships_precheck();
+            if (precheck == l_true) {
+                // conflict was asserted inside check_regex_memberships_precheck
+                IF_VERBOSE(1, verbose_stream() << "nseq final_check: regex precheck UNSAT\n";);
+                return FC_CONTINUE;
+            }
+            if (precheck == l_false) {
+                // all regex constraints satisfiable, no word eqs/diseqs → SAT
+                IF_VERBOSE(1, verbose_stream() << "nseq final_check: regex precheck SAT\n";);
+                return FC_DONE;
+            }
+            // l_undef: inconclusive, fall through to DFS
+        }
+
        IF_VERBOSE(1, verbose_stream() << "nseq final_check: calling solve()\n";);

        // here the actual Nielsen solving happens
@ -829,4 +849,96 @@ namespace smt {
        return new_axiom;
    }

+    // -----------------------------------------------------------------------
+    // Regex membership pre-check
+    // For each variable with regex membership constraints, check intersection
+    // emptiness before DFS.  Mirrors ZIPT's per-variable regex evaluation.
+    //
+    // Returns:
+    //   l_true  — conflict asserted (empty intersection for some variable)
+    //   l_false — all variables satisfiable and no word eqs/diseqs → SAT
+    //   l_undef — inconclusive, proceed to DFS
+    // -----------------------------------------------------------------------
+
+    lbool theory_nseq::check_regex_memberships_precheck() {
+        auto const& mems = m_state.str_mems();
+        if (mems.empty())
+            return l_undef;
+
+        // Group membership indices by variable snode id.
+        // Only consider memberships whose string snode is a plain variable (s_var).
+        u_map<unsigned_vector> var_to_mems;
+        bool all_var_str = true;
+
+        for (unsigned i = 0; i < mems.size(); ++i) {
+            auto const& mem = mems[i];
+            if (!mem.m_str || !mem.m_regex)
+                continue;
+            if (mem.m_str->is_var()) {
+                auto& vec = var_to_mems.insert_if_not_there(mem.m_str->id(), unsigned_vector());
+                vec.push_back(i);
+            }
+            else {
+                all_var_str = false;
+            }
+        }
+
+        if (var_to_mems.empty())
+            return l_undef;
+
+        bool any_undef = false;
+
+        // Check intersection emptiness for each variable.
+        for (auto& kv : var_to_mems) {
+            unsigned var_id = kv.m_key;
+            unsigned_vector const& mem_indices = kv.m_value;
+            ptr_vector<euf::snode> regexes;
+            for (unsigned i : mem_indices) {
+                euf::snode* re = mems[i].m_regex;
+                if (re)
+                    regexes.push_back(re);
+            }
+            if (regexes.empty())
+                continue;
+
+            // Use a bounded BFS (50 states) for the pre-check to keep it fast.
+            // If the BFS times out (l_undef), we fall through to DFS.
+            lbool result = m_regex.check_intersection_emptiness(regexes, 50);
+
+            if (result == l_true) {
+                // Intersection is empty → the memberships on this variable are
+                // jointly unsatisfiable.  Assert a conflict from all their literals.
+                enode_pair_vector eqs;
+                literal_vector lits;
+                for (unsigned i : mem_indices) {
+                    mem_source const& src = m_state.get_mem_source(i);
+                    if (get_context().get_assignment(src.m_lit) == l_true)
+                        lits.push_back(src.m_lit);
+                }
+                TRACE(seq, tout << "nseq regex precheck: empty intersection for var "
+                                << var_id << ", conflict with " << lits.size() << " lits\n";);
+                set_conflict(eqs, lits);
+                return l_true;   // conflict asserted
+            }
+            else if (result == l_undef) {
+                any_undef = true;
+            }
+            // l_false = non-empty intersection, this variable's constraints are satisfiable
+        }
+
+        if (any_undef)
+            return l_undef;  // cannot fully determine; let DFS decide
+
+        // All variables' regex intersections are non-empty.
+        // If there are no word equations and no disequalities, variables are
+        // independent and each can be assigned a witness string → SAT.
+        if (all_var_str && m_state.str_eqs().empty() && m_state.diseqs().empty()) {
+            TRACE(seq, tout << "nseq regex precheck: all intersections non-empty, "
+                            << "no word eqs/diseqs → SAT\n";);
+            return l_false;  // signals SAT (non-empty / satisfiable)
+        }
+
+        return l_undef;  // mixed constraints; let DFS decide
+    }
+
 }
--- a/src/smt/theory_nseq.h
+++ b/src/smt/theory_nseq.h
@ -131,6 +131,14 @@ namespace smt {
        bool assert_nonneg_for_all_vars();
        bool assert_length_constraints();

+        // Regex membership pre-check: for each variable with regex constraints,
+        // check intersection emptiness before running DFS.
+        //   l_true  → found empty intersection, conflict asserted, return FC_CONTINUE
+        //   l_false → all variables' regex constraints satisfiable and no word
+        //             equations / disequalities, return FC_DONE (SAT)
+        //   l_undef → inconclusive, proceed to DFS
+        lbool check_regex_memberships_precheck();
+
    public:
        theory_nseq(context& ctx);
    };