Fix nseq regex gap: increase BFS limit, eager length propagation, relax precheck SAT condition

Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com>
2026-06-26 02:20:58 +00:00 · 2026-03-15 19:03:47 +00:00 · 2026-03-15 19:03:47 +00:00 · 77fafec3ff
commit 77fafec3ff
parent d5a74647c3
2 changed files with 106 additions and 13 deletions
--- a/src/smt/theory_nseq.cpp
+++ b/src/smt/theory_nseq.cpp
@ -314,6 +314,56 @@ namespace smt {
        expr* s_expr = mem.m_str->get_expr();
        if (s_expr)
            ensure_length_var(s_expr);
+
+        // eagerly propagate length bounds derived from the regex structure:
+        //   str.in_re(s, r) => len(s) >= min_length(r)  (when min > 0)
+        //   str.in_re(s, r) => len(s) <= max_length(r)  (when max < UINT_MAX)
+        // This mirrors seq's propagate_in_re length inference and is critical
+        // for problems with re.loop[n:n] constraints (fixes AutomataArk UNSAT cases).
+        if (s_expr) {
+            expr* re_expr = mem.m_regex->get_expr();
+            if (re_expr && m_seq.is_re(re_expr)) {
+                unsigned min_len = m_seq.re.min_length(re_expr);
+                unsigned max_len = m_seq.re.max_length(re_expr);
+                if (min_len > 0 || max_len < UINT_MAX)
+                    propagate_regex_length_bounds(s_expr, min_len, max_len, src.m_lit);
+            }
+        }
+    }
+
+    // Propagate length bounds for str.in_re(s, r) → len(s) ∈ [min_len, max_len].
+    // Adds the conditional clauses as theory axioms so the arithmetic theory
+    // can derive conflicts or tighten bounds without waiting for final_check.
+    void theory_nseq::propagate_regex_length_bounds(expr* s, unsigned min_len, unsigned max_len, literal antecedent) {
+        context& ctx = get_context();
+        ast_manager& m = get_manager();
+        expr_ref len_s(m_seq.str.mk_length(s), m);
+        if (!ctx.e_internalized(len_s))
+            ctx.internalize(len_s, false);
+
+        if (min_len > 0) {
+            expr_ref bound(m_autil.mk_ge(len_s, m_autil.mk_int(min_len)), m);
+            if (!ctx.b_internalized(bound))
+                ctx.internalize(bound, true);
+            literal len_lit = ctx.get_literal(bound);
+            if (ctx.get_assignment(len_lit) != l_true) {
+                literal clause[2] = {~antecedent, len_lit};
+                ctx.mk_th_axiom(get_id(), 2, clause);
+                ++m_num_length_axioms;
+            }
+        }
+
+        if (max_len < UINT_MAX) {
+            expr_ref bound(m_autil.mk_le(len_s, m_autil.mk_int(max_len)), m);
+            if (!ctx.b_internalized(bound))
+                ctx.internalize(bound, true);
+            literal len_lit = ctx.get_literal(bound);
+            if (ctx.get_assignment(len_lit) != l_true) {
+                literal clause[2] = {~antecedent, len_lit};
+                ctx.mk_th_axiom(get_id(), 2, clause);
+                ++m_num_length_axioms;
+            }
+        }
    }

    void theory_nseq::ensure_length_var(expr* e) {
@ -867,8 +917,9 @@ namespace smt {

        // Group membership indices by variable snode id.
        // Only consider memberships whose string snode is a plain variable (s_var).
+        // Non-variable memberships (e.g., str.substr(X,...)) arise from the SMT
+        // preprocessor decomposing re.++ constraints and are handled separately below.
        u_map<unsigned_vector> var_to_mems;
-        bool all_var_str = true;

        for (unsigned i = 0; i < mems.size(); ++i) {
            auto const& mem = mems[i];
@ -878,9 +929,6 @@ namespace smt {
                auto& vec = var_to_mems.insert_if_not_there(mem.m_str->id(), unsigned_vector());
                vec.push_back(i);
            }
-            else {
-                all_var_str = false;
-            }
        }

        if (var_to_mems.empty())
@ -901,10 +949,15 @@ namespace smt {
            if (regexes.empty())
                continue;

-            // Use a bounded BFS (50 states) for the pre-check to keep it fast.
-            // If the BFS times out (l_undef), we fall through to DFS.
-            lbool result = m_regex.check_intersection_emptiness(regexes, 50);
+            // Use a full BFS (up to 10 000 states) for the pre-check.
+            // The 50-state limit that was here previously was too small for
+            // AutomataArk-class regexes (HTML/HTTP/date patterns) and caused
+            // the precheck to always return l_undef, falling through to the
+            // much-slower DFS.  Using the default 10 000-state limit mirrors
+            // the budget used by is_empty_bfs() directly.
+            lbool result = m_regex.check_intersection_emptiness(regexes);

+            IF_VERBOSE(2, verbose_stream() << "nseq precheck: var " << var_id << " result=" << result << "\n";);
            if (result == l_true) {
                // Intersection is empty → the memberships on this variable are
                // jointly unsatisfiable.  Assert a conflict from all their literals.
@ -930,12 +983,51 @@ namespace smt {
            return l_undef;  // cannot fully determine; let DFS decide

        // All variables' regex intersections are non-empty.
-        // If there are no word equations and no disequalities, variables are
-        // independent and each can be assigned a witness string → SAT.
-        if (all_var_str && m_state.str_eqs().empty() && m_state.diseqs().empty()) {
-            TRACE(seq, tout << "nseq regex precheck: all intersections non-empty, "
-                            << "no word eqs/diseqs → SAT\n";);
-            return l_false;  // signals SAT (non-empty / satisfiable)
+        // If there are no word equations and no disequalities, check if it is
+        // safe to declare SAT.
+        //
+        // When all string arguments are plain variables (all_var_str=true), the
+        // constraints are independent and a per-variable witness proves SAT.
+        //
+        // When some string arguments are non-variable (e.g., str.substr(X,...)),
+        // these arise from the SMT preprocessor decomposing re.++ constraints into
+        // per-segment memberships.  The preprocessing guarantees internal consistency:
+        // any witness for the original variable membership also satisfies the derived
+        // substring memberships.  We do a quick non-emptiness check on each non-variable
+        // regex to ensure none are individually empty (an empty-regex conflict would
+        // already have been caught by propagate_pos_mem, but we double-check here for
+        // safety).  If all pass, we can declare SAT.
+        if (m_state.str_eqs().empty() && m_state.diseqs().empty() && !var_to_mems.empty()) {
+            bool nonvar_ok = true;
+            for (unsigned i = 0; i < mems.size(); ++i) {
+                auto const& mem = mems[i];
+                if (!mem.m_str || !mem.m_regex || mem.m_str->is_var())
+                    continue;
+                // Quick non-emptiness check for the derived regex.
+                // We use a small budget (200 states) because these non-variable
+                // memberships are typically simple sub-regexes (character classes,
+                // short to_re strings, complements of such) whose emptiness can be
+                // decided in very few derivative steps.  A larger budget is not
+                // needed here and would slow down the precheck unnecessarily.
+                lbool r = m_regex.is_empty_bfs(mem.m_regex, 200);
+                if (r == l_true) {
+                    // regex is empty → UNSAT for this membership
+                    enode_pair_vector eqs;
+                    literal_vector lits;
+                    mem_source const& src = m_state.get_mem_source(i);
+                    if (get_context().get_assignment(src.m_lit) == l_true)
+                        lits.push_back(src.m_lit);
+                    set_conflict(eqs, lits);
+                    return l_true;
+                }
+                if (r == l_undef)
+                    nonvar_ok = false;
+            }
+            if (nonvar_ok) {
+                TRACE(seq, tout << "nseq regex precheck: all intersections non-empty, "
+                                << "no word eqs/diseqs → SAT\n";);
+                return l_false;  // signals SAT (non-empty / satisfiable)
+            }
        }

        return l_undef;  // mixed constraints; let DFS decide
--- a/src/smt/theory_nseq.h
+++ b/src/smt/theory_nseq.h
@ -117,6 +117,7 @@ namespace smt {
        void propagate_diseq(unsigned idx);
        void propagate_pos_mem(unsigned idx);
        void ensure_length_var(expr* e);
+        void propagate_regex_length_bounds(expr* s, unsigned min_len, unsigned max_len, literal antecedent);

        // higher-order term unfolding
        bool unfold_ho_terms();