fix seq::derive::intersect_intervals dropping kept intervals on disjoint case

The intersect_intervals routine in seq_derive.cpp maintains a path-tracking disjoint union of character intervals. When intersecting the active suffix with a new constraint [lo, hi], it iterated the suffix and, on encountering the first interval disjoint from [lo, hi], reset the output cursor to the end-marker and broke out of the loop. This both threw away the intervals it had already kept and skipped every remaining interval, so e.g. [(0, 96), (98, max)] intersected with [98, 98] became empty instead of [(98, 98)]. Inside derive that silently killed valid branches of symbolic derivatives. For example D(a|b) collapsed to ite(c='a', eps, empty) -- the 'b' branch vanished -- which made the bisimulation procedure conclude bogus equalities such as a* == (a|b)*. On the regex-equivalence corpus this single bug accounted for ~510 false-unsat results vs master. Fix: drop only the disjoint interval and continue scanning the rest of the suffix. Add a small assertion-based regression test that builds D(a|b), checks both branches survive, and runs bisim on a* vs (a|b)*. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-19 15:16:29 +00:00 · 2026-06-15 01:38:55 -07:00 · 2026-06-15 01:38:55 -07:00 · fb6470a1a1
commit fb6470a1a1
parent cf0e43ab38
4 changed files with 81 additions and 10 deletions
--- a/src/ast/rewriter/seq_derive.cpp
+++ b/src/ast/rewriter/seq_derive.cpp
@ -1246,21 +1246,18 @@ namespace seq {
    // Intersect the active suffix m_intervals[m_intervals_start..end] with [lo, hi]
    void derive::intersect_intervals(unsigned lo, unsigned hi) {
        // Copy active suffix to end, update start, then filter
        unsigned old_start = m_intervals_start;
        unsigned old_sz = m_intervals.size();
-        for (unsigned i = old_start; i < old_sz; ++i)
+        for (unsigned i = m_intervals_start; i < old_sz; ++i)
            m_intervals.push_back(m_intervals[i]);
        m_intervals_start = old_sz;
-        // Filter in-place within new suffix
+        // Filter in-place within new suffix: drop intervals disjoint from [lo,hi],
        // keep the intersection for overlapping ones.
        unsigned j = m_intervals_start;
        for (unsigned i = m_intervals_start; i < m_intervals.size(); ++i) {
            auto [lo1, hi1] = m_intervals[i];
-            if (hi < lo1 || lo > hi1) {
+            if (hi < lo1 || lo > hi1)
-                j = old_sz;
+                continue;  // disjoint with this interval — drop it
-                break;
+            m_intervals[j++] = {std::max(lo1, lo), std::min(hi1, hi)};
            }
            if (hi1 >= lo)
                m_intervals[j++] = {std::max(lo1, lo), std::min(hi1, hi)};
        }
        m_intervals.shrink(j);
    }
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@ -113,6 +113,7 @@ add_executable(test-z3
  polynomial_factorization.cpp
  polynorm.cpp
  prime_generator.cpp
  seq_regex_bisim.cpp
  proof_checker.cpp
  qe_arith.cpp
  mbp_qel.cpp
--- a/src/test/main.cpp
+++ b/src/test/main.cpp
@ -193,7 +193,8 @@
    X(ho_matcher) \
    X(finite_set) \
    X(finite_set_rewriter) \
-    X(fpa)
+    X(fpa) \
    X(seq_regex_bisim)
 #define FOR_EACH_TEST(X, X_ARGV) \
    FOR_EACH_ALL_TEST(X, X_ARGV) \
--- a/src/test/seq_regex_bisim.cpp
+++ b/src/test/seq_regex_bisim.cpp
@ -0,0 +1,72 @@
 // Regression test for the seq::derive::intersect_intervals bug.
 //
 // Background: derive uses a path-tracking interval set to compute symbolic
 // derivatives.  The intersect_intervals routine used to react to a single
 // disjoint interval by dropping the entire kept suffix and skipping the rest
 // of the list, which silently killed valid branches in derivatives such as
 // D(a|b).  That made the bisimulation procedure conclude bogus equalities
 // like a* == (a|b)*.
 //
 // This test asserts the basic shape of the derivative and the bisim result
 // for that minimal case, so the bug cannot silently regress.
 #include "ast/ast.h"
 #include "ast/ast_pp.h"
 #include "ast/reg_decl_plugins.h"
 #include "ast/seq_decl_plugin.h"
 #include "ast/rewriter/seq_rewriter.h"
 #include "ast/rewriter/seq_regex_bisim.h"
 #include <iostream>
 static void test_a_star_neq_ab_star() {
    ast_manager m;
    reg_decl_plugins(m);
    seq_util u(m);
    seq_rewriter rw(m);
    sort_ref str_sort(u.str.mk_string_sort(), m);
    zstring sa("a"), sb("b");
    expr_ref re_a(u.re.mk_to_re(u.str.mk_string(sa)), m);
    expr_ref re_b(u.re.mk_to_re(u.str.mk_string(sb)), m);
    expr_ref a_star(u.re.mk_star(re_a), m);
    expr_ref ab(u.re.mk_union(re_a, re_b), m);
    expr_ref ab_star(u.re.mk_star(ab), m);
    expr_ref d_ab = rw.mk_brz_derivative(ab);
    std::cout << "D(a|b) = " << mk_pp(d_ab, m) << "\n";
    // Both the 'a' branch and the 'b' branch of D(a|b) must reach epsilon.
    // Collect the regex leaves of the symbolic derivative and require at
    // least two distinct accepting leaves (one for 'a' and one for 'b').
    expr_ref_vector leaves(m);
    auto collect = [&](expr* e, auto&& self) -> void {
        expr* c, *t, *f;
        if (m.is_ite(e, c, t, f) || u.re.is_union(e, t, f) || u.re.is_antimirov_union(e, t, f)) {
            self(t, self);
            self(f, self);
            return;
        }
        if (u.re.is_empty(e)) return;
        leaves.push_back(e);
    };
    collect(d_ab, collect);
    unsigned nullable_leaves = 0;
    for (expr* l : leaves) {
        expr_ref n = rw.is_nullable(l);
        if (m.is_true(n)) ++nullable_leaves;
    }
    std::cout << "D(a|b) leaves=" << leaves.size()
              << " nullable=" << nullable_leaves << "\n";
    ENSURE(nullable_leaves >= 2);
    // Bisim must report the two languages are not equivalent.
    seq::regex_bisim bisim(rw);
    lbool eq = bisim.are_equivalent(a_star, ab_star);
    std::cout << "bisim(a*, (a|b)*) = "
              << (eq == l_true ? "true" : eq == l_false ? "false" : "undef") << "\n";
    ENSURE(eq == l_false);
 }
 void tst_seq_regex_bisim() {
    test_a_star_neq_ab_star();
 }