Stage 3: collapse boolean combinations of char-class regexes

Introduce src/ast/rewriter/regex_range_collapse.{h,cpp}, a translator between the boolean-combination-of-character-class fragment of regexes and the range_predicate value type added in Stage 2. Recognized fragment (translates to range_predicate): re.empty, re.full_char, re.range, re.union, re.intersection, re.diff of operands recursively in the fragment. Range bounds are accepted in three encodings: string constant ("a"), seq.unit of a const char (seq.unit (Char 97)), and length-1 zstring literal. NOT translated: re.complement -- this is sequence-level complement (Sigma* \ L), not character-class complement. Translating it would incorrectly turn re.comp(re.range "a" "z") into the character class [^a-z], which would drop the empty string and all length>=2 strings. Hook the translator into seq_rewriter at mk_re_union0, mk_re_union, mk_re_inter0, mk_re_inter, and mk_re_diff so that boolean combinations of character classes always reduce to a single canonical range-set form. mk_re_complement is intentionally not hooked. Materialization uses the canonical (seq.unit (Char N)) bound form (matching the rest of seq_rewriter) and right-associates the union with operands sorted by expr_id so the result matches the invariant expected by merge_regex_sets. Unit tests in src/test/regex_range_collapse.cpp cover the recognized fragment, the non-translatable cases, and round-trip identity for multi-range predicates. Corpus validation on bench/inputs/regex-equivalence (1523 .smt2): - 0 soundness regressions vs derive baseline. - Resolves 4 previously-soft-timeout files (now solved correctly). - Resolves 1 pre-existing wrong answer (mut_0404: master/derive say unsat, ground-truth annotation and Stage 3 say sat). - Wall-time: -2.2% vs Stage-3 starting point, -1.5% vs derive. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-20 15:40:37 +00:00 · 2026-06-14 18:49:31 -07:00 · 2026-06-14 18:49:31 -07:00 · c0c826cf5f
commit c0c826cf5f
parent bcdbc80ab8
8 changed files with 500 additions and 1 deletions
--- a/src/ast/rewriter/CMakeLists.txt
+++ b/src/ast/rewriter/CMakeLists.txt
@ -38,6 +38,7 @@ z3_add_component(rewriter
    range_predicate.cpp
    range_predicate_translator.cpp
    recfun_rewriter.cpp
+    regex_range_collapse.cpp
    rewriter.cpp
    seq_axioms.cpp
    seq_eq_solver.cpp
--- a/src/ast/rewriter/regex_range_collapse.cpp
+++ b/src/ast/rewriter/regex_range_collapse.cpp
@ -0,0 +1,145 @@
+/*++
+Copyright (c) 2026 Microsoft Corporation
+
+Module Name:
+
+    regex_range_collapse.cpp
+
+Abstract:
+
+    Implementation of regex <-> range_predicate translation for the
+    boolean-combination-of-ranges fragment. See header for the recognized
+    grammar and the canonical regex AST emitted by materialization.
+
+Authors:
+
+    Margus Veanes (veanes) 2026
+
+--*/
+
+#include "ast/rewriter/regex_range_collapse.h"
+
+namespace seq {
+
+    bool regex_to_range_predicate(seq_util& u, expr* r, range_predicate& out) {
+        unsigned const max_char = u.max_char();
+        auto& re = u.re;
+
+        if (re.is_empty(r)) {
+            out = range_predicate::empty(max_char);
+            return true;
+        }
+        if (re.is_full_char(r)) {
+            out = range_predicate::top(max_char);
+            return true;
+        }
+        unsigned lo = 0, hi = 0;
+        expr* lo_e = nullptr;
+        expr* hi_e = nullptr;
+        if (re.is_range(r, lo_e, hi_e)) {
+            auto extract_char = [&](expr* e, unsigned& c) -> bool {
+                if (u.is_const_char(e, c)) return true;
+                expr* inner = nullptr;
+                if (u.str.is_unit(e, inner) && u.is_const_char(inner, c)) return true;
+                zstring s;
+                if (u.str.is_string(e, s) && s.length() == 1) {
+                    c = s[0];
+                    return true;
+                }
+                return false;
+            };
+            if (!extract_char(lo_e, lo) || !extract_char(hi_e, hi))
+                return false;
+            // Empty/inverted range [lo > hi] is the empty regex.
+            if (lo > hi) {
+                out = range_predicate::empty(max_char);
+                return true;
+            }
+            out = range_predicate::range(lo, hi, max_char);
+            return true;
+        }
+        expr* a = nullptr;
+        expr* b = nullptr;
+        if (re.is_union(r, a, b)) {
+            range_predicate pa(max_char), pb(max_char);
+            if (!regex_to_range_predicate(u, a, pa)) return false;
+            if (!regex_to_range_predicate(u, b, pb)) return false;
+            out = pa | pb;
+            return true;
+        }
+        if (re.is_intersection(r, a, b)) {
+            range_predicate pa(max_char), pb(max_char);
+            if (!regex_to_range_predicate(u, a, pa)) return false;
+            if (!regex_to_range_predicate(u, b, pb)) return false;
+            out = pa & pb;
+            return true;
+        }
+        if (re.is_diff(r, a, b)) {
+            range_predicate pa(max_char), pb(max_char);
+            if (!regex_to_range_predicate(u, a, pa)) return false;
+            if (!regex_to_range_predicate(u, b, pb)) return false;
+            out = pa - pb;
+            return true;
+        }
+        // NOTE: re.complement is intentionally NOT handled here.
+        //   re.complement is the SEQUENCE-level complement: its language
+        //   includes the empty string, strings of length >= 2, and any
+        //   length-1 string outside the operand.  A character-class
+        //   range_predicate can only describe a set of length-1 strings,
+        //   so collapsing re.complement(R) to ~R (character-level
+        //   complement) would change semantics whenever R is wrapped in
+        //   any sequence-level context (e.g. re.diff at the top level,
+        //   or membership tests).  De-Morgan equivalences and the
+        //   special cases re.complement(re.empty) / re.complement(re.full)
+        //   are already handled directly in seq_rewriter::mk_re_complement.
+        return false;
+    }
+
+    static expr_ref mk_unit_string_from_char(seq_util& u, unsigned c) {
+        return expr_ref(u.str.mk_string(zstring(c)), u.get_manager());
+    }
+
+    static expr_ref mk_single_range_regex(seq_util& u, unsigned lo, unsigned hi, sort* re_sort) {
+        ast_manager& m = u.get_manager();
+        if (lo == 0 && hi == u.max_char())
+            return expr_ref(u.re.mk_full_char(re_sort), m);
+        // Use the canonical unit-character form (seq.unit (Char N)) for
+        // range bounds.  This matches the shape used elsewhere in
+        // seq_rewriter and avoids creating duplicate AST nodes with
+        // different ids for semantically identical ranges.
+        expr_ref slo(u.str.mk_unit(u.str.mk_char(lo)), m);
+        expr_ref shi(u.str.mk_unit(u.str.mk_char(hi)), m);
+        return expr_ref(u.re.mk_range(slo, shi), m);
+    }
+
+    expr_ref range_predicate_to_regex(seq_util& u, range_predicate const& p, sort* seq_sort) {
+        ast_manager& m = u.get_manager();
+        sort* re_sort = u.re.mk_re(seq_sort);
+        if (p.is_empty())
+            return expr_ref(u.re.mk_empty(re_sort), m);
+        unsigned const n = p.num_ranges();
+        SASSERT(n > 0);
+        if (n == 1) {
+            auto [lo, hi] = p[0];
+            return mk_single_range_regex(u, lo, hi, re_sort);
+        }
+        // Build single-range AST nodes first, then sort by expression id
+        // so the resulting right-associated union matches the canonical
+        // id-sorted shape that seq_rewriter::merge_regex_sets expects.
+        // Without this the merge algorithm produces incorrect unions
+        // when it has to combine our materialized output with another
+        // (id-sorted) regex set.
+        expr_ref_vector ranges(m);
+        for (unsigned i = 0; i < n; ++i) {
+            auto [lo, hi] = p[i];
+            ranges.push_back(mk_single_range_regex(u, lo, hi, re_sort));
+        }
+        std::sort(ranges.data(), ranges.data() + ranges.size(),
+                  [](expr* a, expr* b) { return a->get_id() < b->get_id(); });
+        expr_ref acc(ranges.get(n - 1), m);
+        for (unsigned i = n - 1; i-- > 0; )
+            acc = expr_ref(u.re.mk_union(ranges.get(i), acc), m);
+        return acc;
+    }
+
+}
--- a/src/ast/rewriter/regex_range_collapse.h
+++ b/src/ast/rewriter/regex_range_collapse.h
@ -0,0 +1,71 @@
+/*++
+Copyright (c) 2026 Microsoft Corporation
+
+Module Name:
+
+    regex_range_collapse.h
+
+Abstract:
+
+    Recognize regexes that are boolean combinations of character-class
+    primitives (re.empty, re.full_char, re.range with concrete chars,
+    and re.union/inter/comp/diff over translatable arguments), and
+    materialize a seq::range_predicate back into a canonical regex AST.
+
+    Together with seq_rewriter integration, this lets any boolean
+    combination of character-class regexes collapse to a canonical
+    multi-range form, so that equivalent character classes share AST
+    identity, and downstream consumers (derivative, OneStep, caching)
+    can short-circuit them as pure range predicates.
+
+Authors:
+
+    Margus Veanes (veanes) 2026
+
+--*/
+#pragma once
+
+#include "ast/rewriter/range_predicate.h"
+#include "ast/seq_decl_plugin.h"
+
+namespace seq {
+
+    /**
+     * If r is a boolean combination of character-class regex primitives
+     * over the unsigned character domain [0, max_char], compute the
+     * equivalent range_predicate and return true. Otherwise return false
+     * with out untouched.
+     *
+     * Recognized fragment (all character-class-preserving operations):
+     *   re.empty                              -> empty
+     *   re.full_char_set                      -> top
+     *   re.range "c_lo" "c_hi"  (concrete)    -> [c_lo, c_hi]
+     *   re.union r1 r2                        -> p1 | p2
+     *   re.intersection r1 r2                 -> p1 & p2
+     *   re.diff r1 r2                         -> p1 - p2
+     *
+     * Notably re.complement is NOT recognized: it is a SEQUENCE-level
+     * complement (over all of Σ*), not a character-class complement, so
+     * collapsing it would change semantics whenever the result is used
+     * in any non-character-class context.  Sequence-level rewrites for
+     * re.complement (double-comp, deMorgan, etc.) are handled directly
+     * in seq_rewriter::mk_re_complement.
+     */
+    bool regex_to_range_predicate(seq_util& u, expr* r, range_predicate& out);
+
+    /**
+     * Canonical materialization of p as a regex AST over the given
+     * sequence sort. Two range_predicates with equal canonical
+     * representations produce structurally identical regex ASTs:
+     *
+     *   empty                  -> re.empty
+     *   top                    -> re.full_char_set
+     *   single range [lo, hi]  -> re.range "lo" "hi"
+     *   multiple ranges        -> right-associated re.union of single
+     *                              ranges, in increasing order of lo
+     *                              (matching the canonical range order
+     *                              held by range_predicate).
+     */
+    expr_ref range_predicate_to_regex(seq_util& u, range_predicate const& p, sort* seq_sort);
+
+}
--- a/src/ast/rewriter/seq_rewriter.cpp
+++ b/src/ast/rewriter/seq_rewriter.cpp
@ -21,6 +21,7 @@ Authors:
 #include "util/uint_set.h"
 #include "ast/rewriter/seq_rewriter.h"
 #include "ast/rewriter/seq_regex_bisim.h"
+#include "ast/rewriter/regex_range_collapse.h"
 #include "ast/arith_decl_plugin.h"
 #include "ast/array_decl_plugin.h"
 #include "ast/ast_pp.h"
@ -3961,6 +3962,32 @@ bool seq_rewriter::is_subset(expr* r1, expr* r2) const {
    return m_subset.is_subset(r1, r2);
 }

+bool seq_rewriter::try_collapse_re_union(expr* a, expr* b, expr_ref& result) {
+    sort* seq_sort = nullptr;
+    if (!u().is_re(a->get_sort(), seq_sort))
+        return false;
+    seq::range_predicate pa(u().max_char()), pb(u().max_char());
+    if (!seq::regex_to_range_predicate(u(), a, pa))
+        return false;
+    if (!seq::regex_to_range_predicate(u(), b, pb))
+        return false;
+    result = seq::range_predicate_to_regex(u(), pa | pb, seq_sort);
+    return true;
+}
+
+bool seq_rewriter::try_collapse_re_inter(expr* a, expr* b, expr_ref& result) {
+    sort* seq_sort = nullptr;
+    if (!u().is_re(a->get_sort(), seq_sort))
+        return false;
+    seq::range_predicate pa(u().max_char()), pb(u().max_char());
+    if (!seq::regex_to_range_predicate(u(), a, pa))
+        return false;
+    if (!seq::regex_to_range_predicate(u(), b, pb))
+        return false;
+    result = seq::range_predicate_to_regex(u(), pa & pb, seq_sort);
+    return true;
+}
+
 br_status seq_rewriter::mk_re_union0(expr* a, expr* b, expr_ref& result) {
    if (a == b) {
        result = a;
@ -4005,11 +4032,15 @@ br_status seq_rewriter::mk_re_union0(expr* a, expr* b, expr_ref& result) {
        result = m().mk_ite(c, re().mk_union(a, r1), re().mk_union(a, r2));
        return BR_REWRITE3;
    }
+    if (try_collapse_re_union(a, b, result))
+        return BR_DONE;
    return BR_FAILED;
 }

 /* Creates a normalized union. */
 br_status seq_rewriter::mk_re_union(expr* a, expr* b, expr_ref& result) {
+    if (try_collapse_re_union(a, b, result))
+        return BR_DONE;
    result = mk_regex_union_normalize(a, b);
    return BR_DONE;
 }
@ -4093,16 +4124,28 @@ br_status seq_rewriter::mk_re_inter0(expr* a, expr* b, expr_ref& result) {
        result = m().mk_ite(c, re().mk_inter(a, r1), re().mk_inter(a, r2));
        return BR_REWRITE3;
    }
+    if (try_collapse_re_inter(a, b, result))
+        return BR_DONE;
    return BR_FAILED;
 }

 /* Creates a normalized intersection. */
 br_status seq_rewriter::mk_re_inter(expr* a, expr* b, expr_ref& result) {
+    if (try_collapse_re_inter(a, b, result))
+        return BR_DONE;
    result = mk_regex_inter_normalize(a, b);
    return BR_DONE;
 }

 br_status seq_rewriter::mk_re_diff(expr* a, expr* b, expr_ref& result) {
+    seq::range_predicate pa(u().max_char()), pb(u().max_char());
+    sort* seq_sort = nullptr;
+    if (u().is_re(a->get_sort(), seq_sort)
+        && seq::regex_to_range_predicate(u(), a, pa)
+        && seq::regex_to_range_predicate(u(), b, pb)) {
+        result = seq::range_predicate_to_regex(u(), pa - pb, seq_sort);
+        return BR_DONE;
+    }
    result = mk_regex_inter_normalize(a, re().mk_complement(b));
    return BR_REWRITE2;
 }
@ -5495,7 +5538,7 @@ void seq_rewriter::op_cache::cleanup() {
 lbool seq_rewriter::some_string_in_re(expr* r, zstring& s) {
    sort* rs;
    (void)rs;
-    // SASSERT(re().is_re(r, rs) && m_util.is_string(rs));
+    // SASSERT(u().is_re(r, rs) && m_util.is_string(rs));
    expr_mark visited;
    unsigned_vector str;

--- a/src/ast/rewriter/seq_rewriter.h
+++ b/src/ast/rewriter/seq_rewriter.h
@ -250,6 +250,14 @@ class seq_rewriter {
    br_status mk_re_union0(expr* a, expr* b, expr_ref& result);
    br_status mk_re_inter0(expr* a, expr* b, expr_ref& result);
    br_status mk_re_complement(expr* a, expr_ref& result);
+    // Range-set collapse helpers: if the operands form a boolean
+    // combination of character-class regexes, materialize the result as a
+    // canonical regex over a single range_predicate.  See
+    // ast/rewriter/regex_range_collapse.h for the recognized fragment.
+    // NOTE: re.complement is intentionally not in this set because it
+    // operates at the sequence level, not the character-class level.
+    bool try_collapse_re_union(expr* a, expr* b, expr_ref& result);
+    bool try_collapse_re_inter(expr* a, expr* b, expr_ref& result);
    br_status mk_re_star(expr* a, expr_ref& result);
    br_status mk_re_diff(expr* a, expr* b, expr_ref& result);
    br_status mk_re_xor(expr* a, expr* b, expr_ref& result);