z3/src/ast/rewriter/regex_range_collapse.cpp

/*++
Copyright (c) 2026 Microsoft Corporation

Module Name:

    regex_range_collapse.cpp

Abstract:

    Implementation of regex <-> range_predicate translation for the
    boolean-combination-of-ranges fragment. See header for the recognized
    grammar and the canonical regex AST emitted by materialization.

Authors:

    Margus Veanes (veanes) 2026

--*/

#include "ast/rewriter/regex_range_collapse.h"

namespace seq {

    bool regex_to_range_predicate(seq_util& u, expr* r, range_predicate& out) {
        unsigned const max_char = u.max_char();
        auto& re = u.re;

        if (re.is_empty(r)) {
            out = range_predicate::empty(max_char);
            return true;
        }
        if (re.is_full_char(r)) {
            out = range_predicate::top(max_char);
            return true;
        }
        unsigned lo = 0, hi = 0;
        expr* lo_e = nullptr;
        expr* hi_e = nullptr;
        if (re.is_range(r, lo_e, hi_e)) {
            auto extract_char = [&](expr* e, unsigned& c) -> bool {
                if (u.is_const_char(e, c)) return true;
                expr* inner = nullptr;
                if (u.str.is_unit(e, inner) && u.is_const_char(inner, c)) return true;
                zstring s;
                if (u.str.is_string(e, s) && s.length() == 1) {
                    c = s[0];
                    return true;
                }
                return false;
            };
            if (!extract_char(lo_e, lo) || !extract_char(hi_e, hi))
                return false;
            // Empty/inverted range [lo > hi] is the empty regex.
            if (lo > hi) {
                out = range_predicate::empty(max_char);
                return true;
            }
            out = range_predicate::range(lo, hi, max_char);
            return true;
        }
        expr* a = nullptr;
        expr* b = nullptr;
        if (re.is_union(r, a, b)) {
            range_predicate pa(max_char), pb(max_char);
            if (!regex_to_range_predicate(u, a, pa)) return false;
            if (!regex_to_range_predicate(u, b, pb)) return false;
            out = pa | pb;
            return true;
        }
        if (re.is_intersection(r, a, b)) {
            range_predicate pa(max_char), pb(max_char);
            if (!regex_to_range_predicate(u, a, pa)) return false;
            if (!regex_to_range_predicate(u, b, pb)) return false;
            out = pa & pb;
            return true;
        }
        if (re.is_diff(r, a, b)) {
            range_predicate pa(max_char), pb(max_char);
            if (!regex_to_range_predicate(u, a, pa)) return false;
            if (!regex_to_range_predicate(u, b, pb)) return false;
            out = pa - pb;
            return true;
        }
        // NOTE: re.complement is intentionally NOT handled here.
        //   re.complement is the SEQUENCE-level complement: its language
        //   includes the empty string, strings of length >= 2, and any
        //   length-1 string outside the operand.  A character-class
        //   range_predicate can only describe a set of length-1 strings,
        //   so collapsing re.complement(R) to ~R (character-level
        //   complement) would change semantics whenever R is wrapped in
        //   any sequence-level context (e.g. re.diff at the top level,
        //   or membership tests).  De-Morgan equivalences and the
        //   special cases re.complement(re.empty) / re.complement(re.full)
        //   are already handled directly in seq_rewriter::mk_re_complement.
        return false;
    }

    static expr_ref mk_unit_string_from_char(seq_util& u, unsigned c) {
        return expr_ref(u.str.mk_string(zstring(c)), u.get_manager());
    }

    static expr_ref mk_single_range_regex(seq_util& u, unsigned lo, unsigned hi, sort* re_sort) {
        ast_manager& m = u.get_manager();
        if (lo == 0 && hi == u.max_char())
            return expr_ref(u.re.mk_full_char(re_sort), m);
        // Use the canonical unit-character form (seq.unit (Char N)) for
        // range bounds.  This matches the shape used elsewhere in
        // seq_rewriter and avoids creating duplicate AST nodes with
        // different ids for semantically identical ranges.
        expr_ref slo(u.str.mk_unit(u.str.mk_char(lo)), m);
        expr_ref shi(u.str.mk_unit(u.str.mk_char(hi)), m);
        return expr_ref(u.re.mk_range(slo, shi), m);
    }

    expr_ref range_predicate_to_regex(seq_util& u, range_predicate const& p, sort* seq_sort) {
        ast_manager& m = u.get_manager();
        sort* re_sort = u.re.mk_re(seq_sort);
        if (p.is_empty())
            return expr_ref(u.re.mk_empty(re_sort), m);
        unsigned const n = p.num_ranges();
        SASSERT(n > 0);
        if (n == 1) {
            auto [lo, hi] = p[0];
            return mk_single_range_regex(u, lo, hi, re_sort);
        }
        // Build single-range AST nodes first, then sort by expression id
        // so the resulting right-associated union matches the canonical
        // id-sorted shape that seq_rewriter::merge_regex_sets expects.
        // Without this the merge algorithm produces incorrect unions
        // when it has to combine our materialized output with another
        // (id-sorted) regex set.
        expr_ref_vector ranges(m);
        for (unsigned i = 0; i < n; ++i) {
            auto [lo, hi] = p[i];
            ranges.push_back(mk_single_range_regex(u, lo, hi, re_sort));
        }
        std::sort(ranges.data(), ranges.data() + ranges.size(),
                  [](expr* a, expr* b) { return a->get_id() < b->get_id(); });
        expr_ref acc(ranges.get(n - 1), m);
        for (unsigned i = n - 1; i-- > 0; )
            acc = expr_ref(u.re.mk_union(ranges.get(i), acc), m);
        return acc;
    }

}