From 61ba424ae6b214ff8f14ea2e8cf5bc09f3357c87 Mon Sep 17 00:00:00 2001 From: Margus Veanes Date: Mon, 15 Jun 2026 23:00:25 -0600 Subject: [PATCH] misc edits of work in progress --- src/ast/rewriter/CMakeLists.txt | 1 + src/ast/rewriter/regex_range_collapse.cpp | 145 ++++++++++++++ src/ast/rewriter/regex_range_collapse.h | 71 +++++++ src/ast/rewriter/seq_derive.cpp | 15 +- src/ast/rewriter/seq_regex_bisim.cpp | 4 +- src/ast/rewriter/seq_rewriter.cpp | 45 ++++- src/ast/rewriter/seq_rewriter.h | 8 + src/test/CMakeLists.txt | 1 + src/test/main.cpp | 1 + src/test/regex_range_collapse.cpp | 229 ++++++++++++++++++++++ 10 files changed, 507 insertions(+), 13 deletions(-) create mode 100644 src/ast/rewriter/regex_range_collapse.cpp create mode 100644 src/ast/rewriter/regex_range_collapse.h create mode 100644 src/test/regex_range_collapse.cpp diff --git a/src/ast/rewriter/CMakeLists.txt b/src/ast/rewriter/CMakeLists.txt index 73be5c5ed..aae3d7857 100644 --- a/src/ast/rewriter/CMakeLists.txt +++ b/src/ast/rewriter/CMakeLists.txt @@ -38,6 +38,7 @@ z3_add_component(rewriter range_predicate.cpp range_predicate_translator.cpp recfun_rewriter.cpp + regex_range_collapse.cpp rewriter.cpp seq_axioms.cpp seq_eq_solver.cpp diff --git a/src/ast/rewriter/regex_range_collapse.cpp b/src/ast/rewriter/regex_range_collapse.cpp new file mode 100644 index 000000000..1db3f60d2 --- /dev/null +++ b/src/ast/rewriter/regex_range_collapse.cpp @@ -0,0 +1,145 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + regex_range_collapse.cpp + +Abstract: + + Implementation of regex <-> range_predicate translation for the + boolean-combination-of-ranges fragment. See header for the recognized + grammar and the canonical regex AST emitted by materialization. + +Authors: + + Margus Veanes (veanes) 2026 + +--*/ + +#include "ast/rewriter/regex_range_collapse.h" + +namespace seq { + + bool regex_to_range_predicate(seq_util& u, expr* r, range_predicate& out) { + unsigned const max_char = u.max_char(); + auto& re = u.re; + + if (re.is_empty(r)) { + out = range_predicate::empty(max_char); + return true; + } + if (re.is_full_char(r)) { + out = range_predicate::top(max_char); + return true; + } + unsigned lo = 0, hi = 0; + expr* lo_e = nullptr; + expr* hi_e = nullptr; + if (re.is_range(r, lo_e, hi_e)) { + auto extract_char = [&](expr* e, unsigned& c) -> bool { + if (u.is_const_char(e, c)) return true; + expr* inner = nullptr; + if (u.str.is_unit(e, inner) && u.is_const_char(inner, c)) return true; + zstring s; + if (u.str.is_string(e, s) && s.length() == 1) { + c = s[0]; + return true; + } + return false; + }; + if (!extract_char(lo_e, lo) || !extract_char(hi_e, hi)) + return false; + // Empty/inverted range [lo > hi] is the empty regex. + if (lo > hi) { + out = range_predicate::empty(max_char); + return true; + } + out = range_predicate::range(lo, hi, max_char); + return true; + } + expr* a = nullptr; + expr* b = nullptr; + if (re.is_union(r, a, b)) { + range_predicate pa(max_char), pb(max_char); + if (!regex_to_range_predicate(u, a, pa)) return false; + if (!regex_to_range_predicate(u, b, pb)) return false; + out = pa | pb; + return true; + } + if (re.is_intersection(r, a, b)) { + range_predicate pa(max_char), pb(max_char); + if (!regex_to_range_predicate(u, a, pa)) return false; + if (!regex_to_range_predicate(u, b, pb)) return false; + out = pa & pb; + return true; + } + if (re.is_diff(r, a, b)) { + range_predicate pa(max_char), pb(max_char); + if (!regex_to_range_predicate(u, a, pa)) return false; + if (!regex_to_range_predicate(u, b, pb)) return false; + out = pa - pb; + return true; + } + // NOTE: re.complement is intentionally NOT handled here. + // re.complement is the SEQUENCE-level complement: its language + // includes the empty string, strings of length >= 2, and any + // length-1 string outside the operand. A character-class + // range_predicate can only describe a set of length-1 strings, + // so collapsing re.complement(R) to ~R (character-level + // complement) would change semantics whenever R is wrapped in + // any sequence-level context (e.g. re.diff at the top level, + // or membership tests). De-Morgan equivalences and the + // special cases re.complement(re.empty) / re.complement(re.full) + // are already handled directly in seq_rewriter::mk_re_complement. + return false; + } + + static expr_ref mk_unit_string_from_char(seq_util& u, unsigned c) { + return expr_ref(u.str.mk_string(zstring(c)), u.get_manager()); + } + + static expr_ref mk_single_range_regex(seq_util& u, unsigned lo, unsigned hi, sort* re_sort) { + ast_manager& m = u.get_manager(); + if (lo == 0 && hi == u.max_char()) + return expr_ref(u.re.mk_full_char(re_sort), m); + // Use the canonical unit-character form (seq.unit (Char N)) for + // range bounds. This matches the shape used elsewhere in + // seq_rewriter and avoids creating duplicate AST nodes with + // different ids for semantically identical ranges. + expr_ref slo(u.str.mk_unit(u.str.mk_char(lo)), m); + expr_ref shi(u.str.mk_unit(u.str.mk_char(hi)), m); + return expr_ref(u.re.mk_range(slo, shi), m); + } + + expr_ref range_predicate_to_regex(seq_util& u, range_predicate const& p, sort* seq_sort) { + ast_manager& m = u.get_manager(); + sort* re_sort = u.re.mk_re(seq_sort); + if (p.is_empty()) + return expr_ref(u.re.mk_empty(re_sort), m); + unsigned const n = p.num_ranges(); + SASSERT(n > 0); + if (n == 1) { + auto [lo, hi] = p[0]; + return mk_single_range_regex(u, lo, hi, re_sort); + } + // Build single-range AST nodes first, then sort by expression id + // so the resulting right-associated union matches the canonical + // id-sorted shape that seq_rewriter::merge_regex_sets expects. + // Without this the merge algorithm produces incorrect unions + // when it has to combine our materialized output with another + // (id-sorted) regex set. + expr_ref_vector ranges(m); + for (unsigned i = 0; i < n; ++i) { + auto [lo, hi] = p[i]; + ranges.push_back(mk_single_range_regex(u, lo, hi, re_sort)); + } + std::sort(ranges.data(), ranges.data() + ranges.size(), + [](expr* a, expr* b) { return a->get_id() < b->get_id(); }); + expr_ref acc(ranges.get(n - 1), m); + for (unsigned i = n - 1; i-- > 0; ) + acc = expr_ref(u.re.mk_union(ranges.get(i), acc), m); + return acc; + } + +} diff --git a/src/ast/rewriter/regex_range_collapse.h b/src/ast/rewriter/regex_range_collapse.h new file mode 100644 index 000000000..5e02a67d9 --- /dev/null +++ b/src/ast/rewriter/regex_range_collapse.h @@ -0,0 +1,71 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + regex_range_collapse.h + +Abstract: + + Recognize regexes that are boolean combinations of character-class + primitives (re.empty, re.full_char, re.range with concrete chars, + and re.union/inter/comp/diff over translatable arguments), and + materialize a seq::range_predicate back into a canonical regex AST. + + Together with seq_rewriter integration, this lets any boolean + combination of character-class regexes collapse to a canonical + multi-range form, so that equivalent character classes share AST + identity, and downstream consumers (derivative, OneStep, caching) + can short-circuit them as pure range predicates. + +Authors: + + Margus Veanes (veanes) 2026 + +--*/ +#pragma once + +#include "ast/rewriter/range_predicate.h" +#include "ast/seq_decl_plugin.h" + +namespace seq { + + /** + * If r is a boolean combination of character-class regex primitives + * over the unsigned character domain [0, max_char], compute the + * equivalent range_predicate and return true. Otherwise return false + * with out untouched. + * + * Recognized fragment (all character-class-preserving operations): + * re.empty -> empty + * re.full_char_set -> top + * re.range "c_lo" "c_hi" (concrete) -> [c_lo, c_hi] + * re.union r1 r2 -> p1 | p2 + * re.intersection r1 r2 -> p1 & p2 + * re.diff r1 r2 -> p1 - p2 + * + * Notably re.complement is NOT recognized: it is a SEQUENCE-level + * complement (over all of Σ*), not a character-class complement, so + * collapsing it would change semantics whenever the result is used + * in any non-character-class context. Sequence-level rewrites for + * re.complement (double-comp, deMorgan, etc.) are handled directly + * in seq_rewriter::mk_re_complement. + */ + bool regex_to_range_predicate(seq_util& u, expr* r, range_predicate& out); + + /** + * Canonical materialization of p as a regex AST over the given + * sequence sort. Two range_predicates with equal canonical + * representations produce structurally identical regex ASTs: + * + * empty -> re.empty + * top -> re.full_char_set + * single range [lo, hi] -> re.range "lo" "hi" + * multiple ranges -> right-associated re.union of single + * ranges, in increasing order of lo + * (matching the canonical range order + * held by range_predicate). + */ + expr_ref range_predicate_to_regex(seq_util& u, range_predicate const& p, sort* seq_sort); + +} diff --git a/src/ast/rewriter/seq_derive.cpp b/src/ast/rewriter/seq_derive.cpp index 48db95e3b..d98885235 100644 --- a/src/ast/rewriter/seq_derive.cpp +++ b/src/ast/rewriter/seq_derive.cpp @@ -1252,21 +1252,18 @@ namespace seq { // Intersect the active suffix m_intervals[m_intervals_start..end] with [lo, hi] void derive::intersect_intervals(unsigned lo, unsigned hi) { // Copy active suffix to end, update start, then filter - unsigned old_start = m_intervals_start; unsigned old_sz = m_intervals.size(); - for (unsigned i = old_start; i < old_sz; ++i) + for (unsigned i = m_intervals_start; i < old_sz; ++i) m_intervals.push_back(m_intervals[i]); m_intervals_start = old_sz; - // Filter in-place within new suffix + // Filter in-place within new suffix: drop intervals disjoint from [lo,hi], + // keep the intersection for overlapping ones. unsigned j = m_intervals_start; for (unsigned i = m_intervals_start; i < m_intervals.size(); ++i) { auto [lo1, hi1] = m_intervals[i]; - if (hi < lo1 || lo > hi1) { - j = old_sz; - break; - } - if (hi1 >= lo) - m_intervals[j++] = {std::max(lo1, lo), std::min(hi1, hi)}; + if (hi < lo1 || lo > hi1) + continue; // disjoint with this interval — drop it + m_intervals[j++] = {std::max(lo1, lo), std::min(hi1, hi)}; } m_intervals.shrink(j); } diff --git a/src/ast/rewriter/seq_regex_bisim.cpp b/src/ast/rewriter/seq_regex_bisim.cpp index 5e849017c..9e1a8e498 100644 --- a/src/ast/rewriter/seq_regex_bisim.cpp +++ b/src/ast/rewriter/seq_regex_bisim.cpp @@ -102,9 +102,7 @@ namespace seq { expr* e = work.back(); work.pop_back(); expr* c = nullptr, * t = nullptr, * f = nullptr; - if (m.is_ite(e, c, t, f) || - m_util.re.is_union(e, t, f) || - m_util.re.is_antimirov_union(e, t, f)) { + if (m.is_ite(e, c, t, f)) { if (seen.insert_if_not_there(t)) work.push_back(t); if (seen.insert_if_not_there(f)) diff --git a/src/ast/rewriter/seq_rewriter.cpp b/src/ast/rewriter/seq_rewriter.cpp index 6f6a4ea88..5cfb233f3 100644 --- a/src/ast/rewriter/seq_rewriter.cpp +++ b/src/ast/rewriter/seq_rewriter.cpp @@ -21,6 +21,7 @@ Authors: #include "util/uint_set.h" #include "ast/rewriter/seq_rewriter.h" #include "ast/rewriter/seq_regex_bisim.h" +#include "ast/rewriter/regex_range_collapse.h" #include "ast/arith_decl_plugin.h" #include "ast/array_decl_plugin.h" #include "ast/ast_pp.h" @@ -3961,6 +3962,32 @@ bool seq_rewriter::is_subset(expr* r1, expr* r2) const { return m_subset.is_subset(r1, r2); } +bool seq_rewriter::try_collapse_re_union(expr* a, expr* b, expr_ref& result) { + sort* seq_sort = nullptr; + if (!u().is_re(a->get_sort(), seq_sort)) + return false; + seq::range_predicate pa(u().max_char()), pb(u().max_char()); + if (!seq::regex_to_range_predicate(u(), a, pa)) + return false; + if (!seq::regex_to_range_predicate(u(), b, pb)) + return false; + result = seq::range_predicate_to_regex(u(), pa | pb, seq_sort); + return true; +} + +bool seq_rewriter::try_collapse_re_inter(expr* a, expr* b, expr_ref& result) { + sort* seq_sort = nullptr; + if (!u().is_re(a->get_sort(), seq_sort)) + return false; + seq::range_predicate pa(u().max_char()), pb(u().max_char()); + if (!seq::regex_to_range_predicate(u(), a, pa)) + return false; + if (!seq::regex_to_range_predicate(u(), b, pb)) + return false; + result = seq::range_predicate_to_regex(u(), pa & pb, seq_sort); + return true; +} + br_status seq_rewriter::mk_re_union0(expr* a, expr* b, expr_ref& result) { if (a == b) { result = a; @@ -4005,11 +4032,15 @@ br_status seq_rewriter::mk_re_union0(expr* a, expr* b, expr_ref& result) { result = m().mk_ite(c, re().mk_union(a, r1), re().mk_union(a, r2)); return BR_REWRITE3; } + if (try_collapse_re_union(a, b, result)) + return BR_DONE; return BR_FAILED; } /* Creates a normalized union. */ br_status seq_rewriter::mk_re_union(expr* a, expr* b, expr_ref& result) { + if (try_collapse_re_union(a, b, result)) + return BR_DONE; result = mk_regex_union_normalize(a, b); return BR_DONE; } @@ -4093,16 +4124,28 @@ br_status seq_rewriter::mk_re_inter0(expr* a, expr* b, expr_ref& result) { result = m().mk_ite(c, re().mk_inter(a, r1), re().mk_inter(a, r2)); return BR_REWRITE3; } + if (try_collapse_re_inter(a, b, result)) + return BR_DONE; return BR_FAILED; } /* Creates a normalized intersection. */ br_status seq_rewriter::mk_re_inter(expr* a, expr* b, expr_ref& result) { + if (try_collapse_re_inter(a, b, result)) + return BR_DONE; result = mk_regex_inter_normalize(a, b); return BR_DONE; } br_status seq_rewriter::mk_re_diff(expr* a, expr* b, expr_ref& result) { + seq::range_predicate pa(u().max_char()), pb(u().max_char()); + sort* seq_sort = nullptr; + if (u().is_re(a->get_sort(), seq_sort) + && seq::regex_to_range_predicate(u(), a, pa) + && seq::regex_to_range_predicate(u(), b, pb)) { + result = seq::range_predicate_to_regex(u(), pa - pb, seq_sort); + return BR_DONE; + } result = mk_regex_inter_normalize(a, re().mk_complement(b)); return BR_REWRITE2; } @@ -5495,7 +5538,7 @@ void seq_rewriter::op_cache::cleanup() { lbool seq_rewriter::some_string_in_re(expr* r, zstring& s) { sort* rs; (void)rs; - // SASSERT(re().is_re(r, rs) && m_util.is_string(rs)); + // SASSERT(u().is_re(r, rs) && m_util.is_string(rs)); expr_mark visited; unsigned_vector str; diff --git a/src/ast/rewriter/seq_rewriter.h b/src/ast/rewriter/seq_rewriter.h index aa4603e6c..bd5dd2479 100644 --- a/src/ast/rewriter/seq_rewriter.h +++ b/src/ast/rewriter/seq_rewriter.h @@ -250,6 +250,14 @@ class seq_rewriter { br_status mk_re_union0(expr* a, expr* b, expr_ref& result); br_status mk_re_inter0(expr* a, expr* b, expr_ref& result); br_status mk_re_complement(expr* a, expr_ref& result); + // Range-set collapse helpers: if the operands form a boolean + // combination of character-class regexes, materialize the result as a + // canonical regex over a single range_predicate. See + // ast/rewriter/regex_range_collapse.h for the recognized fragment. + // NOTE: re.complement is intentionally not in this set because it + // operates at the sequence level, not the character-class level. + bool try_collapse_re_union(expr* a, expr* b, expr_ref& result); + bool try_collapse_re_inter(expr* a, expr* b, expr_ref& result); br_status mk_re_star(expr* a, expr_ref& result); br_status mk_re_diff(expr* a, expr* b, expr_ref& result); br_status mk_re_xor(expr* a, expr* b, expr_ref& result); diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 5697c6e31..b0fa7ad8f 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -125,6 +125,7 @@ add_executable(test-z3 rational.cpp rcf.cpp region.cpp + regex_range_collapse.cpp sat_local_search.cpp sat_lookahead.cpp sat_user_scope.cpp diff --git a/src/test/main.cpp b/src/test/main.cpp index c9027c183..b10446aa9 100644 --- a/src/test/main.cpp +++ b/src/test/main.cpp @@ -115,6 +115,7 @@ X(arith_rewriter) \ X(range_predicate) \ X(range_predicate_translator) \ + X(regex_range_collapse) \ X(check_assumptions) \ X(smt_context) \ X(theory_dl) \ diff --git a/src/test/regex_range_collapse.cpp b/src/test/regex_range_collapse.cpp new file mode 100644 index 000000000..9317dc664 --- /dev/null +++ b/src/test/regex_range_collapse.cpp @@ -0,0 +1,229 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + regex_range_collapse.cpp - unit tests + +--*/ + +#include "ast/rewriter/regex_range_collapse.h" +#include "ast/reg_decl_plugins.h" +#include "ast/ast_pp.h" +#include "util/util.h" + +#include + +namespace { + + using seq::range_predicate; + using seq::regex_to_range_predicate; + using seq::range_predicate_to_regex; + + static void check(bool ok, char const* what) { + if (!ok) { + std::cerr << "regex_range_collapse FAILED: " << what << "\n"; + ENSURE(false); + } + } + + static expr_ref mk_singleton_str(seq_util& u, unsigned c) { + return expr_ref(u.str.mk_string(zstring(c)), u.get_manager()); + } + + static bool extract_range_chars(seq_util& u, expr* e, unsigned& lo, unsigned& hi) { + expr* lo_e = nullptr; expr* hi_e = nullptr; + if (!u.re.is_range(e, lo_e, hi_e)) + return false; + // Accept either string-constant or (seq.unit (Char N)) bound form. + if (u.re.is_range(e, lo, hi)) + return true; + expr* lc = nullptr; expr* hc = nullptr; + if (u.str.is_unit(lo_e, lc) && u.is_const_char(lc, lo) && + u.str.is_unit(hi_e, hc) && u.is_const_char(hc, hi)) + return true; + return false; + } + + static void run() { + ast_manager m; + reg_decl_plugins(m); + seq_util u(m); + unsigned const M = u.max_char(); + + sort* str_sort = u.str.mk_string_sort(); + sort* re_sort = u.re.mk_re(str_sort); + + // primitives + { + range_predicate p(M); + check(regex_to_range_predicate(u, u.re.mk_empty(re_sort), p) && p.is_empty(), + "re.empty -> empty"); + check(regex_to_range_predicate(u, u.re.mk_full_char(re_sort), p) && p.is_top(), + "re.full_char -> top"); + } + // re.range "a" "z" + { + range_predicate p(M); + expr_ref a = mk_singleton_str(u, 'a'); + expr_ref z = mk_singleton_str(u, 'z'); + expr_ref r(u.re.mk_range(a, z), m); + check(regex_to_range_predicate(u, r, p) && p.num_ranges() == 1 && + p[0].first == 'a' && p[0].second == 'z', + "re.range a z -> [a,z]"); + } + // Disjoint union: (a..z) | (0..9) + { + range_predicate p(M); + expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'z')), m); + expr_ref r2(u.re.mk_range(mk_singleton_str(u, '0'), mk_singleton_str(u, '9')), m); + expr_ref un(u.re.mk_union(r1, r2), m); + check(regex_to_range_predicate(u, un, p) && p.num_ranges() == 2, + "(a-z)|(0-9) -> 2 ranges"); + // canonical order: lower lo first + check(p[0].first == '0' && p[0].second == '9' && p[1].first == 'a' && p[1].second == 'z', + "(a-z)|(0-9) ranges in canonical order"); + } + // Overlapping union: (a..c) | (b..f) -> (a..f) + { + range_predicate p(M); + expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'c')), m); + expr_ref r2(u.re.mk_range(mk_singleton_str(u, 'b'), mk_singleton_str(u, 'f')), m); + expr_ref un(u.re.mk_union(r1, r2), m); + check(regex_to_range_predicate(u, un, p) && p.num_ranges() == 1 && + p[0].first == 'a' && p[0].second == 'f', + "(a-c)|(b-f) -> (a-f)"); + } + // Adjacent union: (a..c) | (d..f) -> (a..f) (canonical predicate merges adjacent) + { + range_predicate p(M); + expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'c')), m); + expr_ref r2(u.re.mk_range(mk_singleton_str(u, 'd'), mk_singleton_str(u, 'f')), m); + expr_ref un(u.re.mk_union(r1, r2), m); + check(regex_to_range_predicate(u, un, p) && p.num_ranges() == 1 && + p[0].first == 'a' && p[0].second == 'f', + "(a-c)|(d-f) -> (a-f) via adjacency"); + } + // Disjoint intersection: (a..z) & (0..9) -> empty + { + range_predicate p(M); + expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'z')), m); + expr_ref r2(u.re.mk_range(mk_singleton_str(u, '0'), mk_singleton_str(u, '9')), m); + expr_ref ix(u.re.mk_inter(r1, r2), m); + check(regex_to_range_predicate(u, ix, p) && p.is_empty(), + "(a-z)&(0-9) -> empty"); + } + // Overlapping intersection: (a..f) & (c..z) -> (c..f) + { + range_predicate p(M); + expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'f')), m); + expr_ref r2(u.re.mk_range(mk_singleton_str(u, 'c'), mk_singleton_str(u, 'z')), m); + expr_ref ix(u.re.mk_inter(r1, r2), m); + check(regex_to_range_predicate(u, ix, p) && p.num_ranges() == 1 && + p[0].first == 'c' && p[0].second == 'f', + "(a-f)&(c-z) -> (c-f)"); + } + // Complement: re.complement is intentionally NOT a char-class op + // (it operates over Σ*), so it must NOT be translated. + { + range_predicate p(M); + expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'z')), m); + expr_ref cmp(u.re.mk_complement(r1), m); + check(!regex_to_range_predicate(u, cmp, p), + "re.comp of range is NOT translatable (sequence-level complement)"); + } + // Diff: (a..f) \ (c..z) -> (a..b) + { + range_predicate p(M); + expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'f')), m); + expr_ref r2(u.re.mk_range(mk_singleton_str(u, 'c'), mk_singleton_str(u, 'z')), m); + expr_ref df(u.re.mk_diff(r1, r2), m); + check(regex_to_range_predicate(u, df, p) && p.num_ranges() == 1 && + p[0].first == 'a' && p[0].second == 'b', + "(a-f) \\ (c-z) -> (a-b)"); + } + // Negative: re.* of a range is NOT a char class + { + range_predicate p(M); + expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'z')), m); + expr_ref star(u.re.mk_star(r1), m); + check(!regex_to_range_predicate(u, star, p), + "re.* of range not translatable"); + } + + // ---- materialization round-trip ---- + + // empty -> re.empty + { + range_predicate p = range_predicate::empty(M); + expr_ref e = range_predicate_to_regex(u, p, str_sort); + check(u.re.is_empty(e), "empty -> re.empty"); + } + // top -> re.full_char + { + range_predicate p = range_predicate::top(M); + expr_ref e = range_predicate_to_regex(u, p, str_sort); + check(u.re.is_full_char(e), "top -> re.full_char"); + } + // single range -> re.range + { + range_predicate p = range_predicate::range('a', 'z', M); + expr_ref e = range_predicate_to_regex(u, p, str_sort); + unsigned lo = 0, hi = 0; + check(extract_range_chars(u, e, lo, hi) && lo == 'a' && hi == 'z', + "[a-z] -> re.range a z"); + } + // singleton -> re.range c c + { + range_predicate p = range_predicate::singleton('A', M); + expr_ref e = range_predicate_to_regex(u, p, str_sort); + unsigned lo = 0, hi = 0; + check(extract_range_chars(u, e, lo, hi) && lo == 'A' && hi == 'A', + "{A} -> re.range A A"); + } + // 2 ranges -> re.union(range_0, range_1) in canonical order + { + range_predicate p = range_predicate::range('0', '9', M) + | range_predicate::range('a', 'z', M); + expr_ref e = range_predicate_to_regex(u, p, str_sort); + expr* a = nullptr; expr* b = nullptr; + check(u.re.is_union(e, a, b), "2-range -> union"); + unsigned lo0 = 0, hi0 = 0, lo1 = 0, hi1 = 0; + check(extract_range_chars(u, a, lo0, hi0) && lo0 == '0' && hi0 == '9', + "union arg0 = (0-9) (canonical: lower lo first)"); + check(extract_range_chars(u, b, lo1, hi1) && lo1 == 'a' && hi1 == 'z', + "union arg1 = (a-z)"); + } + // 3 ranges -> right-associated union + { + range_predicate p = range_predicate::range(0, 5, M) + | range_predicate::range(10, 15, M) + | range_predicate::range(20, 25, M); + expr_ref e = range_predicate_to_regex(u, p, str_sort); + expr* a = nullptr; expr* rest = nullptr; + check(u.re.is_union(e, a, rest), "3-range -> union(...)"); + unsigned lo = 0, hi = 0; + check(extract_range_chars(u, a, lo, hi) && lo == 0 && hi == 5, "first arg = (0-5)"); + expr* b = nullptr; expr* c = nullptr; + check(u.re.is_union(rest, b, c), "rest is union(...,...)"); + check(extract_range_chars(u, b, lo, hi) && lo == 10 && hi == 15, "second range"); + check(extract_range_chars(u, c, lo, hi) && lo == 20 && hi == 25, "third range"); + } + // Round-trip identity for an arbitrary range-set + { + range_predicate p_in = range_predicate::range('a', 'c', M) + | range_predicate::range('m', 'p', M) + | range_predicate::range('x', 'z', M); + expr_ref e = range_predicate_to_regex(u, p_in, str_sort); + range_predicate p_out(M); + check(regex_to_range_predicate(u, e, p_out), "round-trip translatable"); + check(p_in == p_out, "round-trip equal"); + } + + std::cerr << "regex_range_collapse tests passed\n"; + } +} + +void tst_regex_range_collapse() { + run(); +}