mirror of
https://github.com/Z3Prover/z3
synced 2026-06-20 07:36:31 +00:00
misc edits of work in progress
This commit is contained in:
parent
bcdbc80ab8
commit
61ba424ae6
10 changed files with 507 additions and 13 deletions
|
|
@ -38,6 +38,7 @@ z3_add_component(rewriter
|
||||||
range_predicate.cpp
|
range_predicate.cpp
|
||||||
range_predicate_translator.cpp
|
range_predicate_translator.cpp
|
||||||
recfun_rewriter.cpp
|
recfun_rewriter.cpp
|
||||||
|
regex_range_collapse.cpp
|
||||||
rewriter.cpp
|
rewriter.cpp
|
||||||
seq_axioms.cpp
|
seq_axioms.cpp
|
||||||
seq_eq_solver.cpp
|
seq_eq_solver.cpp
|
||||||
|
|
|
||||||
145
src/ast/rewriter/regex_range_collapse.cpp
Normal file
145
src/ast/rewriter/regex_range_collapse.cpp
Normal file
|
|
@ -0,0 +1,145 @@
|
||||||
|
/*++
|
||||||
|
Copyright (c) 2026 Microsoft Corporation
|
||||||
|
|
||||||
|
Module Name:
|
||||||
|
|
||||||
|
regex_range_collapse.cpp
|
||||||
|
|
||||||
|
Abstract:
|
||||||
|
|
||||||
|
Implementation of regex <-> range_predicate translation for the
|
||||||
|
boolean-combination-of-ranges fragment. See header for the recognized
|
||||||
|
grammar and the canonical regex AST emitted by materialization.
|
||||||
|
|
||||||
|
Authors:
|
||||||
|
|
||||||
|
Margus Veanes (veanes) 2026
|
||||||
|
|
||||||
|
--*/
|
||||||
|
|
||||||
|
#include "ast/rewriter/regex_range_collapse.h"
|
||||||
|
|
||||||
|
namespace seq {
|
||||||
|
|
||||||
|
bool regex_to_range_predicate(seq_util& u, expr* r, range_predicate& out) {
|
||||||
|
unsigned const max_char = u.max_char();
|
||||||
|
auto& re = u.re;
|
||||||
|
|
||||||
|
if (re.is_empty(r)) {
|
||||||
|
out = range_predicate::empty(max_char);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (re.is_full_char(r)) {
|
||||||
|
out = range_predicate::top(max_char);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
unsigned lo = 0, hi = 0;
|
||||||
|
expr* lo_e = nullptr;
|
||||||
|
expr* hi_e = nullptr;
|
||||||
|
if (re.is_range(r, lo_e, hi_e)) {
|
||||||
|
auto extract_char = [&](expr* e, unsigned& c) -> bool {
|
||||||
|
if (u.is_const_char(e, c)) return true;
|
||||||
|
expr* inner = nullptr;
|
||||||
|
if (u.str.is_unit(e, inner) && u.is_const_char(inner, c)) return true;
|
||||||
|
zstring s;
|
||||||
|
if (u.str.is_string(e, s) && s.length() == 1) {
|
||||||
|
c = s[0];
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
if (!extract_char(lo_e, lo) || !extract_char(hi_e, hi))
|
||||||
|
return false;
|
||||||
|
// Empty/inverted range [lo > hi] is the empty regex.
|
||||||
|
if (lo > hi) {
|
||||||
|
out = range_predicate::empty(max_char);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
out = range_predicate::range(lo, hi, max_char);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
expr* a = nullptr;
|
||||||
|
expr* b = nullptr;
|
||||||
|
if (re.is_union(r, a, b)) {
|
||||||
|
range_predicate pa(max_char), pb(max_char);
|
||||||
|
if (!regex_to_range_predicate(u, a, pa)) return false;
|
||||||
|
if (!regex_to_range_predicate(u, b, pb)) return false;
|
||||||
|
out = pa | pb;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (re.is_intersection(r, a, b)) {
|
||||||
|
range_predicate pa(max_char), pb(max_char);
|
||||||
|
if (!regex_to_range_predicate(u, a, pa)) return false;
|
||||||
|
if (!regex_to_range_predicate(u, b, pb)) return false;
|
||||||
|
out = pa & pb;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (re.is_diff(r, a, b)) {
|
||||||
|
range_predicate pa(max_char), pb(max_char);
|
||||||
|
if (!regex_to_range_predicate(u, a, pa)) return false;
|
||||||
|
if (!regex_to_range_predicate(u, b, pb)) return false;
|
||||||
|
out = pa - pb;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// NOTE: re.complement is intentionally NOT handled here.
|
||||||
|
// re.complement is the SEQUENCE-level complement: its language
|
||||||
|
// includes the empty string, strings of length >= 2, and any
|
||||||
|
// length-1 string outside the operand. A character-class
|
||||||
|
// range_predicate can only describe a set of length-1 strings,
|
||||||
|
// so collapsing re.complement(R) to ~R (character-level
|
||||||
|
// complement) would change semantics whenever R is wrapped in
|
||||||
|
// any sequence-level context (e.g. re.diff at the top level,
|
||||||
|
// or membership tests). De-Morgan equivalences and the
|
||||||
|
// special cases re.complement(re.empty) / re.complement(re.full)
|
||||||
|
// are already handled directly in seq_rewriter::mk_re_complement.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static expr_ref mk_unit_string_from_char(seq_util& u, unsigned c) {
|
||||||
|
return expr_ref(u.str.mk_string(zstring(c)), u.get_manager());
|
||||||
|
}
|
||||||
|
|
||||||
|
static expr_ref mk_single_range_regex(seq_util& u, unsigned lo, unsigned hi, sort* re_sort) {
|
||||||
|
ast_manager& m = u.get_manager();
|
||||||
|
if (lo == 0 && hi == u.max_char())
|
||||||
|
return expr_ref(u.re.mk_full_char(re_sort), m);
|
||||||
|
// Use the canonical unit-character form (seq.unit (Char N)) for
|
||||||
|
// range bounds. This matches the shape used elsewhere in
|
||||||
|
// seq_rewriter and avoids creating duplicate AST nodes with
|
||||||
|
// different ids for semantically identical ranges.
|
||||||
|
expr_ref slo(u.str.mk_unit(u.str.mk_char(lo)), m);
|
||||||
|
expr_ref shi(u.str.mk_unit(u.str.mk_char(hi)), m);
|
||||||
|
return expr_ref(u.re.mk_range(slo, shi), m);
|
||||||
|
}
|
||||||
|
|
||||||
|
expr_ref range_predicate_to_regex(seq_util& u, range_predicate const& p, sort* seq_sort) {
|
||||||
|
ast_manager& m = u.get_manager();
|
||||||
|
sort* re_sort = u.re.mk_re(seq_sort);
|
||||||
|
if (p.is_empty())
|
||||||
|
return expr_ref(u.re.mk_empty(re_sort), m);
|
||||||
|
unsigned const n = p.num_ranges();
|
||||||
|
SASSERT(n > 0);
|
||||||
|
if (n == 1) {
|
||||||
|
auto [lo, hi] = p[0];
|
||||||
|
return mk_single_range_regex(u, lo, hi, re_sort);
|
||||||
|
}
|
||||||
|
// Build single-range AST nodes first, then sort by expression id
|
||||||
|
// so the resulting right-associated union matches the canonical
|
||||||
|
// id-sorted shape that seq_rewriter::merge_regex_sets expects.
|
||||||
|
// Without this the merge algorithm produces incorrect unions
|
||||||
|
// when it has to combine our materialized output with another
|
||||||
|
// (id-sorted) regex set.
|
||||||
|
expr_ref_vector ranges(m);
|
||||||
|
for (unsigned i = 0; i < n; ++i) {
|
||||||
|
auto [lo, hi] = p[i];
|
||||||
|
ranges.push_back(mk_single_range_regex(u, lo, hi, re_sort));
|
||||||
|
}
|
||||||
|
std::sort(ranges.data(), ranges.data() + ranges.size(),
|
||||||
|
[](expr* a, expr* b) { return a->get_id() < b->get_id(); });
|
||||||
|
expr_ref acc(ranges.get(n - 1), m);
|
||||||
|
for (unsigned i = n - 1; i-- > 0; )
|
||||||
|
acc = expr_ref(u.re.mk_union(ranges.get(i), acc), m);
|
||||||
|
return acc;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
71
src/ast/rewriter/regex_range_collapse.h
Normal file
71
src/ast/rewriter/regex_range_collapse.h
Normal file
|
|
@ -0,0 +1,71 @@
|
||||||
|
/*++
|
||||||
|
Copyright (c) 2026 Microsoft Corporation
|
||||||
|
|
||||||
|
Module Name:
|
||||||
|
|
||||||
|
regex_range_collapse.h
|
||||||
|
|
||||||
|
Abstract:
|
||||||
|
|
||||||
|
Recognize regexes that are boolean combinations of character-class
|
||||||
|
primitives (re.empty, re.full_char, re.range with concrete chars,
|
||||||
|
and re.union/inter/comp/diff over translatable arguments), and
|
||||||
|
materialize a seq::range_predicate back into a canonical regex AST.
|
||||||
|
|
||||||
|
Together with seq_rewriter integration, this lets any boolean
|
||||||
|
combination of character-class regexes collapse to a canonical
|
||||||
|
multi-range form, so that equivalent character classes share AST
|
||||||
|
identity, and downstream consumers (derivative, OneStep, caching)
|
||||||
|
can short-circuit them as pure range predicates.
|
||||||
|
|
||||||
|
Authors:
|
||||||
|
|
||||||
|
Margus Veanes (veanes) 2026
|
||||||
|
|
||||||
|
--*/
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ast/rewriter/range_predicate.h"
|
||||||
|
#include "ast/seq_decl_plugin.h"
|
||||||
|
|
||||||
|
namespace seq {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If r is a boolean combination of character-class regex primitives
|
||||||
|
* over the unsigned character domain [0, max_char], compute the
|
||||||
|
* equivalent range_predicate and return true. Otherwise return false
|
||||||
|
* with out untouched.
|
||||||
|
*
|
||||||
|
* Recognized fragment (all character-class-preserving operations):
|
||||||
|
* re.empty -> empty
|
||||||
|
* re.full_char_set -> top
|
||||||
|
* re.range "c_lo" "c_hi" (concrete) -> [c_lo, c_hi]
|
||||||
|
* re.union r1 r2 -> p1 | p2
|
||||||
|
* re.intersection r1 r2 -> p1 & p2
|
||||||
|
* re.diff r1 r2 -> p1 - p2
|
||||||
|
*
|
||||||
|
* Notably re.complement is NOT recognized: it is a SEQUENCE-level
|
||||||
|
* complement (over all of Σ*), not a character-class complement, so
|
||||||
|
* collapsing it would change semantics whenever the result is used
|
||||||
|
* in any non-character-class context. Sequence-level rewrites for
|
||||||
|
* re.complement (double-comp, deMorgan, etc.) are handled directly
|
||||||
|
* in seq_rewriter::mk_re_complement.
|
||||||
|
*/
|
||||||
|
bool regex_to_range_predicate(seq_util& u, expr* r, range_predicate& out);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Canonical materialization of p as a regex AST over the given
|
||||||
|
* sequence sort. Two range_predicates with equal canonical
|
||||||
|
* representations produce structurally identical regex ASTs:
|
||||||
|
*
|
||||||
|
* empty -> re.empty
|
||||||
|
* top -> re.full_char_set
|
||||||
|
* single range [lo, hi] -> re.range "lo" "hi"
|
||||||
|
* multiple ranges -> right-associated re.union of single
|
||||||
|
* ranges, in increasing order of lo
|
||||||
|
* (matching the canonical range order
|
||||||
|
* held by range_predicate).
|
||||||
|
*/
|
||||||
|
expr_ref range_predicate_to_regex(seq_util& u, range_predicate const& p, sort* seq_sort);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -1252,21 +1252,18 @@ namespace seq {
|
||||||
// Intersect the active suffix m_intervals[m_intervals_start..end] with [lo, hi]
|
// Intersect the active suffix m_intervals[m_intervals_start..end] with [lo, hi]
|
||||||
void derive::intersect_intervals(unsigned lo, unsigned hi) {
|
void derive::intersect_intervals(unsigned lo, unsigned hi) {
|
||||||
// Copy active suffix to end, update start, then filter
|
// Copy active suffix to end, update start, then filter
|
||||||
unsigned old_start = m_intervals_start;
|
|
||||||
unsigned old_sz = m_intervals.size();
|
unsigned old_sz = m_intervals.size();
|
||||||
for (unsigned i = old_start; i < old_sz; ++i)
|
for (unsigned i = m_intervals_start; i < old_sz; ++i)
|
||||||
m_intervals.push_back(m_intervals[i]);
|
m_intervals.push_back(m_intervals[i]);
|
||||||
m_intervals_start = old_sz;
|
m_intervals_start = old_sz;
|
||||||
// Filter in-place within new suffix
|
// Filter in-place within new suffix: drop intervals disjoint from [lo,hi],
|
||||||
|
// keep the intersection for overlapping ones.
|
||||||
unsigned j = m_intervals_start;
|
unsigned j = m_intervals_start;
|
||||||
for (unsigned i = m_intervals_start; i < m_intervals.size(); ++i) {
|
for (unsigned i = m_intervals_start; i < m_intervals.size(); ++i) {
|
||||||
auto [lo1, hi1] = m_intervals[i];
|
auto [lo1, hi1] = m_intervals[i];
|
||||||
if (hi < lo1 || lo > hi1) {
|
if (hi < lo1 || lo > hi1)
|
||||||
j = old_sz;
|
continue; // disjoint with this interval — drop it
|
||||||
break;
|
m_intervals[j++] = {std::max(lo1, lo), std::min(hi1, hi)};
|
||||||
}
|
|
||||||
if (hi1 >= lo)
|
|
||||||
m_intervals[j++] = {std::max(lo1, lo), std::min(hi1, hi)};
|
|
||||||
}
|
}
|
||||||
m_intervals.shrink(j);
|
m_intervals.shrink(j);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -102,9 +102,7 @@ namespace seq {
|
||||||
expr* e = work.back();
|
expr* e = work.back();
|
||||||
work.pop_back();
|
work.pop_back();
|
||||||
expr* c = nullptr, * t = nullptr, * f = nullptr;
|
expr* c = nullptr, * t = nullptr, * f = nullptr;
|
||||||
if (m.is_ite(e, c, t, f) ||
|
if (m.is_ite(e, c, t, f)) {
|
||||||
m_util.re.is_union(e, t, f) ||
|
|
||||||
m_util.re.is_antimirov_union(e, t, f)) {
|
|
||||||
if (seen.insert_if_not_there(t))
|
if (seen.insert_if_not_there(t))
|
||||||
work.push_back(t);
|
work.push_back(t);
|
||||||
if (seen.insert_if_not_there(f))
|
if (seen.insert_if_not_there(f))
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,7 @@ Authors:
|
||||||
#include "util/uint_set.h"
|
#include "util/uint_set.h"
|
||||||
#include "ast/rewriter/seq_rewriter.h"
|
#include "ast/rewriter/seq_rewriter.h"
|
||||||
#include "ast/rewriter/seq_regex_bisim.h"
|
#include "ast/rewriter/seq_regex_bisim.h"
|
||||||
|
#include "ast/rewriter/regex_range_collapse.h"
|
||||||
#include "ast/arith_decl_plugin.h"
|
#include "ast/arith_decl_plugin.h"
|
||||||
#include "ast/array_decl_plugin.h"
|
#include "ast/array_decl_plugin.h"
|
||||||
#include "ast/ast_pp.h"
|
#include "ast/ast_pp.h"
|
||||||
|
|
@ -3961,6 +3962,32 @@ bool seq_rewriter::is_subset(expr* r1, expr* r2) const {
|
||||||
return m_subset.is_subset(r1, r2);
|
return m_subset.is_subset(r1, r2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool seq_rewriter::try_collapse_re_union(expr* a, expr* b, expr_ref& result) {
|
||||||
|
sort* seq_sort = nullptr;
|
||||||
|
if (!u().is_re(a->get_sort(), seq_sort))
|
||||||
|
return false;
|
||||||
|
seq::range_predicate pa(u().max_char()), pb(u().max_char());
|
||||||
|
if (!seq::regex_to_range_predicate(u(), a, pa))
|
||||||
|
return false;
|
||||||
|
if (!seq::regex_to_range_predicate(u(), b, pb))
|
||||||
|
return false;
|
||||||
|
result = seq::range_predicate_to_regex(u(), pa | pb, seq_sort);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool seq_rewriter::try_collapse_re_inter(expr* a, expr* b, expr_ref& result) {
|
||||||
|
sort* seq_sort = nullptr;
|
||||||
|
if (!u().is_re(a->get_sort(), seq_sort))
|
||||||
|
return false;
|
||||||
|
seq::range_predicate pa(u().max_char()), pb(u().max_char());
|
||||||
|
if (!seq::regex_to_range_predicate(u(), a, pa))
|
||||||
|
return false;
|
||||||
|
if (!seq::regex_to_range_predicate(u(), b, pb))
|
||||||
|
return false;
|
||||||
|
result = seq::range_predicate_to_regex(u(), pa & pb, seq_sort);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
br_status seq_rewriter::mk_re_union0(expr* a, expr* b, expr_ref& result) {
|
br_status seq_rewriter::mk_re_union0(expr* a, expr* b, expr_ref& result) {
|
||||||
if (a == b) {
|
if (a == b) {
|
||||||
result = a;
|
result = a;
|
||||||
|
|
@ -4005,11 +4032,15 @@ br_status seq_rewriter::mk_re_union0(expr* a, expr* b, expr_ref& result) {
|
||||||
result = m().mk_ite(c, re().mk_union(a, r1), re().mk_union(a, r2));
|
result = m().mk_ite(c, re().mk_union(a, r1), re().mk_union(a, r2));
|
||||||
return BR_REWRITE3;
|
return BR_REWRITE3;
|
||||||
}
|
}
|
||||||
|
if (try_collapse_re_union(a, b, result))
|
||||||
|
return BR_DONE;
|
||||||
return BR_FAILED;
|
return BR_FAILED;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Creates a normalized union. */
|
/* Creates a normalized union. */
|
||||||
br_status seq_rewriter::mk_re_union(expr* a, expr* b, expr_ref& result) {
|
br_status seq_rewriter::mk_re_union(expr* a, expr* b, expr_ref& result) {
|
||||||
|
if (try_collapse_re_union(a, b, result))
|
||||||
|
return BR_DONE;
|
||||||
result = mk_regex_union_normalize(a, b);
|
result = mk_regex_union_normalize(a, b);
|
||||||
return BR_DONE;
|
return BR_DONE;
|
||||||
}
|
}
|
||||||
|
|
@ -4093,16 +4124,28 @@ br_status seq_rewriter::mk_re_inter0(expr* a, expr* b, expr_ref& result) {
|
||||||
result = m().mk_ite(c, re().mk_inter(a, r1), re().mk_inter(a, r2));
|
result = m().mk_ite(c, re().mk_inter(a, r1), re().mk_inter(a, r2));
|
||||||
return BR_REWRITE3;
|
return BR_REWRITE3;
|
||||||
}
|
}
|
||||||
|
if (try_collapse_re_inter(a, b, result))
|
||||||
|
return BR_DONE;
|
||||||
return BR_FAILED;
|
return BR_FAILED;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Creates a normalized intersection. */
|
/* Creates a normalized intersection. */
|
||||||
br_status seq_rewriter::mk_re_inter(expr* a, expr* b, expr_ref& result) {
|
br_status seq_rewriter::mk_re_inter(expr* a, expr* b, expr_ref& result) {
|
||||||
|
if (try_collapse_re_inter(a, b, result))
|
||||||
|
return BR_DONE;
|
||||||
result = mk_regex_inter_normalize(a, b);
|
result = mk_regex_inter_normalize(a, b);
|
||||||
return BR_DONE;
|
return BR_DONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
br_status seq_rewriter::mk_re_diff(expr* a, expr* b, expr_ref& result) {
|
br_status seq_rewriter::mk_re_diff(expr* a, expr* b, expr_ref& result) {
|
||||||
|
seq::range_predicate pa(u().max_char()), pb(u().max_char());
|
||||||
|
sort* seq_sort = nullptr;
|
||||||
|
if (u().is_re(a->get_sort(), seq_sort)
|
||||||
|
&& seq::regex_to_range_predicate(u(), a, pa)
|
||||||
|
&& seq::regex_to_range_predicate(u(), b, pb)) {
|
||||||
|
result = seq::range_predicate_to_regex(u(), pa - pb, seq_sort);
|
||||||
|
return BR_DONE;
|
||||||
|
}
|
||||||
result = mk_regex_inter_normalize(a, re().mk_complement(b));
|
result = mk_regex_inter_normalize(a, re().mk_complement(b));
|
||||||
return BR_REWRITE2;
|
return BR_REWRITE2;
|
||||||
}
|
}
|
||||||
|
|
@ -5495,7 +5538,7 @@ void seq_rewriter::op_cache::cleanup() {
|
||||||
lbool seq_rewriter::some_string_in_re(expr* r, zstring& s) {
|
lbool seq_rewriter::some_string_in_re(expr* r, zstring& s) {
|
||||||
sort* rs;
|
sort* rs;
|
||||||
(void)rs;
|
(void)rs;
|
||||||
// SASSERT(re().is_re(r, rs) && m_util.is_string(rs));
|
// SASSERT(u().is_re(r, rs) && m_util.is_string(rs));
|
||||||
expr_mark visited;
|
expr_mark visited;
|
||||||
unsigned_vector str;
|
unsigned_vector str;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -250,6 +250,14 @@ class seq_rewriter {
|
||||||
br_status mk_re_union0(expr* a, expr* b, expr_ref& result);
|
br_status mk_re_union0(expr* a, expr* b, expr_ref& result);
|
||||||
br_status mk_re_inter0(expr* a, expr* b, expr_ref& result);
|
br_status mk_re_inter0(expr* a, expr* b, expr_ref& result);
|
||||||
br_status mk_re_complement(expr* a, expr_ref& result);
|
br_status mk_re_complement(expr* a, expr_ref& result);
|
||||||
|
// Range-set collapse helpers: if the operands form a boolean
|
||||||
|
// combination of character-class regexes, materialize the result as a
|
||||||
|
// canonical regex over a single range_predicate. See
|
||||||
|
// ast/rewriter/regex_range_collapse.h for the recognized fragment.
|
||||||
|
// NOTE: re.complement is intentionally not in this set because it
|
||||||
|
// operates at the sequence level, not the character-class level.
|
||||||
|
bool try_collapse_re_union(expr* a, expr* b, expr_ref& result);
|
||||||
|
bool try_collapse_re_inter(expr* a, expr* b, expr_ref& result);
|
||||||
br_status mk_re_star(expr* a, expr_ref& result);
|
br_status mk_re_star(expr* a, expr_ref& result);
|
||||||
br_status mk_re_diff(expr* a, expr* b, expr_ref& result);
|
br_status mk_re_diff(expr* a, expr* b, expr_ref& result);
|
||||||
br_status mk_re_xor(expr* a, expr* b, expr_ref& result);
|
br_status mk_re_xor(expr* a, expr* b, expr_ref& result);
|
||||||
|
|
|
||||||
|
|
@ -125,6 +125,7 @@ add_executable(test-z3
|
||||||
rational.cpp
|
rational.cpp
|
||||||
rcf.cpp
|
rcf.cpp
|
||||||
region.cpp
|
region.cpp
|
||||||
|
regex_range_collapse.cpp
|
||||||
sat_local_search.cpp
|
sat_local_search.cpp
|
||||||
sat_lookahead.cpp
|
sat_lookahead.cpp
|
||||||
sat_user_scope.cpp
|
sat_user_scope.cpp
|
||||||
|
|
|
||||||
|
|
@ -115,6 +115,7 @@
|
||||||
X(arith_rewriter) \
|
X(arith_rewriter) \
|
||||||
X(range_predicate) \
|
X(range_predicate) \
|
||||||
X(range_predicate_translator) \
|
X(range_predicate_translator) \
|
||||||
|
X(regex_range_collapse) \
|
||||||
X(check_assumptions) \
|
X(check_assumptions) \
|
||||||
X(smt_context) \
|
X(smt_context) \
|
||||||
X(theory_dl) \
|
X(theory_dl) \
|
||||||
|
|
|
||||||
229
src/test/regex_range_collapse.cpp
Normal file
229
src/test/regex_range_collapse.cpp
Normal file
|
|
@ -0,0 +1,229 @@
|
||||||
|
/*++
|
||||||
|
Copyright (c) 2026 Microsoft Corporation
|
||||||
|
|
||||||
|
Module Name:
|
||||||
|
|
||||||
|
regex_range_collapse.cpp - unit tests
|
||||||
|
|
||||||
|
--*/
|
||||||
|
|
||||||
|
#include "ast/rewriter/regex_range_collapse.h"
|
||||||
|
#include "ast/reg_decl_plugins.h"
|
||||||
|
#include "ast/ast_pp.h"
|
||||||
|
#include "util/util.h"
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
using seq::range_predicate;
|
||||||
|
using seq::regex_to_range_predicate;
|
||||||
|
using seq::range_predicate_to_regex;
|
||||||
|
|
||||||
|
static void check(bool ok, char const* what) {
|
||||||
|
if (!ok) {
|
||||||
|
std::cerr << "regex_range_collapse FAILED: " << what << "\n";
|
||||||
|
ENSURE(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static expr_ref mk_singleton_str(seq_util& u, unsigned c) {
|
||||||
|
return expr_ref(u.str.mk_string(zstring(c)), u.get_manager());
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool extract_range_chars(seq_util& u, expr* e, unsigned& lo, unsigned& hi) {
|
||||||
|
expr* lo_e = nullptr; expr* hi_e = nullptr;
|
||||||
|
if (!u.re.is_range(e, lo_e, hi_e))
|
||||||
|
return false;
|
||||||
|
// Accept either string-constant or (seq.unit (Char N)) bound form.
|
||||||
|
if (u.re.is_range(e, lo, hi))
|
||||||
|
return true;
|
||||||
|
expr* lc = nullptr; expr* hc = nullptr;
|
||||||
|
if (u.str.is_unit(lo_e, lc) && u.is_const_char(lc, lo) &&
|
||||||
|
u.str.is_unit(hi_e, hc) && u.is_const_char(hc, hi))
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void run() {
|
||||||
|
ast_manager m;
|
||||||
|
reg_decl_plugins(m);
|
||||||
|
seq_util u(m);
|
||||||
|
unsigned const M = u.max_char();
|
||||||
|
|
||||||
|
sort* str_sort = u.str.mk_string_sort();
|
||||||
|
sort* re_sort = u.re.mk_re(str_sort);
|
||||||
|
|
||||||
|
// primitives
|
||||||
|
{
|
||||||
|
range_predicate p(M);
|
||||||
|
check(regex_to_range_predicate(u, u.re.mk_empty(re_sort), p) && p.is_empty(),
|
||||||
|
"re.empty -> empty");
|
||||||
|
check(regex_to_range_predicate(u, u.re.mk_full_char(re_sort), p) && p.is_top(),
|
||||||
|
"re.full_char -> top");
|
||||||
|
}
|
||||||
|
// re.range "a" "z"
|
||||||
|
{
|
||||||
|
range_predicate p(M);
|
||||||
|
expr_ref a = mk_singleton_str(u, 'a');
|
||||||
|
expr_ref z = mk_singleton_str(u, 'z');
|
||||||
|
expr_ref r(u.re.mk_range(a, z), m);
|
||||||
|
check(regex_to_range_predicate(u, r, p) && p.num_ranges() == 1 &&
|
||||||
|
p[0].first == 'a' && p[0].second == 'z',
|
||||||
|
"re.range a z -> [a,z]");
|
||||||
|
}
|
||||||
|
// Disjoint union: (a..z) | (0..9)
|
||||||
|
{
|
||||||
|
range_predicate p(M);
|
||||||
|
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'z')), m);
|
||||||
|
expr_ref r2(u.re.mk_range(mk_singleton_str(u, '0'), mk_singleton_str(u, '9')), m);
|
||||||
|
expr_ref un(u.re.mk_union(r1, r2), m);
|
||||||
|
check(regex_to_range_predicate(u, un, p) && p.num_ranges() == 2,
|
||||||
|
"(a-z)|(0-9) -> 2 ranges");
|
||||||
|
// canonical order: lower lo first
|
||||||
|
check(p[0].first == '0' && p[0].second == '9' && p[1].first == 'a' && p[1].second == 'z',
|
||||||
|
"(a-z)|(0-9) ranges in canonical order");
|
||||||
|
}
|
||||||
|
// Overlapping union: (a..c) | (b..f) -> (a..f)
|
||||||
|
{
|
||||||
|
range_predicate p(M);
|
||||||
|
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'c')), m);
|
||||||
|
expr_ref r2(u.re.mk_range(mk_singleton_str(u, 'b'), mk_singleton_str(u, 'f')), m);
|
||||||
|
expr_ref un(u.re.mk_union(r1, r2), m);
|
||||||
|
check(regex_to_range_predicate(u, un, p) && p.num_ranges() == 1 &&
|
||||||
|
p[0].first == 'a' && p[0].second == 'f',
|
||||||
|
"(a-c)|(b-f) -> (a-f)");
|
||||||
|
}
|
||||||
|
// Adjacent union: (a..c) | (d..f) -> (a..f) (canonical predicate merges adjacent)
|
||||||
|
{
|
||||||
|
range_predicate p(M);
|
||||||
|
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'c')), m);
|
||||||
|
expr_ref r2(u.re.mk_range(mk_singleton_str(u, 'd'), mk_singleton_str(u, 'f')), m);
|
||||||
|
expr_ref un(u.re.mk_union(r1, r2), m);
|
||||||
|
check(regex_to_range_predicate(u, un, p) && p.num_ranges() == 1 &&
|
||||||
|
p[0].first == 'a' && p[0].second == 'f',
|
||||||
|
"(a-c)|(d-f) -> (a-f) via adjacency");
|
||||||
|
}
|
||||||
|
// Disjoint intersection: (a..z) & (0..9) -> empty
|
||||||
|
{
|
||||||
|
range_predicate p(M);
|
||||||
|
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'z')), m);
|
||||||
|
expr_ref r2(u.re.mk_range(mk_singleton_str(u, '0'), mk_singleton_str(u, '9')), m);
|
||||||
|
expr_ref ix(u.re.mk_inter(r1, r2), m);
|
||||||
|
check(regex_to_range_predicate(u, ix, p) && p.is_empty(),
|
||||||
|
"(a-z)&(0-9) -> empty");
|
||||||
|
}
|
||||||
|
// Overlapping intersection: (a..f) & (c..z) -> (c..f)
|
||||||
|
{
|
||||||
|
range_predicate p(M);
|
||||||
|
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'f')), m);
|
||||||
|
expr_ref r2(u.re.mk_range(mk_singleton_str(u, 'c'), mk_singleton_str(u, 'z')), m);
|
||||||
|
expr_ref ix(u.re.mk_inter(r1, r2), m);
|
||||||
|
check(regex_to_range_predicate(u, ix, p) && p.num_ranges() == 1 &&
|
||||||
|
p[0].first == 'c' && p[0].second == 'f',
|
||||||
|
"(a-f)&(c-z) -> (c-f)");
|
||||||
|
}
|
||||||
|
// Complement: re.complement is intentionally NOT a char-class op
|
||||||
|
// (it operates over Σ*), so it must NOT be translated.
|
||||||
|
{
|
||||||
|
range_predicate p(M);
|
||||||
|
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'z')), m);
|
||||||
|
expr_ref cmp(u.re.mk_complement(r1), m);
|
||||||
|
check(!regex_to_range_predicate(u, cmp, p),
|
||||||
|
"re.comp of range is NOT translatable (sequence-level complement)");
|
||||||
|
}
|
||||||
|
// Diff: (a..f) \ (c..z) -> (a..b)
|
||||||
|
{
|
||||||
|
range_predicate p(M);
|
||||||
|
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'f')), m);
|
||||||
|
expr_ref r2(u.re.mk_range(mk_singleton_str(u, 'c'), mk_singleton_str(u, 'z')), m);
|
||||||
|
expr_ref df(u.re.mk_diff(r1, r2), m);
|
||||||
|
check(regex_to_range_predicate(u, df, p) && p.num_ranges() == 1 &&
|
||||||
|
p[0].first == 'a' && p[0].second == 'b',
|
||||||
|
"(a-f) \\ (c-z) -> (a-b)");
|
||||||
|
}
|
||||||
|
// Negative: re.* of a range is NOT a char class
|
||||||
|
{
|
||||||
|
range_predicate p(M);
|
||||||
|
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'z')), m);
|
||||||
|
expr_ref star(u.re.mk_star(r1), m);
|
||||||
|
check(!regex_to_range_predicate(u, star, p),
|
||||||
|
"re.* of range not translatable");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- materialization round-trip ----
|
||||||
|
|
||||||
|
// empty -> re.empty
|
||||||
|
{
|
||||||
|
range_predicate p = range_predicate::empty(M);
|
||||||
|
expr_ref e = range_predicate_to_regex(u, p, str_sort);
|
||||||
|
check(u.re.is_empty(e), "empty -> re.empty");
|
||||||
|
}
|
||||||
|
// top -> re.full_char
|
||||||
|
{
|
||||||
|
range_predicate p = range_predicate::top(M);
|
||||||
|
expr_ref e = range_predicate_to_regex(u, p, str_sort);
|
||||||
|
check(u.re.is_full_char(e), "top -> re.full_char");
|
||||||
|
}
|
||||||
|
// single range -> re.range
|
||||||
|
{
|
||||||
|
range_predicate p = range_predicate::range('a', 'z', M);
|
||||||
|
expr_ref e = range_predicate_to_regex(u, p, str_sort);
|
||||||
|
unsigned lo = 0, hi = 0;
|
||||||
|
check(extract_range_chars(u, e, lo, hi) && lo == 'a' && hi == 'z',
|
||||||
|
"[a-z] -> re.range a z");
|
||||||
|
}
|
||||||
|
// singleton -> re.range c c
|
||||||
|
{
|
||||||
|
range_predicate p = range_predicate::singleton('A', M);
|
||||||
|
expr_ref e = range_predicate_to_regex(u, p, str_sort);
|
||||||
|
unsigned lo = 0, hi = 0;
|
||||||
|
check(extract_range_chars(u, e, lo, hi) && lo == 'A' && hi == 'A',
|
||||||
|
"{A} -> re.range A A");
|
||||||
|
}
|
||||||
|
// 2 ranges -> re.union(range_0, range_1) in canonical order
|
||||||
|
{
|
||||||
|
range_predicate p = range_predicate::range('0', '9', M)
|
||||||
|
| range_predicate::range('a', 'z', M);
|
||||||
|
expr_ref e = range_predicate_to_regex(u, p, str_sort);
|
||||||
|
expr* a = nullptr; expr* b = nullptr;
|
||||||
|
check(u.re.is_union(e, a, b), "2-range -> union");
|
||||||
|
unsigned lo0 = 0, hi0 = 0, lo1 = 0, hi1 = 0;
|
||||||
|
check(extract_range_chars(u, a, lo0, hi0) && lo0 == '0' && hi0 == '9',
|
||||||
|
"union arg0 = (0-9) (canonical: lower lo first)");
|
||||||
|
check(extract_range_chars(u, b, lo1, hi1) && lo1 == 'a' && hi1 == 'z',
|
||||||
|
"union arg1 = (a-z)");
|
||||||
|
}
|
||||||
|
// 3 ranges -> right-associated union
|
||||||
|
{
|
||||||
|
range_predicate p = range_predicate::range(0, 5, M)
|
||||||
|
| range_predicate::range(10, 15, M)
|
||||||
|
| range_predicate::range(20, 25, M);
|
||||||
|
expr_ref e = range_predicate_to_regex(u, p, str_sort);
|
||||||
|
expr* a = nullptr; expr* rest = nullptr;
|
||||||
|
check(u.re.is_union(e, a, rest), "3-range -> union(...)");
|
||||||
|
unsigned lo = 0, hi = 0;
|
||||||
|
check(extract_range_chars(u, a, lo, hi) && lo == 0 && hi == 5, "first arg = (0-5)");
|
||||||
|
expr* b = nullptr; expr* c = nullptr;
|
||||||
|
check(u.re.is_union(rest, b, c), "rest is union(...,...)");
|
||||||
|
check(extract_range_chars(u, b, lo, hi) && lo == 10 && hi == 15, "second range");
|
||||||
|
check(extract_range_chars(u, c, lo, hi) && lo == 20 && hi == 25, "third range");
|
||||||
|
}
|
||||||
|
// Round-trip identity for an arbitrary range-set
|
||||||
|
{
|
||||||
|
range_predicate p_in = range_predicate::range('a', 'c', M)
|
||||||
|
| range_predicate::range('m', 'p', M)
|
||||||
|
| range_predicate::range('x', 'z', M);
|
||||||
|
expr_ref e = range_predicate_to_regex(u, p_in, str_sort);
|
||||||
|
range_predicate p_out(M);
|
||||||
|
check(regex_to_range_predicate(u, e, p_out), "round-trip translatable");
|
||||||
|
check(p_in == p_out, "round-trip equal");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cerr << "regex_range_collapse tests passed\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_regex_range_collapse() {
|
||||||
|
run();
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue