3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2026-06-19 15:16:29 +00:00
z3/src/ast/rewriter/regex_range_collapse.cpp
Margus Veanes c0c826cf5f Stage 3: collapse boolean combinations of char-class regexes
Introduce src/ast/rewriter/regex_range_collapse.{h,cpp}, a translator
between the boolean-combination-of-character-class fragment of regexes
and the range_predicate value type added in Stage 2.

Recognized fragment (translates to range_predicate):
  re.empty, re.full_char, re.range, re.union, re.intersection, re.diff
of operands recursively in the fragment.  Range bounds are accepted in
three encodings: string constant ("a"), seq.unit of a const char
(seq.unit (Char 97)), and length-1 zstring literal.

NOT translated:
  re.complement -- this is sequence-level complement (Sigma* \ L), not
  character-class complement.  Translating it would incorrectly turn
  re.comp(re.range "a" "z") into the character class [^a-z], which would
  drop the empty string and all length>=2 strings.

Hook the translator into seq_rewriter at mk_re_union0, mk_re_union,
mk_re_inter0, mk_re_inter, and mk_re_diff so that boolean combinations
of character classes always reduce to a single canonical range-set
form.  mk_re_complement is intentionally not hooked.

Materialization uses the canonical (seq.unit (Char N)) bound form
(matching the rest of seq_rewriter) and right-associates the union
with operands sorted by expr_id so the result matches the invariant
expected by merge_regex_sets.

Unit tests in src/test/regex_range_collapse.cpp cover the recognized
fragment, the non-translatable cases, and round-trip identity for
multi-range predicates.

Corpus validation on bench/inputs/regex-equivalence (1523 .smt2):
- 0 soundness regressions vs derive baseline.
- Resolves 4 previously-soft-timeout files (now solved correctly).
- Resolves 1 pre-existing wrong answer (mut_0404: master/derive say
  unsat, ground-truth annotation and Stage 3 say sat).
- Wall-time: -2.2% vs Stage-3 starting point, -1.5% vs derive.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-15 03:28:01 -07:00

145 lines
5.6 KiB
C++

/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
regex_range_collapse.cpp
Abstract:
Implementation of regex <-> range_predicate translation for the
boolean-combination-of-ranges fragment. See header for the recognized
grammar and the canonical regex AST emitted by materialization.
Authors:
Margus Veanes (veanes) 2026
--*/
#include "ast/rewriter/regex_range_collapse.h"
namespace seq {
bool regex_to_range_predicate(seq_util& u, expr* r, range_predicate& out) {
unsigned const max_char = u.max_char();
auto& re = u.re;
if (re.is_empty(r)) {
out = range_predicate::empty(max_char);
return true;
}
if (re.is_full_char(r)) {
out = range_predicate::top(max_char);
return true;
}
unsigned lo = 0, hi = 0;
expr* lo_e = nullptr;
expr* hi_e = nullptr;
if (re.is_range(r, lo_e, hi_e)) {
auto extract_char = [&](expr* e, unsigned& c) -> bool {
if (u.is_const_char(e, c)) return true;
expr* inner = nullptr;
if (u.str.is_unit(e, inner) && u.is_const_char(inner, c)) return true;
zstring s;
if (u.str.is_string(e, s) && s.length() == 1) {
c = s[0];
return true;
}
return false;
};
if (!extract_char(lo_e, lo) || !extract_char(hi_e, hi))
return false;
// Empty/inverted range [lo > hi] is the empty regex.
if (lo > hi) {
out = range_predicate::empty(max_char);
return true;
}
out = range_predicate::range(lo, hi, max_char);
return true;
}
expr* a = nullptr;
expr* b = nullptr;
if (re.is_union(r, a, b)) {
range_predicate pa(max_char), pb(max_char);
if (!regex_to_range_predicate(u, a, pa)) return false;
if (!regex_to_range_predicate(u, b, pb)) return false;
out = pa | pb;
return true;
}
if (re.is_intersection(r, a, b)) {
range_predicate pa(max_char), pb(max_char);
if (!regex_to_range_predicate(u, a, pa)) return false;
if (!regex_to_range_predicate(u, b, pb)) return false;
out = pa & pb;
return true;
}
if (re.is_diff(r, a, b)) {
range_predicate pa(max_char), pb(max_char);
if (!regex_to_range_predicate(u, a, pa)) return false;
if (!regex_to_range_predicate(u, b, pb)) return false;
out = pa - pb;
return true;
}
// NOTE: re.complement is intentionally NOT handled here.
// re.complement is the SEQUENCE-level complement: its language
// includes the empty string, strings of length >= 2, and any
// length-1 string outside the operand. A character-class
// range_predicate can only describe a set of length-1 strings,
// so collapsing re.complement(R) to ~R (character-level
// complement) would change semantics whenever R is wrapped in
// any sequence-level context (e.g. re.diff at the top level,
// or membership tests). De-Morgan equivalences and the
// special cases re.complement(re.empty) / re.complement(re.full)
// are already handled directly in seq_rewriter::mk_re_complement.
return false;
}
static expr_ref mk_unit_string_from_char(seq_util& u, unsigned c) {
return expr_ref(u.str.mk_string(zstring(c)), u.get_manager());
}
static expr_ref mk_single_range_regex(seq_util& u, unsigned lo, unsigned hi, sort* re_sort) {
ast_manager& m = u.get_manager();
if (lo == 0 && hi == u.max_char())
return expr_ref(u.re.mk_full_char(re_sort), m);
// Use the canonical unit-character form (seq.unit (Char N)) for
// range bounds. This matches the shape used elsewhere in
// seq_rewriter and avoids creating duplicate AST nodes with
// different ids for semantically identical ranges.
expr_ref slo(u.str.mk_unit(u.str.mk_char(lo)), m);
expr_ref shi(u.str.mk_unit(u.str.mk_char(hi)), m);
return expr_ref(u.re.mk_range(slo, shi), m);
}
expr_ref range_predicate_to_regex(seq_util& u, range_predicate const& p, sort* seq_sort) {
ast_manager& m = u.get_manager();
sort* re_sort = u.re.mk_re(seq_sort);
if (p.is_empty())
return expr_ref(u.re.mk_empty(re_sort), m);
unsigned const n = p.num_ranges();
SASSERT(n > 0);
if (n == 1) {
auto [lo, hi] = p[0];
return mk_single_range_regex(u, lo, hi, re_sort);
}
// Build single-range AST nodes first, then sort by expression id
// so the resulting right-associated union matches the canonical
// id-sorted shape that seq_rewriter::merge_regex_sets expects.
// Without this the merge algorithm produces incorrect unions
// when it has to combine our materialized output with another
// (id-sorted) regex set.
expr_ref_vector ranges(m);
for (unsigned i = 0; i < n; ++i) {
auto [lo, hi] = p[i];
ranges.push_back(mk_single_range_regex(u, lo, hi, re_sort));
}
std::sort(ranges.data(), ranges.data() + ranges.size(),
[](expr* a, expr* b) { return a->get_id() < b->get_id(); });
expr_ref acc(ranges.get(n - 1), m);
for (unsigned i = n - 1; i-- > 0; )
acc = expr_ref(u.re.mk_union(ranges.get(i), acc), m);
return acc;
}
}