3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2026-06-19 15:16:29 +00:00

Add OP_RE_CHARCLASS to collapse boolean combinations of char-class regexes

Introduces a first-class AST kind OP_RE_CHARCLASS for ground char-class
regexes, encoded as a nullary op with 2N int parameters that name the
sorted, disjoint, non-adjacent ranges of its predicate. The seq decl
plugin owns the construction and validation; seq_rewriter lifts existing
re.range / re.empty / re.full_char into CHARCLASS and collapses
union/intersection/difference of CHARCLASS-like operands into a single
node via seq::range_predicate.

Highlights:
* Move range_predicate.{h,cpp} from ast/rewriter/ into ast/ so the seq
  plugin can use it.
* mk_re_union / mk_re_inter / mk_re_diff and mk_regex_union_normalize /
  mk_regex_inter_normalize all short-circuit to a single CHARCLASS when
  both operands are charclass-like.
* mk_re_range lifts concrete-bounded ranges at construction.
* mk_antimirov_deriv_rec and mk_derivative_rec both handle the CHARCLASS
  case so derivatives reduce normally during simplification and bisim.
* is_nullable_rec, get_info_rec, can_skip_parenth, and the rex pretty
  printer all recognise the new kind.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Margus Veanes 2026-06-14 21:46:20 -07:00
parent 9651e520b7
commit 292a925826
8 changed files with 281 additions and 6 deletions

View file

@ -43,6 +43,7 @@ z3_add_component(ast
polymorphism_util.cpp
pp.cpp
quantifier_stat.cpp
range_predicate.cpp
recfun_decl_plugin.cpp
reg_decl_plugins.cpp
seq_decl_plugin.cpp

View file

@ -21,7 +21,7 @@ Authors:
--*/
#include "ast/rewriter/range_predicate.h"
#include "ast/range_predicate.h"
#include "util/debug.h"
#include <algorithm>
#include <ostream>

View file

@ -35,7 +35,6 @@ z3_add_component(rewriter
pb2bv_rewriter.cpp
push_app_ite.cpp
quant_hoist.cpp
range_predicate.cpp
recfun_rewriter.cpp
rewriter.cpp
seq_axioms.cpp

View file

@ -2659,7 +2659,8 @@ expr_ref seq_rewriter::is_nullable_rec(expr* r) {
else if (re().is_full_char(r) ||
re().is_empty(r) ||
re().is_of_pred(r) ||
re().is_range(r)) {
re().is_range(r) ||
re().is_charclass(r)) {
result = m().mk_false();
}
else if (re().is_plus(r, r1) ||
@ -3084,6 +3085,47 @@ void seq_rewriter::mk_antimirov_deriv_rec(expr* e, expr* r, expr* path, expr_ref
else
result = re().mk_ite_simplify(c, mk_antimirov_deriv(e, r1, c1), mk_antimirov_deriv(e, r2, c2));
}
else if (re().is_charclass(r)) {
// CHARCLASS = disjoint union of ranges [lo_0,hi_0] | ... | [lo_{N-1},hi_{N-1}].
// D(e, CHARCLASS) = ite(P(e), epsilon, []) where P(e) is the disjunction
// of (lo_i <= e <= hi_i). We build a chain of nested ITEs, one per range,
// so the result is in antimirov BDD normal form.
app const* a = to_app(r);
func_decl* d = a->get_decl();
unsigned np = d->get_num_parameters();
if (np == 0) {
result = nothing();
}
else {
unsigned mx = u().max_char();
result = nothing();
// Iterate ranges in reverse so the first range becomes the outermost ITE.
for (unsigned k = np; k >= 2; k -= 2) {
unsigned i = k - 2;
unsigned lo_p = static_cast<unsigned>(d->get_parameter(i).get_int());
unsigned hi_p = static_cast<unsigned>(d->get_parameter(i + 1).get_int());
if (lo_p == 0 && hi_p == mx) {
// Range covers the entire char domain: derivative is unconditionally epsilon.
result = epsilon();
continue;
}
expr_ref lo_c(m_util.mk_char(lo_p), m());
expr_ref hi_c(m_util.mk_char(hi_p), m());
expr_ref cond(m());
if (lo_p == 0)
cond = u().mk_le(e, hi_c);
else if (hi_p == mx)
cond = u().mk_le(lo_c, e);
else
cond = m().mk_and(u().mk_le(lo_c, e), u().mk_le(e, hi_c));
expr_ref range(simplify_path(e, cond), m());
expr_ref psi(simplify_path(e, m().mk_and(path, range)), m());
if (m().is_false(psi))
continue;
result = re().mk_ite_simplify(range, epsilon(), result);
}
}
}
else if (re().is_range(r, r1, r2)) {
expr_ref range(m());
expr_ref psi(m().mk_false(), m());
@ -3318,6 +3360,15 @@ expr_ref seq_rewriter::mk_regex_union_normalize(expr* r1, expr* r2) {
SASSERT(m_util.is_re(r1));
SASSERT(m_util.is_re(r2));
expr_ref result(m());
// Char-class fast path: combine in O(1) via range_predicate when both
// operands are equivalent to a single character class.
seq::range_predicate pa, pb;
if (re().as_charclass(r1, pa) && re().as_charclass(r2, pb)) {
sort* seq_sort = nullptr;
VERIFY(u().is_re(r1, seq_sort));
result = re().mk_charclass(pa | pb, seq_sort);
return result;
}
std::function<bool(expr*, expr*&, expr*&)> test = [&](expr* t, expr*& a, expr*& b) { return re().is_union(t, a, b); };
std::function<expr* (expr*, expr*)> compose = [&](expr* r1, expr* r2) { return (is_subset(r1, r2) ? r2 : (is_subset(r2, r1) ? r1 : re().mk_union(r1, r2))); };
std::function<bool(expr *, expr *)> is_complement = [&](expr *a, expr *b) {
@ -3355,6 +3406,13 @@ expr_ref seq_rewriter::mk_regex_inter_normalize(expr* r1, expr* r2) {
SASSERT(m_util.is_re(r1));
SASSERT(m_util.is_re(r2));
expr_ref result(m());
seq::range_predicate pa, pb;
if (re().as_charclass(r1, pa) && re().as_charclass(r2, pb)) {
sort* seq_sort = nullptr;
VERIFY(u().is_re(r1, seq_sort));
result = re().mk_charclass(pa & pb, seq_sort);
return result;
}
if (re().is_epsilon(r2))
std::swap(r1, r2);
std::function<bool(expr*, expr*&, expr*&)> test = [&](expr* t, expr*& a, expr*& b) { return re().is_intersection(t, a, b); };
@ -4165,8 +4223,50 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) {
}
}
}
else if (re().is_charclass(r)) {
// Derivative of a char-class cc(P) = ite(P(ele), epsilon, []).
// We build it in transition-regex form: a disjunction over each
// range (lo, hi) of mk_der_inter(le(lo, ele), le(ele, hi)), with
// mk_der_cond normalization on each leaf so the result is a clean
// t-regex of ITE / and / or over single-character constraints.
app const* a = to_app(r);
func_decl* d = a->get_decl();
unsigned np = d->get_num_parameters();
if (np == 0)
return mk_empty();
expr_ref acc(m());
unsigned mx = u().max_char();
for (unsigned i = 0; i + 1 < np; i += 2) {
unsigned lo_p = static_cast<unsigned>(d->get_parameter(i).get_int());
unsigned hi_p = static_cast<unsigned>(d->get_parameter(i + 1).get_int());
expr_ref leaf(m());
if (lo_p == 0 && hi_p == mx) {
leaf = expr_ref(re().mk_to_re(str().mk_empty(seq_sort)), m());
}
else {
expr_ref p1(m()), p2(m());
if (lo_p > 0) {
p1 = u().mk_le(m_util.mk_char(lo_p), ele);
p1 = mk_der_cond(p1, ele, seq_sort);
}
if (hi_p < mx) {
p2 = u().mk_le(ele, m_util.mk_char(hi_p));
p2 = mk_der_cond(p2, ele, seq_sort);
}
if (!p1 && !p2)
leaf = expr_ref(re().mk_to_re(str().mk_empty(seq_sort)), m());
else if (!p1)
leaf = p2;
else if (!p2)
leaf = p1;
else
leaf = mk_der_inter(p1, p2);
}
acc = i == 0 ? leaf : mk_der_union(acc, leaf);
}
return acc;
}
else if (re().is_range(r, r1, r2)) {
// r1, r2 are sequences.
zstring s1, s2;
if (str().is_string(r1, s1) && str().is_string(r2, s2)) {
if (s1.length() == 1 && s2.length() == 1) {
@ -4763,6 +4863,16 @@ br_status seq_rewriter::mk_re_union0(expr* a, expr* b, expr_ref& result) {
result = b;
return BR_DONE;
}
// O(1) char-class collapse: if both operands are equivalent to a
// char-class (CHARCLASS, full_char, empty-char, range with concrete
// bounds), return the union as a single CHARCLASS node.
seq::range_predicate pa, pb;
if (re().as_charclass(a, pa) && re().as_charclass(b, pb)) {
sort* seq_sort = nullptr;
VERIFY(u().is_re(a, seq_sort));
result = re().mk_charclass(pa | pb, seq_sort);
return BR_DONE;
}
return BR_FAILED;
}
@ -4830,6 +4940,13 @@ br_status seq_rewriter::mk_re_inter0(expr* a, expr* b, expr_ref& result) {
result = a;
return BR_DONE;
}
seq::range_predicate pa, pb;
if (re().as_charclass(a, pa) && re().as_charclass(b, pb)) {
sort* seq_sort = nullptr;
VERIFY(u().is_re(a, seq_sort));
result = re().mk_charclass(pa & pb, seq_sort);
return BR_DONE;
}
return BR_FAILED;
}
@ -4840,6 +4957,13 @@ br_status seq_rewriter::mk_re_inter(expr* a, expr* b, expr_ref& result) {
}
br_status seq_rewriter::mk_re_diff(expr* a, expr* b, expr_ref& result) {
seq::range_predicate pa, pb;
if (re().as_charclass(a, pa) && re().as_charclass(b, pb)) {
sort* seq_sort = nullptr;
VERIFY(u().is_re(a, seq_sort));
result = re().mk_charclass(pa - pb, seq_sort);
return BR_DONE;
}
result = mk_regex_inter_normalize(a, re().mk_complement(b));
return BR_REWRITE2;
}
@ -5103,6 +5227,19 @@ br_status seq_rewriter::mk_re_range(expr* lo, expr* hi, expr_ref& result) {
return BR_DONE;
}
// Lift concrete-bound ranges into a single canonical CHARCLASS node so
// that boolean combinations with other char-class regexes collapse in
// O(1) via the smart constructors below. Only fires when the element
// sort is Char (i.e. the standard string regex domain).
sort* ele_sort = nullptr;
if (slo.length() == 1 && shi.length() == 1 && slo[0] <= shi[0]
&& u().is_seq(lo->get_sort(), ele_sort) && u().is_char(ele_sort)) {
unsigned mx = u().max_char();
seq::range_predicate p = seq::range_predicate::range(slo[0], shi[0], mx);
result = re().mk_charclass(p, lo->get_sort());
return BR_DONE;
}
return BR_FAILED;
}

View file

@ -240,6 +240,7 @@ void seq_decl_plugin::init() {
m_sigs[OP_RE_OF_PRED] = alloc(psig, m, "re.of.pred", 1, 1, &predA, reA);
m_sigs[OP_RE_REVERSE] = alloc(psig, m, "re.reverse", 1, 1, &reA, reA);
m_sigs[OP_RE_DERIVATIVE] = alloc(psig, m, "re.derivative", 1, 2, AreA, reA);
m_sigs[OP_RE_CHARCLASS] = alloc(psig, m, "re.charclass", 1, 0, nullptr, reA);
m_sigs[_OP_RE_ANTIMIROV_UNION] = alloc(psig, m, "re.union", 1, 2, reAreA, reA);
m_sigs[OP_SEQ_TO_RE] = alloc(psig, m, "seq.to.re", 1, 1, &seqA, reA);
m_sigs[OP_SEQ_IN_RE] = alloc(psig, m, "seq.in.re", 1, 2, seqAreA, boolT);
@ -497,6 +498,29 @@ func_decl* seq_decl_plugin::mk_func_decl(decl_kind k, unsigned num_parameters, p
}
m.raise_exception("Incorrect arguments used for re.^. Expected one non-negative integer parameter");
case OP_RE_CHARCLASS: {
m_has_re = true;
if (arity != 0)
m.raise_exception("re.charclass takes no arguments");
if ((num_parameters % 2) != 0)
m.raise_exception("re.charclass expects an even number of int parameters (lo, hi pairs)");
unsigned mx = get_char_plugin().max_char();
int prev_hi = -1;
for (unsigned i = 0; i + 1 < num_parameters; i += 2) {
if (!parameters[i].is_int() || !parameters[i + 1].is_int())
m.raise_exception("re.charclass parameters must be integers");
int lo = parameters[i].get_int();
int hi = parameters[i + 1].get_int();
if (lo < 0 || hi < 0 || lo > hi || static_cast<unsigned>(hi) > mx)
m.raise_exception("re.charclass range out of character domain");
if (lo <= prev_hi + 1)
m.raise_exception("re.charclass ranges must be sorted, disjoint, and non-adjacent");
prev_hi = hi;
}
if (!range) range = mk_reglan();
return m.mk_func_decl(m_sigs[k]->m_name, arity, domain, range,
func_decl_info(m_family_id, k, num_parameters, parameters));
}
case OP_STRING_CONST:
if (!(num_parameters == 1 && arity == 0 && parameters[0].is_zstring())) {
m.raise_exception("invalid string declaration");
@ -1140,6 +1164,74 @@ bool seq_util::rex::is_range(expr const* n, unsigned& lo, unsigned& hi) const {
return true;
}
app* seq_util::rex::mk_charclass(seq::range_predicate const& p, sort* seq_sort) {
sort* re_sort = mk_re(seq_sort);
if (p.is_empty())
return mk_empty(re_sort);
unsigned n = p.num_ranges();
vector<parameter> params;
for (unsigned i = 0; i < n; ++i) {
auto r = p[i];
params.push_back(parameter(static_cast<int>(r.first)));
params.push_back(parameter(static_cast<int>(r.second)));
}
return m.mk_app(m_fid, OP_RE_CHARCLASS, params.size(), params.data(), 0, nullptr, re_sort);
}
bool seq_util::rex::is_charclass(expr const* n, seq::range_predicate& p) const {
if (!is_charclass(n))
return false;
app const* a = to_app(n);
func_decl* d = a->get_decl();
unsigned np = d->get_num_parameters();
p = seq::range_predicate::empty(u.max_char());
unsigned mx = u.max_char();
for (unsigned i = 0; i + 1 < np; i += 2) {
unsigned lo = static_cast<unsigned>(d->get_parameter(i).get_int());
unsigned hi = static_cast<unsigned>(d->get_parameter(i + 1).get_int());
p = p | seq::range_predicate::range(lo, hi, mx);
}
return true;
}
bool seq_util::rex::as_charclass(expr* r, seq::range_predicate& p) const {
unsigned mx = u.max_char();
if (is_charclass(r, p))
return true;
if (is_full_char(r)) {
p = seq::range_predicate::top(mx);
return true;
}
if (is_empty(r)) {
sort* seq_sort = nullptr;
sort* ele_sort = nullptr;
if (u.is_re(r, seq_sort) && u.is_seq(seq_sort, ele_sort) && u.is_char(ele_sort)) {
p = seq::range_predicate::empty(mx);
return true;
}
return false;
}
expr* e1 = nullptr, *e2 = nullptr;
if (is_range(r, e1, e2)) {
zstring s1, s2;
if (u.str.is_string(e1, s1) && u.str.is_string(e2, s2)
&& s1.length() == 1 && s2.length() == 1 && s1[0] <= s2[0]) {
p = seq::range_predicate::range(s1[0], s2[0], mx);
return true;
}
unsigned ch1 = 0, ch2 = 0;
expr* c1 = nullptr, *c2 = nullptr;
if (u.str.is_unit(e1, c1) && u.str.is_unit(e2, c2)
&& u.is_const_char(c1, ch1) && u.is_const_char(c2, ch2)
&& ch1 <= ch2) {
p = seq::range_predicate::range(ch1, ch2, mx);
return true;
}
return false;
}
return false;
}
sort* seq_util::rex::to_seq(sort* re) {
(void)u;
@ -1347,7 +1439,7 @@ std::ostream& seq_util::rex::pp::print_range(std::ostream& out, expr* s1, expr*
*/
bool seq_util::rex::pp::can_skip_parenth(expr* r) const {
expr* s;
return ((re.is_to_re(r, s) && re.u.str.is_unit(s)) || re.is_range(r) || re.is_empty(r) || re.is_epsilon(r) || re.is_full_char(r));
return ((re.is_to_re(r, s) && re.u.str.is_unit(s)) || re.is_range(r) || re.is_empty(r) || re.is_epsilon(r) || re.is_full_char(r) || re.is_charclass(r));
}
/*
@ -1443,6 +1535,26 @@ std::ostream& seq_util::rex::pp::print(std::ostream& out, expr* e) const {
print(out, r2);
out << ")";
}
else if (re.is_charclass(e)) {
app const* a = to_app(e);
func_decl* d = a->get_decl();
unsigned np = d->get_num_parameters();
if (np == 0) {
out << (html_encode ? "&#x2205;" : "[]");
}
else {
out << "[";
for (unsigned i = 0; i + 1 < np; i += 2) {
int lo_p = d->get_parameter(i).get_int();
int hi_p = d->get_parameter(i + 1).get_int();
if (lo_p == hi_p)
out << "\\x" << std::hex << lo_p << std::dec;
else
out << "\\x" << std::hex << lo_p << "-\\x" << hi_p << std::dec;
}
out << "]";
}
}
else if (re.is_intersection(e, r1, r2)) {
out << "(";
print(out, r1);
@ -1674,6 +1786,7 @@ seq_util::rex::info seq_util::rex::mk_info_rec(app* e) const {
case OP_RE_RANGE:
case OP_RE_FULL_CHAR_SET:
case OP_RE_OF_PRED:
case OP_RE_CHARCLASS:
//TBD: check if the character predicate contains uninterpreted symbols or is nonground or is unsat
//TBD: check if the range is unsat
return info(true, l_false, 1, false);

View file

@ -24,6 +24,7 @@ Revision History:
#include "ast/ast.h"
#include "ast/char_decl_plugin.h"
#include "ast/range_predicate.h"
#include "util/lbool.h"
#include "util/zstring.h"
@ -78,6 +79,13 @@ enum seq_op_kind {
OP_RE_OF_PRED,
OP_RE_REVERSE,
OP_RE_DERIVATIVE, // Char -> RegEx -> RegEx
OP_RE_CHARCLASS, // 2N int parameters (lo_0, hi_0, ..., lo_{N-1}, hi_{N-1}) -> RegEx
// Canonical char-class regex: union of disjoint, sorted, non-adjacent
// ranges [lo_i, hi_i] over the character domain. N = 0 denotes the
// empty regex; the predicate covering the full domain is also
// representable. Equivalent in language to the union of re.range
// nodes, but a single hash-consed AST node enabling O(1) boolean
// combinations on regex char-classes.
// string specific operators.
@ -544,6 +552,13 @@ public:
app* mk_derivative(expr* ele, expr* r) { return m.mk_app(m_fid, OP_RE_DERIVATIVE, ele, r); }
app* mk_antimirov_union(expr* r1, expr* r2) { return m.mk_app(m_fid, _OP_RE_ANTIMIROV_UNION, r1, r2); }
// Construct a canonical char-class regex over the character domain.
// The seq_sort argument identifies the element sort of the regex
// (must be (Seq Char)). The predicate is encoded as 2*N int
// parameters [lo_0, hi_0, ..., lo_{N-1}, hi_{N-1}]. An empty
// predicate produces re.none of the same sort.
app* mk_charclass(seq::range_predicate const& p, sort* seq_sort);
bool is_to_re(expr const* n) const { return is_app_of(n, m_fid, OP_SEQ_TO_RE); }
bool is_concat(expr const* n) const { return is_app_of(n, m_fid, OP_RE_CONCAT); }
bool is_union(expr const* n) const { return is_app_of(n, m_fid, OP_RE_UNION); }
@ -577,6 +592,16 @@ public:
bool is_reverse(expr const* n) const { return is_app_of(n, m_fid, OP_RE_REVERSE); }
bool is_derivative(expr const* n) const { return is_app_of(n, m_fid, OP_RE_DERIVATIVE); }
bool is_antimirov_union(expr const* n) const { return is_app_of(n, m_fid, _OP_RE_ANTIMIROV_UNION); }
bool is_charclass(expr const* n) const { return is_app_of(n, m_fid, OP_RE_CHARCLASS); }
bool is_charclass(expr const* n, seq::range_predicate& p) const;
// True if `r` is semantically equivalent to a single char-class regex
// over Char. Handles OP_RE_CHARCLASS, re.full_char, re.empty (over a
// sequence sort whose element sort is Char), and re.range with two
// single-char concrete bounds. Returns the canonical predicate in `p`.
// This is the O(1) fast-path check used by smart constructors to
// collapse boolean combinations to a single CHARCLASS node.
bool as_charclass(expr* r, seq::range_predicate& p) const;
MATCH_UNARY(is_to_re);
MATCH_BINARY(is_concat);
MATCH_BINARY(is_union);

View file

@ -22,7 +22,7 @@ Author:
--*/
#include "ast/rewriter/range_predicate.h"
#include "ast/range_predicate.h"
#include "util/debug.h"
#include <cstdint>
#include <iostream>