3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2026-06-26 10:28:48 +00:00

Derive with ranges (#9963)

Signed-off-by: Nikolaj Bjorner <nbjorner@microsoft.com>
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: Margus Veanes <margus@microsoft.com>
Co-authored-by: Margus Veanes <veanes@users.noreply.github.com>
This commit is contained in:
Nikolaj Bjorner 2026-06-25 18:47:25 -07:00 committed by GitHub
parent 0596c7c634
commit 22c2635786
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
24 changed files with 3462 additions and 1000 deletions

3
.gitignore vendored
View file

@ -1,9 +1,12 @@
*~
rebase.cmd
reports/
crashes/
*.pyc
*.pyo
# Ignore callgrind files
callgrind.out.*
gmon.out
# .hpp files are automatically generated
*.hpp
.env

BIN
gmon.out

Binary file not shown.

View file

@ -39,7 +39,11 @@ z3_add_component(rewriter
rewriter.cpp
seq_axioms.cpp
seq_eq_solver.cpp
seq_derive.cpp
seq_subset.cpp
seq_derive.cpp
seq_range_collapse.cpp
seq_range_predicate.cpp
seq_rewriter.cpp
seq_regex_bisim.cpp
seq_skolem.cpp

View file

@ -19,6 +19,7 @@ Notes:
#include "ast/rewriter/bool_rewriter.h"
#include "params/bool_rewriter_params.hpp"
#include "ast/rewriter/rewriter_def.h"
#include "ast/rewriter/expr_safe_replace.h"
#include "ast/ast_lt.h"
#include "ast/for_each_expr.h"
#include <algorithm>
@ -1185,4 +1186,30 @@ void bool_rewriter::mk_ge2(expr* a, expr* b, expr* c, expr_ref& r) {
}
template class rewriter_tpl<bool_rewriter_cfg>;
bool bool_rewriter::decompose_ite(expr *r, expr_ref &c, expr_ref &th, expr_ref &el) {
expr *cond = nullptr, *r1 = nullptr, *r2 = nullptr;
if (m().is_ite(r, cond, r1, r2)) {
c = cond;
th = r1;
el = r2;
return true;
}
for (expr *e : subterms::ground(expr_ref(r, m()))) {
if (m().is_ite(e, cond, r1, r2)) {
m_rep1.reset();
m_rep2.reset();
m_rep1.insert(e, r1);
m_rep2.insert(e, r2);
c = cond;
th = r;
el = r;
m_rep1(th);
m_rep2(el);
return true;
}
}
return false;
}
template class rewriter_tpl<bool_rewriter_cfg>;

View file

@ -20,6 +20,7 @@ Notes:
#include "ast/ast.h"
#include "ast/rewriter/rewriter.h"
#include "ast/rewriter/expr_safe_replace.h"
#include "util/params.h"
/**
@ -64,6 +65,7 @@ class bool_rewriter {
ptr_vector<expr> m_todo1, m_todo2;
unsigned_vector m_counts1, m_counts2;
expr_mark m_marked;
expr_safe_replace m_rep1, m_rep2;
br_status mk_flat_and_core(unsigned num_args, expr * const * args, expr_ref & result);
br_status mk_flat_or_core(unsigned num_args, expr * const * args, expr_ref & result);
@ -87,7 +89,7 @@ class bool_rewriter {
expr_ref simplify_eq_ite(expr* value, expr* ite);
public:
bool_rewriter(ast_manager & m, params_ref const & p = params_ref()):m_manager(m), m_local_ctx_cost(0) {
bool_rewriter(ast_manager & m, params_ref const & p = params_ref()):m_manager(m), m_local_ctx_cost(0), m_rep1(m), m_rep2(m) {
updt_params(p);
}
ast_manager & m() const { return m_manager; }
@ -242,6 +244,11 @@ public:
void mk_nand(expr * arg1, expr * arg2, expr_ref & result);
void mk_nor(expr * arg1, expr * arg2, expr_ref & result);
void mk_ge2(expr* a, expr* b, expr* c, expr_ref& result);
// If r is, or contains, an if-then-else, decompose it into a top-level
// ite by hoisting the (first) inner ite condition: returns c, th, el such
// that r is equivalent to (ite c th el). Returns false if r has no ite.
bool decompose_ite(expr *r, expr_ref &c, expr_ref &th, expr_ref &el);
};
struct bool_rewriter_cfg : public default_rewriter_cfg {

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,265 @@
/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
seq_derive.h
Abstract:
Symbolic derivative computation for regular expressions.
Produces an ITE-tree (transition regex) representation where
the free variable is de Bruijn index 0 representing the input character.
Based on the theory of symbolic derivatives and transition regexes:
- Veanes et al., "On Symbolic Derivatives and Transition Regexes" (LPAR 2024)
- Varatalu, Veanes, Ernits, "RE#" (POPL 2025)
- Stanford, Veanes, Bjørner, "Symbolic Boolean Derivatives" (PLDI 2021)
Authors:
Nikolaj Bjorner (nbjorner) 2025-06-03
--*/
#pragma once
#include "ast/seq_decl_plugin.h"
#include "ast/arith_decl_plugin.h"
#include "ast/array_decl_plugin.h"
#include "ast/rewriter/bool_rewriter.h"
#include "util/obj_pair_hashtable.h"
#include "util/obj_triple_hashtable.h"
#include <functional>
class seq_rewriter;
namespace seq {
enum class derivative_kind { antimirov_t, brzozowski_t };
/**
* Symbolic derivative engine for regular expressions.
*
* Given a regex r, operator()(r) computes a symbolic derivative δ(r)
* represented as an ITE-tree over character predicates (using de Bruijn
* variable 0 for the character). Evaluating the ITE-tree for a concrete
* character 'a' yields the classical Brzozowski derivative δ_a(r).
*
* The ITE-tree structure implicitly defines minterms (equivalence classes
* of characters indistinguishable by the regex).
*
* Key properties:
* - Results are memoized for termination on cyclic derivative graphs
* - Union/intersection operands are sorted for ACI canonicalization
* - Depth-bounded to prevent stack overflow
*/
class derive {
ast_manager& m;
seq_util m_util;
arith_util m_autil;
bool_rewriter m_br;
seq_rewriter& m_re;
// Cache: maps (ele, regex) pair to its derivative
obj_pair_map<expr, expr, expr*> m_acache, m_bcache;
obj_pair_map<expr, expr, expr*> m_atop_cache, m_btop_cache; // post-simplify cache
expr_ref_vector m_trail; // pin cached results
// Op cache for ITE-hoisting operations (union, inter, concat, complement)
// Path-aware caches: key is (a, b, path_expr) for binary ops, (a, path_expr) for complement
obj_triple_map<expr, expr, expr, expr *> m_aunion_cache, m_bunion_cache, m_ainter_cache, m_binter_cache, m_axor_cache, m_bxor_cache;
obj_pair_map<expr, expr, expr*> m_aconcat_cache, m_bconcat_cache;
obj_pair_map<expr, expr, expr*> m_acomplement_cache, m_bcomplement_cache;
// Depth limiting
unsigned m_depth { 0 };
static const unsigned m_max_depth = 512;
seq_util::rex& re() { return m_util.re; }
seq_util& u() { return m_util; }
derivative_kind m_derivative_kind = derivative_kind::antimirov_t;
// The element (character) for the current derivative computation
expr_ref m_ele;
// Path state for inline pruning during mk_inter/mk_union/mk_complement
using intervals_t = svector<std::pair<unsigned, unsigned>>;
// Path: vector of signed atoms
svector<std::pair<expr*, bool>> m_path;
// Intervals: feasible character ranges under current path (append-only)
intervals_t m_intervals;
unsigned m_intervals_start { 0 };
// Stack of saved states for push/pop
struct path_save { unsigned path_sz; unsigned intervals_sz; unsigned intervals_start; expr* path_expr; };
svector<path_save> m_path_stack;
// Boolean expression encoding of current path (for cache keys)
expr_ref m_path_expr;
// Path interface
lbool push(expr* c, bool sign); // l_true: implied, l_undef: pushed (must pop), l_false: contradicts
void pop(); // restore state to matching push
expr* get_path_expr() { return m_path_expr; }
obj_pair_map<expr, expr, expr *> &cache() {
return m_derivative_kind == derivative_kind::antimirov_t ? m_acache : m_bcache;
}
obj_pair_map<expr, expr, expr *> &top_cache() {
return m_derivative_kind == derivative_kind::antimirov_t ? m_atop_cache : m_btop_cache;
}
obj_triple_map<expr, expr, expr, expr *> &union_cache() {
return m_derivative_kind == derivative_kind::antimirov_t ? m_aunion_cache : m_bunion_cache;
}
obj_triple_map<expr, expr, expr, expr *> &inter_cache() {
return m_derivative_kind == derivative_kind::antimirov_t ? m_ainter_cache : m_binter_cache;
}
obj_triple_map<expr, expr, expr, expr *> &xor_cache() {
return m_derivative_kind == derivative_kind::antimirov_t ? m_axor_cache : m_bxor_cache;
}
obj_pair_map<expr, expr, expr *> &concat_cache() {
return m_derivative_kind == derivative_kind::antimirov_t ? m_aconcat_cache : m_bconcat_cache;
}
obj_pair_map<expr, expr, expr *> &complement_cache() {
return m_derivative_kind == derivative_kind::antimirov_t ? m_acomplement_cache : m_bcomplement_cache;
}
// Hoist ITE: apply_op through ite(c, t, e) with path pruning
expr_ref apply_ite(expr* c, expr* t, expr* e, expr* r, std::function<expr_ref(expr*, expr*)> apply_op);
expr_ref apply_ite(expr* c, expr* t1, expr* e1, expr* t2, expr* e2, std::function<expr_ref(expr*, expr*)> apply_op);
expr_ref apply_ite(expr* c, expr* t, expr* e, std::function<expr_ref(expr*)> apply_op);
// Common ITE dispatch for binary ops (union/inter)
expr_ref hoist_ite(expr* a, expr* b, std::function<expr_ref(expr*, expr*)> apply_op);
// Evaluate a condition against the current path/intervals
lbool eval_path_cond(expr* c);
// Internal helpers for push
lbool push_path_atoms(expr* c, bool sign);
lbool push_intervals_impl(expr* c, bool sign);
// Core derivative computation
expr_ref derive_rec(expr* r);
expr_ref derive_core(expr* r);
// Helpers for specific regex constructs
expr_ref derive_to_re(expr* s, sort* seq_sort);
expr_ref derive_range(expr* lo, expr* hi, sort* seq_sort);
expr_ref derive_of_pred(expr* pred, sort* seq_sort);
// Nullable check: returns a Boolean expression
expr_ref is_nullable(expr* r);
expr_ref is_nullable_symbolic_regex(expr* r, sort* seq_sort);
// Smart constructors with path-aware simplification and ACI canonicalization
expr_ref mk_union(expr* a, expr* b);
bool are_complements(expr* a, expr* b);
unsigned union_id(expr* e); // complement-aware ID for sorting
bool is_subset(expr* a, expr* b);
expr_ref mk_union_core(expr* a, expr* b);
expr_ref mk_inter(expr* a, expr* b);
expr_ref mk_inter_core(expr* a, expr* b);
expr_ref mk_concat(expr* a, expr* b);
expr_ref mk_complement(expr* a);
expr_ref mk_complement_core(expr* a);
expr_ref mk_xor(expr *a, expr *b);
expr_ref mk_xor_core(expr *a, expr *b);
expr_ref mk_core(decl_kind k, expr* a, expr* b);
expr_ref mk_ite(expr* c, expr* t, expr* e);
// Distribute concatenation through ITE/union in derivative
expr_ref mk_deriv_concat(expr* d, expr* tail);
expr_ref mk_deriv_concat_core(expr* d, expr* tail);
// Extract head character and tail from a sequence expression
bool get_head_tail(expr* s1, expr* s2, expr_ref& hd, expr_ref& tl);
// Predicate implication for character range conditions.
bool pred_implies(bool sign_a, expr* a, bool sign_b, expr* b);
bool pred_implies(expr* a, expr* b);
// Normalize reverse(r)
expr_ref mk_regex_reverse(expr* r);
// Condition evaluation helpers
lbool eval_cond(expr* cond);
lbool eval_range_cond(expr* c);
void intersect_intervals(unsigned lo, unsigned hi);
void exclude_interval(unsigned lo, unsigned hi);
// Cofactor enumeration over a transition regex (ITE-tree).
void get_cofactors_rec(expr* r, expr_ref_pair_vector& result);
// Re-apply union/intersection simplifications bottom-up to a cofactor
// leaf. decompose_ite substitutes ITE branch values structurally
// (no simplification), so leaves can contain un-normalized nodes such
// as union(R, none) or inter(R, none); this rebuilds them through
// mk_union/mk_inter so equal states share a canonical form.
expr_ref clean_leaf(expr* r);
sort* re_sort(expr* r) { return r->get_sort(); }
sort* seq_sort(expr* r) { sort* s = nullptr; m_util.is_re(r, s); return s; }
sort* ele_sort(expr* r) { sort* s = seq_sort(r); sort* e = nullptr; m_util.is_seq(s, e); return e; }
void reset();
void reset_op_caches();
public:
derive(ast_manager& m, seq_rewriter& re);
/**
* Compute the derivative of regex r with respect to element ele.
* When ele is a de Bruijn variable, produces a symbolic ITE-tree.
* When ele is a concrete character, produces the concrete derivative.
*/
expr_ref operator()(derivative_kind k, expr* ele, expr* r);
/**
* Convenience: symbolic derivative using de Bruijn var 0.
*/
expr_ref operator()(derivative_kind k, expr* r);
/**
* Nullable check: returns a Boolean expression that is true iff r accepts the empty string.
*/
expr_ref nullable(expr* r) { return is_nullable(r); }
/**
* Enumerate the cofactors (min-terms) of a transition regex r taken with
* respect to element ele. r is an ITE-tree over character predicates on
* ele; for every feasible path through the tree this produces a pair
* (path_condition, leaf_regex). Infeasible character-interval
* combinations are pruned using the same path/interval context that the
* derivative engine uses while hoisting ITEs.
*/
void get_cofactors(expr* ele, expr* r, expr_ref_pair_vector& result);
/**
* Compute the symbolic derivative of r and enumerate its reachable
* leaves in fully ITE-hoisted normal form.
*
* Concretely this returns, for every feasible minterm (character
* class) of δ(r), a pair (path_condition, target_regex). Every
* if-then-else over the input character (including ones that would
* otherwise be buried under a concat/union) is hoisted to the top
* via the same path/interval pruning used by the derivative engine,
* so each target_regex is free of (:var 0) and its nullability is
* always decidable. Unions are kept intact as single leaves (a
* union leaf denotes a single bisimulation state). Infeasible
* minterms are pruned, so all returned leaves are reachable.
*
* This is the entry point the regex_bisim equivalence procedure
* uses: it consumes the target_regex of each pair and ignores the
* (redundant) path condition.
*/
void derivative_cofactors(expr* r, expr_ref_pair_vector& result);
};
}

View file

@ -0,0 +1,168 @@
/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
seq_range_collapse.cpp
Abstract:
Implementation of regex <-> range_predicate translation for the
boolean-combination-of-ranges fragment. See header for the recognized
grammar and the canonical regex AST emitted by materialization.
Authors:
Margus Veanes (veanes) 2026
--*/
#include "ast/rewriter/seq_range_collapse.h"
namespace seq {
bool regex_to_range_predicate(seq_util& u, expr* r, range_predicate& out) {
// The range algebra only models sets of single characters over the
// unsigned character domain [0, max_char]. Guard against any regex
// whose element type is not a sequence of characters (e.g. a regex
// over (Seq Int) or (Seq (Seq Char))): for such regexes the
// re.range/re.union/... matchers below would silently fabricate a
// character-class predicate and change semantics. Reject them up
// front so callers fall back to the generic regex path.
sort* seq_sort = nullptr;
if (!u.is_re(r, seq_sort) || !u.is_string(seq_sort))
return false;
unsigned const max_char = u.max_char();
auto& re = u.re;
if (re.is_empty(r)) {
out = range_predicate::empty(max_char);
return true;
}
if (re.is_full_char(r)) {
out = range_predicate::top(max_char);
return true;
}
unsigned lo = 0, hi = 0;
expr* lo_e = nullptr;
expr* hi_e = nullptr;
if (re.is_range(r, lo_e, hi_e)) {
auto extract_char = [&](expr* e, unsigned& c) -> bool {
if (u.is_const_char(e, c)) return true;
expr* inner = nullptr;
if (u.str.is_unit(e, inner) && u.is_const_char(inner, c)) return true;
zstring s;
if (u.str.is_string(e, s) && s.length() == 1) {
c = s[0];
return true;
}
return false;
};
if (!extract_char(lo_e, lo) || !extract_char(hi_e, hi))
return false;
// Empty/inverted range [lo > hi] is the empty regex.
if (lo > hi) {
out = range_predicate::empty(max_char);
return true;
}
out = range_predicate::range(lo, hi, max_char);
return true;
}
expr *a = nullptr, *b = nullptr, *c = nullptr;
if (re.is_union(r, a, b)) {
range_predicate pa(max_char), pb(max_char);
if (!regex_to_range_predicate(u, a, pa)) return false;
if (!regex_to_range_predicate(u, b, pb)) return false;
out = pa | pb;
return true;
}
auto mk_diff = [&](expr *a, expr *b) -> bool {
range_predicate pa(max_char), pb(max_char);
if (!regex_to_range_predicate(u, a, pa))
return false;
if (!regex_to_range_predicate(u, b, pb))
return false;
out = pa - pb;
return true;
};
if (re.is_diff(r, a, b))
return mk_diff(a, b);
if (re.is_intersection(r, a, b) && re.is_complement(b, c))
return mk_diff(a, c);
if (re.is_intersection(r, a, b) && re.is_complement(a, c))
return mk_diff(b, c);
if (re.is_intersection(r, a, b)) {
range_predicate pa(max_char), pb(max_char);
if (!regex_to_range_predicate(u, a, pa)) return false;
if (!regex_to_range_predicate(u, b, pb)) return false;
out = pa & pb;
return true;
}
// NOTE: re.complement is intentionally NOT handled here.
// re.complement is the SEQUENCE-level complement: its language
// includes the empty string, strings of length >= 2, and any
// length-1 string outside the operand. A character-class
// range_predicate can only describe a set of length-1 strings,
// so collapsing re.complement(R) to ~R (character-level
// complement) would change semantics whenever R is wrapped in
// any sequence-level context (e.g. re.diff at the top level,
// or membership tests). De-Morgan equivalences and the
// special cases re.complement(re.empty) / re.complement(re.full)
// are already handled directly in seq_rewriter::mk_re_complement.
return false;
}
static expr_ref mk_unit_string_from_char(seq_util& u, unsigned c) {
return expr_ref(u.str.mk_string(zstring(c)), u.get_manager());
}
static expr_ref mk_single_range_regex(seq_util& u, unsigned lo, unsigned hi, sort* re_sort) {
ast_manager& m = u.get_manager();
if (lo == 0 && hi == u.max_char())
return expr_ref(u.re.mk_full_char(re_sort), m);
// Use the canonical unit-character form (seq.unit (Char N)) for
// range bounds. This matches the shape used elsewhere in
// seq_rewriter and avoids creating duplicate AST nodes with
// different ids for semantically identical ranges.
expr_ref slo(u.str.mk_unit(u.str.mk_char(lo)), m);
expr_ref shi(u.str.mk_unit(u.str.mk_char(hi)), m);
return expr_ref(u.re.mk_range(slo, shi), m);
}
expr_ref range_predicate_to_regex(seq_util& u, range_predicate const& p, sort* seq_sort) {
ast_manager& m = u.get_manager();
sort* re_sort = u.re.mk_re(seq_sort);
if (p.is_empty())
return expr_ref(u.re.mk_empty(re_sort), m);
unsigned const n = p.num_ranges();
SASSERT(n > 0);
if (n == 1) {
auto [lo, hi] = p[0];
return mk_single_range_regex(u, lo, hi, re_sort);
}
// Build single-range AST nodes first, then sort by expression id
// so the resulting right-associated union matches the canonical
// id-sorted shape that seq_rewriter::merge_regex_sets expects.
// Without this the merge algorithm produces incorrect unions
// when it has to combine our materialized output with another
// (id-sorted) regex set.
expr_ref_vector ranges(m);
for (unsigned i = 0; i < n; ++i) {
auto [lo, hi] = p[i];
ranges.push_back(mk_single_range_regex(u, lo, hi, re_sort));
}
std::sort(ranges.data(), ranges.data() + ranges.size(),
[](expr* a, expr* b) { return a->get_id() < b->get_id(); });
expr_ref acc(ranges.get(n - 1), m);
for (unsigned i = n - 1; i-- > 0; )
acc = expr_ref(u.re.mk_union(ranges.get(i), acc), m);
return acc;
}
}

View file

@ -0,0 +1,71 @@
/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
seq_range_collapse.h
Abstract:
Recognize regexes that are boolean combinations of character-class
primitives (re.empty, re.full_char, re.range with concrete chars,
and re.union/inter/comp/diff over translatable arguments), and
materialize a seq::range_predicate back into a canonical regex AST.
Together with seq_rewriter integration, this lets any boolean
combination of character-class regexes collapse to a canonical
multi-range form, so that equivalent character classes share AST
identity, and downstream consumers (derivative, OneStep, caching)
can short-circuit them as pure range predicates.
Authors:
Margus Veanes (veanes) 2026
--*/
#pragma once
#include "ast/rewriter/seq_range_predicate.h"
#include "ast/seq_decl_plugin.h"
namespace seq {
/**
* If r is a boolean combination of character-class regex primitives
* over the unsigned character domain [0, max_char], compute the
* equivalent range_predicate and return true. Otherwise return false
* with out untouched.
*
* Recognized fragment (all character-class-preserving operations):
* re.empty -> empty
* re.full_char_set -> top
* re.range "c_lo" "c_hi" (concrete) -> [c_lo, c_hi]
* re.union r1 r2 -> p1 | p2
* re.intersection r1 r2 -> p1 & p2
* re.diff r1 r2 -> p1 - p2
*
* Notably re.complement is NOT recognized: it is a SEQUENCE-level
* complement (over all of Σ*), not a character-class complement, so
* collapsing it would change semantics whenever the result is used
* in any non-character-class context. Sequence-level rewrites for
* re.complement (double-comp, deMorgan, etc.) are handled directly
* in seq_rewriter::mk_re_complement.
*/
bool regex_to_range_predicate(seq_util& u, expr* r, range_predicate& out);
/**
* Canonical materialization of p as a regex AST over the given
* sequence sort. Two range_predicates with equal canonical
* representations produce structurally identical regex ASTs:
*
* empty -> re.empty
* top -> re.full_char_set
* single range [lo, hi] -> re.range "lo" "hi"
* multiple ranges -> right-associated re.union of single
* ranges, in increasing order of lo
* (matching the canonical range order
* held by range_predicate).
*/
expr_ref range_predicate_to_regex(seq_util& u, range_predicate const& p, sort* seq_sort);
}

View file

@ -0,0 +1,292 @@
/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
seq_range_predicate.cpp
Abstract:
Implementation of the specialized range-algebra used by symbolic
derivative computation and regex rewriting. See seq_range_predicate.h
for the algebraic specification.
All Boolean operations are implemented as single linear sweeps over
the canonical sorted range vectors and produce canonical output
(sorted, disjoint, non-adjacent).
Authors:
Margus Veanes (veanes) 2026
--*/
#include "ast/rewriter/seq_range_predicate.h"
#include "util/debug.h"
#include <algorithm>
#include <ostream>
namespace seq {
// -----------------------------------------------------------------------
// Factories
// -----------------------------------------------------------------------
range_predicate range_predicate::empty(unsigned max_char) {
return range_predicate(max_char);
}
range_predicate range_predicate::top(unsigned max_char) {
range_predicate r(max_char);
r.m_ranges.push_back({0u, max_char});
SASSERT(r.well_formed());
return r;
}
range_predicate range_predicate::singleton(unsigned c, unsigned max_char) {
SASSERT(c <= max_char);
range_predicate r(max_char);
r.m_ranges.push_back({c, c});
SASSERT(r.well_formed());
return r;
}
range_predicate range_predicate::range(unsigned lo, unsigned hi, unsigned max_char) {
range_predicate r(max_char);
if (lo <= hi && lo <= max_char) {
unsigned clipped_hi = hi <= max_char ? hi : max_char;
r.m_ranges.push_back({lo, clipped_hi});
}
SASSERT(r.well_formed());
return r;
}
// -----------------------------------------------------------------------
// Invariants and observers
// -----------------------------------------------------------------------
bool range_predicate::well_formed() const {
for (unsigned i = 0; i < m_ranges.size(); ++i) {
auto [lo, hi] = m_ranges[i];
if (lo > hi) return false;
if (hi > m_max_char) return false;
if (i > 0) {
unsigned prev_hi = m_ranges[i - 1].second;
// Non-adjacent and sorted: prev_hi + 1 < lo, with care
// around prev_hi == UINT_MAX which we never expect because
// hi <= m_max_char.
if (prev_hi + 1 >= lo) return false;
}
}
return true;
}
bool range_predicate::contains(unsigned c) const {
// Binary search on first element of pairs.
unsigned lo = 0, hi = m_ranges.size();
while (lo < hi) {
unsigned mid = lo + (hi - lo) / 2;
auto [a, b] = m_ranges[mid];
if (c < a) hi = mid;
else if (c > b) lo = mid + 1;
else return true;
}
return false;
}
uint64_t range_predicate::cardinality() const {
uint64_t n = 0;
for (auto [lo, hi] : m_ranges)
n += static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo) + 1u;
return n;
}
// -----------------------------------------------------------------------
// Equality, ordering, hashing
// -----------------------------------------------------------------------
bool range_predicate::equals(range_predicate const& o) const {
if (m_max_char != o.m_max_char) return false;
if (m_ranges.size() != o.m_ranges.size()) return false;
for (unsigned i = 0; i < m_ranges.size(); ++i)
if (m_ranges[i] != o.m_ranges[i]) return false;
return true;
}
bool range_predicate::operator<(range_predicate const& o) const {
if (m_max_char != o.m_max_char)
return m_max_char < o.m_max_char;
unsigned n = std::min(m_ranges.size(), o.m_ranges.size());
for (unsigned i = 0; i < n; ++i) {
auto a = m_ranges[i];
auto b = o.m_ranges[i];
if (a.first != b.first) return a.first < b.first;
if (a.second != b.second) return a.second < b.second;
}
return m_ranges.size() < o.m_ranges.size();
}
unsigned range_predicate::hash() const {
// FNV-1a 32-bit over (max_char, then each (lo, hi)).
uint32_t h = 2166136261u;
auto step = [&](uint32_t x) {
h ^= x;
h *= 16777619u;
};
step(m_max_char);
for (auto [lo, hi] : m_ranges) {
step(lo);
step(hi);
}
return h;
}
// -----------------------------------------------------------------------
// Boolean operations
// -----------------------------------------------------------------------
namespace {
// Append (lo, hi) to result, merging with the previous range if
// adjacent or overlapping. Maintains canonical form.
inline void append_merged(svector<std::pair<unsigned, unsigned>>& result,
unsigned lo, unsigned hi) {
SASSERT(lo <= hi);
if (!result.empty() && result.back().second + 1 >= lo) {
if (result.back().second < hi)
result.back().second = hi;
} else {
result.push_back({lo, hi});
}
}
}
range_predicate range_predicate::operator|(range_predicate const& o) const {
SASSERT(m_max_char == o.m_max_char);
range_predicate r(m_max_char);
unsigned i = 0, j = 0;
const unsigned n = m_ranges.size();
const unsigned m = o.m_ranges.size();
while (i < n && j < m) {
auto a = m_ranges[i];
auto b = o.m_ranges[j];
if (a.first <= b.first) {
append_merged(r.m_ranges, a.first, a.second);
++i;
} else {
append_merged(r.m_ranges, b.first, b.second);
++j;
}
}
while (i < n) {
auto a = m_ranges[i++];
append_merged(r.m_ranges, a.first, a.second);
}
while (j < m) {
auto b = o.m_ranges[j++];
append_merged(r.m_ranges, b.first, b.second);
}
SASSERT(r.well_formed());
return r;
}
range_predicate range_predicate::operator&(range_predicate const& o) const {
SASSERT(m_max_char == o.m_max_char);
range_predicate r(m_max_char);
unsigned i = 0, j = 0;
const unsigned n = m_ranges.size();
const unsigned m = o.m_ranges.size();
while (i < n && j < m) {
auto [a_lo, a_hi] = m_ranges[i];
auto [b_lo, b_hi] = o.m_ranges[j];
unsigned lo = std::max(a_lo, b_lo);
unsigned hi = std::min(a_hi, b_hi);
if (lo <= hi)
r.m_ranges.push_back({lo, hi});
// Advance the range that ends first.
if (a_hi < b_hi) ++i;
else if (b_hi < a_hi) ++j;
else { ++i; ++j; }
}
SASSERT(r.well_formed());
return r;
}
range_predicate range_predicate::operator~() const {
range_predicate r(m_max_char);
unsigned cursor = 0;
for (auto [lo, hi] : m_ranges) {
if (cursor < lo)
r.m_ranges.push_back({cursor, lo - 1});
// Step past hi without overflow: hi <= m_max_char and we
// only step when more characters remain.
if (hi >= m_max_char) {
cursor = m_max_char + 1; // sentinel: no more characters
break;
}
cursor = hi + 1;
}
if (cursor <= m_max_char)
r.m_ranges.push_back({cursor, m_max_char});
SASSERT(r.well_formed());
return r;
}
range_predicate range_predicate::operator-(range_predicate const& o) const {
SASSERT(m_max_char == o.m_max_char);
// A - B by linear sweep: for each range of A, subtract overlapping
// ranges of B. Both inputs are sorted so we advance j monotonically.
range_predicate r(m_max_char);
unsigned j = 0;
const unsigned m = o.m_ranges.size();
for (auto [a_lo, a_hi] : m_ranges) {
unsigned cursor = a_lo;
while (j < m && o.m_ranges[j].second < cursor)
++j;
unsigned k = j;
while (k < m && o.m_ranges[k].first <= a_hi) {
auto [b_lo, b_hi] = o.m_ranges[k];
if (cursor < b_lo)
r.m_ranges.push_back({cursor, std::min(a_hi, b_lo - 1)});
if (b_hi >= a_hi) {
cursor = a_hi + 1;
break;
}
cursor = b_hi + 1;
++k;
}
if (cursor <= a_hi)
r.m_ranges.push_back({cursor, a_hi});
}
SASSERT(r.well_formed());
return r;
}
range_predicate range_predicate::operator^(range_predicate const& o) const {
SASSERT(m_max_char == o.m_max_char);
// (A | B) - (A & B), but implemented directly with one linear sweep
// over the union of breakpoints.
return (*this | o) - (*this & o);
}
// -----------------------------------------------------------------------
// Display
// -----------------------------------------------------------------------
std::ostream& range_predicate::display(std::ostream& out) const {
if (m_ranges.empty()) {
return out << "[]";
}
out << "[";
bool first = true;
for (auto [lo, hi] : m_ranges) {
if (!first) out << ",";
first = false;
if (lo == hi)
out << lo;
else
out << lo << "-" << hi;
}
return out << "]";
}
}

View file

@ -0,0 +1,127 @@
/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
seq_range_predicate.h
Abstract:
Specialized range-algebra over an unsigned character domain [0, max_char].
A range_predicate represents a subset of the character domain as a
sorted sequence of non-overlapping, non-adjacent, non-empty ranges:
[(lo_0, hi_0), (lo_1, hi_1), ...] with hi_i + 1 < lo_{i+1}.
The representation is canonical, so two range_predicates over the same
domain are extensionally equivalent iff their internal vectors are
elementwise equal.
All Boolean operations (union, intersection, complement, difference)
are linear in the total number of ranges and produce the canonical
representation.
Intended use:
* path conditions for symbolic derivative computation,
* OneStep predicates capturing length-1 acceptance,
* smart-constructor side conditions for regex rewrites such as
R & psi --> toregex(OneStep(R) & psi).
The type is a pure value: no ast_manager allocation occurs in its
construction or its Boolean operations. Conversion to and from
expr* is the responsibility of a separate translator (see callers
in seq_derive / seq_rewriter).
Authors:
Margus Veanes (veanes) 2026
--*/
#pragma once
#include "util/vector.h"
#include <iosfwd>
#include <utility>
namespace seq {
class range_predicate {
using range_t = std::pair<unsigned, unsigned>;
using ranges_t = svector<range_t>;
// Sorted by first; ranges are disjoint and non-adjacent;
// every range satisfies lo <= hi <= m_max_char.
ranges_t m_ranges;
unsigned m_max_char { 0 };
// Invariant check used in debug builds.
bool well_formed() const;
public:
range_predicate() = default;
explicit range_predicate(unsigned max_char) : m_max_char(max_char) {}
// ---------------- Factory functions ----------------
static range_predicate empty(unsigned max_char);
static range_predicate top(unsigned max_char);
static range_predicate singleton(unsigned c, unsigned max_char);
static range_predicate range(unsigned lo, unsigned hi, unsigned max_char);
// ---------------- Observers ----------------
unsigned max_char() const { return m_max_char; }
unsigned num_ranges() const { return m_ranges.size(); }
range_t operator[](unsigned i) const { return m_ranges[i]; }
ranges_t const& ranges() const { return m_ranges; }
bool is_empty() const { return m_ranges.empty(); }
bool is_top() const {
return m_ranges.size() == 1
&& m_ranges[0].first == 0
&& m_ranges[0].second == m_max_char;
}
bool is_singleton(unsigned& c) const {
if (m_ranges.size() != 1) return false;
if (m_ranges[0].first != m_ranges[0].second) return false;
c = m_ranges[0].first;
return true;
}
bool contains(unsigned c) const;
// Number of characters in the predicate (well-defined for any domain).
uint64_t cardinality() const;
// ---------------- Equality, ordering, hashing ----------------
bool equals(range_predicate const& o) const;
bool operator==(range_predicate const& o) const { return equals(o); }
bool operator!=(range_predicate const& o) const { return !equals(o); }
// Total order: lexicographic on the canonical range sequence,
// with shorter sequences ordered before longer prefixes.
// Predicates over different domains compare by max_char first.
bool operator<(range_predicate const& o) const;
bool less_than(range_predicate const& o) const { return *this < o; }
unsigned hash() const;
// ---------------- Boolean operations ----------------
range_predicate operator|(range_predicate const& o) const; // union
range_predicate operator&(range_predicate const& o) const; // intersection
range_predicate operator-(range_predicate const& o) const; // difference
range_predicate operator^(range_predicate const& o) const; // symmetric diff
range_predicate operator~() const; // complement
// ---------------- Display ----------------
std::ostream& display(std::ostream& out) const;
};
inline std::ostream& operator<<(std::ostream& out, range_predicate const& p) {
return p.display(out);
}
}

View file

@ -85,45 +85,6 @@ namespace seq {
return is_ground(r);
}
/*
Collect the leaves of a t-regex der (an ITE-tree whose leaves are
regex expressions) into the output vector. Empty (re.empty) leaves
are dropped.
Each leaf is treated as a single bisimulation state regardless of
its top-level shape (including re.union and re.antimirov_union):
descending into a union at the leaf would split one state into
several, which is semantically unsound for the bisimulation /
union-find merging that follows.
Returns false if we encountered an unexpected node (e.g. a free
variable creeping in) in that case the caller should bail out.
*/
bool regex_bisim::collect_leaves(expr* der, expr_ref_vector& leaves) {
ptr_vector<expr> work;
obj_hashtable<expr> seen;
work.push_back(der);
seen.insert(der);
while (!work.empty()) {
expr* e = work.back();
work.pop_back();
expr* c = nullptr, * t = nullptr, * f = nullptr;
if (m.is_ite(e, c, t, f)) {
if (seen.insert_if_not_there(t))
work.push_back(t);
if (seen.insert_if_not_there(f))
work.push_back(f);
continue;
}
if (m_util.re.is_empty(e))
continue;
if (!m_util.is_re(e))
return false;
leaves.push_back(e);
}
return true;
}
/*
Fast inequivalence check based on the get_info().classical flag.
@ -232,15 +193,19 @@ namespace seq {
m_worklist.pop_back();
// Compute the symbolic derivative wrt the canonical variable
// (:var 0). The result is a transition regex (ITE tree) whose
// leaves are regex expressions. We use the classical Brzozowski
// entry point so the derivative stays as a single TRegex and
// does not lift unions to the top via antimirov nodes — this
// preserves the XOR-pair invariant the bisimulation relies on.
expr_ref d(m_rw.mk_brz_derivative(r), m);
// (:var 0) and enumerate its reachable leaves in fully
// ITE-hoisted normal form. Every if-then-else over the input
// character — even one that would otherwise be buried under a
// concat or union — is hoisted to the top and infeasible
// minterms are pruned, so each leaf is a ground regex free of
// (:var 0) whose nullability is always decidable. Unions are
// kept intact as single leaves (a union leaf denotes a single
// bisimulation state, never a split into separate states).
expr_ref_pair_vector cofs(m);
m_rw.brz_derivative_cofactors(r, cofs);
expr_ref_vector leaves(m);
if (!collect_leaves(d, leaves))
return l_undef;
for (auto const& p : cofs)
leaves.push_back(p.second);
// First pass: check for any nullable leaf (definitive
// distinguishing empty-continuation word) or any classically

View file

@ -74,7 +74,6 @@ namespace seq {
unsigned node_of(expr* r);
bool merge_leaf(expr* xor_pair);
bool collect_leaves(expr* der, expr_ref_vector& leaves);
lbool nullability(expr* r);
bool is_supported(expr* r);
// Returns true if the leaf l proves that the original pair is

File diff suppressed because it is too large Load diff

View file

@ -19,6 +19,7 @@ Notes:
#pragma once
#include "ast/seq_decl_plugin.h"
#include "ast/rewriter/seq_derive.h"
#include "ast/ast_pp.h"
#include "ast/arith_decl_plugin.h"
#include "ast/rewriter/rewriter_types.h"
@ -128,15 +129,20 @@ class seq_rewriter {
void insert(decl_kind op, expr* a, expr* b, expr* c, expr* r);
};
friend class seq::derive;
seq_util m_util;
seq_subset m_subset;
arith_util m_autil;
bool_rewriter m_br;
seq::derive m_derive;
// re2automaton m_re2aut;
op_cache m_op_cache;
expr_ref_vector m_es, m_lhs, m_rhs;
bool m_coalesce_chars;
bool m_in_bisim { false };
bool m_coalesce_chars = false;
bool m_in_bisim { false };
unsigned m_re_deriv_depth { 0 };
static const unsigned m_max_re_deriv_depth = 512;
enum length_comparison {
shorter_c,
@ -179,8 +185,6 @@ class seq_rewriter {
expr_ref re_replace_char(expr *r, unsigned a_ch, unsigned b_ch, expr *a_str, expr *b_str);
// Calculate derivative, memoized and enforcing a normal form
expr_ref is_nullable_rec(expr* r);
expr_ref mk_derivative_rec(expr* ele, expr* r);
expr_ref mk_der_op(decl_kind k, expr* a, expr* b);
expr_ref mk_der_op_rec(decl_kind k, expr* a, expr* b);
expr_ref mk_der_concat(expr* a, expr* b);
@ -191,26 +195,10 @@ class seq_rewriter {
expr_ref mk_der_cond(expr* cond, expr* ele, sort* seq_sort);
expr_ref mk_der_antimirov_union(expr* r1, expr* r2);
bool ite_bdds_compatible(expr* a, expr* b);
/* if r has the form deriv(en..deriv(e1,to_re(s))..) returns 's = [e1..en]' else returns '() in r'*/
expr_ref is_nullable_symbolic_regex(expr* r, sort* seq_sort);
#ifdef Z3DEBUG
bool check_deriv_normal_form(expr* r, int level = 3);
#endif
void mk_antimirov_deriv_rec(expr* e, expr* r, expr* path, expr_ref& result);
expr_ref mk_antimirov_deriv(expr* e, expr* r, expr* path);
expr_ref mk_in_antimirov_rec(expr* s, expr* d);
expr_ref mk_in_antimirov(expr* s, expr* d);
expr_ref mk_antimirov_deriv_intersection(expr* elem, expr* d1, expr* d2, expr* path);
expr_ref mk_antimirov_deriv_concat(expr* d, expr* r);
expr_ref mk_antimirov_deriv_negate(expr* elem, expr* d);
expr_ref mk_antimirov_deriv_union(expr* d1, expr* d2);
expr_ref mk_antimirov_deriv_restrict(expr* elem, expr* d1, expr* cond);
expr_ref mk_regex_reverse(expr* r);
expr_ref mk_regex_concat(expr* r1, expr* r2);
expr_ref merge_regex_sets(expr* r1, expr* r2, expr* unit, std::function<bool(expr*, expr*&, expr*&)>& decompose, std::function<expr* (expr*, expr*)>& compose);
// elem is (:var 0) and path a condition that may have (:var 0) as a free variable
@ -267,6 +255,14 @@ class seq_rewriter {
br_status mk_re_union0(expr* a, expr* b, expr_ref& result);
br_status mk_re_inter0(expr* a, expr* b, expr_ref& result);
br_status mk_re_complement(expr* a, expr_ref& result);
// Range-set collapse helpers: if the operands form a boolean
// combination of character-class regexes, materialize the result as a
// canonical regex over a single range_predicate. See
// ast/rewriter/seq_range_collapse.h for the recognized fragment.
// NOTE: re.complement is intentionally not in this set because it
// operates at the sequence level, not the character-class level.
bool try_collapse_re_union(expr* a, expr* b, expr_ref& result);
bool try_collapse_re_inter(expr* a, expr* b, expr_ref& result);
br_status mk_re_star(expr* a, expr_ref& result);
br_status mk_re_diff(expr* a, expr* b, expr_ref& result);
br_status mk_re_xor(expr* a, expr* b, expr_ref& result);
@ -351,9 +347,9 @@ class seq_rewriter {
public:
seq_rewriter(ast_manager & m, params_ref const & p = params_ref()):
m_util(m), m_subset(m_util.re), m_autil(m), m_br(m, p), // m_re2aut(m),
m_util(m), m_subset(m_util.re), m_autil(m), m_br(m, p), m_derive(m, *this),
m_op_cache(m), m_es(m),
m_lhs(m), m_rhs(m), m_coalesce_chars(true) {
m_lhs(m), m_rhs(m) {
}
ast_manager & m() const { return m_util.get_manager(); }
family_id get_fid() const { return m_util.get_family_id(); }
@ -364,7 +360,7 @@ public:
static void get_param_descrs(param_descrs & r);
bool coalesce_chars() const { return m_coalesce_chars; }
// bool coalesce_chars() const { return m_coalesce_chars; }
br_status mk_app_core(func_decl * f, unsigned num_args, expr * const * args, expr_ref & result);
br_status mk_eq_core(expr * lhs, expr * rhs, expr_ref & result);
@ -380,6 +376,34 @@ public:
return result;
}
expr_ref mk_xor0(expr *a, expr *b) {
expr_ref result(m());
if (mk_re_xor0(a, b, result) == BR_FAILED)
result = re().mk_xor(a, b);
return result;
}
expr_ref mk_union(expr *a, expr *b) {
expr_ref result(m());
if (mk_re_union(a, b, result) == BR_FAILED)
result = re().mk_union(a, b);
return result;
}
expr_ref mk_inter(expr *a, expr *b) {
expr_ref result(m());
if (mk_re_inter(a, b, result) == BR_FAILED)
result = re().mk_inter(a, b);
return result;
}
expr_ref mk_complement(expr *a) {
expr_ref result(m());
if (mk_re_complement(a, result) == BR_FAILED)
result = re().mk_complement(a);
return result;
}
/*
* makes concat and simplifies
*/
@ -440,7 +464,31 @@ public:
procedure which relies on each leaf of D(p XOR q) being a coherent
XOR pair (D_v p) XOR (D_v q).
*/
expr_ref mk_brz_derivative(expr* r);
expr_ref mk_brz_derivative(expr *r) {
return mk_derivative(r);
}
/*
Enumerate the cofactors (min-terms) of a transition regex r taken with
respect to ele. Produces (path_condition, leaf_regex) pairs for every
feasible path through the ITE-tree, pruning infeasible character ranges.
Delegates to the derivative engine so the same path/interval context used
while hoisting ITEs is reused for the leaf simplification.
*/
void get_cofactors(expr* ele, expr* r, expr_ref_pair_vector& result) {
m_derive.get_cofactors(ele, r, result);
}
/*
Compute the symbolic derivative of r and enumerate its reachable leaves
in fully ITE-hoisted normal form: a list of (path_condition, target)
pairs where every target is free of (:var 0) (so nullability is always
decidable) and unions are kept intact as single states. Used by
regex_bisim, which consumes the targets and ignores the path conditions.
*/
void brz_derivative_cofactors(expr* r, expr_ref_pair_vector& result) {
m_derive.derivative_cofactors(r, result);
}
// heuristic elimination of element from condition that comes form a derivative.
// special case optimization for conjunctions of equalities, disequalities and ranges.
@ -451,6 +499,8 @@ public:
/* Apply simplifications to the intersection to keep it normalized (r1 and r2 are not normalized)*/
expr_ref mk_regex_inter_normalize(expr* r1, expr* r2);
expr_ref mk_regex_concat(expr *r1, expr *r2);
/*
* Extract some string that is a member of r.
* Return true if a valid string was extracted.

View file

@ -19,7 +19,7 @@ Author:
bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const {
while (true) {
if (a == b)
return true;
if (m_re.is_empty(a))
@ -30,7 +30,7 @@ bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const {
return true;
if (depth >= m_max_depth)
return false;
return false;
expr* a1 = nullptr, * a2 = nullptr, * b1 = nullptr, * b2 = nullptr;
unsigned la, ua, lb, ub;
@ -39,16 +39,12 @@ bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const {
if (m_re.is_dot_plus(b) && m_re.get_info(a).nullable == l_false)
return true;
// a ⊆ a*
if (m_re.is_star(b, b1) && is_subset_rec(a, b1, depth))
return true;
// e ⊆ a*
if (m_re.is_epsilon(a) && m_re.is_star(b, b1))
return true;
// R ⊆ R*
if (m_re.is_star(b, b1) && is_subset_rec(a, b1, depth + 1))
// a ⊆ a*: if b = b1* and a ⊆ b1, then a ⊆ b1*
if (m_re.is_star(b, b1) && is_subset_rec(a, b1, depth))
return true;
// R1* ⊆ R2* if R1 ⊆ R2
@ -112,6 +108,12 @@ bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const {
if (m_re.is_concat(b, b1, b2) && m_re.is_full_seq(b1) && is_subset_rec(a, b2, depth))
return true;
// prefix absorption: P·R' ⊆ Σ*·R' for any prefix P (since P ⊆ Σ*).
// Detect that a has R' (= b2) as a concatenation suffix, where b = Σ*·R'.
// Covers contains-patterns, e.g. Σ*·a·Σ*·b·Σ* ⊆ Σ*·b·Σ*.
if (m_re.is_concat(b, b1, b2) && m_re.is_full_seq(b1) && ends_with(a, b2))
return true;
// R ⊆ R'·Σ* if R ⊆ R'
if (m_re.is_concat(b, b1, b2) && m_re.is_full_seq(b2) && is_subset_rec(a, b1, depth))
return true;
@ -144,3 +146,30 @@ bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const {
bool seq_subset::is_subset(expr* a, expr* b) const {
return is_subset_rec(a, b, 0);
}
bool seq_subset::ends_with(expr* a, expr* suf) const {
if (a == suf)
return true;
// Flatten both regexes into their sequence of concatenation factors
// (independent of left/right associativity) and test list-suffix equality.
ptr_vector<expr> af, sf;
flatten_concat(a, af);
flatten_concat(suf, sf);
if (sf.size() > af.size())
return false;
unsigned off = af.size() - sf.size();
for (unsigned i = 0; i < sf.size(); ++i)
if (af[off + i] != sf[i])
return false;
return true;
}
void seq_subset::flatten_concat(expr* a, ptr_vector<expr>& out) const {
expr* a1 = nullptr, * a2 = nullptr;
if (m_re.is_concat(a, a1, a2)) {
flatten_concat(a1, out);
flatten_concat(a2, out);
}
else
out.push_back(a);
}

View file

@ -24,6 +24,12 @@ class seq_subset {
bool is_subset_rec(expr* a, expr* b, unsigned depth) const;
// true if regex a, viewed as a flattened concatenation, has suf as a
// structural (concatenation) suffix.
bool ends_with(expr* a, expr* suf) const;
void flatten_concat(expr* a, ptr_vector<expr>& out) const;
public:
explicit seq_subset(seq_util::rex& re) : m_re(re) {}
bool is_subset(expr* a, expr* b) const;

View file

@ -461,6 +461,24 @@ namespace smt {
if (re().is_empty(r))
//trivially true
return;
// When one side is re.none the equation is a pure emptiness check on
// the other regex (symmetric_diff already returned it as r). Decide
// it directly by antimirov NFA reachability instead of running the
// bisimulation/XOR closure, which would build large un-canonicalized
// product states for intersections of contains-patterns.
if ((re().is_empty(r1) || re().is_empty(r2)) && is_ground(r)) {
switch (re_is_empty(r)) {
case l_true:
STRACE(seq_regex_brief, tout << "empty:eq ";);
return; // languages equal (both empty): trivially true
case l_false:
STRACE(seq_regex_brief, tout << "empty:neq ";);
th.add_axiom(~th.mk_eq(r1, r2, false), false_literal);
return;
case l_undef:
break;
}
}
// Try the bisimulation procedure on ground regexes first. If it
// returns a definite answer, dispatch the corresponding axiom and
// bypass the symbolic emptiness/derivative closure.
@ -562,7 +580,7 @@ namespace smt {
lits.push_back(null_lit);
expr_ref_pair_vector cofactors(m);
get_cofactors(d, cofactors);
seq_rw().get_cofactors(hd, d, cofactors);
for (auto const& p : cofactors) {
if (is_member(p.second, u))
continue;
@ -671,6 +689,67 @@ namespace smt {
return result;
}
/*
Decide emptiness of a ground regex r via antimirov-mode NFA
reachability.
The symbolic derivative engine runs in antimirov mode, so the
derivative of an intersection distributes into a *set* of individual
product states inter(A_i, B_j) (each a small, ground regex) rather
than one giant union-of-intersections term. get_derivative_targets
enumerates these NFA successor states.
We short-circuit to l_false (non-empty) as soon as a reachable state
is nullable (accepts the empty word) or classical (a regex built only
from to_re/all/union/concat/star/plus/opt/loop, hence non-empty). An
intersection itself is never classical, but once one operand reduces
to Σ* the intersection collapses (via the derivative's subset
simplification) to the other, classical, operand.
If the worklist is exhausted with no such state, r is empty (l_true).
Returns l_undef if a step bound is hit, so callers can fall back to
the general procedure.
*/
lbool seq_regex::re_is_empty(expr* r) {
if (re().is_empty(r))
return l_true;
expr_ref_vector pinned(m);
obj_hashtable<expr> visited;
ptr_vector<expr> work;
work.push_back(r);
visited.insert(r);
pinned.push_back(r);
unsigned const bound = 100000;
unsigned steps = 0;
while (!work.empty()) {
if (++steps > bound)
return l_undef;
expr* s = work.back();
work.pop_back();
auto info = re().get_info(s);
if (!info.is_known())
return l_undef;
// ε ∈ L(s) or s is a non-empty classical regex ⇒ L(r) non-empty.
if (info.nullable == l_true || info.classical)
return l_false;
// Dead state: prune (min_length == UINT_MAX means no word is
// accepted from here).
if (info.min_length == UINT_MAX)
continue;
expr_ref_vector targets(m);
get_derivative_targets(s, targets);
for (expr* t : targets) {
if (visited.contains(t))
continue;
visited.insert(t);
pinned.push_back(t);
work.push_back(t);
}
}
return l_true;
}
/*
Return a list of all target regexes in the derivative of a regex r,
ignoring the conditions along each path.
@ -707,53 +786,26 @@ namespace smt {
/*
Return a list of all (cond, leaf) pairs in a given derivative
expression r.
expression r, where elem is the character symbol the derivative was
taken with respect to.
Note: this implementation is inefficient: it simply collects all expressions under an if and
iterates over all combinations.
The transition regexes produced by the symbolic derivative engine are
ITE-trees over character predicates ci on elem (equalities such as
elem = 'A', and ranges such as 'a' <= elem <= 'z'). These predicates
are typically mutually exclusive, so the number of feasible truth
assignments to {c1,..,ck} ("minterms") is small.
This method is still used by:
The enumeration is delegated to seq::derive (via seq_rw().get_cofactors)
so it reuses the very same path/interval context that the derivative
engine uses while hoisting ITEs: each feasible path through the ITE-tree
yields one (path_condition, leaf) cofactor, infeasible character-range
combinations are pruned, and the leaf is simplified with the path-aware
smart constructors.
This is used by:
propagate_is_empty
propagate_is_non_empty
*/
void seq_regex::get_cofactors(expr* r, expr_ref_pair_vector& result) {
obj_hashtable<expr> ifs;
expr* cond = nullptr, * r1 = nullptr, * r2 = nullptr;
for (expr* e : subterms::ground(expr_ref(r, m)))
if (m.is_ite(e, cond, r1, r2))
ifs.insert(cond);
expr_ref_vector rs(m);
vector<expr_ref_vector> conds;
conds.push_back(expr_ref_vector(m));
rs.push_back(r);
for (expr* c : ifs) {
unsigned sz = conds.size();
expr_safe_replace rep1(m);
expr_safe_replace rep2(m);
rep1.insert(c, m.mk_true());
rep2.insert(c, m.mk_false());
expr_ref r2(m);
for (unsigned i = 0; i < sz; ++i) {
expr_ref_vector cs = conds[i];
cs.push_back(mk_not(m, c));
conds.push_back(cs);
conds[i].push_back(c);
expr_ref r1(rs.get(i), m);
rep1(r1, r2);
rs[i] = r2;
rep2(r1, r2);
rs.push_back(r2);
}
}
for (unsigned i = 0; i < conds.size(); ++i) {
expr_ref conj = mk_and(conds[i]);
expr_ref r(rs.get(i), m);
ctx.get_rewriter()(r);
if (!m.is_false(conj) && !re().is_empty(r))
result.push_back(conj, r);
}
}
/*
is_empty(r, u) => ~is_nullable(r)
@ -781,7 +833,7 @@ namespace smt {
d = mk_derivative_wrapper(hd, r);
literal_vector lits;
expr_ref_pair_vector cofactors(m);
get_cofactors(d, cofactors);
seq_rw().get_cofactors(hd, d, cofactors);
for (auto const& p : cofactors) {
if (is_member(p.second, u))
continue;

View file

@ -164,7 +164,12 @@ namespace smt {
// returned by derivative_wrapper
expr_ref mk_deriv_accept(expr* s, unsigned i, expr* r);
void get_derivative_targets(expr* r, expr_ref_vector& targets);
void get_cofactors(expr* r, expr_ref_pair_vector& result);
// Decide emptiness of a ground regex by antimirov-mode NFA
// reachability: explore derivative target states, short-circuiting to
// "non-empty" on the first reachable nullable or classical state.
// Returns l_true (empty), l_false (non-empty), l_undef (gave up).
lbool re_is_empty(expr* r);
/*
Pretty print the regex of the state id to the out stream,

View file

@ -115,15 +115,18 @@ add_executable(test-z3
polynomial_factorization.cpp
polynorm.cpp
prime_generator.cpp
seq_regex_bisim.cpp
proof_checker.cpp
qe_arith.cpp
mbp_qel.cpp
quant_elim.cpp
quant_solve.cpp
random.cpp
range_predicate.cpp
rational.cpp
rcf.cpp
region.cpp
regex_range_collapse.cpp
sat_local_search.cpp
sat_lookahead.cpp
sat_user_scope.cpp

View file

@ -113,6 +113,8 @@
X(api_bug) \
X(api_special_relations) \
X(arith_rewriter) \
X(range_predicate) \
X(regex_range_collapse) \
X(seq_rewriter) \
X(check_assumptions) \
X(smt_context) \
@ -195,6 +197,7 @@
X(finite_set) \
X(finite_set_rewriter) \
X(fpa) \
X(seq_regex_bisim) \
X(term_enumeration) \
X(lcube)

View file

@ -0,0 +1,260 @@
/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
test/range_predicate.cpp
Abstract:
Unit tests for the range-algebra value type seq::range_predicate.
The tests exercise:
* factory constructors and canonical-form invariants,
* extensional equality and total ordering,
* Boolean operations (|, &, ~, -, ^) on hand-picked instances,
* exhaustive verification of de-Morgan and lattice laws on a
small character domain, by enumerating every subset.
Author:
Margus Veanes (veanes) 2026
--*/
#include "ast/rewriter/seq_range_predicate.h"
#include "util/debug.h"
#include <cstdint>
#include <iostream>
#include <sstream>
using seq::range_predicate;
namespace {
// Build a range_predicate from a bitmask over [0, max_char] for testing.
range_predicate from_mask(uint64_t mask, unsigned max_char) {
range_predicate r = range_predicate::empty(max_char);
for (unsigned c = 0; c <= max_char; ++c)
if ((mask >> c) & 1u)
r = r | range_predicate::singleton(c, max_char);
return r;
}
// Convert a range_predicate back to a bitmask for cross-checking.
uint64_t to_mask(range_predicate const& r) {
uint64_t mask = 0;
for (unsigned c = 0; c <= r.max_char(); ++c)
if (r.contains(c))
mask |= (uint64_t(1) << c);
return mask;
}
void test_factories() {
auto e = range_predicate::empty(255);
ENSURE(e.is_empty());
ENSURE(!e.is_top());
ENSURE(e.num_ranges() == 0);
ENSURE(e.cardinality() == 0);
auto t = range_predicate::top(255);
ENSURE(!t.is_empty());
ENSURE(t.is_top());
ENSURE(t.num_ranges() == 1);
ENSURE(t.cardinality() == 256);
ENSURE(t.contains(0));
ENSURE(t.contains(255));
auto s = range_predicate::singleton(42, 255);
ENSURE(s.num_ranges() == 1);
ENSURE(s.cardinality() == 1);
ENSURE(s.contains(42));
ENSURE(!s.contains(41));
unsigned c = 0;
ENSURE(s.is_singleton(c));
ENSURE(c == 42);
auto r = range_predicate::range(10, 20, 255);
ENSURE(r.num_ranges() == 1);
ENSURE(r.cardinality() == 11);
ENSURE(r.contains(10));
ENSURE(r.contains(20));
ENSURE(!r.contains(9));
ENSURE(!r.contains(21));
// Reversed bounds produce empty.
auto bad = range_predicate::range(20, 10, 255);
ENSURE(bad.is_empty());
// Clipping at max_char.
auto clipped = range_predicate::range(200, 1000, 255);
ENSURE(clipped.num_ranges() == 1);
ENSURE(clipped[0] == std::make_pair(200u, 255u));
}
void test_equality_and_order() {
auto a = range_predicate::range(1, 5, 31);
auto b = range_predicate::range(1, 5, 31);
auto c = range_predicate::range(1, 6, 31);
ENSURE(a == b);
ENSURE(a != c);
ENSURE(a.hash() == b.hash());
ENSURE(a < c || c < a);
ENSURE(!(a < a));
auto empty = range_predicate::empty(31);
ENSURE(empty < a);
// Canonical merging of adjacent ranges.
auto d = range_predicate::range(0, 4, 31) | range_predicate::range(5, 10, 31);
auto e = range_predicate::range(0, 10, 31);
ENSURE(d == e);
}
void test_union_intersection_hand() {
unsigned const M = 31;
auto a = range_predicate::range(0, 4, M) | range_predicate::range(10, 14, M);
auto b = range_predicate::range(3, 11, M);
auto u = a | b; // [0,14]
ENSURE(u.num_ranges() == 1);
ENSURE(u[0] == std::make_pair(0u, 14u));
auto i = a & b; // [3,4] U [10,11]
ENSURE(i.num_ranges() == 2);
ENSURE(i[0] == std::make_pair(3u, 4u));
ENSURE(i[1] == std::make_pair(10u, 11u));
auto d = a - b; // [0,2] U [12,14]
ENSURE(d.num_ranges() == 2);
ENSURE(d[0] == std::make_pair(0u, 2u));
ENSURE(d[1] == std::make_pair(12u, 14u));
auto x = a ^ b; // [0,2] U [5,9] U [12,14]
ENSURE(x.num_ranges() == 3);
ENSURE(x[0] == std::make_pair(0u, 2u));
ENSURE(x[1] == std::make_pair(5u, 9u));
ENSURE(x[2] == std::make_pair(12u, 14u));
}
void test_complement_hand() {
unsigned const M = 10;
auto e = range_predicate::empty(M);
ENSURE((~e).is_top());
auto t = range_predicate::top(M);
ENSURE((~t).is_empty());
// ~([2,3] U [7,8]) = [0,1] U [4,6] U [9,10]
auto a = range_predicate::range(2, 3, M) | range_predicate::range(7, 8, M);
auto na = ~a;
ENSURE(na.num_ranges() == 3);
ENSURE(na[0] == std::make_pair(0u, 1u));
ENSURE(na[1] == std::make_pair(4u, 6u));
ENSURE(na[2] == std::make_pair(9u, 10u));
// ~([0,4]) = [5,10]
auto b = range_predicate::range(0, 4, M);
auto nb = ~b;
ENSURE(nb.num_ranges() == 1);
ENSURE(nb[0] == std::make_pair(5u, 10u));
// ~([5,10]) = [0,4]
auto cnb = ~nb;
ENSURE(cnb == b);
}
// Exhaustively verify the lattice / de-Morgan laws on a small domain
// by enumerating every possible subset (bitmask).
void test_exhaustive_laws() {
unsigned const M = 5; // 6 characters -> 64 subsets
unsigned const N = 1u << (M + 1);
for (unsigned i = 0; i < N; ++i) {
range_predicate A = from_mask(i, M);
ENSURE(to_mask(A) == i);
// ~ ~ A == A
ENSURE(~~A == A);
// A | ~A == top
ENSURE((A | ~A).is_top());
// A & ~A == empty
ENSURE((A & ~A).is_empty());
// cardinality matches popcount
unsigned pop = 0;
for (unsigned k = 0; k <= M; ++k) if ((i >> k) & 1u) ++pop;
ENSURE(A.cardinality() == pop);
}
for (unsigned i = 0; i < N; ++i) {
range_predicate A = from_mask(i, M);
for (unsigned j = 0; j < N; ++j) {
range_predicate B = from_mask(j, M);
// Bitmask reference semantics.
ENSURE(to_mask(A | B) == (i | j));
ENSURE(to_mask(A & B) == (i & j));
ENSURE(to_mask(A - B) == (i & ~j & ((1u << (M + 1)) - 1u)));
ENSURE(to_mask(A ^ B) == (i ^ j));
// de-Morgan
ENSURE(~(A | B) == (~A & ~B));
ENSURE(~(A & B) == (~A | ~B));
// Commutativity
ENSURE((A | B) == (B | A));
ENSURE((A & B) == (B & A));
// (A - B) == A & ~B
ENSURE((A - B) == (A & ~B));
// (A ^ B) == (A | B) - (A & B)
ENSURE((A ^ B) == ((A | B) - (A & B)));
// Extensional equality is reflexive on equal masks.
if (i == j) {
ENSURE(A == B);
ENSURE(A.hash() == B.hash());
}
}
}
}
void test_total_order_strict() {
unsigned const M = 5;
unsigned const N = 1u << (M + 1);
// Strict total order: for any distinct A, B exactly one of A<B, B<A holds.
for (unsigned i = 0; i < N; ++i) {
range_predicate A = from_mask(i, M);
ENSURE(!(A < A));
for (unsigned j = i + 1; j < N; ++j) {
range_predicate B = from_mask(j, M);
bool lt = A < B;
bool gt = B < A;
ENSURE(lt != gt);
ENSURE(lt || gt);
}
}
}
void test_display() {
std::ostringstream oss;
oss << range_predicate::empty(31);
ENSURE(oss.str() == "[]");
oss.str("");
oss << range_predicate::range(3, 7, 31);
ENSURE(oss.str() == "[3-7]");
oss.str("");
oss << range_predicate::singleton(9, 31);
ENSURE(oss.str() == "[9]");
oss.str("");
auto p = range_predicate::range(0, 2, 31) | range_predicate::singleton(5, 31);
oss << p;
ENSURE(oss.str() == "[0-2,5]");
}
}
void tst_range_predicate() {
test_factories();
test_equality_and_order();
test_union_intersection_hand();
test_complement_hand();
test_exhaustive_laws();
test_total_order_strict();
test_display();
std::cout << "range_predicate unit tests passed\n";
}

View file

@ -0,0 +1,244 @@
/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
regex_range_collapse.cpp - unit tests
--*/
#include "ast/rewriter/seq_range_collapse.h"
#include "ast/reg_decl_plugins.h"
#include "ast/ast_pp.h"
#include "ast/arith_decl_plugin.h"
#include "util/util.h"
#include <iostream>
namespace {
using seq::range_predicate;
using seq::regex_to_range_predicate;
using seq::range_predicate_to_regex;
static void check(bool ok, char const* what) {
if (!ok) {
std::cerr << "regex_range_collapse FAILED: " << what << "\n";
ENSURE(false);
}
}
static expr_ref mk_singleton_str(seq_util& u, unsigned c) {
return expr_ref(u.str.mk_string(zstring(c)), u.get_manager());
}
static bool extract_range_chars(seq_util& u, expr* e, unsigned& lo, unsigned& hi) {
expr* lo_e = nullptr; expr* hi_e = nullptr;
if (!u.re.is_range(e, lo_e, hi_e))
return false;
// Accept either string-constant or (seq.unit (Char N)) bound form.
if (u.re.is_range(e, lo, hi))
return true;
expr* lc = nullptr; expr* hc = nullptr;
if (u.str.is_unit(lo_e, lc) && u.is_const_char(lc, lo) &&
u.str.is_unit(hi_e, hc) && u.is_const_char(hc, hi))
return true;
return false;
}
static void run() {
ast_manager m;
reg_decl_plugins(m);
seq_util u(m);
unsigned const M = u.max_char();
sort* str_sort = u.str.mk_string_sort();
sort* re_sort = u.re.mk_re(str_sort);
// primitives
{
range_predicate p(M);
check(regex_to_range_predicate(u, u.re.mk_empty(re_sort), p) && p.is_empty(),
"re.empty -> empty");
check(regex_to_range_predicate(u, u.re.mk_full_char(re_sort), p) && p.is_top(),
"re.full_char -> top");
}
// re.range "a" "z"
{
range_predicate p(M);
expr_ref a = mk_singleton_str(u, 'a');
expr_ref z = mk_singleton_str(u, 'z');
expr_ref r(u.re.mk_range(a, z), m);
check(regex_to_range_predicate(u, r, p) && p.num_ranges() == 1 &&
p[0].first == 'a' && p[0].second == 'z',
"re.range a z -> [a,z]");
}
// Disjoint union: (a..z) | (0..9)
{
range_predicate p(M);
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'z')), m);
expr_ref r2(u.re.mk_range(mk_singleton_str(u, '0'), mk_singleton_str(u, '9')), m);
expr_ref un(u.re.mk_union(r1, r2), m);
check(regex_to_range_predicate(u, un, p) && p.num_ranges() == 2,
"(a-z)|(0-9) -> 2 ranges");
// canonical order: lower lo first
check(p[0].first == '0' && p[0].second == '9' && p[1].first == 'a' && p[1].second == 'z',
"(a-z)|(0-9) ranges in canonical order");
}
// Overlapping union: (a..c) | (b..f) -> (a..f)
{
range_predicate p(M);
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'c')), m);
expr_ref r2(u.re.mk_range(mk_singleton_str(u, 'b'), mk_singleton_str(u, 'f')), m);
expr_ref un(u.re.mk_union(r1, r2), m);
check(regex_to_range_predicate(u, un, p) && p.num_ranges() == 1 &&
p[0].first == 'a' && p[0].second == 'f',
"(a-c)|(b-f) -> (a-f)");
}
// Adjacent union: (a..c) | (d..f) -> (a..f) (canonical predicate merges adjacent)
{
range_predicate p(M);
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'c')), m);
expr_ref r2(u.re.mk_range(mk_singleton_str(u, 'd'), mk_singleton_str(u, 'f')), m);
expr_ref un(u.re.mk_union(r1, r2), m);
check(regex_to_range_predicate(u, un, p) && p.num_ranges() == 1 &&
p[0].first == 'a' && p[0].second == 'f',
"(a-c)|(d-f) -> (a-f) via adjacency");
}
// Disjoint intersection: (a..z) & (0..9) -> empty
{
range_predicate p(M);
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'z')), m);
expr_ref r2(u.re.mk_range(mk_singleton_str(u, '0'), mk_singleton_str(u, '9')), m);
expr_ref ix(u.re.mk_inter(r1, r2), m);
check(regex_to_range_predicate(u, ix, p) && p.is_empty(),
"(a-z)&(0-9) -> empty");
}
// Overlapping intersection: (a..f) & (c..z) -> (c..f)
{
range_predicate p(M);
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'f')), m);
expr_ref r2(u.re.mk_range(mk_singleton_str(u, 'c'), mk_singleton_str(u, 'z')), m);
expr_ref ix(u.re.mk_inter(r1, r2), m);
check(regex_to_range_predicate(u, ix, p) && p.num_ranges() == 1 &&
p[0].first == 'c' && p[0].second == 'f',
"(a-f)&(c-z) -> (c-f)");
}
// Complement: re.complement is intentionally NOT a char-class op
// (it operates over Σ*), so it must NOT be translated.
{
range_predicate p(M);
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'z')), m);
expr_ref cmp(u.re.mk_complement(r1), m);
check(!regex_to_range_predicate(u, cmp, p),
"re.comp of range is NOT translatable (sequence-level complement)");
}
// Diff: (a..f) \ (c..z) -> (a..b)
{
range_predicate p(M);
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'f')), m);
expr_ref r2(u.re.mk_range(mk_singleton_str(u, 'c'), mk_singleton_str(u, 'z')), m);
expr_ref df(u.re.mk_diff(r1, r2), m);
check(regex_to_range_predicate(u, df, p) && p.num_ranges() == 1 &&
p[0].first == 'a' && p[0].second == 'b',
"(a-f) \\ (c-z) -> (a-b)");
}
// Negative: re.* of a range is NOT a char class
{
range_predicate p(M);
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'z')), m);
expr_ref star(u.re.mk_star(r1), m);
check(!regex_to_range_predicate(u, star, p),
"re.* of range not translatable");
}
// Negative: a regex whose element type is NOT a sequence of
// characters (here (Seq Int)) must be rejected outright, even for
// shapes that structurally resemble char-class operators.
{
range_predicate p(M);
arith_util a(m);
sort* int_seq = u.str.mk_seq(a.mk_int());
sort* int_re = u.re.mk_re(int_seq);
check(!regex_to_range_predicate(u, u.re.mk_empty(int_re), p),
"re.empty over (Seq Int) is NOT a char class");
check(!regex_to_range_predicate(u, u.re.mk_full_char(int_re), p),
"re.full_char over (Seq Int) is NOT a char class");
}
// ---- materialization round-trip ----
// empty -> re.empty
{
range_predicate p = range_predicate::empty(M);
expr_ref e = range_predicate_to_regex(u, p, str_sort);
check(u.re.is_empty(e), "empty -> re.empty");
}
// top -> re.full_char
{
range_predicate p = range_predicate::top(M);
expr_ref e = range_predicate_to_regex(u, p, str_sort);
check(u.re.is_full_char(e), "top -> re.full_char");
}
// single range -> re.range
{
range_predicate p = range_predicate::range('a', 'z', M);
expr_ref e = range_predicate_to_regex(u, p, str_sort);
unsigned lo = 0, hi = 0;
check(extract_range_chars(u, e, lo, hi) && lo == 'a' && hi == 'z',
"[a-z] -> re.range a z");
}
// singleton -> re.range c c
{
range_predicate p = range_predicate::singleton('A', M);
expr_ref e = range_predicate_to_regex(u, p, str_sort);
unsigned lo = 0, hi = 0;
check(extract_range_chars(u, e, lo, hi) && lo == 'A' && hi == 'A',
"{A} -> re.range A A");
}
// 2 ranges -> re.union(range_0, range_1) in canonical order
{
range_predicate p = range_predicate::range('0', '9', M)
| range_predicate::range('a', 'z', M);
expr_ref e = range_predicate_to_regex(u, p, str_sort);
expr* a = nullptr; expr* b = nullptr;
check(u.re.is_union(e, a, b), "2-range -> union");
unsigned lo0 = 0, hi0 = 0, lo1 = 0, hi1 = 0;
check(extract_range_chars(u, a, lo0, hi0) && lo0 == '0' && hi0 == '9',
"union arg0 = (0-9) (canonical: lower lo first)");
check(extract_range_chars(u, b, lo1, hi1) && lo1 == 'a' && hi1 == 'z',
"union arg1 = (a-z)");
}
// 3 ranges -> right-associated union
{
range_predicate p = range_predicate::range(0, 5, M)
| range_predicate::range(10, 15, M)
| range_predicate::range(20, 25, M);
expr_ref e = range_predicate_to_regex(u, p, str_sort);
expr* a = nullptr; expr* rest = nullptr;
check(u.re.is_union(e, a, rest), "3-range -> union(...)");
unsigned lo = 0, hi = 0;
check(extract_range_chars(u, a, lo, hi) && lo == 0 && hi == 5, "first arg = (0-5)");
expr* b = nullptr; expr* c = nullptr;
check(u.re.is_union(rest, b, c), "rest is union(...,...)");
check(extract_range_chars(u, b, lo, hi) && lo == 10 && hi == 15, "second range");
check(extract_range_chars(u, c, lo, hi) && lo == 20 && hi == 25, "third range");
}
// Round-trip identity for an arbitrary range-set
{
range_predicate p_in = range_predicate::range('a', 'c', M)
| range_predicate::range('m', 'p', M)
| range_predicate::range('x', 'z', M);
expr_ref e = range_predicate_to_regex(u, p_in, str_sort);
range_predicate p_out(M);
check(regex_to_range_predicate(u, e, p_out), "round-trip translatable");
check(p_in == p_out, "round-trip equal");
}
std::cerr << "regex_range_collapse tests passed\n";
}
}
void tst_regex_range_collapse() {
run();
}

View file

@ -0,0 +1,127 @@
// Regression test for the seq::derive::intersect_intervals bug.
//
// Background: derive uses a path-tracking interval set to compute symbolic
// derivatives. The intersect_intervals routine used to react to a single
// disjoint interval by dropping the entire kept suffix and skipping the rest
// of the list, which silently killed valid branches in derivatives such as
// D(a|b). That made the bisimulation procedure conclude bogus equalities
// like a* == (a|b)*.
//
// This file also covers the seq::derive top-level-cache poisoning bug.
// `m_top_cache` is keyed only by the regex; the routine used to populate it
// while `m_ele` was set to a *concrete* character, baking that character
// into the cached "symbolic" derivative. Subsequent calls with the same
// regex but a different ele then returned a stale concrete answer instead
// of the true symbolic derivative. The simplest victim is
// (str.in_re "aP" (re.++ (re.* "a") "P"))
// which used to return false because the derivative wrt 'a' was cached and
// re-used as the derivative wrt 'P'.
#include "ast/ast.h"
#include "ast/ast_pp.h"
#include "ast/reg_decl_plugins.h"
#include "ast/seq_decl_plugin.h"
#include "ast/rewriter/seq_rewriter.h"
#include "ast/rewriter/seq_regex_bisim.h"
#include "ast/rewriter/th_rewriter.h"
#include <iostream>
static void test_a_star_neq_ab_star() {
ast_manager m;
reg_decl_plugins(m);
seq_util u(m);
seq_rewriter rw(m);
sort_ref str_sort(u.str.mk_string_sort(), m);
zstring sa("a"), sb("b");
expr_ref re_a(u.re.mk_to_re(u.str.mk_string(sa)), m);
expr_ref re_b(u.re.mk_to_re(u.str.mk_string(sb)), m);
expr_ref a_star(u.re.mk_star(re_a), m);
expr_ref ab(u.re.mk_union(re_a, re_b), m);
expr_ref ab_star(u.re.mk_star(ab), m);
expr_ref d_ab = rw.mk_brz_derivative(ab);
std::cout << "D(a|b) = " << mk_pp(d_ab, m) << "\n";
// Both the 'a' branch and the 'b' branch of D(a|b) must reach epsilon.
// Collect the regex leaves of the symbolic derivative and require at
// least two distinct accepting leaves (one for 'a' and one for 'b').
expr_ref_vector leaves(m);
auto collect = [&](expr* e, auto&& self) -> void {
expr* c, *t, *f;
if (m.is_ite(e, c, t, f) || u.re.is_union(e, t, f) || u.re.is_antimirov_union(e, t, f)) {
self(t, self);
self(f, self);
return;
}
if (u.re.is_empty(e)) return;
leaves.push_back(e);
};
collect(d_ab, collect);
unsigned nullable_leaves = 0;
for (expr* l : leaves) {
expr_ref n = rw.is_nullable(l);
if (m.is_true(n)) ++nullable_leaves;
}
std::cout << "D(a|b) leaves=" << leaves.size()
<< " nullable=" << nullable_leaves << "\n";
ENSURE(nullable_leaves >= 2);
// Bisim must report the two languages are not equivalent.
seq::regex_bisim bisim(rw);
lbool eq = bisim.are_equivalent(a_star, ab_star);
std::cout << "bisim(a*, (a|b)*) = "
<< (eq == l_true ? "true" : eq == l_false ? "false" : "undef") << "\n";
ENSURE(eq == l_false);
}
// Regression for the derive top-level-cache poisoning bug.
// Take r = (re.* "a") ++ "P" and check str.in_re "aP" r. Before the fix
// the first per-char derivative call (wrt 'a') populated m_top_cache with
// 'a' baked into the symbolic ITE-tree, so the next call (wrt 'P') returned
// that stale cached value instead of computing D_P(r) = epsilon, making
// str.in_re wrongly return false.
static void test_derive_cache_per_ele() {
ast_manager m;
reg_decl_plugins(m);
seq_util u(m);
seq_rewriter rw(m);
sort_ref str_sort(u.str.mk_string_sort(), m);
zstring sa("a"), sP("P"), s_aP("aP");
expr_ref re_a(u.re.mk_to_re(u.str.mk_string(sa)), m);
expr_ref re_P(u.re.mk_to_re(u.str.mk_string(sP)), m);
expr_ref a_star(u.re.mk_star(re_a), m);
expr_ref r(u.re.mk_concat(a_star, re_P), m);
expr_ref aP(u.str.mk_string(s_aP), m);
// Compute D_'a'(a*P) and D_'P'(a*P) directly via mk_derivative.
// Before the fix, m_top_cache was populated while m_ele = ele (the
// concrete char), so the second call hit the stale cached answer from
// the first. After the fix the cache is keyed by a symbolic var, so
// each concrete-ele substitution produces the right answer.
expr_ref ch_a(u.mk_char('a'), m);
expr_ref ch_P(u.mk_char('P'), m);
expr_ref d_a = rw.mk_derivative(ch_a, r);
expr_ref d_P = rw.mk_derivative(ch_P, r);
std::cout << "D_a(a*P) = " << mk_pp(d_a, m) << "\n";
std::cout << "D_P(a*P) = " << mk_pp(d_P, m) << "\n";
// D_P(a*P) must be nullable (it accepts the empty suffix), while
// D_a(a*P) must not be (it still needs a trailing 'P').
expr_ref n_a = rw.is_nullable(d_a);
expr_ref n_P = rw.is_nullable(d_P);
th_rewriter trw(m);
trw(n_a);
trw(n_P);
std::cout << "nullable(D_a) = " << mk_pp(n_a, m) << "\n";
std::cout << "nullable(D_P) = " << mk_pp(n_P, m) << "\n";
ENSURE(m.is_false(n_a));
ENSURE(m.is_true(n_P));
}
void tst_seq_regex_bisim() {
test_a_star_neq_ab_star();
test_derive_cache_per_ele();
}