mirror of
https://github.com/Z3Prover/z3
synced 2026-06-26 10:28:48 +00:00
Derive with ranges (#9963)
Signed-off-by: Nikolaj Bjorner <nbjorner@microsoft.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: Margus Veanes <margus@microsoft.com> Co-authored-by: Margus Veanes <veanes@users.noreply.github.com>
This commit is contained in:
parent
0596c7c634
commit
22c2635786
24 changed files with 3462 additions and 1000 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -1,9 +1,12 @@
|
|||
*~
|
||||
rebase.cmd
|
||||
reports/
|
||||
crashes/
|
||||
*.pyc
|
||||
*.pyo
|
||||
# Ignore callgrind files
|
||||
callgrind.out.*
|
||||
gmon.out
|
||||
# .hpp files are automatically generated
|
||||
*.hpp
|
||||
.env
|
||||
|
|
|
|||
BIN
gmon.out
BIN
gmon.out
Binary file not shown.
|
|
@ -39,7 +39,11 @@ z3_add_component(rewriter
|
|||
rewriter.cpp
|
||||
seq_axioms.cpp
|
||||
seq_eq_solver.cpp
|
||||
seq_derive.cpp
|
||||
seq_subset.cpp
|
||||
seq_derive.cpp
|
||||
seq_range_collapse.cpp
|
||||
seq_range_predicate.cpp
|
||||
seq_rewriter.cpp
|
||||
seq_regex_bisim.cpp
|
||||
seq_skolem.cpp
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ Notes:
|
|||
#include "ast/rewriter/bool_rewriter.h"
|
||||
#include "params/bool_rewriter_params.hpp"
|
||||
#include "ast/rewriter/rewriter_def.h"
|
||||
#include "ast/rewriter/expr_safe_replace.h"
|
||||
#include "ast/ast_lt.h"
|
||||
#include "ast/for_each_expr.h"
|
||||
#include <algorithm>
|
||||
|
|
@ -1185,4 +1186,30 @@ void bool_rewriter::mk_ge2(expr* a, expr* b, expr* c, expr_ref& r) {
|
|||
}
|
||||
|
||||
|
||||
template class rewriter_tpl<bool_rewriter_cfg>;
|
||||
bool bool_rewriter::decompose_ite(expr *r, expr_ref &c, expr_ref &th, expr_ref &el) {
|
||||
expr *cond = nullptr, *r1 = nullptr, *r2 = nullptr;
|
||||
if (m().is_ite(r, cond, r1, r2)) {
|
||||
c = cond;
|
||||
th = r1;
|
||||
el = r2;
|
||||
return true;
|
||||
}
|
||||
for (expr *e : subterms::ground(expr_ref(r, m()))) {
|
||||
if (m().is_ite(e, cond, r1, r2)) {
|
||||
m_rep1.reset();
|
||||
m_rep2.reset();
|
||||
m_rep1.insert(e, r1);
|
||||
m_rep2.insert(e, r2);
|
||||
c = cond;
|
||||
th = r;
|
||||
el = r;
|
||||
m_rep1(th);
|
||||
m_rep2(el);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
template class rewriter_tpl<bool_rewriter_cfg>;
|
||||
|
|
@ -20,6 +20,7 @@ Notes:
|
|||
|
||||
#include "ast/ast.h"
|
||||
#include "ast/rewriter/rewriter.h"
|
||||
#include "ast/rewriter/expr_safe_replace.h"
|
||||
#include "util/params.h"
|
||||
|
||||
/**
|
||||
|
|
@ -64,6 +65,7 @@ class bool_rewriter {
|
|||
ptr_vector<expr> m_todo1, m_todo2;
|
||||
unsigned_vector m_counts1, m_counts2;
|
||||
expr_mark m_marked;
|
||||
expr_safe_replace m_rep1, m_rep2;
|
||||
|
||||
br_status mk_flat_and_core(unsigned num_args, expr * const * args, expr_ref & result);
|
||||
br_status mk_flat_or_core(unsigned num_args, expr * const * args, expr_ref & result);
|
||||
|
|
@ -87,7 +89,7 @@ class bool_rewriter {
|
|||
expr_ref simplify_eq_ite(expr* value, expr* ite);
|
||||
|
||||
public:
|
||||
bool_rewriter(ast_manager & m, params_ref const & p = params_ref()):m_manager(m), m_local_ctx_cost(0) {
|
||||
bool_rewriter(ast_manager & m, params_ref const & p = params_ref()):m_manager(m), m_local_ctx_cost(0), m_rep1(m), m_rep2(m) {
|
||||
updt_params(p);
|
||||
}
|
||||
ast_manager & m() const { return m_manager; }
|
||||
|
|
@ -242,6 +244,11 @@ public:
|
|||
void mk_nand(expr * arg1, expr * arg2, expr_ref & result);
|
||||
void mk_nor(expr * arg1, expr * arg2, expr_ref & result);
|
||||
void mk_ge2(expr* a, expr* b, expr* c, expr_ref& result);
|
||||
|
||||
// If r is, or contains, an if-then-else, decompose it into a top-level
|
||||
// ite by hoisting the (first) inner ite condition: returns c, th, el such
|
||||
// that r is equivalent to (ite c th el). Returns false if r has no ite.
|
||||
bool decompose_ite(expr *r, expr_ref &c, expr_ref &th, expr_ref &el);
|
||||
};
|
||||
|
||||
struct bool_rewriter_cfg : public default_rewriter_cfg {
|
||||
|
|
|
|||
1416
src/ast/rewriter/seq_derive.cpp
Normal file
1416
src/ast/rewriter/seq_derive.cpp
Normal file
File diff suppressed because it is too large
Load diff
265
src/ast/rewriter/seq_derive.h
Normal file
265
src/ast/rewriter/seq_derive.h
Normal file
|
|
@ -0,0 +1,265 @@
|
|||
/*++
|
||||
Copyright (c) 2026 Microsoft Corporation
|
||||
|
||||
Module Name:
|
||||
|
||||
seq_derive.h
|
||||
|
||||
Abstract:
|
||||
|
||||
Symbolic derivative computation for regular expressions.
|
||||
Produces an ITE-tree (transition regex) representation where
|
||||
the free variable is de Bruijn index 0 representing the input character.
|
||||
|
||||
Based on the theory of symbolic derivatives and transition regexes:
|
||||
- Veanes et al., "On Symbolic Derivatives and Transition Regexes" (LPAR 2024)
|
||||
- Varatalu, Veanes, Ernits, "RE#" (POPL 2025)
|
||||
- Stanford, Veanes, Bjørner, "Symbolic Boolean Derivatives" (PLDI 2021)
|
||||
|
||||
Authors:
|
||||
|
||||
Nikolaj Bjorner (nbjorner) 2025-06-03
|
||||
|
||||
--*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ast/seq_decl_plugin.h"
|
||||
#include "ast/arith_decl_plugin.h"
|
||||
#include "ast/array_decl_plugin.h"
|
||||
#include "ast/rewriter/bool_rewriter.h"
|
||||
#include "util/obj_pair_hashtable.h"
|
||||
#include "util/obj_triple_hashtable.h"
|
||||
#include <functional>
|
||||
|
||||
class seq_rewriter;
|
||||
|
||||
namespace seq {
|
||||
|
||||
enum class derivative_kind { antimirov_t, brzozowski_t };
|
||||
/**
|
||||
* Symbolic derivative engine for regular expressions.
|
||||
*
|
||||
* Given a regex r, operator()(r) computes a symbolic derivative δ(r)
|
||||
* represented as an ITE-tree over character predicates (using de Bruijn
|
||||
* variable 0 for the character). Evaluating the ITE-tree for a concrete
|
||||
* character 'a' yields the classical Brzozowski derivative δ_a(r).
|
||||
*
|
||||
* The ITE-tree structure implicitly defines minterms (equivalence classes
|
||||
* of characters indistinguishable by the regex).
|
||||
*
|
||||
* Key properties:
|
||||
* - Results are memoized for termination on cyclic derivative graphs
|
||||
* - Union/intersection operands are sorted for ACI canonicalization
|
||||
* - Depth-bounded to prevent stack overflow
|
||||
*/
|
||||
class derive {
|
||||
ast_manager& m;
|
||||
seq_util m_util;
|
||||
arith_util m_autil;
|
||||
bool_rewriter m_br;
|
||||
seq_rewriter& m_re;
|
||||
|
||||
// Cache: maps (ele, regex) pair to its derivative
|
||||
obj_pair_map<expr, expr, expr*> m_acache, m_bcache;
|
||||
obj_pair_map<expr, expr, expr*> m_atop_cache, m_btop_cache; // post-simplify cache
|
||||
expr_ref_vector m_trail; // pin cached results
|
||||
|
||||
// Op cache for ITE-hoisting operations (union, inter, concat, complement)
|
||||
// Path-aware caches: key is (a, b, path_expr) for binary ops, (a, path_expr) for complement
|
||||
obj_triple_map<expr, expr, expr, expr *> m_aunion_cache, m_bunion_cache, m_ainter_cache, m_binter_cache, m_axor_cache, m_bxor_cache;
|
||||
obj_pair_map<expr, expr, expr*> m_aconcat_cache, m_bconcat_cache;
|
||||
obj_pair_map<expr, expr, expr*> m_acomplement_cache, m_bcomplement_cache;
|
||||
|
||||
// Depth limiting
|
||||
unsigned m_depth { 0 };
|
||||
static const unsigned m_max_depth = 512;
|
||||
|
||||
seq_util::rex& re() { return m_util.re; }
|
||||
seq_util& u() { return m_util; }
|
||||
|
||||
derivative_kind m_derivative_kind = derivative_kind::antimirov_t;
|
||||
|
||||
// The element (character) for the current derivative computation
|
||||
expr_ref m_ele;
|
||||
|
||||
// Path state for inline pruning during mk_inter/mk_union/mk_complement
|
||||
using intervals_t = svector<std::pair<unsigned, unsigned>>;
|
||||
|
||||
// Path: vector of signed atoms
|
||||
svector<std::pair<expr*, bool>> m_path;
|
||||
// Intervals: feasible character ranges under current path (append-only)
|
||||
intervals_t m_intervals;
|
||||
unsigned m_intervals_start { 0 };
|
||||
// Stack of saved states for push/pop
|
||||
struct path_save { unsigned path_sz; unsigned intervals_sz; unsigned intervals_start; expr* path_expr; };
|
||||
svector<path_save> m_path_stack;
|
||||
// Boolean expression encoding of current path (for cache keys)
|
||||
expr_ref m_path_expr;
|
||||
|
||||
// Path interface
|
||||
lbool push(expr* c, bool sign); // l_true: implied, l_undef: pushed (must pop), l_false: contradicts
|
||||
void pop(); // restore state to matching push
|
||||
expr* get_path_expr() { return m_path_expr; }
|
||||
|
||||
obj_pair_map<expr, expr, expr *> &cache() {
|
||||
return m_derivative_kind == derivative_kind::antimirov_t ? m_acache : m_bcache;
|
||||
}
|
||||
|
||||
obj_pair_map<expr, expr, expr *> &top_cache() {
|
||||
return m_derivative_kind == derivative_kind::antimirov_t ? m_atop_cache : m_btop_cache;
|
||||
}
|
||||
|
||||
obj_triple_map<expr, expr, expr, expr *> &union_cache() {
|
||||
return m_derivative_kind == derivative_kind::antimirov_t ? m_aunion_cache : m_bunion_cache;
|
||||
}
|
||||
|
||||
obj_triple_map<expr, expr, expr, expr *> &inter_cache() {
|
||||
return m_derivative_kind == derivative_kind::antimirov_t ? m_ainter_cache : m_binter_cache;
|
||||
}
|
||||
|
||||
obj_triple_map<expr, expr, expr, expr *> &xor_cache() {
|
||||
return m_derivative_kind == derivative_kind::antimirov_t ? m_axor_cache : m_bxor_cache;
|
||||
}
|
||||
|
||||
obj_pair_map<expr, expr, expr *> &concat_cache() {
|
||||
return m_derivative_kind == derivative_kind::antimirov_t ? m_aconcat_cache : m_bconcat_cache;
|
||||
}
|
||||
|
||||
obj_pair_map<expr, expr, expr *> &complement_cache() {
|
||||
return m_derivative_kind == derivative_kind::antimirov_t ? m_acomplement_cache : m_bcomplement_cache;
|
||||
}
|
||||
|
||||
// Hoist ITE: apply_op through ite(c, t, e) with path pruning
|
||||
expr_ref apply_ite(expr* c, expr* t, expr* e, expr* r, std::function<expr_ref(expr*, expr*)> apply_op);
|
||||
expr_ref apply_ite(expr* c, expr* t1, expr* e1, expr* t2, expr* e2, std::function<expr_ref(expr*, expr*)> apply_op);
|
||||
expr_ref apply_ite(expr* c, expr* t, expr* e, std::function<expr_ref(expr*)> apply_op);
|
||||
// Common ITE dispatch for binary ops (union/inter)
|
||||
expr_ref hoist_ite(expr* a, expr* b, std::function<expr_ref(expr*, expr*)> apply_op);
|
||||
|
||||
// Evaluate a condition against the current path/intervals
|
||||
lbool eval_path_cond(expr* c);
|
||||
|
||||
// Internal helpers for push
|
||||
lbool push_path_atoms(expr* c, bool sign);
|
||||
lbool push_intervals_impl(expr* c, bool sign);
|
||||
|
||||
// Core derivative computation
|
||||
expr_ref derive_rec(expr* r);
|
||||
expr_ref derive_core(expr* r);
|
||||
|
||||
// Helpers for specific regex constructs
|
||||
expr_ref derive_to_re(expr* s, sort* seq_sort);
|
||||
expr_ref derive_range(expr* lo, expr* hi, sort* seq_sort);
|
||||
expr_ref derive_of_pred(expr* pred, sort* seq_sort);
|
||||
|
||||
// Nullable check: returns a Boolean expression
|
||||
expr_ref is_nullable(expr* r);
|
||||
expr_ref is_nullable_symbolic_regex(expr* r, sort* seq_sort);
|
||||
|
||||
// Smart constructors with path-aware simplification and ACI canonicalization
|
||||
expr_ref mk_union(expr* a, expr* b);
|
||||
bool are_complements(expr* a, expr* b);
|
||||
unsigned union_id(expr* e); // complement-aware ID for sorting
|
||||
bool is_subset(expr* a, expr* b);
|
||||
expr_ref mk_union_core(expr* a, expr* b);
|
||||
expr_ref mk_inter(expr* a, expr* b);
|
||||
expr_ref mk_inter_core(expr* a, expr* b);
|
||||
expr_ref mk_concat(expr* a, expr* b);
|
||||
expr_ref mk_complement(expr* a);
|
||||
expr_ref mk_complement_core(expr* a);
|
||||
expr_ref mk_xor(expr *a, expr *b);
|
||||
expr_ref mk_xor_core(expr *a, expr *b);
|
||||
expr_ref mk_core(decl_kind k, expr* a, expr* b);
|
||||
expr_ref mk_ite(expr* c, expr* t, expr* e);
|
||||
|
||||
// Distribute concatenation through ITE/union in derivative
|
||||
expr_ref mk_deriv_concat(expr* d, expr* tail);
|
||||
expr_ref mk_deriv_concat_core(expr* d, expr* tail);
|
||||
|
||||
// Extract head character and tail from a sequence expression
|
||||
bool get_head_tail(expr* s1, expr* s2, expr_ref& hd, expr_ref& tl);
|
||||
|
||||
// Predicate implication for character range conditions.
|
||||
bool pred_implies(bool sign_a, expr* a, bool sign_b, expr* b);
|
||||
bool pred_implies(expr* a, expr* b);
|
||||
|
||||
// Normalize reverse(r)
|
||||
expr_ref mk_regex_reverse(expr* r);
|
||||
|
||||
// Condition evaluation helpers
|
||||
lbool eval_cond(expr* cond);
|
||||
lbool eval_range_cond(expr* c);
|
||||
void intersect_intervals(unsigned lo, unsigned hi);
|
||||
void exclude_interval(unsigned lo, unsigned hi);
|
||||
|
||||
// Cofactor enumeration over a transition regex (ITE-tree).
|
||||
void get_cofactors_rec(expr* r, expr_ref_pair_vector& result);
|
||||
|
||||
// Re-apply union/intersection simplifications bottom-up to a cofactor
|
||||
// leaf. decompose_ite substitutes ITE branch values structurally
|
||||
// (no simplification), so leaves can contain un-normalized nodes such
|
||||
// as union(R, none) or inter(R, none); this rebuilds them through
|
||||
// mk_union/mk_inter so equal states share a canonical form.
|
||||
expr_ref clean_leaf(expr* r);
|
||||
|
||||
sort* re_sort(expr* r) { return r->get_sort(); }
|
||||
sort* seq_sort(expr* r) { sort* s = nullptr; m_util.is_re(r, s); return s; }
|
||||
sort* ele_sort(expr* r) { sort* s = seq_sort(r); sort* e = nullptr; m_util.is_seq(s, e); return e; }
|
||||
|
||||
void reset();
|
||||
void reset_op_caches();
|
||||
|
||||
public:
|
||||
derive(ast_manager& m, seq_rewriter& re);
|
||||
|
||||
/**
|
||||
* Compute the derivative of regex r with respect to element ele.
|
||||
* When ele is a de Bruijn variable, produces a symbolic ITE-tree.
|
||||
* When ele is a concrete character, produces the concrete derivative.
|
||||
*/
|
||||
expr_ref operator()(derivative_kind k, expr* ele, expr* r);
|
||||
|
||||
/**
|
||||
* Convenience: symbolic derivative using de Bruijn var 0.
|
||||
*/
|
||||
expr_ref operator()(derivative_kind k, expr* r);
|
||||
|
||||
/**
|
||||
* Nullable check: returns a Boolean expression that is true iff r accepts the empty string.
|
||||
*/
|
||||
expr_ref nullable(expr* r) { return is_nullable(r); }
|
||||
|
||||
/**
|
||||
* Enumerate the cofactors (min-terms) of a transition regex r taken with
|
||||
* respect to element ele. r is an ITE-tree over character predicates on
|
||||
* ele; for every feasible path through the tree this produces a pair
|
||||
* (path_condition, leaf_regex). Infeasible character-interval
|
||||
* combinations are pruned using the same path/interval context that the
|
||||
* derivative engine uses while hoisting ITEs.
|
||||
*/
|
||||
void get_cofactors(expr* ele, expr* r, expr_ref_pair_vector& result);
|
||||
|
||||
/**
|
||||
* Compute the symbolic derivative of r and enumerate its reachable
|
||||
* leaves in fully ITE-hoisted normal form.
|
||||
*
|
||||
* Concretely this returns, for every feasible minterm (character
|
||||
* class) of δ(r), a pair (path_condition, target_regex). Every
|
||||
* if-then-else over the input character (including ones that would
|
||||
* otherwise be buried under a concat/union) is hoisted to the top
|
||||
* via the same path/interval pruning used by the derivative engine,
|
||||
* so each target_regex is free of (:var 0) and its nullability is
|
||||
* always decidable. Unions are kept intact as single leaves (a
|
||||
* union leaf denotes a single bisimulation state). Infeasible
|
||||
* minterms are pruned, so all returned leaves are reachable.
|
||||
*
|
||||
* This is the entry point the regex_bisim equivalence procedure
|
||||
* uses: it consumes the target_regex of each pair and ignores the
|
||||
* (redundant) path condition.
|
||||
*/
|
||||
void derivative_cofactors(expr* r, expr_ref_pair_vector& result);
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
168
src/ast/rewriter/seq_range_collapse.cpp
Normal file
168
src/ast/rewriter/seq_range_collapse.cpp
Normal file
|
|
@ -0,0 +1,168 @@
|
|||
/*++
|
||||
Copyright (c) 2026 Microsoft Corporation
|
||||
|
||||
Module Name:
|
||||
|
||||
seq_range_collapse.cpp
|
||||
|
||||
Abstract:
|
||||
|
||||
Implementation of regex <-> range_predicate translation for the
|
||||
boolean-combination-of-ranges fragment. See header for the recognized
|
||||
grammar and the canonical regex AST emitted by materialization.
|
||||
|
||||
Authors:
|
||||
|
||||
Margus Veanes (veanes) 2026
|
||||
|
||||
--*/
|
||||
|
||||
#include "ast/rewriter/seq_range_collapse.h"
|
||||
|
||||
namespace seq {
|
||||
|
||||
bool regex_to_range_predicate(seq_util& u, expr* r, range_predicate& out) {
|
||||
// The range algebra only models sets of single characters over the
|
||||
// unsigned character domain [0, max_char]. Guard against any regex
|
||||
// whose element type is not a sequence of characters (e.g. a regex
|
||||
// over (Seq Int) or (Seq (Seq Char))): for such regexes the
|
||||
// re.range/re.union/... matchers below would silently fabricate a
|
||||
// character-class predicate and change semantics. Reject them up
|
||||
// front so callers fall back to the generic regex path.
|
||||
sort* seq_sort = nullptr;
|
||||
if (!u.is_re(r, seq_sort) || !u.is_string(seq_sort))
|
||||
return false;
|
||||
|
||||
unsigned const max_char = u.max_char();
|
||||
auto& re = u.re;
|
||||
|
||||
if (re.is_empty(r)) {
|
||||
out = range_predicate::empty(max_char);
|
||||
return true;
|
||||
}
|
||||
if (re.is_full_char(r)) {
|
||||
out = range_predicate::top(max_char);
|
||||
return true;
|
||||
}
|
||||
unsigned lo = 0, hi = 0;
|
||||
expr* lo_e = nullptr;
|
||||
expr* hi_e = nullptr;
|
||||
if (re.is_range(r, lo_e, hi_e)) {
|
||||
auto extract_char = [&](expr* e, unsigned& c) -> bool {
|
||||
if (u.is_const_char(e, c)) return true;
|
||||
expr* inner = nullptr;
|
||||
if (u.str.is_unit(e, inner) && u.is_const_char(inner, c)) return true;
|
||||
zstring s;
|
||||
if (u.str.is_string(e, s) && s.length() == 1) {
|
||||
c = s[0];
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
if (!extract_char(lo_e, lo) || !extract_char(hi_e, hi))
|
||||
return false;
|
||||
// Empty/inverted range [lo > hi] is the empty regex.
|
||||
if (lo > hi) {
|
||||
out = range_predicate::empty(max_char);
|
||||
return true;
|
||||
}
|
||||
out = range_predicate::range(lo, hi, max_char);
|
||||
return true;
|
||||
}
|
||||
expr *a = nullptr, *b = nullptr, *c = nullptr;
|
||||
if (re.is_union(r, a, b)) {
|
||||
range_predicate pa(max_char), pb(max_char);
|
||||
if (!regex_to_range_predicate(u, a, pa)) return false;
|
||||
if (!regex_to_range_predicate(u, b, pb)) return false;
|
||||
out = pa | pb;
|
||||
return true;
|
||||
}
|
||||
auto mk_diff = [&](expr *a, expr *b) -> bool {
|
||||
range_predicate pa(max_char), pb(max_char);
|
||||
if (!regex_to_range_predicate(u, a, pa))
|
||||
return false;
|
||||
if (!regex_to_range_predicate(u, b, pb))
|
||||
return false;
|
||||
out = pa - pb;
|
||||
return true;
|
||||
};
|
||||
if (re.is_diff(r, a, b))
|
||||
return mk_diff(a, b);
|
||||
|
||||
if (re.is_intersection(r, a, b) && re.is_complement(b, c))
|
||||
return mk_diff(a, c);
|
||||
|
||||
if (re.is_intersection(r, a, b) && re.is_complement(a, c))
|
||||
return mk_diff(b, c);
|
||||
|
||||
if (re.is_intersection(r, a, b)) {
|
||||
range_predicate pa(max_char), pb(max_char);
|
||||
if (!regex_to_range_predicate(u, a, pa)) return false;
|
||||
if (!regex_to_range_predicate(u, b, pb)) return false;
|
||||
out = pa & pb;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// NOTE: re.complement is intentionally NOT handled here.
|
||||
// re.complement is the SEQUENCE-level complement: its language
|
||||
// includes the empty string, strings of length >= 2, and any
|
||||
// length-1 string outside the operand. A character-class
|
||||
// range_predicate can only describe a set of length-1 strings,
|
||||
// so collapsing re.complement(R) to ~R (character-level
|
||||
// complement) would change semantics whenever R is wrapped in
|
||||
// any sequence-level context (e.g. re.diff at the top level,
|
||||
// or membership tests). De-Morgan equivalences and the
|
||||
// special cases re.complement(re.empty) / re.complement(re.full)
|
||||
// are already handled directly in seq_rewriter::mk_re_complement.
|
||||
return false;
|
||||
}
|
||||
|
||||
static expr_ref mk_unit_string_from_char(seq_util& u, unsigned c) {
|
||||
return expr_ref(u.str.mk_string(zstring(c)), u.get_manager());
|
||||
}
|
||||
|
||||
static expr_ref mk_single_range_regex(seq_util& u, unsigned lo, unsigned hi, sort* re_sort) {
|
||||
ast_manager& m = u.get_manager();
|
||||
if (lo == 0 && hi == u.max_char())
|
||||
return expr_ref(u.re.mk_full_char(re_sort), m);
|
||||
// Use the canonical unit-character form (seq.unit (Char N)) for
|
||||
// range bounds. This matches the shape used elsewhere in
|
||||
// seq_rewriter and avoids creating duplicate AST nodes with
|
||||
// different ids for semantically identical ranges.
|
||||
expr_ref slo(u.str.mk_unit(u.str.mk_char(lo)), m);
|
||||
expr_ref shi(u.str.mk_unit(u.str.mk_char(hi)), m);
|
||||
return expr_ref(u.re.mk_range(slo, shi), m);
|
||||
}
|
||||
|
||||
expr_ref range_predicate_to_regex(seq_util& u, range_predicate const& p, sort* seq_sort) {
|
||||
ast_manager& m = u.get_manager();
|
||||
sort* re_sort = u.re.mk_re(seq_sort);
|
||||
if (p.is_empty())
|
||||
return expr_ref(u.re.mk_empty(re_sort), m);
|
||||
unsigned const n = p.num_ranges();
|
||||
SASSERT(n > 0);
|
||||
if (n == 1) {
|
||||
auto [lo, hi] = p[0];
|
||||
return mk_single_range_regex(u, lo, hi, re_sort);
|
||||
}
|
||||
// Build single-range AST nodes first, then sort by expression id
|
||||
// so the resulting right-associated union matches the canonical
|
||||
// id-sorted shape that seq_rewriter::merge_regex_sets expects.
|
||||
// Without this the merge algorithm produces incorrect unions
|
||||
// when it has to combine our materialized output with another
|
||||
// (id-sorted) regex set.
|
||||
expr_ref_vector ranges(m);
|
||||
for (unsigned i = 0; i < n; ++i) {
|
||||
auto [lo, hi] = p[i];
|
||||
ranges.push_back(mk_single_range_regex(u, lo, hi, re_sort));
|
||||
}
|
||||
std::sort(ranges.data(), ranges.data() + ranges.size(),
|
||||
[](expr* a, expr* b) { return a->get_id() < b->get_id(); });
|
||||
expr_ref acc(ranges.get(n - 1), m);
|
||||
for (unsigned i = n - 1; i-- > 0; )
|
||||
acc = expr_ref(u.re.mk_union(ranges.get(i), acc), m);
|
||||
return acc;
|
||||
}
|
||||
|
||||
}
|
||||
71
src/ast/rewriter/seq_range_collapse.h
Normal file
71
src/ast/rewriter/seq_range_collapse.h
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
/*++
|
||||
Copyright (c) 2026 Microsoft Corporation
|
||||
|
||||
Module Name:
|
||||
|
||||
seq_range_collapse.h
|
||||
|
||||
Abstract:
|
||||
|
||||
Recognize regexes that are boolean combinations of character-class
|
||||
primitives (re.empty, re.full_char, re.range with concrete chars,
|
||||
and re.union/inter/comp/diff over translatable arguments), and
|
||||
materialize a seq::range_predicate back into a canonical regex AST.
|
||||
|
||||
Together with seq_rewriter integration, this lets any boolean
|
||||
combination of character-class regexes collapse to a canonical
|
||||
multi-range form, so that equivalent character classes share AST
|
||||
identity, and downstream consumers (derivative, OneStep, caching)
|
||||
can short-circuit them as pure range predicates.
|
||||
|
||||
Authors:
|
||||
|
||||
Margus Veanes (veanes) 2026
|
||||
|
||||
--*/
|
||||
#pragma once
|
||||
|
||||
#include "ast/rewriter/seq_range_predicate.h"
|
||||
#include "ast/seq_decl_plugin.h"
|
||||
|
||||
namespace seq {
|
||||
|
||||
/**
|
||||
* If r is a boolean combination of character-class regex primitives
|
||||
* over the unsigned character domain [0, max_char], compute the
|
||||
* equivalent range_predicate and return true. Otherwise return false
|
||||
* with out untouched.
|
||||
*
|
||||
* Recognized fragment (all character-class-preserving operations):
|
||||
* re.empty -> empty
|
||||
* re.full_char_set -> top
|
||||
* re.range "c_lo" "c_hi" (concrete) -> [c_lo, c_hi]
|
||||
* re.union r1 r2 -> p1 | p2
|
||||
* re.intersection r1 r2 -> p1 & p2
|
||||
* re.diff r1 r2 -> p1 - p2
|
||||
*
|
||||
* Notably re.complement is NOT recognized: it is a SEQUENCE-level
|
||||
* complement (over all of Σ*), not a character-class complement, so
|
||||
* collapsing it would change semantics whenever the result is used
|
||||
* in any non-character-class context. Sequence-level rewrites for
|
||||
* re.complement (double-comp, deMorgan, etc.) are handled directly
|
||||
* in seq_rewriter::mk_re_complement.
|
||||
*/
|
||||
bool regex_to_range_predicate(seq_util& u, expr* r, range_predicate& out);
|
||||
|
||||
/**
|
||||
* Canonical materialization of p as a regex AST over the given
|
||||
* sequence sort. Two range_predicates with equal canonical
|
||||
* representations produce structurally identical regex ASTs:
|
||||
*
|
||||
* empty -> re.empty
|
||||
* top -> re.full_char_set
|
||||
* single range [lo, hi] -> re.range "lo" "hi"
|
||||
* multiple ranges -> right-associated re.union of single
|
||||
* ranges, in increasing order of lo
|
||||
* (matching the canonical range order
|
||||
* held by range_predicate).
|
||||
*/
|
||||
expr_ref range_predicate_to_regex(seq_util& u, range_predicate const& p, sort* seq_sort);
|
||||
|
||||
}
|
||||
292
src/ast/rewriter/seq_range_predicate.cpp
Normal file
292
src/ast/rewriter/seq_range_predicate.cpp
Normal file
|
|
@ -0,0 +1,292 @@
|
|||
/*++
|
||||
Copyright (c) 2026 Microsoft Corporation
|
||||
|
||||
Module Name:
|
||||
|
||||
seq_range_predicate.cpp
|
||||
|
||||
Abstract:
|
||||
|
||||
Implementation of the specialized range-algebra used by symbolic
|
||||
derivative computation and regex rewriting. See seq_range_predicate.h
|
||||
for the algebraic specification.
|
||||
|
||||
All Boolean operations are implemented as single linear sweeps over
|
||||
the canonical sorted range vectors and produce canonical output
|
||||
(sorted, disjoint, non-adjacent).
|
||||
|
||||
Authors:
|
||||
|
||||
Margus Veanes (veanes) 2026
|
||||
|
||||
--*/
|
||||
|
||||
#include "ast/rewriter/seq_range_predicate.h"
|
||||
#include "util/debug.h"
|
||||
#include <algorithm>
|
||||
#include <ostream>
|
||||
|
||||
namespace seq {
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Factories
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
range_predicate range_predicate::empty(unsigned max_char) {
|
||||
return range_predicate(max_char);
|
||||
}
|
||||
|
||||
range_predicate range_predicate::top(unsigned max_char) {
|
||||
range_predicate r(max_char);
|
||||
r.m_ranges.push_back({0u, max_char});
|
||||
SASSERT(r.well_formed());
|
||||
return r;
|
||||
}
|
||||
|
||||
range_predicate range_predicate::singleton(unsigned c, unsigned max_char) {
|
||||
SASSERT(c <= max_char);
|
||||
range_predicate r(max_char);
|
||||
r.m_ranges.push_back({c, c});
|
||||
SASSERT(r.well_formed());
|
||||
return r;
|
||||
}
|
||||
|
||||
range_predicate range_predicate::range(unsigned lo, unsigned hi, unsigned max_char) {
|
||||
range_predicate r(max_char);
|
||||
if (lo <= hi && lo <= max_char) {
|
||||
unsigned clipped_hi = hi <= max_char ? hi : max_char;
|
||||
r.m_ranges.push_back({lo, clipped_hi});
|
||||
}
|
||||
SASSERT(r.well_formed());
|
||||
return r;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Invariants and observers
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
bool range_predicate::well_formed() const {
|
||||
for (unsigned i = 0; i < m_ranges.size(); ++i) {
|
||||
auto [lo, hi] = m_ranges[i];
|
||||
if (lo > hi) return false;
|
||||
if (hi > m_max_char) return false;
|
||||
if (i > 0) {
|
||||
unsigned prev_hi = m_ranges[i - 1].second;
|
||||
// Non-adjacent and sorted: prev_hi + 1 < lo, with care
|
||||
// around prev_hi == UINT_MAX which we never expect because
|
||||
// hi <= m_max_char.
|
||||
if (prev_hi + 1 >= lo) return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool range_predicate::contains(unsigned c) const {
|
||||
// Binary search on first element of pairs.
|
||||
unsigned lo = 0, hi = m_ranges.size();
|
||||
while (lo < hi) {
|
||||
unsigned mid = lo + (hi - lo) / 2;
|
||||
auto [a, b] = m_ranges[mid];
|
||||
if (c < a) hi = mid;
|
||||
else if (c > b) lo = mid + 1;
|
||||
else return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
uint64_t range_predicate::cardinality() const {
|
||||
uint64_t n = 0;
|
||||
for (auto [lo, hi] : m_ranges)
|
||||
n += static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo) + 1u;
|
||||
return n;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Equality, ordering, hashing
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
bool range_predicate::equals(range_predicate const& o) const {
|
||||
if (m_max_char != o.m_max_char) return false;
|
||||
if (m_ranges.size() != o.m_ranges.size()) return false;
|
||||
for (unsigned i = 0; i < m_ranges.size(); ++i)
|
||||
if (m_ranges[i] != o.m_ranges[i]) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool range_predicate::operator<(range_predicate const& o) const {
|
||||
if (m_max_char != o.m_max_char)
|
||||
return m_max_char < o.m_max_char;
|
||||
unsigned n = std::min(m_ranges.size(), o.m_ranges.size());
|
||||
for (unsigned i = 0; i < n; ++i) {
|
||||
auto a = m_ranges[i];
|
||||
auto b = o.m_ranges[i];
|
||||
if (a.first != b.first) return a.first < b.first;
|
||||
if (a.second != b.second) return a.second < b.second;
|
||||
}
|
||||
return m_ranges.size() < o.m_ranges.size();
|
||||
}
|
||||
|
||||
unsigned range_predicate::hash() const {
|
||||
// FNV-1a 32-bit over (max_char, then each (lo, hi)).
|
||||
uint32_t h = 2166136261u;
|
||||
auto step = [&](uint32_t x) {
|
||||
h ^= x;
|
||||
h *= 16777619u;
|
||||
};
|
||||
step(m_max_char);
|
||||
for (auto [lo, hi] : m_ranges) {
|
||||
step(lo);
|
||||
step(hi);
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Boolean operations
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
namespace {
|
||||
// Append (lo, hi) to result, merging with the previous range if
|
||||
// adjacent or overlapping. Maintains canonical form.
|
||||
inline void append_merged(svector<std::pair<unsigned, unsigned>>& result,
|
||||
unsigned lo, unsigned hi) {
|
||||
SASSERT(lo <= hi);
|
||||
if (!result.empty() && result.back().second + 1 >= lo) {
|
||||
if (result.back().second < hi)
|
||||
result.back().second = hi;
|
||||
} else {
|
||||
result.push_back({lo, hi});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
range_predicate range_predicate::operator|(range_predicate const& o) const {
|
||||
SASSERT(m_max_char == o.m_max_char);
|
||||
range_predicate r(m_max_char);
|
||||
unsigned i = 0, j = 0;
|
||||
const unsigned n = m_ranges.size();
|
||||
const unsigned m = o.m_ranges.size();
|
||||
while (i < n && j < m) {
|
||||
auto a = m_ranges[i];
|
||||
auto b = o.m_ranges[j];
|
||||
if (a.first <= b.first) {
|
||||
append_merged(r.m_ranges, a.first, a.second);
|
||||
++i;
|
||||
} else {
|
||||
append_merged(r.m_ranges, b.first, b.second);
|
||||
++j;
|
||||
}
|
||||
}
|
||||
while (i < n) {
|
||||
auto a = m_ranges[i++];
|
||||
append_merged(r.m_ranges, a.first, a.second);
|
||||
}
|
||||
while (j < m) {
|
||||
auto b = o.m_ranges[j++];
|
||||
append_merged(r.m_ranges, b.first, b.second);
|
||||
}
|
||||
SASSERT(r.well_formed());
|
||||
return r;
|
||||
}
|
||||
|
||||
range_predicate range_predicate::operator&(range_predicate const& o) const {
|
||||
SASSERT(m_max_char == o.m_max_char);
|
||||
range_predicate r(m_max_char);
|
||||
unsigned i = 0, j = 0;
|
||||
const unsigned n = m_ranges.size();
|
||||
const unsigned m = o.m_ranges.size();
|
||||
while (i < n && j < m) {
|
||||
auto [a_lo, a_hi] = m_ranges[i];
|
||||
auto [b_lo, b_hi] = o.m_ranges[j];
|
||||
unsigned lo = std::max(a_lo, b_lo);
|
||||
unsigned hi = std::min(a_hi, b_hi);
|
||||
if (lo <= hi)
|
||||
r.m_ranges.push_back({lo, hi});
|
||||
// Advance the range that ends first.
|
||||
if (a_hi < b_hi) ++i;
|
||||
else if (b_hi < a_hi) ++j;
|
||||
else { ++i; ++j; }
|
||||
}
|
||||
SASSERT(r.well_formed());
|
||||
return r;
|
||||
}
|
||||
|
||||
range_predicate range_predicate::operator~() const {
|
||||
range_predicate r(m_max_char);
|
||||
unsigned cursor = 0;
|
||||
for (auto [lo, hi] : m_ranges) {
|
||||
if (cursor < lo)
|
||||
r.m_ranges.push_back({cursor, lo - 1});
|
||||
// Step past hi without overflow: hi <= m_max_char and we
|
||||
// only step when more characters remain.
|
||||
if (hi >= m_max_char) {
|
||||
cursor = m_max_char + 1; // sentinel: no more characters
|
||||
break;
|
||||
}
|
||||
cursor = hi + 1;
|
||||
}
|
||||
if (cursor <= m_max_char)
|
||||
r.m_ranges.push_back({cursor, m_max_char});
|
||||
SASSERT(r.well_formed());
|
||||
return r;
|
||||
}
|
||||
|
||||
range_predicate range_predicate::operator-(range_predicate const& o) const {
|
||||
SASSERT(m_max_char == o.m_max_char);
|
||||
// A - B by linear sweep: for each range of A, subtract overlapping
|
||||
// ranges of B. Both inputs are sorted so we advance j monotonically.
|
||||
range_predicate r(m_max_char);
|
||||
unsigned j = 0;
|
||||
const unsigned m = o.m_ranges.size();
|
||||
for (auto [a_lo, a_hi] : m_ranges) {
|
||||
unsigned cursor = a_lo;
|
||||
while (j < m && o.m_ranges[j].second < cursor)
|
||||
++j;
|
||||
unsigned k = j;
|
||||
while (k < m && o.m_ranges[k].first <= a_hi) {
|
||||
auto [b_lo, b_hi] = o.m_ranges[k];
|
||||
if (cursor < b_lo)
|
||||
r.m_ranges.push_back({cursor, std::min(a_hi, b_lo - 1)});
|
||||
if (b_hi >= a_hi) {
|
||||
cursor = a_hi + 1;
|
||||
break;
|
||||
}
|
||||
cursor = b_hi + 1;
|
||||
++k;
|
||||
}
|
||||
if (cursor <= a_hi)
|
||||
r.m_ranges.push_back({cursor, a_hi});
|
||||
}
|
||||
SASSERT(r.well_formed());
|
||||
return r;
|
||||
}
|
||||
|
||||
range_predicate range_predicate::operator^(range_predicate const& o) const {
|
||||
SASSERT(m_max_char == o.m_max_char);
|
||||
// (A | B) - (A & B), but implemented directly with one linear sweep
|
||||
// over the union of breakpoints.
|
||||
return (*this | o) - (*this & o);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Display
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
std::ostream& range_predicate::display(std::ostream& out) const {
|
||||
if (m_ranges.empty()) {
|
||||
return out << "[]";
|
||||
}
|
||||
out << "[";
|
||||
bool first = true;
|
||||
for (auto [lo, hi] : m_ranges) {
|
||||
if (!first) out << ",";
|
||||
first = false;
|
||||
if (lo == hi)
|
||||
out << lo;
|
||||
else
|
||||
out << lo << "-" << hi;
|
||||
}
|
||||
return out << "]";
|
||||
}
|
||||
|
||||
}
|
||||
127
src/ast/rewriter/seq_range_predicate.h
Normal file
127
src/ast/rewriter/seq_range_predicate.h
Normal file
|
|
@ -0,0 +1,127 @@
|
|||
/*++
|
||||
Copyright (c) 2026 Microsoft Corporation
|
||||
|
||||
Module Name:
|
||||
|
||||
seq_range_predicate.h
|
||||
|
||||
Abstract:
|
||||
|
||||
Specialized range-algebra over an unsigned character domain [0, max_char].
|
||||
|
||||
A range_predicate represents a subset of the character domain as a
|
||||
sorted sequence of non-overlapping, non-adjacent, non-empty ranges:
|
||||
|
||||
[(lo_0, hi_0), (lo_1, hi_1), ...] with hi_i + 1 < lo_{i+1}.
|
||||
|
||||
The representation is canonical, so two range_predicates over the same
|
||||
domain are extensionally equivalent iff their internal vectors are
|
||||
elementwise equal.
|
||||
|
||||
All Boolean operations (union, intersection, complement, difference)
|
||||
are linear in the total number of ranges and produce the canonical
|
||||
representation.
|
||||
|
||||
Intended use:
|
||||
* path conditions for symbolic derivative computation,
|
||||
* OneStep predicates capturing length-1 acceptance,
|
||||
* smart-constructor side conditions for regex rewrites such as
|
||||
R & psi --> toregex(OneStep(R) & psi).
|
||||
|
||||
The type is a pure value: no ast_manager allocation occurs in its
|
||||
construction or its Boolean operations. Conversion to and from
|
||||
expr* is the responsibility of a separate translator (see callers
|
||||
in seq_derive / seq_rewriter).
|
||||
|
||||
Authors:
|
||||
|
||||
Margus Veanes (veanes) 2026
|
||||
|
||||
--*/
|
||||
#pragma once
|
||||
|
||||
#include "util/vector.h"
|
||||
#include <iosfwd>
|
||||
#include <utility>
|
||||
|
||||
namespace seq {
|
||||
|
||||
class range_predicate {
|
||||
using range_t = std::pair<unsigned, unsigned>;
|
||||
using ranges_t = svector<range_t>;
|
||||
|
||||
// Sorted by first; ranges are disjoint and non-adjacent;
|
||||
// every range satisfies lo <= hi <= m_max_char.
|
||||
ranges_t m_ranges;
|
||||
unsigned m_max_char { 0 };
|
||||
|
||||
// Invariant check used in debug builds.
|
||||
bool well_formed() const;
|
||||
|
||||
public:
|
||||
range_predicate() = default;
|
||||
explicit range_predicate(unsigned max_char) : m_max_char(max_char) {}
|
||||
|
||||
// ---------------- Factory functions ----------------
|
||||
|
||||
static range_predicate empty(unsigned max_char);
|
||||
static range_predicate top(unsigned max_char);
|
||||
static range_predicate singleton(unsigned c, unsigned max_char);
|
||||
static range_predicate range(unsigned lo, unsigned hi, unsigned max_char);
|
||||
|
||||
// ---------------- Observers ----------------
|
||||
|
||||
unsigned max_char() const { return m_max_char; }
|
||||
unsigned num_ranges() const { return m_ranges.size(); }
|
||||
range_t operator[](unsigned i) const { return m_ranges[i]; }
|
||||
ranges_t const& ranges() const { return m_ranges; }
|
||||
|
||||
bool is_empty() const { return m_ranges.empty(); }
|
||||
bool is_top() const {
|
||||
return m_ranges.size() == 1
|
||||
&& m_ranges[0].first == 0
|
||||
&& m_ranges[0].second == m_max_char;
|
||||
}
|
||||
bool is_singleton(unsigned& c) const {
|
||||
if (m_ranges.size() != 1) return false;
|
||||
if (m_ranges[0].first != m_ranges[0].second) return false;
|
||||
c = m_ranges[0].first;
|
||||
return true;
|
||||
}
|
||||
bool contains(unsigned c) const;
|
||||
|
||||
// Number of characters in the predicate (well-defined for any domain).
|
||||
uint64_t cardinality() const;
|
||||
|
||||
// ---------------- Equality, ordering, hashing ----------------
|
||||
|
||||
bool equals(range_predicate const& o) const;
|
||||
bool operator==(range_predicate const& o) const { return equals(o); }
|
||||
bool operator!=(range_predicate const& o) const { return !equals(o); }
|
||||
|
||||
// Total order: lexicographic on the canonical range sequence,
|
||||
// with shorter sequences ordered before longer prefixes.
|
||||
// Predicates over different domains compare by max_char first.
|
||||
bool operator<(range_predicate const& o) const;
|
||||
bool less_than(range_predicate const& o) const { return *this < o; }
|
||||
|
||||
unsigned hash() const;
|
||||
|
||||
// ---------------- Boolean operations ----------------
|
||||
|
||||
range_predicate operator|(range_predicate const& o) const; // union
|
||||
range_predicate operator&(range_predicate const& o) const; // intersection
|
||||
range_predicate operator-(range_predicate const& o) const; // difference
|
||||
range_predicate operator^(range_predicate const& o) const; // symmetric diff
|
||||
range_predicate operator~() const; // complement
|
||||
|
||||
// ---------------- Display ----------------
|
||||
|
||||
std::ostream& display(std::ostream& out) const;
|
||||
};
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& out, range_predicate const& p) {
|
||||
return p.display(out);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -85,45 +85,6 @@ namespace seq {
|
|||
return is_ground(r);
|
||||
}
|
||||
|
||||
/*
|
||||
Collect the leaves of a t-regex der (an ITE-tree whose leaves are
|
||||
regex expressions) into the output vector. Empty (re.empty) leaves
|
||||
are dropped.
|
||||
|
||||
Each leaf is treated as a single bisimulation state regardless of
|
||||
its top-level shape (including re.union and re.antimirov_union):
|
||||
descending into a union at the leaf would split one state into
|
||||
several, which is semantically unsound for the bisimulation /
|
||||
union-find merging that follows.
|
||||
|
||||
Returns false if we encountered an unexpected node (e.g. a free
|
||||
variable creeping in) — in that case the caller should bail out.
|
||||
*/
|
||||
bool regex_bisim::collect_leaves(expr* der, expr_ref_vector& leaves) {
|
||||
ptr_vector<expr> work;
|
||||
obj_hashtable<expr> seen;
|
||||
work.push_back(der);
|
||||
seen.insert(der);
|
||||
while (!work.empty()) {
|
||||
expr* e = work.back();
|
||||
work.pop_back();
|
||||
expr* c = nullptr, * t = nullptr, * f = nullptr;
|
||||
if (m.is_ite(e, c, t, f)) {
|
||||
if (seen.insert_if_not_there(t))
|
||||
work.push_back(t);
|
||||
if (seen.insert_if_not_there(f))
|
||||
work.push_back(f);
|
||||
continue;
|
||||
}
|
||||
if (m_util.re.is_empty(e))
|
||||
continue;
|
||||
if (!m_util.is_re(e))
|
||||
return false;
|
||||
leaves.push_back(e);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
Fast inequivalence check based on the get_info().classical flag.
|
||||
|
||||
|
|
@ -232,15 +193,19 @@ namespace seq {
|
|||
m_worklist.pop_back();
|
||||
|
||||
// Compute the symbolic derivative wrt the canonical variable
|
||||
// (:var 0). The result is a transition regex (ITE tree) whose
|
||||
// leaves are regex expressions. We use the classical Brzozowski
|
||||
// entry point so the derivative stays as a single TRegex and
|
||||
// does not lift unions to the top via antimirov nodes — this
|
||||
// preserves the XOR-pair invariant the bisimulation relies on.
|
||||
expr_ref d(m_rw.mk_brz_derivative(r), m);
|
||||
// (:var 0) and enumerate its reachable leaves in fully
|
||||
// ITE-hoisted normal form. Every if-then-else over the input
|
||||
// character — even one that would otherwise be buried under a
|
||||
// concat or union — is hoisted to the top and infeasible
|
||||
// minterms are pruned, so each leaf is a ground regex free of
|
||||
// (:var 0) whose nullability is always decidable. Unions are
|
||||
// kept intact as single leaves (a union leaf denotes a single
|
||||
// bisimulation state, never a split into separate states).
|
||||
expr_ref_pair_vector cofs(m);
|
||||
m_rw.brz_derivative_cofactors(r, cofs);
|
||||
expr_ref_vector leaves(m);
|
||||
if (!collect_leaves(d, leaves))
|
||||
return l_undef;
|
||||
for (auto const& p : cofs)
|
||||
leaves.push_back(p.second);
|
||||
|
||||
// First pass: check for any nullable leaf (definitive
|
||||
// distinguishing empty-continuation word) or any classically
|
||||
|
|
|
|||
|
|
@ -74,7 +74,6 @@ namespace seq {
|
|||
|
||||
unsigned node_of(expr* r);
|
||||
bool merge_leaf(expr* xor_pair);
|
||||
bool collect_leaves(expr* der, expr_ref_vector& leaves);
|
||||
lbool nullability(expr* r);
|
||||
bool is_supported(expr* r);
|
||||
// Returns true if the leaf l proves that the original pair is
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -19,6 +19,7 @@ Notes:
|
|||
#pragma once
|
||||
|
||||
#include "ast/seq_decl_plugin.h"
|
||||
#include "ast/rewriter/seq_derive.h"
|
||||
#include "ast/ast_pp.h"
|
||||
#include "ast/arith_decl_plugin.h"
|
||||
#include "ast/rewriter/rewriter_types.h"
|
||||
|
|
@ -128,15 +129,20 @@ class seq_rewriter {
|
|||
void insert(decl_kind op, expr* a, expr* b, expr* c, expr* r);
|
||||
};
|
||||
|
||||
friend class seq::derive;
|
||||
|
||||
seq_util m_util;
|
||||
seq_subset m_subset;
|
||||
arith_util m_autil;
|
||||
bool_rewriter m_br;
|
||||
seq::derive m_derive;
|
||||
// re2automaton m_re2aut;
|
||||
op_cache m_op_cache;
|
||||
expr_ref_vector m_es, m_lhs, m_rhs;
|
||||
bool m_coalesce_chars;
|
||||
bool m_in_bisim { false };
|
||||
bool m_coalesce_chars = false;
|
||||
bool m_in_bisim { false };
|
||||
unsigned m_re_deriv_depth { 0 };
|
||||
static const unsigned m_max_re_deriv_depth = 512;
|
||||
|
||||
enum length_comparison {
|
||||
shorter_c,
|
||||
|
|
@ -179,8 +185,6 @@ class seq_rewriter {
|
|||
expr_ref re_replace_char(expr *r, unsigned a_ch, unsigned b_ch, expr *a_str, expr *b_str);
|
||||
|
||||
// Calculate derivative, memoized and enforcing a normal form
|
||||
expr_ref is_nullable_rec(expr* r);
|
||||
expr_ref mk_derivative_rec(expr* ele, expr* r);
|
||||
expr_ref mk_der_op(decl_kind k, expr* a, expr* b);
|
||||
expr_ref mk_der_op_rec(decl_kind k, expr* a, expr* b);
|
||||
expr_ref mk_der_concat(expr* a, expr* b);
|
||||
|
|
@ -191,26 +195,10 @@ class seq_rewriter {
|
|||
expr_ref mk_der_cond(expr* cond, expr* ele, sort* seq_sort);
|
||||
expr_ref mk_der_antimirov_union(expr* r1, expr* r2);
|
||||
bool ite_bdds_compatible(expr* a, expr* b);
|
||||
/* if r has the form deriv(en..deriv(e1,to_re(s))..) returns 's = [e1..en]' else returns '() in r'*/
|
||||
expr_ref is_nullable_symbolic_regex(expr* r, sort* seq_sort);
|
||||
#ifdef Z3DEBUG
|
||||
bool check_deriv_normal_form(expr* r, int level = 3);
|
||||
#endif
|
||||
|
||||
void mk_antimirov_deriv_rec(expr* e, expr* r, expr* path, expr_ref& result);
|
||||
|
||||
expr_ref mk_antimirov_deriv(expr* e, expr* r, expr* path);
|
||||
expr_ref mk_in_antimirov_rec(expr* s, expr* d);
|
||||
expr_ref mk_in_antimirov(expr* s, expr* d);
|
||||
|
||||
expr_ref mk_antimirov_deriv_intersection(expr* elem, expr* d1, expr* d2, expr* path);
|
||||
expr_ref mk_antimirov_deriv_concat(expr* d, expr* r);
|
||||
expr_ref mk_antimirov_deriv_negate(expr* elem, expr* d);
|
||||
expr_ref mk_antimirov_deriv_union(expr* d1, expr* d2);
|
||||
expr_ref mk_antimirov_deriv_restrict(expr* elem, expr* d1, expr* cond);
|
||||
expr_ref mk_regex_reverse(expr* r);
|
||||
expr_ref mk_regex_concat(expr* r1, expr* r2);
|
||||
|
||||
expr_ref merge_regex_sets(expr* r1, expr* r2, expr* unit, std::function<bool(expr*, expr*&, expr*&)>& decompose, std::function<expr* (expr*, expr*)>& compose);
|
||||
|
||||
// elem is (:var 0) and path a condition that may have (:var 0) as a free variable
|
||||
|
|
@ -267,6 +255,14 @@ class seq_rewriter {
|
|||
br_status mk_re_union0(expr* a, expr* b, expr_ref& result);
|
||||
br_status mk_re_inter0(expr* a, expr* b, expr_ref& result);
|
||||
br_status mk_re_complement(expr* a, expr_ref& result);
|
||||
// Range-set collapse helpers: if the operands form a boolean
|
||||
// combination of character-class regexes, materialize the result as a
|
||||
// canonical regex over a single range_predicate. See
|
||||
// ast/rewriter/seq_range_collapse.h for the recognized fragment.
|
||||
// NOTE: re.complement is intentionally not in this set because it
|
||||
// operates at the sequence level, not the character-class level.
|
||||
bool try_collapse_re_union(expr* a, expr* b, expr_ref& result);
|
||||
bool try_collapse_re_inter(expr* a, expr* b, expr_ref& result);
|
||||
br_status mk_re_star(expr* a, expr_ref& result);
|
||||
br_status mk_re_diff(expr* a, expr* b, expr_ref& result);
|
||||
br_status mk_re_xor(expr* a, expr* b, expr_ref& result);
|
||||
|
|
@ -351,9 +347,9 @@ class seq_rewriter {
|
|||
|
||||
public:
|
||||
seq_rewriter(ast_manager & m, params_ref const & p = params_ref()):
|
||||
m_util(m), m_subset(m_util.re), m_autil(m), m_br(m, p), // m_re2aut(m),
|
||||
m_util(m), m_subset(m_util.re), m_autil(m), m_br(m, p), m_derive(m, *this),
|
||||
m_op_cache(m), m_es(m),
|
||||
m_lhs(m), m_rhs(m), m_coalesce_chars(true) {
|
||||
m_lhs(m), m_rhs(m) {
|
||||
}
|
||||
ast_manager & m() const { return m_util.get_manager(); }
|
||||
family_id get_fid() const { return m_util.get_family_id(); }
|
||||
|
|
@ -364,7 +360,7 @@ public:
|
|||
static void get_param_descrs(param_descrs & r);
|
||||
|
||||
|
||||
bool coalesce_chars() const { return m_coalesce_chars; }
|
||||
// bool coalesce_chars() const { return m_coalesce_chars; }
|
||||
|
||||
br_status mk_app_core(func_decl * f, unsigned num_args, expr * const * args, expr_ref & result);
|
||||
br_status mk_eq_core(expr * lhs, expr * rhs, expr_ref & result);
|
||||
|
|
@ -380,6 +376,34 @@ public:
|
|||
return result;
|
||||
}
|
||||
|
||||
expr_ref mk_xor0(expr *a, expr *b) {
|
||||
expr_ref result(m());
|
||||
if (mk_re_xor0(a, b, result) == BR_FAILED)
|
||||
result = re().mk_xor(a, b);
|
||||
return result;
|
||||
}
|
||||
|
||||
expr_ref mk_union(expr *a, expr *b) {
|
||||
expr_ref result(m());
|
||||
if (mk_re_union(a, b, result) == BR_FAILED)
|
||||
result = re().mk_union(a, b);
|
||||
return result;
|
||||
}
|
||||
|
||||
expr_ref mk_inter(expr *a, expr *b) {
|
||||
expr_ref result(m());
|
||||
if (mk_re_inter(a, b, result) == BR_FAILED)
|
||||
result = re().mk_inter(a, b);
|
||||
return result;
|
||||
}
|
||||
|
||||
expr_ref mk_complement(expr *a) {
|
||||
expr_ref result(m());
|
||||
if (mk_re_complement(a, result) == BR_FAILED)
|
||||
result = re().mk_complement(a);
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
* makes concat and simplifies
|
||||
*/
|
||||
|
|
@ -440,7 +464,31 @@ public:
|
|||
procedure which relies on each leaf of D(p XOR q) being a coherent
|
||||
XOR pair (D_v p) XOR (D_v q).
|
||||
*/
|
||||
expr_ref mk_brz_derivative(expr* r);
|
||||
expr_ref mk_brz_derivative(expr *r) {
|
||||
return mk_derivative(r);
|
||||
}
|
||||
|
||||
/*
|
||||
Enumerate the cofactors (min-terms) of a transition regex r taken with
|
||||
respect to ele. Produces (path_condition, leaf_regex) pairs for every
|
||||
feasible path through the ITE-tree, pruning infeasible character ranges.
|
||||
Delegates to the derivative engine so the same path/interval context used
|
||||
while hoisting ITEs is reused for the leaf simplification.
|
||||
*/
|
||||
void get_cofactors(expr* ele, expr* r, expr_ref_pair_vector& result) {
|
||||
m_derive.get_cofactors(ele, r, result);
|
||||
}
|
||||
|
||||
/*
|
||||
Compute the symbolic derivative of r and enumerate its reachable leaves
|
||||
in fully ITE-hoisted normal form: a list of (path_condition, target)
|
||||
pairs where every target is free of (:var 0) (so nullability is always
|
||||
decidable) and unions are kept intact as single states. Used by
|
||||
regex_bisim, which consumes the targets and ignores the path conditions.
|
||||
*/
|
||||
void brz_derivative_cofactors(expr* r, expr_ref_pair_vector& result) {
|
||||
m_derive.derivative_cofactors(r, result);
|
||||
}
|
||||
|
||||
// heuristic elimination of element from condition that comes form a derivative.
|
||||
// special case optimization for conjunctions of equalities, disequalities and ranges.
|
||||
|
|
@ -451,6 +499,8 @@ public:
|
|||
/* Apply simplifications to the intersection to keep it normalized (r1 and r2 are not normalized)*/
|
||||
expr_ref mk_regex_inter_normalize(expr* r1, expr* r2);
|
||||
|
||||
expr_ref mk_regex_concat(expr *r1, expr *r2);
|
||||
|
||||
/*
|
||||
* Extract some string that is a member of r.
|
||||
* Return true if a valid string was extracted.
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ Author:
|
|||
|
||||
bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const {
|
||||
while (true) {
|
||||
|
||||
|
||||
if (a == b)
|
||||
return true;
|
||||
if (m_re.is_empty(a))
|
||||
|
|
@ -30,7 +30,7 @@ bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const {
|
|||
return true;
|
||||
|
||||
if (depth >= m_max_depth)
|
||||
return false;
|
||||
return false;
|
||||
|
||||
expr* a1 = nullptr, * a2 = nullptr, * b1 = nullptr, * b2 = nullptr;
|
||||
unsigned la, ua, lb, ub;
|
||||
|
|
@ -39,16 +39,12 @@ bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const {
|
|||
if (m_re.is_dot_plus(b) && m_re.get_info(a).nullable == l_false)
|
||||
return true;
|
||||
|
||||
// a ⊆ a*
|
||||
if (m_re.is_star(b, b1) && is_subset_rec(a, b1, depth))
|
||||
return true;
|
||||
|
||||
// e ⊆ a*
|
||||
if (m_re.is_epsilon(a) && m_re.is_star(b, b1))
|
||||
return true;
|
||||
|
||||
// R ⊆ R*
|
||||
if (m_re.is_star(b, b1) && is_subset_rec(a, b1, depth + 1))
|
||||
// a ⊆ a*: if b = b1* and a ⊆ b1, then a ⊆ b1*
|
||||
if (m_re.is_star(b, b1) && is_subset_rec(a, b1, depth))
|
||||
return true;
|
||||
|
||||
// R1* ⊆ R2* if R1 ⊆ R2
|
||||
|
|
@ -112,6 +108,12 @@ bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const {
|
|||
if (m_re.is_concat(b, b1, b2) && m_re.is_full_seq(b1) && is_subset_rec(a, b2, depth))
|
||||
return true;
|
||||
|
||||
// prefix absorption: P·R' ⊆ Σ*·R' for any prefix P (since P ⊆ Σ*).
|
||||
// Detect that a has R' (= b2) as a concatenation suffix, where b = Σ*·R'.
|
||||
// Covers contains-patterns, e.g. Σ*·a·Σ*·b·Σ* ⊆ Σ*·b·Σ*.
|
||||
if (m_re.is_concat(b, b1, b2) && m_re.is_full_seq(b1) && ends_with(a, b2))
|
||||
return true;
|
||||
|
||||
// R ⊆ R'·Σ* if R ⊆ R'
|
||||
if (m_re.is_concat(b, b1, b2) && m_re.is_full_seq(b2) && is_subset_rec(a, b1, depth))
|
||||
return true;
|
||||
|
|
@ -144,3 +146,30 @@ bool seq_subset::is_subset_rec(expr* a, expr* b, unsigned depth) const {
|
|||
bool seq_subset::is_subset(expr* a, expr* b) const {
|
||||
return is_subset_rec(a, b, 0);
|
||||
}
|
||||
|
||||
bool seq_subset::ends_with(expr* a, expr* suf) const {
|
||||
if (a == suf)
|
||||
return true;
|
||||
// Flatten both regexes into their sequence of concatenation factors
|
||||
// (independent of left/right associativity) and test list-suffix equality.
|
||||
ptr_vector<expr> af, sf;
|
||||
flatten_concat(a, af);
|
||||
flatten_concat(suf, sf);
|
||||
if (sf.size() > af.size())
|
||||
return false;
|
||||
unsigned off = af.size() - sf.size();
|
||||
for (unsigned i = 0; i < sf.size(); ++i)
|
||||
if (af[off + i] != sf[i])
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
void seq_subset::flatten_concat(expr* a, ptr_vector<expr>& out) const {
|
||||
expr* a1 = nullptr, * a2 = nullptr;
|
||||
if (m_re.is_concat(a, a1, a2)) {
|
||||
flatten_concat(a1, out);
|
||||
flatten_concat(a2, out);
|
||||
}
|
||||
else
|
||||
out.push_back(a);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,6 +24,12 @@ class seq_subset {
|
|||
|
||||
bool is_subset_rec(expr* a, expr* b, unsigned depth) const;
|
||||
|
||||
// true if regex a, viewed as a flattened concatenation, has suf as a
|
||||
// structural (concatenation) suffix.
|
||||
bool ends_with(expr* a, expr* suf) const;
|
||||
|
||||
void flatten_concat(expr* a, ptr_vector<expr>& out) const;
|
||||
|
||||
public:
|
||||
explicit seq_subset(seq_util::rex& re) : m_re(re) {}
|
||||
bool is_subset(expr* a, expr* b) const;
|
||||
|
|
|
|||
|
|
@ -461,6 +461,24 @@ namespace smt {
|
|||
if (re().is_empty(r))
|
||||
//trivially true
|
||||
return;
|
||||
// When one side is re.none the equation is a pure emptiness check on
|
||||
// the other regex (symmetric_diff already returned it as r). Decide
|
||||
// it directly by antimirov NFA reachability instead of running the
|
||||
// bisimulation/XOR closure, which would build large un-canonicalized
|
||||
// product states for intersections of contains-patterns.
|
||||
if ((re().is_empty(r1) || re().is_empty(r2)) && is_ground(r)) {
|
||||
switch (re_is_empty(r)) {
|
||||
case l_true:
|
||||
STRACE(seq_regex_brief, tout << "empty:eq ";);
|
||||
return; // languages equal (both empty): trivially true
|
||||
case l_false:
|
||||
STRACE(seq_regex_brief, tout << "empty:neq ";);
|
||||
th.add_axiom(~th.mk_eq(r1, r2, false), false_literal);
|
||||
return;
|
||||
case l_undef:
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Try the bisimulation procedure on ground regexes first. If it
|
||||
// returns a definite answer, dispatch the corresponding axiom and
|
||||
// bypass the symbolic emptiness/derivative closure.
|
||||
|
|
@ -562,7 +580,7 @@ namespace smt {
|
|||
lits.push_back(null_lit);
|
||||
|
||||
expr_ref_pair_vector cofactors(m);
|
||||
get_cofactors(d, cofactors);
|
||||
seq_rw().get_cofactors(hd, d, cofactors);
|
||||
for (auto const& p : cofactors) {
|
||||
if (is_member(p.second, u))
|
||||
continue;
|
||||
|
|
@ -671,6 +689,67 @@ namespace smt {
|
|||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
Decide emptiness of a ground regex r via antimirov-mode NFA
|
||||
reachability.
|
||||
|
||||
The symbolic derivative engine runs in antimirov mode, so the
|
||||
derivative of an intersection distributes into a *set* of individual
|
||||
product states inter(A_i, B_j) (each a small, ground regex) rather
|
||||
than one giant union-of-intersections term. get_derivative_targets
|
||||
enumerates these NFA successor states.
|
||||
|
||||
We short-circuit to l_false (non-empty) as soon as a reachable state
|
||||
is nullable (accepts the empty word) or classical (a regex built only
|
||||
from to_re/all/union/concat/star/plus/opt/loop, hence non-empty). An
|
||||
intersection itself is never classical, but once one operand reduces
|
||||
to Σ* the intersection collapses (via the derivative's subset
|
||||
simplification) to the other, classical, operand.
|
||||
|
||||
If the worklist is exhausted with no such state, r is empty (l_true).
|
||||
Returns l_undef if a step bound is hit, so callers can fall back to
|
||||
the general procedure.
|
||||
*/
|
||||
lbool seq_regex::re_is_empty(expr* r) {
|
||||
if (re().is_empty(r))
|
||||
return l_true;
|
||||
expr_ref_vector pinned(m);
|
||||
obj_hashtable<expr> visited;
|
||||
ptr_vector<expr> work;
|
||||
work.push_back(r);
|
||||
visited.insert(r);
|
||||
pinned.push_back(r);
|
||||
unsigned const bound = 100000;
|
||||
unsigned steps = 0;
|
||||
while (!work.empty()) {
|
||||
if (++steps > bound)
|
||||
return l_undef;
|
||||
expr* s = work.back();
|
||||
work.pop_back();
|
||||
auto info = re().get_info(s);
|
||||
if (!info.is_known())
|
||||
return l_undef;
|
||||
// ε ∈ L(s) or s is a non-empty classical regex ⇒ L(r) non-empty.
|
||||
if (info.nullable == l_true || info.classical)
|
||||
return l_false;
|
||||
// Dead state: prune (min_length == UINT_MAX means no word is
|
||||
// accepted from here).
|
||||
if (info.min_length == UINT_MAX)
|
||||
continue;
|
||||
expr_ref_vector targets(m);
|
||||
get_derivative_targets(s, targets);
|
||||
for (expr* t : targets) {
|
||||
if (visited.contains(t))
|
||||
continue;
|
||||
visited.insert(t);
|
||||
pinned.push_back(t);
|
||||
work.push_back(t);
|
||||
}
|
||||
}
|
||||
return l_true;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Return a list of all target regexes in the derivative of a regex r,
|
||||
ignoring the conditions along each path.
|
||||
|
|
@ -707,53 +786,26 @@ namespace smt {
|
|||
|
||||
/*
|
||||
Return a list of all (cond, leaf) pairs in a given derivative
|
||||
expression r.
|
||||
expression r, where elem is the character symbol the derivative was
|
||||
taken with respect to.
|
||||
|
||||
Note: this implementation is inefficient: it simply collects all expressions under an if and
|
||||
iterates over all combinations.
|
||||
The transition regexes produced by the symbolic derivative engine are
|
||||
ITE-trees over character predicates ci on elem (equalities such as
|
||||
elem = 'A', and ranges such as 'a' <= elem <= 'z'). These predicates
|
||||
are typically mutually exclusive, so the number of feasible truth
|
||||
assignments to {c1,..,ck} ("minterms") is small.
|
||||
|
||||
This method is still used by:
|
||||
The enumeration is delegated to seq::derive (via seq_rw().get_cofactors)
|
||||
so it reuses the very same path/interval context that the derivative
|
||||
engine uses while hoisting ITEs: each feasible path through the ITE-tree
|
||||
yields one (path_condition, leaf) cofactor, infeasible character-range
|
||||
combinations are pruned, and the leaf is simplified with the path-aware
|
||||
smart constructors.
|
||||
|
||||
This is used by:
|
||||
propagate_is_empty
|
||||
propagate_is_non_empty
|
||||
*/
|
||||
void seq_regex::get_cofactors(expr* r, expr_ref_pair_vector& result) {
|
||||
obj_hashtable<expr> ifs;
|
||||
expr* cond = nullptr, * r1 = nullptr, * r2 = nullptr;
|
||||
for (expr* e : subterms::ground(expr_ref(r, m)))
|
||||
if (m.is_ite(e, cond, r1, r2))
|
||||
ifs.insert(cond);
|
||||
|
||||
expr_ref_vector rs(m);
|
||||
vector<expr_ref_vector> conds;
|
||||
conds.push_back(expr_ref_vector(m));
|
||||
rs.push_back(r);
|
||||
for (expr* c : ifs) {
|
||||
unsigned sz = conds.size();
|
||||
expr_safe_replace rep1(m);
|
||||
expr_safe_replace rep2(m);
|
||||
rep1.insert(c, m.mk_true());
|
||||
rep2.insert(c, m.mk_false());
|
||||
expr_ref r2(m);
|
||||
for (unsigned i = 0; i < sz; ++i) {
|
||||
expr_ref_vector cs = conds[i];
|
||||
cs.push_back(mk_not(m, c));
|
||||
conds.push_back(cs);
|
||||
conds[i].push_back(c);
|
||||
expr_ref r1(rs.get(i), m);
|
||||
rep1(r1, r2);
|
||||
rs[i] = r2;
|
||||
rep2(r1, r2);
|
||||
rs.push_back(r2);
|
||||
}
|
||||
}
|
||||
for (unsigned i = 0; i < conds.size(); ++i) {
|
||||
expr_ref conj = mk_and(conds[i]);
|
||||
expr_ref r(rs.get(i), m);
|
||||
ctx.get_rewriter()(r);
|
||||
if (!m.is_false(conj) && !re().is_empty(r))
|
||||
result.push_back(conj, r);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
is_empty(r, u) => ~is_nullable(r)
|
||||
|
|
@ -781,7 +833,7 @@ namespace smt {
|
|||
d = mk_derivative_wrapper(hd, r);
|
||||
literal_vector lits;
|
||||
expr_ref_pair_vector cofactors(m);
|
||||
get_cofactors(d, cofactors);
|
||||
seq_rw().get_cofactors(hd, d, cofactors);
|
||||
for (auto const& p : cofactors) {
|
||||
if (is_member(p.second, u))
|
||||
continue;
|
||||
|
|
|
|||
|
|
@ -164,7 +164,12 @@ namespace smt {
|
|||
// returned by derivative_wrapper
|
||||
expr_ref mk_deriv_accept(expr* s, unsigned i, expr* r);
|
||||
void get_derivative_targets(expr* r, expr_ref_vector& targets);
|
||||
void get_cofactors(expr* r, expr_ref_pair_vector& result);
|
||||
|
||||
// Decide emptiness of a ground regex by antimirov-mode NFA
|
||||
// reachability: explore derivative target states, short-circuiting to
|
||||
// "non-empty" on the first reachable nullable or classical state.
|
||||
// Returns l_true (empty), l_false (non-empty), l_undef (gave up).
|
||||
lbool re_is_empty(expr* r);
|
||||
|
||||
/*
|
||||
Pretty print the regex of the state id to the out stream,
|
||||
|
|
|
|||
|
|
@ -115,15 +115,18 @@ add_executable(test-z3
|
|||
polynomial_factorization.cpp
|
||||
polynorm.cpp
|
||||
prime_generator.cpp
|
||||
seq_regex_bisim.cpp
|
||||
proof_checker.cpp
|
||||
qe_arith.cpp
|
||||
mbp_qel.cpp
|
||||
quant_elim.cpp
|
||||
quant_solve.cpp
|
||||
random.cpp
|
||||
range_predicate.cpp
|
||||
rational.cpp
|
||||
rcf.cpp
|
||||
region.cpp
|
||||
regex_range_collapse.cpp
|
||||
sat_local_search.cpp
|
||||
sat_lookahead.cpp
|
||||
sat_user_scope.cpp
|
||||
|
|
|
|||
|
|
@ -113,6 +113,8 @@
|
|||
X(api_bug) \
|
||||
X(api_special_relations) \
|
||||
X(arith_rewriter) \
|
||||
X(range_predicate) \
|
||||
X(regex_range_collapse) \
|
||||
X(seq_rewriter) \
|
||||
X(check_assumptions) \
|
||||
X(smt_context) \
|
||||
|
|
@ -195,6 +197,7 @@
|
|||
X(finite_set) \
|
||||
X(finite_set_rewriter) \
|
||||
X(fpa) \
|
||||
X(seq_regex_bisim) \
|
||||
X(term_enumeration) \
|
||||
X(lcube)
|
||||
|
||||
|
|
|
|||
260
src/test/range_predicate.cpp
Normal file
260
src/test/range_predicate.cpp
Normal file
|
|
@ -0,0 +1,260 @@
|
|||
/*++
|
||||
Copyright (c) 2026 Microsoft Corporation
|
||||
|
||||
Module Name:
|
||||
|
||||
test/range_predicate.cpp
|
||||
|
||||
Abstract:
|
||||
|
||||
Unit tests for the range-algebra value type seq::range_predicate.
|
||||
|
||||
The tests exercise:
|
||||
* factory constructors and canonical-form invariants,
|
||||
* extensional equality and total ordering,
|
||||
* Boolean operations (|, &, ~, -, ^) on hand-picked instances,
|
||||
* exhaustive verification of de-Morgan and lattice laws on a
|
||||
small character domain, by enumerating every subset.
|
||||
|
||||
Author:
|
||||
|
||||
Margus Veanes (veanes) 2026
|
||||
|
||||
--*/
|
||||
|
||||
#include "ast/rewriter/seq_range_predicate.h"
|
||||
#include "util/debug.h"
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
using seq::range_predicate;
|
||||
|
||||
namespace {
|
||||
|
||||
// Build a range_predicate from a bitmask over [0, max_char] for testing.
|
||||
range_predicate from_mask(uint64_t mask, unsigned max_char) {
|
||||
range_predicate r = range_predicate::empty(max_char);
|
||||
for (unsigned c = 0; c <= max_char; ++c)
|
||||
if ((mask >> c) & 1u)
|
||||
r = r | range_predicate::singleton(c, max_char);
|
||||
return r;
|
||||
}
|
||||
|
||||
// Convert a range_predicate back to a bitmask for cross-checking.
|
||||
uint64_t to_mask(range_predicate const& r) {
|
||||
uint64_t mask = 0;
|
||||
for (unsigned c = 0; c <= r.max_char(); ++c)
|
||||
if (r.contains(c))
|
||||
mask |= (uint64_t(1) << c);
|
||||
return mask;
|
||||
}
|
||||
|
||||
void test_factories() {
|
||||
auto e = range_predicate::empty(255);
|
||||
ENSURE(e.is_empty());
|
||||
ENSURE(!e.is_top());
|
||||
ENSURE(e.num_ranges() == 0);
|
||||
ENSURE(e.cardinality() == 0);
|
||||
|
||||
auto t = range_predicate::top(255);
|
||||
ENSURE(!t.is_empty());
|
||||
ENSURE(t.is_top());
|
||||
ENSURE(t.num_ranges() == 1);
|
||||
ENSURE(t.cardinality() == 256);
|
||||
ENSURE(t.contains(0));
|
||||
ENSURE(t.contains(255));
|
||||
|
||||
auto s = range_predicate::singleton(42, 255);
|
||||
ENSURE(s.num_ranges() == 1);
|
||||
ENSURE(s.cardinality() == 1);
|
||||
ENSURE(s.contains(42));
|
||||
ENSURE(!s.contains(41));
|
||||
unsigned c = 0;
|
||||
ENSURE(s.is_singleton(c));
|
||||
ENSURE(c == 42);
|
||||
|
||||
auto r = range_predicate::range(10, 20, 255);
|
||||
ENSURE(r.num_ranges() == 1);
|
||||
ENSURE(r.cardinality() == 11);
|
||||
ENSURE(r.contains(10));
|
||||
ENSURE(r.contains(20));
|
||||
ENSURE(!r.contains(9));
|
||||
ENSURE(!r.contains(21));
|
||||
|
||||
// Reversed bounds produce empty.
|
||||
auto bad = range_predicate::range(20, 10, 255);
|
||||
ENSURE(bad.is_empty());
|
||||
|
||||
// Clipping at max_char.
|
||||
auto clipped = range_predicate::range(200, 1000, 255);
|
||||
ENSURE(clipped.num_ranges() == 1);
|
||||
ENSURE(clipped[0] == std::make_pair(200u, 255u));
|
||||
}
|
||||
|
||||
void test_equality_and_order() {
|
||||
auto a = range_predicate::range(1, 5, 31);
|
||||
auto b = range_predicate::range(1, 5, 31);
|
||||
auto c = range_predicate::range(1, 6, 31);
|
||||
ENSURE(a == b);
|
||||
ENSURE(a != c);
|
||||
ENSURE(a.hash() == b.hash());
|
||||
ENSURE(a < c || c < a);
|
||||
ENSURE(!(a < a));
|
||||
|
||||
auto empty = range_predicate::empty(31);
|
||||
ENSURE(empty < a);
|
||||
|
||||
// Canonical merging of adjacent ranges.
|
||||
auto d = range_predicate::range(0, 4, 31) | range_predicate::range(5, 10, 31);
|
||||
auto e = range_predicate::range(0, 10, 31);
|
||||
ENSURE(d == e);
|
||||
}
|
||||
|
||||
void test_union_intersection_hand() {
|
||||
unsigned const M = 31;
|
||||
auto a = range_predicate::range(0, 4, M) | range_predicate::range(10, 14, M);
|
||||
auto b = range_predicate::range(3, 11, M);
|
||||
|
||||
auto u = a | b; // [0,14]
|
||||
ENSURE(u.num_ranges() == 1);
|
||||
ENSURE(u[0] == std::make_pair(0u, 14u));
|
||||
|
||||
auto i = a & b; // [3,4] U [10,11]
|
||||
ENSURE(i.num_ranges() == 2);
|
||||
ENSURE(i[0] == std::make_pair(3u, 4u));
|
||||
ENSURE(i[1] == std::make_pair(10u, 11u));
|
||||
|
||||
auto d = a - b; // [0,2] U [12,14]
|
||||
ENSURE(d.num_ranges() == 2);
|
||||
ENSURE(d[0] == std::make_pair(0u, 2u));
|
||||
ENSURE(d[1] == std::make_pair(12u, 14u));
|
||||
|
||||
auto x = a ^ b; // [0,2] U [5,9] U [12,14]
|
||||
ENSURE(x.num_ranges() == 3);
|
||||
ENSURE(x[0] == std::make_pair(0u, 2u));
|
||||
ENSURE(x[1] == std::make_pair(5u, 9u));
|
||||
ENSURE(x[2] == std::make_pair(12u, 14u));
|
||||
}
|
||||
|
||||
void test_complement_hand() {
|
||||
unsigned const M = 10;
|
||||
auto e = range_predicate::empty(M);
|
||||
ENSURE((~e).is_top());
|
||||
auto t = range_predicate::top(M);
|
||||
ENSURE((~t).is_empty());
|
||||
|
||||
// ~([2,3] U [7,8]) = [0,1] U [4,6] U [9,10]
|
||||
auto a = range_predicate::range(2, 3, M) | range_predicate::range(7, 8, M);
|
||||
auto na = ~a;
|
||||
ENSURE(na.num_ranges() == 3);
|
||||
ENSURE(na[0] == std::make_pair(0u, 1u));
|
||||
ENSURE(na[1] == std::make_pair(4u, 6u));
|
||||
ENSURE(na[2] == std::make_pair(9u, 10u));
|
||||
|
||||
// ~([0,4]) = [5,10]
|
||||
auto b = range_predicate::range(0, 4, M);
|
||||
auto nb = ~b;
|
||||
ENSURE(nb.num_ranges() == 1);
|
||||
ENSURE(nb[0] == std::make_pair(5u, 10u));
|
||||
|
||||
// ~([5,10]) = [0,4]
|
||||
auto cnb = ~nb;
|
||||
ENSURE(cnb == b);
|
||||
}
|
||||
|
||||
// Exhaustively verify the lattice / de-Morgan laws on a small domain
|
||||
// by enumerating every possible subset (bitmask).
|
||||
void test_exhaustive_laws() {
|
||||
unsigned const M = 5; // 6 characters -> 64 subsets
|
||||
unsigned const N = 1u << (M + 1);
|
||||
for (unsigned i = 0; i < N; ++i) {
|
||||
range_predicate A = from_mask(i, M);
|
||||
ENSURE(to_mask(A) == i);
|
||||
// ~ ~ A == A
|
||||
ENSURE(~~A == A);
|
||||
// A | ~A == top
|
||||
ENSURE((A | ~A).is_top());
|
||||
// A & ~A == empty
|
||||
ENSURE((A & ~A).is_empty());
|
||||
// cardinality matches popcount
|
||||
unsigned pop = 0;
|
||||
for (unsigned k = 0; k <= M; ++k) if ((i >> k) & 1u) ++pop;
|
||||
ENSURE(A.cardinality() == pop);
|
||||
}
|
||||
for (unsigned i = 0; i < N; ++i) {
|
||||
range_predicate A = from_mask(i, M);
|
||||
for (unsigned j = 0; j < N; ++j) {
|
||||
range_predicate B = from_mask(j, M);
|
||||
// Bitmask reference semantics.
|
||||
ENSURE(to_mask(A | B) == (i | j));
|
||||
ENSURE(to_mask(A & B) == (i & j));
|
||||
ENSURE(to_mask(A - B) == (i & ~j & ((1u << (M + 1)) - 1u)));
|
||||
ENSURE(to_mask(A ^ B) == (i ^ j));
|
||||
// de-Morgan
|
||||
ENSURE(~(A | B) == (~A & ~B));
|
||||
ENSURE(~(A & B) == (~A | ~B));
|
||||
// Commutativity
|
||||
ENSURE((A | B) == (B | A));
|
||||
ENSURE((A & B) == (B & A));
|
||||
// (A - B) == A & ~B
|
||||
ENSURE((A - B) == (A & ~B));
|
||||
// (A ^ B) == (A | B) - (A & B)
|
||||
ENSURE((A ^ B) == ((A | B) - (A & B)));
|
||||
// Extensional equality is reflexive on equal masks.
|
||||
if (i == j) {
|
||||
ENSURE(A == B);
|
||||
ENSURE(A.hash() == B.hash());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void test_total_order_strict() {
|
||||
unsigned const M = 5;
|
||||
unsigned const N = 1u << (M + 1);
|
||||
// Strict total order: for any distinct A, B exactly one of A<B, B<A holds.
|
||||
for (unsigned i = 0; i < N; ++i) {
|
||||
range_predicate A = from_mask(i, M);
|
||||
ENSURE(!(A < A));
|
||||
for (unsigned j = i + 1; j < N; ++j) {
|
||||
range_predicate B = from_mask(j, M);
|
||||
bool lt = A < B;
|
||||
bool gt = B < A;
|
||||
ENSURE(lt != gt);
|
||||
ENSURE(lt || gt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void test_display() {
|
||||
std::ostringstream oss;
|
||||
oss << range_predicate::empty(31);
|
||||
ENSURE(oss.str() == "[]");
|
||||
|
||||
oss.str("");
|
||||
oss << range_predicate::range(3, 7, 31);
|
||||
ENSURE(oss.str() == "[3-7]");
|
||||
|
||||
oss.str("");
|
||||
oss << range_predicate::singleton(9, 31);
|
||||
ENSURE(oss.str() == "[9]");
|
||||
|
||||
oss.str("");
|
||||
auto p = range_predicate::range(0, 2, 31) | range_predicate::singleton(5, 31);
|
||||
oss << p;
|
||||
ENSURE(oss.str() == "[0-2,5]");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void tst_range_predicate() {
|
||||
test_factories();
|
||||
test_equality_and_order();
|
||||
test_union_intersection_hand();
|
||||
test_complement_hand();
|
||||
test_exhaustive_laws();
|
||||
test_total_order_strict();
|
||||
test_display();
|
||||
std::cout << "range_predicate unit tests passed\n";
|
||||
}
|
||||
244
src/test/regex_range_collapse.cpp
Normal file
244
src/test/regex_range_collapse.cpp
Normal file
|
|
@ -0,0 +1,244 @@
|
|||
/*++
|
||||
Copyright (c) 2026 Microsoft Corporation
|
||||
|
||||
Module Name:
|
||||
|
||||
regex_range_collapse.cpp - unit tests
|
||||
|
||||
--*/
|
||||
|
||||
#include "ast/rewriter/seq_range_collapse.h"
|
||||
#include "ast/reg_decl_plugins.h"
|
||||
#include "ast/ast_pp.h"
|
||||
#include "ast/arith_decl_plugin.h"
|
||||
#include "util/util.h"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
namespace {
|
||||
|
||||
using seq::range_predicate;
|
||||
using seq::regex_to_range_predicate;
|
||||
using seq::range_predicate_to_regex;
|
||||
|
||||
static void check(bool ok, char const* what) {
|
||||
if (!ok) {
|
||||
std::cerr << "regex_range_collapse FAILED: " << what << "\n";
|
||||
ENSURE(false);
|
||||
}
|
||||
}
|
||||
|
||||
static expr_ref mk_singleton_str(seq_util& u, unsigned c) {
|
||||
return expr_ref(u.str.mk_string(zstring(c)), u.get_manager());
|
||||
}
|
||||
|
||||
static bool extract_range_chars(seq_util& u, expr* e, unsigned& lo, unsigned& hi) {
|
||||
expr* lo_e = nullptr; expr* hi_e = nullptr;
|
||||
if (!u.re.is_range(e, lo_e, hi_e))
|
||||
return false;
|
||||
// Accept either string-constant or (seq.unit (Char N)) bound form.
|
||||
if (u.re.is_range(e, lo, hi))
|
||||
return true;
|
||||
expr* lc = nullptr; expr* hc = nullptr;
|
||||
if (u.str.is_unit(lo_e, lc) && u.is_const_char(lc, lo) &&
|
||||
u.str.is_unit(hi_e, hc) && u.is_const_char(hc, hi))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static void run() {
|
||||
ast_manager m;
|
||||
reg_decl_plugins(m);
|
||||
seq_util u(m);
|
||||
unsigned const M = u.max_char();
|
||||
|
||||
sort* str_sort = u.str.mk_string_sort();
|
||||
sort* re_sort = u.re.mk_re(str_sort);
|
||||
|
||||
// primitives
|
||||
{
|
||||
range_predicate p(M);
|
||||
check(regex_to_range_predicate(u, u.re.mk_empty(re_sort), p) && p.is_empty(),
|
||||
"re.empty -> empty");
|
||||
check(regex_to_range_predicate(u, u.re.mk_full_char(re_sort), p) && p.is_top(),
|
||||
"re.full_char -> top");
|
||||
}
|
||||
// re.range "a" "z"
|
||||
{
|
||||
range_predicate p(M);
|
||||
expr_ref a = mk_singleton_str(u, 'a');
|
||||
expr_ref z = mk_singleton_str(u, 'z');
|
||||
expr_ref r(u.re.mk_range(a, z), m);
|
||||
check(regex_to_range_predicate(u, r, p) && p.num_ranges() == 1 &&
|
||||
p[0].first == 'a' && p[0].second == 'z',
|
||||
"re.range a z -> [a,z]");
|
||||
}
|
||||
// Disjoint union: (a..z) | (0..9)
|
||||
{
|
||||
range_predicate p(M);
|
||||
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'z')), m);
|
||||
expr_ref r2(u.re.mk_range(mk_singleton_str(u, '0'), mk_singleton_str(u, '9')), m);
|
||||
expr_ref un(u.re.mk_union(r1, r2), m);
|
||||
check(regex_to_range_predicate(u, un, p) && p.num_ranges() == 2,
|
||||
"(a-z)|(0-9) -> 2 ranges");
|
||||
// canonical order: lower lo first
|
||||
check(p[0].first == '0' && p[0].second == '9' && p[1].first == 'a' && p[1].second == 'z',
|
||||
"(a-z)|(0-9) ranges in canonical order");
|
||||
}
|
||||
// Overlapping union: (a..c) | (b..f) -> (a..f)
|
||||
{
|
||||
range_predicate p(M);
|
||||
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'c')), m);
|
||||
expr_ref r2(u.re.mk_range(mk_singleton_str(u, 'b'), mk_singleton_str(u, 'f')), m);
|
||||
expr_ref un(u.re.mk_union(r1, r2), m);
|
||||
check(regex_to_range_predicate(u, un, p) && p.num_ranges() == 1 &&
|
||||
p[0].first == 'a' && p[0].second == 'f',
|
||||
"(a-c)|(b-f) -> (a-f)");
|
||||
}
|
||||
// Adjacent union: (a..c) | (d..f) -> (a..f) (canonical predicate merges adjacent)
|
||||
{
|
||||
range_predicate p(M);
|
||||
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'c')), m);
|
||||
expr_ref r2(u.re.mk_range(mk_singleton_str(u, 'd'), mk_singleton_str(u, 'f')), m);
|
||||
expr_ref un(u.re.mk_union(r1, r2), m);
|
||||
check(regex_to_range_predicate(u, un, p) && p.num_ranges() == 1 &&
|
||||
p[0].first == 'a' && p[0].second == 'f',
|
||||
"(a-c)|(d-f) -> (a-f) via adjacency");
|
||||
}
|
||||
// Disjoint intersection: (a..z) & (0..9) -> empty
|
||||
{
|
||||
range_predicate p(M);
|
||||
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'z')), m);
|
||||
expr_ref r2(u.re.mk_range(mk_singleton_str(u, '0'), mk_singleton_str(u, '9')), m);
|
||||
expr_ref ix(u.re.mk_inter(r1, r2), m);
|
||||
check(regex_to_range_predicate(u, ix, p) && p.is_empty(),
|
||||
"(a-z)&(0-9) -> empty");
|
||||
}
|
||||
// Overlapping intersection: (a..f) & (c..z) -> (c..f)
|
||||
{
|
||||
range_predicate p(M);
|
||||
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'f')), m);
|
||||
expr_ref r2(u.re.mk_range(mk_singleton_str(u, 'c'), mk_singleton_str(u, 'z')), m);
|
||||
expr_ref ix(u.re.mk_inter(r1, r2), m);
|
||||
check(regex_to_range_predicate(u, ix, p) && p.num_ranges() == 1 &&
|
||||
p[0].first == 'c' && p[0].second == 'f',
|
||||
"(a-f)&(c-z) -> (c-f)");
|
||||
}
|
||||
// Complement: re.complement is intentionally NOT a char-class op
|
||||
// (it operates over Σ*), so it must NOT be translated.
|
||||
{
|
||||
range_predicate p(M);
|
||||
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'z')), m);
|
||||
expr_ref cmp(u.re.mk_complement(r1), m);
|
||||
check(!regex_to_range_predicate(u, cmp, p),
|
||||
"re.comp of range is NOT translatable (sequence-level complement)");
|
||||
}
|
||||
// Diff: (a..f) \ (c..z) -> (a..b)
|
||||
{
|
||||
range_predicate p(M);
|
||||
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'f')), m);
|
||||
expr_ref r2(u.re.mk_range(mk_singleton_str(u, 'c'), mk_singleton_str(u, 'z')), m);
|
||||
expr_ref df(u.re.mk_diff(r1, r2), m);
|
||||
check(regex_to_range_predicate(u, df, p) && p.num_ranges() == 1 &&
|
||||
p[0].first == 'a' && p[0].second == 'b',
|
||||
"(a-f) \\ (c-z) -> (a-b)");
|
||||
}
|
||||
// Negative: re.* of a range is NOT a char class
|
||||
{
|
||||
range_predicate p(M);
|
||||
expr_ref r1(u.re.mk_range(mk_singleton_str(u, 'a'), mk_singleton_str(u, 'z')), m);
|
||||
expr_ref star(u.re.mk_star(r1), m);
|
||||
check(!regex_to_range_predicate(u, star, p),
|
||||
"re.* of range not translatable");
|
||||
}
|
||||
|
||||
// Negative: a regex whose element type is NOT a sequence of
|
||||
// characters (here (Seq Int)) must be rejected outright, even for
|
||||
// shapes that structurally resemble char-class operators.
|
||||
{
|
||||
range_predicate p(M);
|
||||
arith_util a(m);
|
||||
sort* int_seq = u.str.mk_seq(a.mk_int());
|
||||
sort* int_re = u.re.mk_re(int_seq);
|
||||
check(!regex_to_range_predicate(u, u.re.mk_empty(int_re), p),
|
||||
"re.empty over (Seq Int) is NOT a char class");
|
||||
check(!regex_to_range_predicate(u, u.re.mk_full_char(int_re), p),
|
||||
"re.full_char over (Seq Int) is NOT a char class");
|
||||
}
|
||||
|
||||
// ---- materialization round-trip ----
|
||||
|
||||
// empty -> re.empty
|
||||
{
|
||||
range_predicate p = range_predicate::empty(M);
|
||||
expr_ref e = range_predicate_to_regex(u, p, str_sort);
|
||||
check(u.re.is_empty(e), "empty -> re.empty");
|
||||
}
|
||||
// top -> re.full_char
|
||||
{
|
||||
range_predicate p = range_predicate::top(M);
|
||||
expr_ref e = range_predicate_to_regex(u, p, str_sort);
|
||||
check(u.re.is_full_char(e), "top -> re.full_char");
|
||||
}
|
||||
// single range -> re.range
|
||||
{
|
||||
range_predicate p = range_predicate::range('a', 'z', M);
|
||||
expr_ref e = range_predicate_to_regex(u, p, str_sort);
|
||||
unsigned lo = 0, hi = 0;
|
||||
check(extract_range_chars(u, e, lo, hi) && lo == 'a' && hi == 'z',
|
||||
"[a-z] -> re.range a z");
|
||||
}
|
||||
// singleton -> re.range c c
|
||||
{
|
||||
range_predicate p = range_predicate::singleton('A', M);
|
||||
expr_ref e = range_predicate_to_regex(u, p, str_sort);
|
||||
unsigned lo = 0, hi = 0;
|
||||
check(extract_range_chars(u, e, lo, hi) && lo == 'A' && hi == 'A',
|
||||
"{A} -> re.range A A");
|
||||
}
|
||||
// 2 ranges -> re.union(range_0, range_1) in canonical order
|
||||
{
|
||||
range_predicate p = range_predicate::range('0', '9', M)
|
||||
| range_predicate::range('a', 'z', M);
|
||||
expr_ref e = range_predicate_to_regex(u, p, str_sort);
|
||||
expr* a = nullptr; expr* b = nullptr;
|
||||
check(u.re.is_union(e, a, b), "2-range -> union");
|
||||
unsigned lo0 = 0, hi0 = 0, lo1 = 0, hi1 = 0;
|
||||
check(extract_range_chars(u, a, lo0, hi0) && lo0 == '0' && hi0 == '9',
|
||||
"union arg0 = (0-9) (canonical: lower lo first)");
|
||||
check(extract_range_chars(u, b, lo1, hi1) && lo1 == 'a' && hi1 == 'z',
|
||||
"union arg1 = (a-z)");
|
||||
}
|
||||
// 3 ranges -> right-associated union
|
||||
{
|
||||
range_predicate p = range_predicate::range(0, 5, M)
|
||||
| range_predicate::range(10, 15, M)
|
||||
| range_predicate::range(20, 25, M);
|
||||
expr_ref e = range_predicate_to_regex(u, p, str_sort);
|
||||
expr* a = nullptr; expr* rest = nullptr;
|
||||
check(u.re.is_union(e, a, rest), "3-range -> union(...)");
|
||||
unsigned lo = 0, hi = 0;
|
||||
check(extract_range_chars(u, a, lo, hi) && lo == 0 && hi == 5, "first arg = (0-5)");
|
||||
expr* b = nullptr; expr* c = nullptr;
|
||||
check(u.re.is_union(rest, b, c), "rest is union(...,...)");
|
||||
check(extract_range_chars(u, b, lo, hi) && lo == 10 && hi == 15, "second range");
|
||||
check(extract_range_chars(u, c, lo, hi) && lo == 20 && hi == 25, "third range");
|
||||
}
|
||||
// Round-trip identity for an arbitrary range-set
|
||||
{
|
||||
range_predicate p_in = range_predicate::range('a', 'c', M)
|
||||
| range_predicate::range('m', 'p', M)
|
||||
| range_predicate::range('x', 'z', M);
|
||||
expr_ref e = range_predicate_to_regex(u, p_in, str_sort);
|
||||
range_predicate p_out(M);
|
||||
check(regex_to_range_predicate(u, e, p_out), "round-trip translatable");
|
||||
check(p_in == p_out, "round-trip equal");
|
||||
}
|
||||
|
||||
std::cerr << "regex_range_collapse tests passed\n";
|
||||
}
|
||||
}
|
||||
|
||||
void tst_regex_range_collapse() {
|
||||
run();
|
||||
}
|
||||
127
src/test/seq_regex_bisim.cpp
Normal file
127
src/test/seq_regex_bisim.cpp
Normal file
|
|
@ -0,0 +1,127 @@
|
|||
// Regression test for the seq::derive::intersect_intervals bug.
|
||||
//
|
||||
// Background: derive uses a path-tracking interval set to compute symbolic
|
||||
// derivatives. The intersect_intervals routine used to react to a single
|
||||
// disjoint interval by dropping the entire kept suffix and skipping the rest
|
||||
// of the list, which silently killed valid branches in derivatives such as
|
||||
// D(a|b). That made the bisimulation procedure conclude bogus equalities
|
||||
// like a* == (a|b)*.
|
||||
//
|
||||
// This file also covers the seq::derive top-level-cache poisoning bug.
|
||||
// `m_top_cache` is keyed only by the regex; the routine used to populate it
|
||||
// while `m_ele` was set to a *concrete* character, baking that character
|
||||
// into the cached "symbolic" derivative. Subsequent calls with the same
|
||||
// regex but a different ele then returned a stale concrete answer instead
|
||||
// of the true symbolic derivative. The simplest victim is
|
||||
// (str.in_re "aP" (re.++ (re.* "a") "P"))
|
||||
// which used to return false because the derivative wrt 'a' was cached and
|
||||
// re-used as the derivative wrt 'P'.
|
||||
#include "ast/ast.h"
|
||||
#include "ast/ast_pp.h"
|
||||
#include "ast/reg_decl_plugins.h"
|
||||
#include "ast/seq_decl_plugin.h"
|
||||
#include "ast/rewriter/seq_rewriter.h"
|
||||
#include "ast/rewriter/seq_regex_bisim.h"
|
||||
#include "ast/rewriter/th_rewriter.h"
|
||||
#include <iostream>
|
||||
|
||||
static void test_a_star_neq_ab_star() {
|
||||
ast_manager m;
|
||||
reg_decl_plugins(m);
|
||||
seq_util u(m);
|
||||
seq_rewriter rw(m);
|
||||
|
||||
sort_ref str_sort(u.str.mk_string_sort(), m);
|
||||
|
||||
zstring sa("a"), sb("b");
|
||||
expr_ref re_a(u.re.mk_to_re(u.str.mk_string(sa)), m);
|
||||
expr_ref re_b(u.re.mk_to_re(u.str.mk_string(sb)), m);
|
||||
expr_ref a_star(u.re.mk_star(re_a), m);
|
||||
expr_ref ab(u.re.mk_union(re_a, re_b), m);
|
||||
expr_ref ab_star(u.re.mk_star(ab), m);
|
||||
|
||||
expr_ref d_ab = rw.mk_brz_derivative(ab);
|
||||
std::cout << "D(a|b) = " << mk_pp(d_ab, m) << "\n";
|
||||
|
||||
// Both the 'a' branch and the 'b' branch of D(a|b) must reach epsilon.
|
||||
// Collect the regex leaves of the symbolic derivative and require at
|
||||
// least two distinct accepting leaves (one for 'a' and one for 'b').
|
||||
expr_ref_vector leaves(m);
|
||||
auto collect = [&](expr* e, auto&& self) -> void {
|
||||
expr* c, *t, *f;
|
||||
if (m.is_ite(e, c, t, f) || u.re.is_union(e, t, f) || u.re.is_antimirov_union(e, t, f)) {
|
||||
self(t, self);
|
||||
self(f, self);
|
||||
return;
|
||||
}
|
||||
if (u.re.is_empty(e)) return;
|
||||
leaves.push_back(e);
|
||||
};
|
||||
collect(d_ab, collect);
|
||||
unsigned nullable_leaves = 0;
|
||||
for (expr* l : leaves) {
|
||||
expr_ref n = rw.is_nullable(l);
|
||||
if (m.is_true(n)) ++nullable_leaves;
|
||||
}
|
||||
std::cout << "D(a|b) leaves=" << leaves.size()
|
||||
<< " nullable=" << nullable_leaves << "\n";
|
||||
ENSURE(nullable_leaves >= 2);
|
||||
|
||||
// Bisim must report the two languages are not equivalent.
|
||||
seq::regex_bisim bisim(rw);
|
||||
lbool eq = bisim.are_equivalent(a_star, ab_star);
|
||||
std::cout << "bisim(a*, (a|b)*) = "
|
||||
<< (eq == l_true ? "true" : eq == l_false ? "false" : "undef") << "\n";
|
||||
ENSURE(eq == l_false);
|
||||
}
|
||||
|
||||
// Regression for the derive top-level-cache poisoning bug.
|
||||
// Take r = (re.* "a") ++ "P" and check str.in_re "aP" r. Before the fix
|
||||
// the first per-char derivative call (wrt 'a') populated m_top_cache with
|
||||
// 'a' baked into the symbolic ITE-tree, so the next call (wrt 'P') returned
|
||||
// that stale cached value instead of computing D_P(r) = epsilon, making
|
||||
// str.in_re wrongly return false.
|
||||
static void test_derive_cache_per_ele() {
|
||||
ast_manager m;
|
||||
reg_decl_plugins(m);
|
||||
seq_util u(m);
|
||||
seq_rewriter rw(m);
|
||||
|
||||
sort_ref str_sort(u.str.mk_string_sort(), m);
|
||||
|
||||
zstring sa("a"), sP("P"), s_aP("aP");
|
||||
expr_ref re_a(u.re.mk_to_re(u.str.mk_string(sa)), m);
|
||||
expr_ref re_P(u.re.mk_to_re(u.str.mk_string(sP)), m);
|
||||
expr_ref a_star(u.re.mk_star(re_a), m);
|
||||
expr_ref r(u.re.mk_concat(a_star, re_P), m);
|
||||
expr_ref aP(u.str.mk_string(s_aP), m);
|
||||
|
||||
// Compute D_'a'(a*P) and D_'P'(a*P) directly via mk_derivative.
|
||||
// Before the fix, m_top_cache was populated while m_ele = ele (the
|
||||
// concrete char), so the second call hit the stale cached answer from
|
||||
// the first. After the fix the cache is keyed by a symbolic var, so
|
||||
// each concrete-ele substitution produces the right answer.
|
||||
expr_ref ch_a(u.mk_char('a'), m);
|
||||
expr_ref ch_P(u.mk_char('P'), m);
|
||||
expr_ref d_a = rw.mk_derivative(ch_a, r);
|
||||
expr_ref d_P = rw.mk_derivative(ch_P, r);
|
||||
std::cout << "D_a(a*P) = " << mk_pp(d_a, m) << "\n";
|
||||
std::cout << "D_P(a*P) = " << mk_pp(d_P, m) << "\n";
|
||||
|
||||
// D_P(a*P) must be nullable (it accepts the empty suffix), while
|
||||
// D_a(a*P) must not be (it still needs a trailing 'P').
|
||||
expr_ref n_a = rw.is_nullable(d_a);
|
||||
expr_ref n_P = rw.is_nullable(d_P);
|
||||
th_rewriter trw(m);
|
||||
trw(n_a);
|
||||
trw(n_P);
|
||||
std::cout << "nullable(D_a) = " << mk_pp(n_a, m) << "\n";
|
||||
std::cout << "nullable(D_P) = " << mk_pp(n_P, m) << "\n";
|
||||
ENSURE(m.is_false(n_a));
|
||||
ENSURE(m.is_true(n_P));
|
||||
}
|
||||
|
||||
void tst_seq_regex_bisim() {
|
||||
test_a_star_neq_ab_star();
|
||||
test_derive_cache_per_ele();
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue