mirror of
https://github.com/Z3Prover/z3
synced 2026-06-29 03:48:51 +00:00
The previous commit reduced unions by recursively inserting each disjunct into the other operand, which recurses with depth proportional to the union width. On wide range-product unions (z3test 5721 sub#2) that overflowed the stack (exit 0xC00000FD), turning a timeout into a crash. Reformulate mk_union_core to flatten both operands into a disjunct set via an explicit worklist and reduce it with add_union_elem (a bounded loop applying subsumption, prefix factoring and same-condition ITE merge against every existing member). No width-proportional recursion remains. 5731 stays fixed (0.04s), 5728 stays at ~0.02s, 5721 sub#2 no longer crashes (cleanly times out as before), 92/92 unit tests pass. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
266 lines
11 KiB
C++
266 lines
11 KiB
C++
/*++
|
|
Copyright (c) 2026 Microsoft Corporation
|
|
|
|
Module Name:
|
|
|
|
seq_derive.h
|
|
|
|
Abstract:
|
|
|
|
Symbolic derivative computation for regular expressions.
|
|
Produces an ITE-tree (transition regex) representation where
|
|
the free variable is de Bruijn index 0 representing the input character.
|
|
|
|
Based on the theory of symbolic derivatives and transition regexes:
|
|
- Veanes et al., "On Symbolic Derivatives and Transition Regexes" (LPAR 2024)
|
|
- Varatalu, Veanes, Ernits, "RE#" (POPL 2025)
|
|
- Stanford, Veanes, Bjørner, "Symbolic Boolean Derivatives" (PLDI 2021)
|
|
|
|
Authors:
|
|
|
|
Nikolaj Bjorner (nbjorner) 2025-06-03
|
|
|
|
--*/
|
|
|
|
#pragma once
|
|
|
|
#include "ast/seq_decl_plugin.h"
|
|
#include "ast/arith_decl_plugin.h"
|
|
#include "ast/array_decl_plugin.h"
|
|
#include "ast/rewriter/bool_rewriter.h"
|
|
#include "util/obj_pair_hashtable.h"
|
|
#include "util/obj_triple_hashtable.h"
|
|
#include <functional>
|
|
|
|
class seq_rewriter;
|
|
|
|
namespace seq {
|
|
|
|
enum class derivative_kind { antimirov_t, brzozowski_t };
|
|
/**
|
|
* Symbolic derivative engine for regular expressions.
|
|
*
|
|
* Given a regex r, operator()(r) computes a symbolic derivative δ(r)
|
|
* represented as an ITE-tree over character predicates (using de Bruijn
|
|
* variable 0 for the character). Evaluating the ITE-tree for a concrete
|
|
* character 'a' yields the classical Brzozowski derivative δ_a(r).
|
|
*
|
|
* The ITE-tree structure implicitly defines minterms (equivalence classes
|
|
* of characters indistinguishable by the regex).
|
|
*
|
|
* Key properties:
|
|
* - Results are memoized for termination on cyclic derivative graphs
|
|
* - Union/intersection operands are sorted for ACI canonicalization
|
|
* - Depth-bounded to prevent stack overflow
|
|
*/
|
|
class derive {
|
|
ast_manager& m;
|
|
seq_util m_util;
|
|
arith_util m_autil;
|
|
bool_rewriter m_br;
|
|
seq_rewriter& m_re;
|
|
|
|
// Cache: maps (ele, regex) pair to its derivative
|
|
obj_pair_map<expr, expr, expr*> m_acache, m_bcache;
|
|
obj_pair_map<expr, expr, expr*> m_atop_cache, m_btop_cache; // post-simplify cache
|
|
expr_ref_vector m_trail; // pin cached results
|
|
|
|
// Op cache for ITE-hoisting operations (union, inter, concat, complement)
|
|
// Path-aware caches: key is (a, b, path_expr) for binary ops, (a, path_expr) for complement
|
|
obj_triple_map<expr, expr, expr, expr *> m_aunion_cache, m_bunion_cache, m_ainter_cache, m_binter_cache, m_axor_cache, m_bxor_cache;
|
|
obj_pair_map<expr, expr, expr*> m_aconcat_cache, m_bconcat_cache;
|
|
obj_pair_map<expr, expr, expr*> m_acomplement_cache, m_bcomplement_cache;
|
|
|
|
// Depth limiting
|
|
unsigned m_depth { 0 };
|
|
static const unsigned m_max_depth = 512;
|
|
|
|
seq_util::rex& re() { return m_util.re; }
|
|
seq_util& u() { return m_util; }
|
|
|
|
derivative_kind m_derivative_kind = derivative_kind::antimirov_t;
|
|
|
|
// The element (character) for the current derivative computation
|
|
expr_ref m_ele;
|
|
|
|
// Path state for inline pruning during mk_inter/mk_union/mk_complement
|
|
using intervals_t = svector<std::pair<unsigned, unsigned>>;
|
|
|
|
// Path: vector of signed atoms
|
|
svector<std::pair<expr*, bool>> m_path;
|
|
// Intervals: feasible character ranges under current path (append-only)
|
|
intervals_t m_intervals;
|
|
unsigned m_intervals_start { 0 };
|
|
// Stack of saved states for push/pop
|
|
struct path_save { unsigned path_sz; unsigned intervals_sz; unsigned intervals_start; expr* path_expr; };
|
|
svector<path_save> m_path_stack;
|
|
// Boolean expression encoding of current path (for cache keys)
|
|
expr_ref m_path_expr;
|
|
|
|
// Path interface
|
|
lbool push(expr* c, bool sign); // l_true: implied, l_undef: pushed (must pop), l_false: contradicts
|
|
void pop(); // restore state to matching push
|
|
expr* get_path_expr() { return m_path_expr; }
|
|
|
|
obj_pair_map<expr, expr, expr *> &cache() {
|
|
return m_derivative_kind == derivative_kind::antimirov_t ? m_acache : m_bcache;
|
|
}
|
|
|
|
obj_pair_map<expr, expr, expr *> &top_cache() {
|
|
return m_derivative_kind == derivative_kind::antimirov_t ? m_atop_cache : m_btop_cache;
|
|
}
|
|
|
|
obj_triple_map<expr, expr, expr, expr *> &union_cache() {
|
|
return m_derivative_kind == derivative_kind::antimirov_t ? m_aunion_cache : m_bunion_cache;
|
|
}
|
|
|
|
obj_triple_map<expr, expr, expr, expr *> &inter_cache() {
|
|
return m_derivative_kind == derivative_kind::antimirov_t ? m_ainter_cache : m_binter_cache;
|
|
}
|
|
|
|
obj_triple_map<expr, expr, expr, expr *> &xor_cache() {
|
|
return m_derivative_kind == derivative_kind::antimirov_t ? m_axor_cache : m_bxor_cache;
|
|
}
|
|
|
|
obj_pair_map<expr, expr, expr *> &concat_cache() {
|
|
return m_derivative_kind == derivative_kind::antimirov_t ? m_aconcat_cache : m_bconcat_cache;
|
|
}
|
|
|
|
obj_pair_map<expr, expr, expr *> &complement_cache() {
|
|
return m_derivative_kind == derivative_kind::antimirov_t ? m_acomplement_cache : m_bcomplement_cache;
|
|
}
|
|
|
|
// Hoist ITE: apply_op through ite(c, t, e) with path pruning
|
|
expr_ref apply_ite(expr* c, expr* t, expr* e, expr* r, std::function<expr_ref(expr*, expr*)> apply_op);
|
|
expr_ref apply_ite(expr* c, expr* t1, expr* e1, expr* t2, expr* e2, std::function<expr_ref(expr*, expr*)> apply_op);
|
|
expr_ref apply_ite(expr* c, expr* t, expr* e, std::function<expr_ref(expr*)> apply_op);
|
|
// Common ITE dispatch for binary ops (union/inter)
|
|
expr_ref hoist_ite(expr* a, expr* b, std::function<expr_ref(expr*, expr*)> apply_op);
|
|
|
|
// Evaluate a condition against the current path/intervals
|
|
lbool eval_path_cond(expr* c);
|
|
|
|
// Internal helpers for push
|
|
lbool push_path_atoms(expr* c, bool sign);
|
|
lbool push_intervals_impl(expr* c, bool sign);
|
|
|
|
// Core derivative computation
|
|
expr_ref derive_rec(expr* r);
|
|
expr_ref derive_core(expr* r);
|
|
|
|
// Helpers for specific regex constructs
|
|
expr_ref derive_to_re(expr* s, sort* seq_sort);
|
|
expr_ref derive_range(expr* lo, expr* hi, sort* seq_sort);
|
|
expr_ref derive_of_pred(expr* pred, sort* seq_sort);
|
|
|
|
// Nullable check: returns a Boolean expression
|
|
expr_ref is_nullable(expr* r);
|
|
expr_ref is_nullable_symbolic_regex(expr* r, sort* seq_sort);
|
|
|
|
// Smart constructors with path-aware simplification and ACI canonicalization
|
|
expr_ref mk_union(expr* a, expr* b);
|
|
bool are_complements(expr* a, expr* b);
|
|
unsigned union_id(expr* e); // complement-aware ID for sorting
|
|
bool is_subset(expr* a, expr* b);
|
|
expr_ref mk_union_core(expr* a, expr* b);
|
|
void add_union_elem(expr_ref_vector& set, expr* e);
|
|
expr_ref mk_inter(expr* a, expr* b);
|
|
expr_ref mk_inter_core(expr* a, expr* b);
|
|
expr_ref mk_concat(expr* a, expr* b);
|
|
expr_ref mk_complement(expr* a);
|
|
expr_ref mk_complement_core(expr* a);
|
|
expr_ref mk_xor(expr *a, expr *b);
|
|
expr_ref mk_xor_core(expr *a, expr *b);
|
|
expr_ref mk_core(decl_kind k, expr* a, expr* b);
|
|
expr_ref mk_ite(expr* c, expr* t, expr* e);
|
|
|
|
// Distribute concatenation through ITE/union in derivative
|
|
expr_ref mk_deriv_concat(expr* d, expr* tail);
|
|
expr_ref mk_deriv_concat_core(expr* d, expr* tail);
|
|
|
|
// Extract head character and tail from a sequence expression
|
|
bool get_head_tail(expr* s1, expr* s2, expr_ref& hd, expr_ref& tl);
|
|
|
|
// Predicate implication for character range conditions.
|
|
bool pred_implies(bool sign_a, expr* a, bool sign_b, expr* b);
|
|
bool pred_implies(expr* a, expr* b);
|
|
|
|
// Normalize reverse(r)
|
|
expr_ref mk_regex_reverse(expr* r);
|
|
|
|
// Condition evaluation helpers
|
|
lbool eval_cond(expr* cond);
|
|
lbool eval_range_cond(expr* c);
|
|
void intersect_intervals(unsigned lo, unsigned hi);
|
|
void exclude_interval(unsigned lo, unsigned hi);
|
|
|
|
// Cofactor enumeration over a transition regex (ITE-tree).
|
|
void get_cofactors_rec(expr* r, expr_ref_pair_vector& result);
|
|
|
|
// Re-apply union/intersection simplifications bottom-up to a cofactor
|
|
// leaf. decompose_ite substitutes ITE branch values structurally
|
|
// (no simplification), so leaves can contain un-normalized nodes such
|
|
// as union(R, none) or inter(R, none); this rebuilds them through
|
|
// mk_union/mk_inter so equal states share a canonical form.
|
|
expr_ref clean_leaf(expr* r);
|
|
|
|
sort* re_sort(expr* r) { return r->get_sort(); }
|
|
sort* seq_sort(expr* r) { sort* s = nullptr; m_util.is_re(r, s); return s; }
|
|
sort* ele_sort(expr* r) { sort* s = seq_sort(r); sort* e = nullptr; m_util.is_seq(s, e); return e; }
|
|
|
|
void reset();
|
|
void reset_op_caches();
|
|
|
|
public:
|
|
derive(ast_manager& m, seq_rewriter& re);
|
|
|
|
/**
|
|
* Compute the derivative of regex r with respect to element ele.
|
|
* When ele is a de Bruijn variable, produces a symbolic ITE-tree.
|
|
* When ele is a concrete character, produces the concrete derivative.
|
|
*/
|
|
expr_ref operator()(derivative_kind k, expr* ele, expr* r);
|
|
|
|
/**
|
|
* Convenience: symbolic derivative using de Bruijn var 0.
|
|
*/
|
|
expr_ref operator()(derivative_kind k, expr* r);
|
|
|
|
/**
|
|
* Nullable check: returns a Boolean expression that is true iff r accepts the empty string.
|
|
*/
|
|
expr_ref nullable(expr* r) { return is_nullable(r); }
|
|
|
|
/**
|
|
* Enumerate the cofactors (min-terms) of a transition regex r taken with
|
|
* respect to element ele. r is an ITE-tree over character predicates on
|
|
* ele; for every feasible path through the tree this produces a pair
|
|
* (path_condition, leaf_regex). Infeasible character-interval
|
|
* combinations are pruned using the same path/interval context that the
|
|
* derivative engine uses while hoisting ITEs.
|
|
*/
|
|
void get_cofactors(expr* ele, expr* r, expr_ref_pair_vector& result);
|
|
|
|
/**
|
|
* Compute the symbolic derivative of r and enumerate its reachable
|
|
* leaves in fully ITE-hoisted normal form.
|
|
*
|
|
* Concretely this returns, for every feasible minterm (character
|
|
* class) of δ(r), a pair (path_condition, target_regex). Every
|
|
* if-then-else over the input character (including ones that would
|
|
* otherwise be buried under a concat/union) is hoisted to the top
|
|
* via the same path/interval pruning used by the derivative engine,
|
|
* so each target_regex is free of (:var 0) and its nullability is
|
|
* always decidable. Unions are kept intact as single leaves (a
|
|
* union leaf denotes a single bisimulation state). Infeasible
|
|
* minterms are pruned, so all returned leaves are reachable.
|
|
*
|
|
* This is the entry point the regex_bisim equivalence procedure
|
|
* uses: it consumes the target_regex of each pair and ignores the
|
|
* (redundant) path condition.
|
|
*/
|
|
void derivative_cofactors(expr* r, expr_ref_pair_vector& result);
|
|
|
|
};
|
|
|
|
}
|