3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2025-04-06 17:44:08 +00:00

Regex solver updates (#4636)

* std::cout debugging statements

* comment out std::cout debugging as this is now a shared fork

* convert std::cout to TRACE statements for seq_rewriter and seq_regex

* add cases to min_length and max_length for regexes

* bug fix

* update min_length and max_length functions for REs

* initial pass on simplifying derivative normal forms by eliminating redundant predicates locally

* add seq_regex_brief trace statements

* working on debugging ref count issue

* fix ref count bug and convert trace statements to seq_regex_brief

* add compact tracing for cache hits/misses

* seq_regex fix cache hit/miss tracing and wrapper around is_nullable

* minor

* label and disable more experimental changes for testing

* minor documentation / tracing

* a few more @EXP annotations

* dead state elimination skeleton code

* progress on dead state elimination

* more progress on dead state elimination

* refactor dead state class to separate self-contained state_graph class

* finish factoring state_graph to only work with unsigned values, and implement separate functionality for expr* logic

* implement get_all_derivatives, add debug tracing

* trace statements for debugging is_nullable loop bug

* fix is_nullable loop bug

* comment out local nullable change and mark experimental

* pretty printing for state_graph

* rewrite state graph to remove the fragile assumption that all edges from a state are added at a time

* start of general cycle detection check + fix some comments

* implement full cycle detection procedure

* normalize derivative conditions to form 'ele <= a'

* order derivative conditions by character code

* fix confusing names m_to and m_from

* assign increasing state IDs from 1 instead of using get_id on AST node

* remove elim_condition call in get_dall_derivatives

* use u_map instead of uint_map to avoid memory leak

* remove unnecessary call to is_ground

* debugging

* small improvements to seq_regex_brief tracing

* fix bug on evil2 example

* save work

* new propagate code

* work in progress on using same seq sort for deriv calls

* avoid re-computing derivatives: use same head var for every derivative call

* use min_length on regexes to prune search

* simple implementation of can_be_in_cycle using rank function idea

* add a disabled experimental change

* minor cleanup comments, etc.

* seq_rewriter cleanup for PR

* typo noticed by Nikolaj

* move state graph to util/state_graph

* re-add accidentally removed line

* clean up seq_regex code removing obsolete functions and comments

* a few more cleanup items

* oops, missed merge change to fix compilation

* disabled change to lift unions to the top level and treat them seperately in seq_regex solver

* added get_overapprox_regex to over-approximate regex membership constraints

* replace calls to is_epsilon with a centrally available method in seq_decl_plugin

* simplifications and modifications in get_overapprox_regex and related

* added approximation support for sequence expressions that use ite

* removed is_app check that was redundant

* tweak differences with upstream

* rewrite derivative leaves

* enable Antimorov-style derivatives via lifting unions in the solver

* TODO placeholders for outputting state graph

* change order in seq_regex propagate_in_re

* implement a more restricted form of Antimorov derivatives via a special op code to indicate lifting unions

* minor

* new Antimorov optimizations based on BDD compatibility checking

* seq regex tracing for # of derivatives

* fix get_cofactors (currently this fix is buggy)

* partially revert get_cofactors buggy change

* re-implement get_cofactors to more efficiently explore nodes in the derivative expression

* dgml generation for state graph

* fix release build

* improved dgml output

* bug fixes in dgml generation

* dot output support for state_graph and moved dgml and dot output under CASSERT

* updated tracing of what regex corresponds to what state id with /tr:state_graph

* clean up & document Antimorov derivative support

* remove op cache tracing

* remove re_rank experimental idea

* small fix

* fix Antimorov derivative (important change for the good performance)

* remove unused and unnecessary code

* implemented simpler efficient get_cofactors alternative mk_deriv_accept

* simplifications in propagate_accept, and trace unusual cases

* document the various seq_regex tracing & debugging command-line options

* fix debug build (broken tracing)

* guard eager Antimorov lifting for possible disabling

* fix bug in propagate_accept Rule 1

* disable eager version of Antimorov lifting for performance reasons

* remove some remaining obsolete comments

Co-authored-by: calebstanford-msr <t-casta@microsoft.com>
Co-authored-by: Margus Veanes <margus@microsoft.com>
This commit is contained in:
Caleb Stanford 2020-08-13 15:47:36 -04:00 committed by GitHub
parent 9df6c10ad8
commit 2c02264a94
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 556 additions and 140 deletions

View file

@ -557,6 +557,12 @@ br_status seq_rewriter::mk_app_core(func_decl * f, unsigned num_args, expr * con
st = mk_re_concat(args[0], args[1], result);
}
break;
case _OP_RE_ANTIMOROV_UNION:
SASSERT(num_args == 2);
// Rewrite Antimorov union to real union
result = re().mk_union(args[0], args[1]);
st = BR_REWRITE1;
break;
case OP_RE_UNION:
if (num_args == 1) {
result = args[0];
@ -2365,11 +2371,26 @@ br_status seq_rewriter::mk_re_reverse(expr* r, expr_ref& result) {
}
}
/***************************************************
***** Begin Derivative Code *****
***************************************************/
/*
Symbolic derivative: seq -> regex -> regex
seq should be single char
*/
This is the rewriter entrypoint for computing a derivative.
Use mk_derivative from seq_decl_plugin instead to create a derivative
expression without computing it (simplifying).
This calls mk_derivative, the main logic which builds a derivative
recursively, but mk_derivative doesn't guarantee full simplification.
Once the derivative is built, we return BR_REWRITE_FULL so that
any remaining possible simplification is performed from the bottom up.
Rewriting also replaces _OP_RE_ANTIMOROV_UNION, which is produced
by is_derivative, with real union.
*/
br_status seq_rewriter::mk_re_derivative(expr* ele, expr* r, expr_ref& result) {
result = mk_derivative(ele, r);
// TBD: we may even declare BR_DONE here and potentially miss some simplifications
@ -2377,14 +2398,105 @@ br_status seq_rewriter::mk_re_derivative(expr* ele, expr* r, expr_ref& result) {
}
/*
Memoized, recursive implementation of the symbolic derivative such that
the result is in an optimized BDD form.
Note: Derivative Normal Form
Definition of BDD form:
if-then-elses are pushed outwards
and sorted by condition ID (cond->get_id()), from largest on
the outside to smallest on the inside.
Duplicate nested conditions are eliminated.
When computing derivatives recursively, we preserve the following
BDD normal form:
- At the top level, the derivative is a union of Antimorov derivatives
(Conceptually each element of the union is a different derivative).
We currently express this derivative using an internal op code:
_OP_RE_ANTIMOROV_UNION
- An Antimorov derivative is a nested if-then-else term.
if-then-elses are pushed outwards and sorted by condition ID
(cond->get_id()), from largest on the outside to smallest on the
inside. Duplicate nested conditions are eliminated.
- The leaves of the if-then-else BDD can have unions themselves,
but these are interpreted as Regex union, not as separate Antimorov
derivatives.
To debug the normal form, call Z3 with -dbg:seq_regex:
this calls check_deriv_normal_form (below) periodically.
The main logic is in mk_der_op_rec for combining normal forms
(some also in mk_der_compl_rec).
*/
#ifdef Z3DEBUG
/*
Debugging to check the derivative normal form that we assume
(see definition above).
This may fail on unusual/unexpected REs, such as those containing
regex variables, but this is by design as this is only checked
during debugging, and we have not considered how normal form
should apply in such cases.
*/
bool seq_rewriter::check_deriv_normal_form(expr* r, int level) {
if (level == 3) { // top level
STRACE("seq_verbose", tout
<< "Checking derivative normal form invariant...";);
}
expr *r1 = nullptr, *r2 = nullptr, *p = nullptr, *s = nullptr;
unsigned lo = 0, hi = 0;
STRACE("seq_verbose", tout << " (level " << level << ")";);
int new_level = 0;
if (re().is_antimorov_union(r)) {
SASSERT(level >= 2);
new_level = 2;
}
else if (m().is_ite(r)) {
SASSERT(level >= 1);
new_level = 1;
}
SASSERT(!re().is_diff(r));
SASSERT(!re().is_opt(r));
SASSERT(!re().is_plus(r));
if (re().is_antimorov_union(r, r1, r2) ||
re().is_concat(r, r1, r2) ||
re().is_union(r, r1, r2) ||
re().is_intersection(r, r1, r2) ||
m().is_ite(r, p, r1, r2)) {
check_deriv_normal_form(r1, new_level);
check_deriv_normal_form(r2, new_level);
}
else if (re().is_star(r, r1) ||
re().is_complement(r, r1) ||
re().is_loop(r, r1, lo) ||
re().is_loop(r, r1, lo, hi)) {
check_deriv_normal_form(r1, new_level);
}
else if (re().is_reverse(r, r1)) {
SASSERT(re().is_to_re(r1));
}
else if (re().is_full_seq(r) ||
re().is_empty(r) ||
re().is_range(r) ||
re().is_full_char(r) ||
re().is_of_pred(r) ||
re().is_to_re(r, s)) {
// OK
}
else {
SASSERT(false);
}
if (level == 3) {
STRACE("seq_verbose", tout << " passed!" << std::endl;);
}
return true;
}
#endif
/*
Memoized, recursive implementation of the symbolic derivative such that
the result is in normal form.
Functions without _rec are memoized wrappers, which call the _rec
version if lookup fails.
The main logic is in mk_der_op_rec for combining normal forms.
*/
expr_ref seq_rewriter::mk_derivative(expr* ele, expr* r) {
STRACE("seq_verbose", tout << "derivative: " << mk_pp(ele, m())
@ -2396,9 +2508,14 @@ expr_ref seq_rewriter::mk_derivative(expr* ele, expr* r) {
}
STRACE("seq_verbose", tout << "derivative result: "
<< mk_pp(result, m()) << std::endl;);
CASSERT("seq_regex", check_deriv_normal_form(r));
return result;
}
expr_ref seq_rewriter::mk_der_antimorov_union(expr* r1, expr* r2) {
return mk_der_op(_OP_RE_ANTIMOROV_UNION, r1, r2);
}
expr_ref seq_rewriter::mk_der_union(expr* r1, expr* r2) {
return mk_der_op(OP_RE_UNION, r1, r2);
}
@ -2466,15 +2583,37 @@ bool seq_rewriter::pred_implies(expr* a, expr* b) {
}
/*
Apply a binary operation, preserving BDD normal form on derivative expressions.
Utility function to decide if two BDDs (nested if-then-else terms)
have exactly the same structure and conditions.
*/
bool seq_rewriter::ite_bdds_compatabile(expr* a, expr* b) {
expr* ca = nullptr, *a1 = nullptr, *a2 = nullptr;
expr* cb = nullptr, *b1 = nullptr, *b2 = nullptr;
if (m().is_ite(a, ca, a1, a2) && m().is_ite(b, cb, b1, b2)) {
return (ca == cb) && ite_bdds_compatabile(a1, b1)
&& ite_bdds_compatabile(a2, b2);
}
else if (m().is_ite(a) || m().is_ite(b)) {
return false;
}
else {
return true;
}
}
/*
Apply a binary operation, preserving normal form on derivative expressions.
Preconditions:
- k is a binary op code on REs: one of concat, intersection, or union
(not difference)
- a and b are in BDD form
- k is one of the following binary op codes on REs:
OP_RE_INTERSECT
OP_RE_UNION
OP_RE_CONCAT
_OP_RE_ANTIMOROV_UNION
- a and b are in normal form (check_deriv_normal_form)
Postcondition:
- result is in BDD form
- result is in normal form (check_deriv_normal_form)
*/
expr_ref seq_rewriter::mk_der_op_rec(decl_kind k, expr* a, expr* b) {
STRACE("seq_verbose", tout << "mk_der_op_rec: " << k
@ -2483,6 +2622,7 @@ expr_ref seq_rewriter::mk_der_op_rec(decl_kind k, expr* a, expr* b) {
expr* ca = nullptr, *a1 = nullptr, *a2 = nullptr;
expr* cb = nullptr, *b1 = nullptr, *b2 = nullptr;
expr_ref result(m());
// Simplify if-then-elses whenever possible
auto mk_ite = [&](expr* c, expr* a, expr* b) {
return (a == b) ? a : m().mk_ite(c, a, b);
@ -2497,6 +2637,46 @@ expr_ref seq_rewriter::mk_der_op_rec(decl_kind k, expr* a, expr* b) {
m().is_not(e, e);
return e->get_id();
};
// Choose when to lift a union to the top level, by converting
// it to an Antimorov union
// This implements a restricted form of Antimorov derivatives
if (k == OP_RE_UNION) {
if (re().is_antimorov_union(a) || re().is_antimorov_union(b)) {
k = _OP_RE_ANTIMOROV_UNION;
}
#if 0
// Disabled: eager Antimorov lifting unless BDDs are compatible
// Note: the check for BDD compatibility could be made more
// sophisticated: in an Antimorov union of n terms, we really
// want to check if any pair of them is compatible.
else if (m().is_ite(a) && m().is_ite(b) &&
!ite_bdds_compatabile(a, b)) {
k = _OP_RE_ANTIMOROV_UNION;
}
#endif
}
if (k == _OP_RE_ANTIMOROV_UNION) {
result = re().mk_antimorov_union(a, b);
return result;
}
if (re().is_antimorov_union(a, a1, a2)) {
expr_ref r1(m()), r2(m());
r1 = mk_der_op(k, a1, b);
r2 = mk_der_op(k, a2, b);
result = re().mk_antimorov_union(r1, r2);
return result;
}
if (re().is_antimorov_union(b, b1, b2)) {
expr_ref r1(m()), r2(m());
r1 = mk_der_op(k, a, b1);
r2 = mk_der_op(k, a, b2);
result = re().mk_antimorov_union(r1, r2);
return result;
}
// Remaining non-union case: combine two if-then-else BDDs
// (underneath top-level Antimorov unions)
if (m().is_ite(a, ca, a1, a2)) {
expr_ref r1(m()), r2(m());
expr_ref notca(m().mk_not(ca), m());
@ -2594,6 +2774,7 @@ expr_ref seq_rewriter::mk_der_op(decl_kind k, expr* a, expr* b) {
result = mk_der_op_rec(k, a, b);
m_op_cache.insert(k, a, b, result);
}
CASSERT("seq_regex", check_deriv_normal_form(result));
return result;
}
@ -2603,13 +2784,22 @@ expr_ref seq_rewriter::mk_der_compl(expr* r) {
expr_ref result(m_op_cache.find(OP_RE_COMPLEMENT, r, nullptr), m());
if (!result) {
expr* c = nullptr, * r1 = nullptr, * r2 = nullptr;
if (m().is_ite(r, c, r1, r2)) {
if (re().is_antimorov_union(r, r1, r2)) {
// Convert union to intersection
// Result: Antimorov union at top level is lost, pushed inside ITEs
expr_ref res1(m()), res2(m());
res1 = mk_der_compl(r1);
res2 = mk_der_compl(r2);
result = mk_der_inter(res1, res2);
}
else if (m().is_ite(r, c, r1, r2)) {
result = m().mk_ite(c, mk_der_compl(r1), mk_der_compl(r2));
}
else if (BR_FAILED == mk_re_complement(r, result))
result = re().mk_complement(r);
m_op_cache.insert(OP_RE_COMPLEMENT, r, nullptr, result);
}
CASSERT("seq_regex", check_deriv_normal_form(result));
return result;
}
@ -2675,6 +2865,7 @@ expr_ref seq_rewriter::mk_der_cond(expr* cond, expr* ele, sort* seq_sort) {
}
STRACE("seq_verbose", tout << "mk_der_cond result: "
<< mk_pp(result, m()) << std::endl;);
CASSERT("seq_regex", check_deriv_normal_form(result));
return result;
}
@ -2696,7 +2887,11 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) {
}
expr_ref dr2 = mk_derivative(ele, r2);
is_n = re_predicate(is_n, seq_sort);
return mk_der_union(result, mk_der_concat(is_n, dr2));
// Instead of mk_der_union here, we use mk_der_antimorov_union to
// force the two cases to be considered separately and lifted to
// the top level. This avoids blowup in cases where determinization
// is expensive.
return mk_der_antimorov_union(result, mk_der_concat(is_n, dr2));
}
else if (re().is_star(r, r1)) {
return mk_der_concat(mk_derivative(ele, r1), r);
@ -2843,6 +3038,11 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) {
return expr_ref(re().mk_derivative(ele, r), m());
}
/*************************************************
***** End Derivative Code *****
*************************************************/
/*
* pattern match against all ++ "abc" ++ all ++ "def" ++ all regexes.
*/
@ -3128,11 +3328,11 @@ br_status seq_rewriter::mk_re_concat(expr* a, expr* b, expr_ref& result) {
result = b;
return BR_DONE;
}
if (is_epsilon(a)) {
if (re().is_epsilon(a)) {
result = b;
return BR_DONE;
}
if (is_epsilon(b)) {
if (re().is_epsilon(b)) {
result = a;
return BR_DONE;
}
@ -3249,11 +3449,11 @@ br_status seq_rewriter::mk_re_union0(expr* a, expr* b, expr_ref& result) {
result = b;
return BR_DONE;
}
if (re().is_star(a) && is_epsilon(b)) {
if (re().is_star(a) && re().is_epsilon(b)) {
result = a;
return BR_DONE;
}
if (re().is_star(b) && is_epsilon(a)) {
if (re().is_star(b) && re().is_epsilon(a)) {
result = b;
return BR_DONE;
}
@ -3558,11 +3758,11 @@ br_status seq_rewriter::mk_re_star(expr* a, expr_ref& result) {
result = re().mk_star(re().mk_union(b, c1));
return BR_REWRITE2;
}
if (is_epsilon(b)) {
if (re().is_epsilon(b)) {
result = re().mk_star(c);
return BR_REWRITE2;
}
if (is_epsilon(c)) {
if (re().is_epsilon(c)) {
result = re().mk_star(b);
return BR_REWRITE2;
}
@ -3599,7 +3799,7 @@ br_status seq_rewriter::mk_re_plus(expr* a, expr_ref& result) {
result = a;
return BR_DONE;
}
if (is_epsilon(a)) {
if (re().is_epsilon(a)) {
result = a;
return BR_DONE;
}
@ -4252,12 +4452,6 @@ bool seq_rewriter::reduce_by_length(expr_ref_vector& ls, expr_ref_vector& rs,
return true;
}
bool seq_rewriter::is_epsilon(expr* e) const {
expr* e1;
return re().is_to_re(e, e1) && str().is_empty(e1);
}
/**
reduce for the case where rs = a constant string,
ls contains a substring that matches no substring of rs.
@ -4357,6 +4551,8 @@ void seq_rewriter::op_cache::cleanup() {
if (m_table.size() >= m_max_cache_size) {
m_trail.reset();
m_table.reset();
STRACE("seq_regex", tout << "Op cache reset!" << std::endl;);
STRACE("seq_regex_brief", tout << "(OP CACHE RESET) ";);
STRACE("seq_verbose", tout << "Derivative op cache reset" << std::endl;);
}
}

View file

@ -191,6 +191,11 @@ class seq_rewriter {
expr_ref mk_der_inter(expr* a, expr* b);
expr_ref mk_der_compl(expr* a);
expr_ref mk_der_cond(expr* cond, expr* ele, sort* seq_sort);
expr_ref mk_der_antimorov_union(expr* r1, expr* r2);
bool ite_bdds_compatabile(expr* a, expr* b);
#ifdef Z3DEBUG
bool check_deriv_normal_form(expr* r, int level = 3);
#endif
bool lt_char(expr* ch1, expr* ch2);
bool eq_char(expr* ch1, expr* ch2);
@ -277,7 +282,6 @@ class seq_rewriter {
void add_next(u_map<expr*>& next, expr_ref_vector& trail, unsigned idx, expr* cond);
bool is_sequence(expr* e, expr_ref_vector& seq);
bool is_sequence(eautomaton& aut, expr_ref_vector& seq);
bool is_epsilon(expr* e) const;
bool get_lengths(expr* e, expr_ref_vector& lens, rational& pos);
bool reduce_back(expr_ref_vector& ls, expr_ref_vector& rs, expr_ref_pair_vector& new_eqs);
bool reduce_front(expr_ref_vector& ls, expr_ref_vector& rs, expr_ref_pair_vector& new_eqs);

View file

@ -616,6 +616,7 @@ void seq_decl_plugin::init() {
m_sigs[OP_RE_OF_PRED] = alloc(psig, m, "re.of.pred", 1, 1, &predA, reA);
m_sigs[OP_RE_REVERSE] = alloc(psig, m, "re.reverse", 1, 1, &reA, reA);
m_sigs[OP_RE_DERIVATIVE] = alloc(psig, m, "re.derivative", 1, 2, AreA, reA);
m_sigs[_OP_RE_ANTIMOROV_UNION] = alloc(psig, m, "re.union", 1, 2, reAreA, reA);
m_sigs[OP_SEQ_TO_RE] = alloc(psig, m, "seq.to.re", 1, 1, &seqA, reA);
m_sigs[OP_SEQ_IN_RE] = alloc(psig, m, "seq.in.re", 1, 2, seqAreA, boolT);
m_sigs[OP_SEQ_REPLACE_RE_ALL] = alloc(psig, m, "str.replace_re_all", 1, 3, seqAreAseqA, seqA);
@ -760,6 +761,7 @@ func_decl * seq_decl_plugin::mk_func_decl(decl_kind k, unsigned num_parameters,
case OP_RE_COMPLEMENT:
case OP_RE_REVERSE:
case OP_RE_DERIVATIVE:
case _OP_RE_ANTIMOROV_UNION:
m_has_re = true;
// fall-through
case OP_SEQ_UNIT:

View file

@ -109,6 +109,7 @@ enum seq_op_kind {
_OP_REGEXP_EMPTY,
_OP_REGEXP_FULL_CHAR,
_OP_RE_IS_NULLABLE,
_OP_RE_ANTIMOROV_UNION, // Lifted union for antimorov-style derivatives
_OP_SEQ_SKOLEM,
LAST_SEQ_OP
};
@ -237,9 +238,10 @@ class seq_util {
mutable scoped_ptr<bv_util> m_bv;
bv_util& bv() const;
public:
unsigned max_plus(unsigned x, unsigned y) const;
unsigned max_mul(unsigned x, unsigned y) const;
public:
ast_manager& get_manager() const { return m; }
@ -437,6 +439,7 @@ public:
app* mk_of_pred(expr* p);
app* mk_reverse(expr* r) { return m.mk_app(m_fid, OP_RE_REVERSE, r); }
app* mk_derivative(expr* ele, expr* r) { return m.mk_app(m_fid, OP_RE_DERIVATIVE, ele, r); }
app* mk_antimorov_union(expr* r1, expr* r2) { return m.mk_app(m_fid, _OP_RE_ANTIMOROV_UNION, r1, r2); }
bool is_to_re(expr const* n) const { return is_app_of(n, m_fid, OP_SEQ_TO_RE); }
bool is_concat(expr const* n) const { return is_app_of(n, m_fid, OP_RE_CONCAT); }
@ -455,6 +458,7 @@ public:
bool is_of_pred(expr const* n) const { return is_app_of(n, m_fid, OP_RE_OF_PRED); }
bool is_reverse(expr const* n) const { return is_app_of(n, m_fid, OP_RE_REVERSE); }
bool is_derivative(expr const* n) const { return is_app_of(n, m_fid, OP_RE_DERIVATIVE); }
bool is_antimorov_union(expr const* n) const { return is_app_of(n, m_fid, _OP_RE_ANTIMOROV_UNION); }
MATCH_UNARY(is_to_re);
MATCH_BINARY(is_concat);
MATCH_BINARY(is_union);
@ -468,6 +472,7 @@ public:
MATCH_UNARY(is_of_pred);
MATCH_UNARY(is_reverse);
MATCH_BINARY(is_derivative);
MATCH_BINARY(is_antimorov_union);
bool is_loop(expr const* n, expr*& body, unsigned& lo, unsigned& hi) const;
bool is_loop(expr const* n, expr*& body, unsigned& lo) const;
bool is_loop(expr const* n, expr*& body, expr*& lo, expr*& hi) const;

View file

@ -109,24 +109,6 @@ namespace smt {
return;
}
// Convert a non-ground sequence into an additional regex and
// strengthen the original regex constraint into an intersection
// for example:
// (x ++ "a" ++ y) in b*
// is coverted to
// (x ++ "a" ++ y) in intersect((.* ++ "a" ++ .*), b*)
if (!m.is_value(s)) {
expr_ref s_approx = get_overapprox_regex(s);
if (!re().is_full_seq(s_approx)) {
r = re().mk_inter(r, s_approx);
TRACE("seq_regex", tout
<< "get_overapprox_regex(" << mk_pp(s, m)
<< ") = " << mk_pp(s_approx, m) << std::endl;);
STRACE("seq_regex_brief", tout
<< "overapprox=" << state_str(r) << " ";);
}
}
if (coallesce_in_re(lit)) {
TRACE("seq_regex", tout
<< "simplified conjunctions to an intersection" << std::endl;);
@ -141,6 +123,26 @@ namespace smt {
return;
}
// Convert a non-ground sequence into an additional regex and
// strengthen the original regex constraint into an intersection
// for example:
// (x ++ "a" ++ y) in b*
// is coverted to
// (x ++ "a" ++ y) in intersect((.* ++ "a" ++ .*), b*)
expr_ref _r_temp_owner(m);
if (!m.is_value(s)) {
expr_ref s_approx = get_overapprox_regex(s);
if (!re().is_full_seq(s_approx)) {
r = re().mk_inter(r, s_approx);
_r_temp_owner = r;
TRACE("seq_regex", tout
<< "get_overapprox_regex(" << mk_pp(s, m)
<< ") = " << mk_pp(s_approx, m) << std::endl;);
STRACE("seq_regex_brief", tout
<< "overapprox=" << state_str(r) << " ";);
}
}
expr_ref zero(a().mk_int(0), m);
expr_ref acc = sk().mk_accept(s, zero, r);
literal acc_lit = th.mk_literal(acc);
@ -213,6 +215,7 @@ namespace smt {
*
* Rule 1. (accept s i r) => len(s) >= i + min_len(r)
* Rule 2. (accept s i r) & len(s) <= i => nullable(r)
* (only necessary if min_len fails and returns 0 for non-nullable r)
* Rule 3. (accept s i r) and len(s) > i =>
* (accept s (i + 1) (derivative s[i] r)
*
@ -258,24 +261,36 @@ namespace smt {
STRACE("seq_regex_brief", tout << "(unfold) ";);
// Rule 1: use min_length to prune search
expr_ref s_to_re(re().mk_to_re(s), m);
expr_ref s_plus_r(re().mk_concat(s_to_re, r), m);
unsigned min_len = re().min_length(s_plus_r);
literal len_s_ge_min = th.m_ax.mk_ge(th.mk_len(s), min_len);
unsigned min_len = re().min_length(r);
unsigned min_len_plus_i = u().max_plus(min_len, idx);
literal len_s_ge_min = th.m_ax.mk_ge(th.mk_len(s), min_len_plus_i);
th.propagate_lit(nullptr, 1, &lit, len_s_ge_min);
// Axiom equivalent to the above: th.add_axiom(~lit, len_s_ge_min);
// Rule 2: nullable check
literal len_s_le_i = th.m_ax.mk_le(th.mk_len(s), idx);
expr_ref is_nullable = is_nullable_wrapper(r);
if (m.is_false(is_nullable)) {
th.propagate_lit(nullptr, 1, &lit, ~len_s_le_i);
}
else if (!m.is_true(is_nullable)) {
// is_nullable did not simplify
literal is_nullable_lit = th.mk_literal(is_nullable_wrapper(r));
ctx.mark_as_relevant(is_nullable_lit);
th.add_axiom(~lit, ~len_s_le_i, is_nullable_lit);
if (min_len == 0) {
expr_ref is_nullable = is_nullable_wrapper(r);
if (m.is_false(is_nullable)) {
STRACE("seq_regex", tout
<< "Warning: min_length returned 0 for non-nullable regex"
<< std::endl;);
STRACE("seq_regex_brief", tout
<< " (Warning: min_length returned 0 for"
<< " non-nullable regex)";);
th.propagate_lit(nullptr, 1, &lit, ~len_s_le_i);
}
else if (!m.is_true(is_nullable)) {
// is_nullable did not simplify
STRACE("seq_regex", tout
<< "Warning: is_nullable did not simplify to true or false"
<< std::endl;);
STRACE("seq_regex_brief", tout
<< " (Warning: is_nullable did not simplify)";);
literal is_nullable_lit = th.mk_literal(is_nullable);
ctx.mark_as_relevant(is_nullable_lit);
th.add_axiom(~lit, ~len_s_le_i, is_nullable_lit);
}
}
// Rule 3: derivative unfolding
@ -283,24 +298,11 @@ namespace smt {
expr_ref hd = th.mk_nth(s, i);
expr_ref deriv(m);
deriv = derivative_wrapper(hd, r);
expr_ref accept_deriv(m);
accept_deriv = mk_deriv_accept(s, idx + 1, deriv);
accept_next.push_back(~lit);
accept_next.push_back(len_s_le_i);
expr_ref_pair_vector cofactors(m);
get_cofactors(deriv, cofactors);
for (auto const& p : cofactors) {
if (m.is_false(p.first) || re().is_empty(p.second)) continue;
expr_ref cond(p.first, m);
expr_ref deriv_leaf(p.second, m);
expr_ref acc = sk().mk_accept(s, a().mk_int(idx + 1), deriv_leaf);
expr_ref choice(m.mk_and(cond, acc), m);
literal choice_lit = th.mk_literal(choice);
accept_next.push_back(choice_lit);
// TBD: try prioritizing unvisited states here over visited
// ones (in the state graph), to improve performance
STRACE("seq_regex_verbose", tout << "added choice: "
<< mk_pp(choice, m) << std::endl;);
}
accept_next.push_back(th.mk_literal(accept_deriv));
th.add_axiom(accept_next);
}
@ -442,7 +444,7 @@ namespace smt {
expr_ref r = symmetric_diff(r1, r2);
expr_ref emp(re().mk_empty(m.get_sort(r)), m);
expr_ref n(m.mk_fresh_const("re.char", seq_sort), m);
expr_ref is_empty = sk().mk_is_empty(r, emp, n);
expr_ref is_empty = sk().mk_is_empty(r, r, n);
th.add_axiom(~th.mk_eq(r1, r2, false), th.mk_literal(is_empty));
}
@ -455,7 +457,7 @@ namespace smt {
expr_ref r = symmetric_diff(r1, r2);
expr_ref emp(re().mk_empty(m.get_sort(r)), m);
expr_ref n(m.mk_fresh_const("re.char", seq_sort), m);
expr_ref is_non_empty = sk().mk_is_non_empty(r, emp, n);
expr_ref is_non_empty = sk().mk_is_non_empty(r, r, n);
th.add_axiom(th.mk_eq(r1, r2, false), th.mk_literal(is_non_empty));
}
@ -517,22 +519,98 @@ namespace smt {
th.add_axiom(lits);
}
void seq_regex::get_cofactors(expr* r, expr_ref_vector& conds, expr_ref_pair_vector& result) {
expr* cond = nullptr, *th = nullptr, *el = nullptr;
if (m.is_ite(r, cond, th, el)) {
conds.push_back(cond);
get_cofactors(th, conds, result);
conds.pop_back();
conds.push_back(mk_not(m, cond));
get_cofactors(el, conds, result);
conds.pop_back();
}
else {
expr_ref conj = mk_and(conds);
result.push_back(conj, r);
/*
Given a string s, index i, and a derivative regex d, return an
expression that is equivalent to
accept s i r
but which pushes accept s i r into the leaves (next derivatives to
explore).
Input r is of type regex; output is of type bool.
Example:
mk_deriv_accept(s, i, (ite a r1 r2) u (ite b r3 r4))
= (or (ite a (accept s i r1) (accept s i r2))
(ite b (accept s i r3) (accept s i r4)))
*/
expr_ref seq_regex::mk_deriv_accept(expr* s, unsigned i, expr* r) {
vector<expr*> to_visit;
to_visit.push_back(r);
obj_map<expr, expr*> re_to_bool;
expr_ref_vector _temp_bool_owner(m); // temp owner for bools we create
// DFS
while (to_visit.size() > 0) {
expr* e = to_visit.back();
expr* econd = nullptr, *e1 = nullptr, *e2 = nullptr;
if (!re_to_bool.contains(e)) {
// First visit: add children
STRACE("seq_regex_verbose", tout << "1";);
if (m.is_ite(e, econd, e1, e2) ||
re().is_union(e, e1, e2)) {
to_visit.push_back(e1);
to_visit.push_back(e2);
}
// Mark first visit by adding nullptr to the map
re_to_bool.insert(e, nullptr);
}
else if (re_to_bool.find(e) == nullptr) {
// Second visit: set value
STRACE("seq_regex_verbose", tout << "2";);
to_visit.pop_back();
if (m.is_ite(e, econd, e1, e2)) {
expr* b1 = re_to_bool.find(e1);
expr* b2 = re_to_bool.find(e2);
expr* b = m.mk_ite(econd, b1, b2);
_temp_bool_owner.push_back(b);
re_to_bool.find(e) = b;
}
else if (re().is_union(e, e1, e2)) {
expr* b1 = re_to_bool.find(e1);
expr* b2 = re_to_bool.find(e2);
expr* b = m.mk_or(b1, b2);
_temp_bool_owner.push_back(b);
re_to_bool.find(e) = b;
}
else {
expr* iplus1 = a().mk_int(i);
_temp_bool_owner.push_back(iplus1);
expr_ref acc_leaf = sk().mk_accept(s, iplus1, e);
_temp_bool_owner.push_back(acc_leaf);
re_to_bool.find(e) = acc_leaf;
STRACE("seq_regex_verbose", tout
<< "mk_deriv_accept: added accept leaf: "
<< mk_pp(acc_leaf, m) << std::endl;);
}
}
else {
STRACE("seq_regex_verbose", tout << "3";);
// Remaining visits: skip
to_visit.pop_back();
}
}
// Finalize
expr_ref result(m);
result = re_to_bool.find(r); // Assigns ownership of all exprs in
// re_to_bool for after this completes
rewrite(result);
return result;
}
/*
Return a list of all leaves in the derivative of a regex r,
ignoring the conditions along each path.
Warning: Although the derivative
normal form tries to eliminate unsat condition paths, one cannot
assume that the path to each leaf is satisfiable in general
(e.g. when regexes are created using re.pred).
So not all results may correspond to satisfiable predicates.
It is OK to rely on the results being satisfiable for completeness,
but not soundness.
*/
void seq_regex::get_all_derivatives(expr* r, expr_ref_vector& results) {
// Get derivative
sort* seq_sort = nullptr;
@ -541,14 +619,74 @@ namespace smt {
expr_ref hd = mk_first(r, n);
expr_ref d(m);
d = derivative_wrapper(hd, r);
// Use get_cofactors method and try to filter out unsatisfiable conds
expr_ref_pair_vector cofactors(m);
get_cofactors(d, cofactors);
STRACE("seq_regex_verbose", tout << "getting all derivatives of: " << mk_pp(r, m) << std::endl;);
for (auto const& p : cofactors) {
if (m.is_false(p.first) || re().is_empty(p.second)) continue;
STRACE("seq_regex_verbose", tout << "adding derivative: " << mk_pp(p.second, m) << std::endl;);
results.push_back(p.second);
// DFS
vector<expr*> to_visit;
to_visit.push_back(d);
obj_map<expr, bool> visited; // set<expr> (bool is used as a unit type)
while (to_visit.size() > 0) {
expr* e = to_visit.back();
to_visit.pop_back();
if (visited.contains(e)) continue;
visited.insert(e, true);
expr* econd = nullptr, *e1 = nullptr, *e2 = nullptr;
if (m.is_ite(e, econd, e1, e2) ||
re().is_union(e, e1, e2)) {
to_visit.push_back(e1);
to_visit.push_back(e2);
}
else if (!re().is_empty(e)) {
results.push_back(e);
STRACE("seq_regex_verbose", tout
<< "get_all_derivatives: added deriv: "
<< mk_pp(e, m) << std::endl;);
}
}
STRACE("seq_regex", tout << "Number of derivatives: "
<< results.size() << std::endl;);
STRACE("seq_regex_brief", tout << "#derivs=" << results.size() << " ";);
}
/*
Return a list of all (cond, leaf) pairs in a given derivative
expression r.
Note: this recursive implementation is inefficient, since if nodes
are repeated often in the expression DAG, they may be visited
many times. For this reason, prefer mk_deriv_accept and
get_all_derivatives when possible.
This method is still used by:
propagate_is_empty
propagate_is_non_empty
*/
void seq_regex::get_cofactors(expr* r, expr_ref_pair_vector& result) {
expr_ref_vector conds(m);
get_cofactors_rec(r, conds, result);
STRACE("seq_regex", tout << "Number of derivatives: "
<< result.size() << std::endl;);
STRACE("seq_regex_brief", tout << "#derivs=" << result.size() << " ";);
}
void seq_regex::get_cofactors_rec(expr* r, expr_ref_vector& conds,
expr_ref_pair_vector& result) {
expr* cond = nullptr, *r1 = nullptr, *r2 = nullptr;
if (m.is_ite(r, cond, r1, r2)) {
conds.push_back(cond);
get_cofactors_rec(r1, conds, result);
conds.pop_back();
conds.push_back(mk_not(m, cond));
get_cofactors_rec(r2, conds, result);
conds.pop_back();
}
else if (re().is_union(r, r1, r2)) {
get_cofactors_rec(r1, conds, result);
get_cofactors_rec(r2, conds, result);
}
else {
expr_ref conj = mk_and(conds);
if (!m.is_false(conj) && !re().is_empty(r))
result.push_back(conj, r);
}
}
@ -618,6 +756,9 @@ namespace smt {
m_expr_to_state.insert(e, new_id);
STRACE("seq_regex_brief", tout << "new(" << expr_id_str(e)
<< ")=" << state_str(e) << " ";);
STRACE("seq_regex", tout
<< "New state ID: " << new_id
<< " = " << mk_pp(e, m) << std::endl;);
}
return m_expr_to_state.find(e);
}
@ -627,6 +768,7 @@ namespace smt {
return m_state_to_expr.get(id);
}
bool seq_regex::can_be_in_cycle(expr *r1, expr *r2) {
// TBD: This can be used to optimize the state graph:
// return false here if it is known that r1 -> r2 can never be
@ -649,10 +791,11 @@ namespace smt {
STRACE("seq_regex_brief", tout << "(MAX SIZE REACHED) ";);
return false;
}
STRACE("seq_regex", tout << "Updating state graph for regex "
<< mk_pp(r, m) << ") ";);
// Add state
m_state_graph.add_state(r_id);
STRACE("state_graph", tout << "regex(" << r_id << ") = " << mk_pp(r, m) << std::endl;);
STRACE("seq_regex", tout << "Updating state graph for regex "
<< mk_pp(r, m) << ") " << std::endl;);
STRACE("seq_regex_brief", tout << std::endl << "USG("
<< state_str(r) << ") ";);
expr_ref r_nullable = is_nullable_wrapper(r);
@ -663,18 +806,20 @@ namespace smt {
// Add edges to all derivatives
expr_ref_vector derivatives(m);
STRACE("seq_regex_verbose", tout
<< std::endl << " getting all derivs: " << r_id << " ";);
<< "getting all derivs: " << r_id << " " << std::endl;);
get_all_derivatives(r, derivatives);
for (auto const& dr: derivatives) {
unsigned dr_id = get_state_id(dr);
STRACE("seq_regex_verbose", tout
<< std::endl << " traversing deriv: " << dr_id << " ";);
<< " traversing deriv: " << dr_id << " " << std::endl;);
m_state_graph.add_state(dr_id);
STRACE("state_graph", tout << "regex(" << dr_id << ") = " << mk_pp(dr, m) << std::endl;);
bool maybecycle = can_be_in_cycle(r, dr);
m_state_graph.add_edge(r_id, dr_id, maybecycle);
}
m_state_graph.mark_done(r_id);
}
STRACE("seq_regex", m_state_graph.display(tout););
STRACE("seq_regex_brief", tout << std::endl;);
STRACE("seq_regex_brief", m_state_graph.display(tout););
return true;

View file

@ -23,6 +23,71 @@ Author:
#include "smt/smt_context.h"
#include "smt/seq_skolem.h"
/*
*** Tracing and debugging in this module and related modules ***
Tracing and debugging for the regex solver are split across several
command-line flags.
TRACING
-tr:seq_regex and -tr:seq_regex_brief
These are the main tags to trace what the regex solver is doing.
They mostly trace the same things, except that seq_regex_brief
avoids printing out expressions and tries to abbreviate the output
as much as possible. seq_regex_brief shows the following output:
Top-level propagations:
PIR: Propagating an in_re constraint
PE/PNE: Propagating an empty/non-empty constraint
PEQ/PNEQ: Propagating a not-equal constraint
PA: Propagating an accept constraint
In tracing, arguments are generally put in parentheses.
To achieve abbreviated output, expressions are traced in one of two
ways:
id243 (expr ID): the regex or expression with id 243
3 (state ID): the regex with state ID 3
When a regex is newly assigned to a state ID, we print this:
new(id606)=4
Of these, PA is the most important, and traces as follows:
PA(x@i,r): propagate accept for string x at index i, regex r.
(empty), (dead), (blocked), (unfold): info about whether this
PA was cut off early, or unfolded into the derivatives
(next states)
d(r1)=r2: r2 is the derivative of r1
n(r1)=b: b = whether r1 is nullable or not
USG(r): updating state graph for regex r (add all derivatives)
-tr:state_graph
This is the tracing done by util/state_graph, the data structure
that seq_regex uses to track live and dead regexes, which can
altneratively be used to get a high-level picture of what states
are being explored and updated as the solver progresses.
-tr:seq_regex_verbose
Used for some more frequent tracing (in the style of seq_regex,
not in the style of seq_regex_brief)
-tr:seq and -tr:seq_verbose
These are the underlying sequence theory tracing, often used by
the rewriter.
DEBUGGING AND VIEWING STATE GRAPH GRAPHICAL OUTPUT
-dbg:seq_regex
Debugging that checks invariants. Currently, checks that derivative
normal form is correctly preserved in the rewriter.
-dbg:state_graph
Debugging for the state graph, which
1. Checks state graph invariants, and
2. Generates the files .z3-state-graph.dgml and .z3-state-graph.dot
which can be used to visually view the state graph being explored,
during or after executing Z3.
The output can be viewed:
- Using Visual Studio for .dgml
- Using a tool such as xdot (`xdot .z3-state-graph.dot`) for .dot
*/
namespace smt {
class theory_seq;
@ -93,12 +158,13 @@ namespace smt {
expr_ref is_nullable_wrapper(expr* r);
expr_ref derivative_wrapper(expr* hd, expr* r);
void get_cofactors(expr* r, expr_ref_vector& conds, expr_ref_pair_vector& result);
void get_cofactors(expr* r, expr_ref_pair_vector& result) {
expr_ref_vector conds(m);
get_cofactors(r, conds, result);
}
// Various support for unfolding derivative expressions that are
// returned by derivative_wrapper
expr_ref mk_deriv_accept(expr* s, unsigned i, expr* r);
void get_all_derivatives(expr* r, expr_ref_vector& results);
void get_cofactors(expr* r, expr_ref_pair_vector& result);
void get_cofactors_rec(expr* r, expr_ref_vector& conds,
expr_ref_pair_vector& result);
public:

View file

@ -324,13 +324,41 @@ bool state_graph::is_done(state s) const {
return m_seen.contains(s) && !m_unexplored.contains(m_state_ufind.find(s));
}
/*
Pretty printing
*/
std::ostream& state_graph::display(std::ostream& o) const {
o << "---------- State Graph ----------" << std::endl
<< "Seen:";
for (auto s : m_seen) {
o << " " << s;
state s_root = m_state_ufind.find(s);
if (s_root != s)
o << "(=" << s_root << ")";
}
o << std::endl
<< "Live:" << m_live << std::endl
<< "Dead:" << m_dead << std::endl
<< "Unknown:" << m_unknown << std::endl
<< "Unexplored:" << m_unexplored << std::endl
<< "Edges:" << std::endl;
for (auto s1 : m_seen) {
if (m_state_ufind.is_root(s1)) {
o << " " << s1 << " -> " << m_targets[s1] << std::endl;
}
}
o << "---------------------------------" << std::endl;
return o;
}
#ifdef Z3DEBUG
/*
Class invariants check (and associated auxiliary functions)
check_invariant performs a sequence of SASSERT assertions,
then always returns true.
*/
#ifdef Z3DEBUG
bool state_graph::is_subset(state_set set1, state_set set2) const {
for (auto s1: set1) {
if (!set2.contains(s1)) return false;
@ -387,37 +415,7 @@ bool state_graph::check_invariant() const {
STRACE("state_graph", tout << "(invariant passed) ";);
return true;
}
#endif
/*
Pretty printing
*/
std::ostream& state_graph::display(std::ostream& o) const {
o << "---------- State Graph ----------" << std::endl
<< "Seen:";
for (auto s: m_seen) {
o << " " << s;
state s_root = m_state_ufind.find(s);
if (s_root != s)
o << "(=" << s_root << ")";
}
o << std::endl
<< "Live:" << m_live << std::endl
<< "Dead:" << m_dead << std::endl
<< "Unknown:" << m_unknown << std::endl
<< "Unexplored:" << m_unexplored << std::endl
<< "Edges:" << std::endl;
for (auto s1: m_seen) {
if (m_state_ufind.is_root(s1)) {
o << " " << s1 << " -> " << m_targets[s1] << std::endl;
}
}
o << "---------------------------------" << std::endl;
return o;
}
#ifdef Z3DEBUG
/*
Output the whole state graph in dgml format into the file '.z3-state-graph.dgml'
*/