3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2025-04-23 09:05:31 +00:00

Updated regex derivative engine (#5567)

* updated derivative engine

* some edit

* further improvements in derivative code

* more deriv code edits and re::to_str update

* optimized mk_deriv_accept

* fixed PR comments

* small syntax fix

* updated some simplifications

* bugfix:forgot to_re before reverse

* fixed PR comments

* more PR comment fixes

* more PR comment fixes

* forgot to delete

* deleting unused definition

* fixes

Signed-off-by: Nikolaj Bjorner <nbjorner@microsoft.com>

* fixes

Signed-off-by: Nikolaj Bjorner <nbjorner@microsoft.com>

Co-authored-by: Nikolaj Bjorner <nbjorner@microsoft.com>
This commit is contained in:
Margus Veanes 2021-10-08 13:04:49 -07:00 committed by GitHub
parent c0c3e685e7
commit 146f4621c5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 893 additions and 280 deletions

View file

@ -12,6 +12,7 @@ Abstract:
Author:
Nikolaj Bjorner (nbjorner) 2020-5-22
Margus Veanes 2021
--*/
@ -253,10 +254,8 @@ namespace smt {
* (accept s (i + 1) (derivative s[i] r)
*
* Acceptance of a derivative is unfolded into a disjunction over
* all derivatives. Effectively, this implements the following rule,
* but all in one step:
* (accept s i (ite c r1 r2)) =>
* c & (accept s i r1) \/ ~c & (accept s i r2)
* all derivatives. Effectively, this implements the following rule:
* (accept s i (ite c r1 r2)) => (ite c (accept s i r1) (accept s i r2))
*/
void seq_regex::propagate_accept(literal lit) {
SASSERT(!lit.sign());
@ -302,6 +301,7 @@ namespace smt {
unsigned min_len = re().min_length(r);
unsigned min_len_plus_i = u().max_plus(min_len, idx);
literal len_s_ge_min = th.m_ax.mk_ge(th.mk_len(s), min_len_plus_i);
// Acc(s,i,r) ==> |s| >= i + minlength(r)
th.propagate_lit(nullptr, 1, &lit, len_s_ge_min);
// Axiom equivalent to the above: th.add_axiom(~lit, len_s_ge_min);
@ -316,6 +316,8 @@ namespace smt {
STRACE("seq_regex_brief", tout
<< " (Warning: min_length returned 0 for"
<< " non-nullable regex)";);
// since nullable(r) = false:
// Acc(s,i,r) ==> |s|>i
th.propagate_lit(nullptr, 1, &lit, ~len_s_le_i);
}
else if (!m.is_true(is_nullable)) {
@ -327,7 +329,9 @@ namespace smt {
<< " (Warning: is_nullable did not simplify)";);
literal is_nullable_lit = th.mk_literal(is_nullable);
ctx.mark_as_relevant(is_nullable_lit);
// Acc(s,i,r) & |s|<=i ==> nullable(r)
th.add_axiom(~lit, ~len_s_le_i, is_nullable_lit);
//TODO: what if is_nullable contains an in_re
if (str().is_in_re(is_nullable))
th.add_unhandled_expr(is_nullable);
}
@ -335,14 +339,19 @@ namespace smt {
// Rule 3: derivative unfolding
literal_vector accept_next;
expr_ref hd = th.mk_nth(s, i);
expr_ref s_i = th.mk_nth(s, i);
expr_ref deriv(m);
deriv = derivative_wrapper(hd, r);
deriv = mk_derivative_wrapper(s_i, r);
STRACE("seq_regex", tout
<< "mk_derivative_wrapper: " << re().to_str(deriv) << std::endl;);
expr_ref accept_deriv(m);
accept_deriv = mk_deriv_accept(s, idx + 1, deriv);
accept_next.push_back(~lit);
accept_next.push_back(len_s_le_i);
accept_next.push_back(th.mk_literal(accept_deriv));
// Acc(s, i, r) => (|s|<=i or Acc(s, i+1, D(s_i,r)))
// where Acc(s, i+1, ite(c, t, f)) = ite(c, Acc(s, i+1, t), Acc(s, i+1, t))
// and Acc(s, i+1, r U s) = Acc(s, i+1, r) or Acc(s, i+1, s)
th.add_axiom(accept_next);
}
@ -418,20 +427,21 @@ namespace smt {
/*
Wrapper around calls to is_nullable from the seq rewriter.
Note: the nullable wrapper and derivative wrapper actually use
TODO: clean up the following:
Note: the is_nullable_wrapper and mk_derivative_wrapper actually use
different sequence rewriters; these are at:
m_seq_rewrite
(returned by seq_rw())
th.m_rewrite.m_imp->m_cfg.m_seq_rw
(private, can't be accessed directly)
As a result operations are cached separately for the nullable
and derivative calls. TBD if caching them using the same rewriter
makes any difference.
and derivative calls.
*/
expr_ref seq_regex::is_nullable_wrapper(expr* r) {
STRACE("seq_regex", tout << "nullable: " << mk_pp(r, m) << std::endl;);
expr_ref result = seq_rw().is_nullable(r);
//TODO: rewrite seems unnecessary here
rewrite(result);
STRACE("seq_regex", tout << "nullable result: " << mk_pp(result, m) << std::endl;);
@ -442,39 +452,28 @@ namespace smt {
}
/*
Wrapper around the regex symbolic derivative from the seq rewriter.
Ensures that the derivative is written in a normalized BDD form
with optimizations for if-then-else expressions involving the head.
Note: the nullable wrapper and derivative wrapper actually use
different sequence rewriters; these are at:
m_seq_rewrite
(returned by seq_rw())
th.m_rewrite.m_imp->m_cfg.m_seq_rw
(private, can't be accessed directly)
As a result operations are cached separately for the nullable
and derivative calls. TBD if caching them using the same rewriter
makes any difference.
First creates a derivatrive of r wrt x=(:var 0) and then replaces x by ele.
This will create a cached entry for the generic derivative of r that is independent of ele.
*/
expr_ref seq_regex::derivative_wrapper(expr* hd, expr* r) {
STRACE("seq_regex", tout << "derivative(" << mk_pp(hd, m) << "): " << mk_pp(r, m) << std::endl;);
expr_ref seq_regex::mk_derivative_wrapper(expr* ele, expr* r) {
STRACE("seq_regex", tout << "derivative(" << mk_pp(ele, m) << "): " << mk_pp(r, m) << std::endl;);
// Use canonical variable for head
expr_ref hd_canon(m.mk_var(0, hd->get_sort()), m);
expr_ref result(re().mk_derivative(hd_canon, r), m);
rewrite(result);
// Uses canonical variable (:var 0) for the derivative element
expr_ref der(seq_rw().mk_derivative(r), m);
// Substitute with real head
// Substitute (:var 0) with the actual element
var_subst subst(m);
expr_ref_vector sub(m);
sub.push_back(hd);
result = subst(result, sub);
sub.push_back(ele);
der = subst(der, sub);
STRACE("seq_regex", tout << "derivative result: " << mk_pp(result, m) << std::endl;);
STRACE("seq_regex", tout << "derivative result: " << mk_pp(der, m) << std::endl;);
STRACE("seq_regex_brief", tout << "d(" << state_str(r) << ")="
<< state_str(result) << " ";);
<< state_str(der) << " ";);
return result;
//TODO: simplify der further, if ele implies further simplifications
//e.g. if ele='b' then de(ite (x='a') t f) simplifies to t
return der;
}
void seq_regex::propagate_eq(expr* r1, expr* r2) {
@ -557,7 +556,7 @@ namespace smt {
literal null_lit = th.mk_literal(is_nullable);
expr_ref hd = mk_first(r, n);
expr_ref d(m);
d = derivative_wrapper(hd, r);
d = mk_derivative_wrapper(hd, r);
literal_vector lits;
lits.push_back(~lit);
@ -584,11 +583,10 @@ namespace smt {
}
/*
Given a string s, index i, and a derivative regex d, return an
Given a string s, index i, and a derivative r, return an
expression that is equivalent to
accept s i r
but which pushes accept s i r into the leaves (next derivatives to
explore).
but which pushes accept s i r into the leaves
Input r is of type regex; output is of type bool.
@ -600,14 +598,18 @@ namespace smt {
expr_ref seq_regex::mk_deriv_accept(expr* s, unsigned i, expr* r) {
vector<expr*> to_visit;
to_visit.push_back(r);
obj_map<expr, expr*> re_to_bool;
obj_map<expr, expr*> re_to_accept;
expr_ref_vector _temp_bool_owner(m); // temp owner for bools we create
// DFS
bool s_is_longer_than_i = str().min_length(s) > i;
expr* i_int = a().mk_int(i);
_temp_bool_owner.push_back(i_int);
// DFS, avoids duplicating derivative construction that has already been done
while (to_visit.size() > 0) {
expr* e = to_visit.back();
expr* econd = nullptr, *e1 = nullptr, *e2 = nullptr;
if (!re_to_bool.contains(e)) {
if (!re_to_accept.contains(e)) {
// First visit: add children
STRACE("seq_regex_verbose", tout << "1";);
if (m.is_ite(e, econd, e1, e2) ||
@ -616,36 +618,40 @@ namespace smt {
to_visit.push_back(e2);
}
// Mark first visit by adding nullptr to the map
re_to_bool.insert(e, nullptr);
re_to_accept.insert(e, nullptr);
}
else if (re_to_bool.find(e) == nullptr) {
else if (re_to_accept.find(e) == nullptr) {
// Second visit: set value
STRACE("seq_regex_verbose", tout << "2";);
to_visit.pop_back();
if (m.is_ite(e, econd, e1, e2)) {
expr* b1 = re_to_bool.find(e1);
expr* b2 = re_to_bool.find(e2);
expr* b = m.mk_ite(econd, b1, b2);
expr* b1 = re_to_accept.find(e1);
expr* b2 = re_to_accept.find(e2);
expr* b = m.is_true(econd) || b1 == b2 ? b1 : m.is_false(econd) ? b2 : m.mk_ite(econd, b1, b2);
_temp_bool_owner.push_back(b);
re_to_bool.find(e) = b;
re_to_accept.find(e) = b;
}
else if (re().is_empty(e) || (s_is_longer_than_i && re().is_epsilon(e)))
{
// s[i..] in [] <==> false, also: s[i..] in () <==> false when |s|>i
re_to_accept.find(e) = m.mk_false();
}
else if (re().is_full_seq(e) || (s_is_longer_than_i && re().is_dot_plus(e)))
{
// s[i..] in .* <==> true, also: s[i..] in .+ <==> true when |s|>i
re_to_accept.find(e) = m.mk_true();
}
/*
else if (re().is_empty(e))
{
re_to_bool.find(e) = m.mk_false();
}
else if (re().is_epsilon(e))
{
expr* iplus1 = a().mk_int(i);
expr* one = a().mk_int(1);
_temp_bool_owner.push_back(iplus1);
_temp_bool_owner.push_back(one);
//the substring starting after position iplus1 must be empty
expr* s_end = str().mk_substr(s, iplus1, one);
//the substring starting after position i must be empty
expr* s_end = str().mk_substr(s, i_int, one);
expr* s_end_is_epsilon = m.mk_eq(s_end, str().mk_empty(m.get_sort(s)));
_temp_bool_owner.push_back(s_end_is_epsilon);
re_to_bool.find(e) = s_end_is_epsilon;
re_to_accept.find(e) = s_end_is_epsilon;
STRACE("seq_regex_verbose", tout
<< "added empty sequence leaf: "
@ -653,18 +659,16 @@ namespace smt {
}
*/
else if (re().is_union(e, e1, e2)) {
expr* b1 = re_to_bool.find(e1);
expr* b2 = re_to_bool.find(e2);
expr* b = m.mk_or(b1, b2);
expr* b1 = re_to_accept.find(e1);
expr* b2 = re_to_accept.find(e2);
expr* b = m.is_false(b1) || b1 == b2 ? b2 : m.is_false(b2) ? b1 : m.mk_or(b1, b2);
_temp_bool_owner.push_back(b);
re_to_bool.find(e) = b;
re_to_accept.find(e) = b;
}
else {
expr* iplus1 = a().mk_int(i);
_temp_bool_owner.push_back(iplus1);
expr_ref acc_leaf = sk().mk_accept(s, iplus1, e);
expr_ref acc_leaf = sk().mk_accept(s, i_int, e);
_temp_bool_owner.push_back(acc_leaf);
re_to_bool.find(e) = acc_leaf;
re_to_accept.find(e) = acc_leaf;
STRACE("seq_regex_verbose", tout
<< "mk_deriv_accept: added accept leaf: "
@ -680,59 +684,44 @@ namespace smt {
// Finalize
expr_ref result(m);
result = re_to_bool.find(r); // Assigns ownership of all exprs in
// re_to_bool for after this completes
result = re_to_accept.find(r); // Assigns ownership of all exprs in
// re_to_accept for after this completes
rewrite(result);
return result;
}
/*
Return a list of all leaves in the derivative of a regex r,
Return a list of all target regexes in the derivative of a regex r,
ignoring the conditions along each path.
Warning: Although the derivative
normal form tries to eliminate unsat condition paths, one cannot
assume that the path to each leaf is satisfiable in general
(e.g. when regexes are created using re.pred).
So not all results may correspond to satisfiable predicates.
It is OK to rely on the results being satisfiable for completeness,
but not soundness.
The derivative construction uses (:var 0) and tries
to eliminate unsat condition paths but it does not perform
full satisfiability checks and it is not guaranteed
that all targets are actually reachable
*/
void seq_regex::get_all_derivatives(expr* r, expr_ref_vector& results) {
// Get derivative
sort* seq_sort = nullptr;
VERIFY(u().is_re(r, seq_sort));
expr_ref n(m.mk_fresh_const("re.char", seq_sort), m);
expr_ref hd = mk_first(r, n);
expr_ref d(m);
d = derivative_wrapper(hd, r);
void seq_regex::get_derivative_targets(expr* r, expr_ref_vector& targets) {
// constructs the derivative wrt (:var 0)
expr_ref d(seq_rw().mk_derivative(r), m);
// DFS
vector<expr*> to_visit;
to_visit.push_back(d);
obj_map<expr, bool> visited; // set<expr> (bool is used as a unit type)
while (to_visit.size() > 0) {
expr* e = to_visit.back();
to_visit.pop_back();
if (visited.contains(e)) continue;
visited.insert(e, true);
expr* econd = nullptr, *e1 = nullptr, *e2 = nullptr;
if (m.is_ite(e, econd, e1, e2) ||
re().is_union(e, e1, e2)) {
to_visit.push_back(e1);
to_visit.push_back(e2);
}
else if (!re().is_empty(e)) {
results.push_back(e);
STRACE("seq_regex_verbose", tout
<< "get_all_derivatives: added deriv: "
<< mk_pp(e, m) << std::endl;);
// use DFS to collect all the targets (leaf regexes) in d.
expr* _1 = nullptr, * e1 = nullptr, * e2 = nullptr;
obj_hashtable<expr>::entry* _2 = nullptr;
vector<expr*> workset;
workset.push_back(d);
obj_hashtable<expr> done;
done.insert(d);
while (workset.size() > 0) {
expr* e = workset.back();
workset.pop_back();
if (m.is_ite(e, _1, e1, e2) || re().is_union(e, e1, e2)) {
if (done.insert_if_not_there_core(e1, _2))
workset.push_back(e1);
if (done.insert_if_not_there_core(e2, _2))
workset.push_back(e2);
}
else if (!re().is_empty(e))
targets.push_back(e);
}
STRACE("seq_regex", tout << "Number of derivatives: "
<< results.size() << std::endl;);
STRACE("seq_regex_brief", tout << "#derivs=" << results.size() << " ";);
}
/*
@ -800,7 +789,7 @@ namespace smt {
th.add_axiom(~lit, ~th.mk_literal(is_nullable));
expr_ref hd = mk_first(r, n);
expr_ref d(m);
d = derivative_wrapper(hd, r);
d = mk_derivative_wrapper(hd, r);
literal_vector lits;
expr_ref_pair_vector cofactors(m);
get_cofactors(d, cofactors);
@ -884,7 +873,7 @@ namespace smt {
STRACE("state_graph",
if (!m_state_graph.is_seen(r_id))
tout << std::endl << "state(" << r_id << ") = " << seq_util::rex::pp(re(), r) << std::endl << "info(" << r_id << ") = " << re().get_info(r) << std::endl;);
tout << std::endl << "state(" << r_id << ") = " << re().to_str(r) << std::endl << "info(" << r_id << ") = " << re().get_info(r) << std::endl;);
// Add state
m_state_graph.add_state(r_id);
STRACE("seq_regex", tout << "Updating state graph for regex "
@ -900,14 +889,14 @@ namespace smt {
expr_ref_vector derivatives(m);
STRACE("seq_regex_verbose", tout
<< "getting all derivs: " << r_id << " " << std::endl;);
get_all_derivatives(r, derivatives);
get_derivative_targets(r, derivatives);
for (auto const& dr: derivatives) {
unsigned dr_id = get_state_id(dr);
STRACE("seq_regex_verbose", tout
<< std::endl << " traversing deriv: " << dr_id << " ";);
STRACE("state_graph",
if (!m_state_graph.is_seen(dr_id))
tout << "state(" << dr_id << ") = " << seq_util::rex::pp(re(), dr) << std::endl << "info(" << dr_id << ") = " << re().get_info(dr) << std::endl;);
tout << "state(" << dr_id << ") = " << re().to_str(dr) << std::endl << "info(" << dr_id << ") = " << re().get_info(dr) << std::endl;);
// Add state
m_state_graph.add_state(dr_id);
bool maybecycle = can_be_in_cycle(r, dr);

View file

@ -158,12 +158,12 @@ namespace smt {
expr_ref symmetric_diff(expr* r1, expr* r2);
expr_ref is_nullable_wrapper(expr* r);
expr_ref derivative_wrapper(expr* hd, expr* r);
expr_ref mk_derivative_wrapper(expr* hd, expr* r);
// Various support for unfolding derivative expressions that are
// returned by derivative_wrapper
expr_ref mk_deriv_accept(expr* s, unsigned i, expr* r);
void get_all_derivatives(expr* r, expr_ref_vector& results);
void get_derivative_targets(expr* r, expr_ref_vector& targets);
void get_cofactors(expr* r, expr_ref_pair_vector& result);
void get_cofactors_rec(expr* r, expr_ref_vector& conds,
expr_ref_pair_vector& result);