mirror of
https://github.com/Z3Prover/z3
synced 2025-04-23 09:05:31 +00:00
Updated regex derivative engine (#5567)
* updated derivative engine * some edit * further improvements in derivative code * more deriv code edits and re::to_str update * optimized mk_deriv_accept * fixed PR comments * small syntax fix * updated some simplifications * bugfix:forgot to_re before reverse * fixed PR comments * more PR comment fixes * more PR comment fixes * forgot to delete * deleting unused definition * fixes Signed-off-by: Nikolaj Bjorner <nbjorner@microsoft.com> * fixes Signed-off-by: Nikolaj Bjorner <nbjorner@microsoft.com> Co-authored-by: Nikolaj Bjorner <nbjorner@microsoft.com>
This commit is contained in:
parent
c0c3e685e7
commit
146f4621c5
7 changed files with 893 additions and 280 deletions
|
@ -12,6 +12,7 @@ Abstract:
|
|||
Author:
|
||||
|
||||
Nikolaj Bjorner (nbjorner) 2020-5-22
|
||||
Margus Veanes 2021
|
||||
|
||||
--*/
|
||||
|
||||
|
@ -253,10 +254,8 @@ namespace smt {
|
|||
* (accept s (i + 1) (derivative s[i] r)
|
||||
*
|
||||
* Acceptance of a derivative is unfolded into a disjunction over
|
||||
* all derivatives. Effectively, this implements the following rule,
|
||||
* but all in one step:
|
||||
* (accept s i (ite c r1 r2)) =>
|
||||
* c & (accept s i r1) \/ ~c & (accept s i r2)
|
||||
* all derivatives. Effectively, this implements the following rule:
|
||||
* (accept s i (ite c r1 r2)) => (ite c (accept s i r1) (accept s i r2))
|
||||
*/
|
||||
void seq_regex::propagate_accept(literal lit) {
|
||||
SASSERT(!lit.sign());
|
||||
|
@ -302,6 +301,7 @@ namespace smt {
|
|||
unsigned min_len = re().min_length(r);
|
||||
unsigned min_len_plus_i = u().max_plus(min_len, idx);
|
||||
literal len_s_ge_min = th.m_ax.mk_ge(th.mk_len(s), min_len_plus_i);
|
||||
// Acc(s,i,r) ==> |s| >= i + minlength(r)
|
||||
th.propagate_lit(nullptr, 1, &lit, len_s_ge_min);
|
||||
// Axiom equivalent to the above: th.add_axiom(~lit, len_s_ge_min);
|
||||
|
||||
|
@ -316,6 +316,8 @@ namespace smt {
|
|||
STRACE("seq_regex_brief", tout
|
||||
<< " (Warning: min_length returned 0 for"
|
||||
<< " non-nullable regex)";);
|
||||
// since nullable(r) = false:
|
||||
// Acc(s,i,r) ==> |s|>i
|
||||
th.propagate_lit(nullptr, 1, &lit, ~len_s_le_i);
|
||||
}
|
||||
else if (!m.is_true(is_nullable)) {
|
||||
|
@ -327,7 +329,9 @@ namespace smt {
|
|||
<< " (Warning: is_nullable did not simplify)";);
|
||||
literal is_nullable_lit = th.mk_literal(is_nullable);
|
||||
ctx.mark_as_relevant(is_nullable_lit);
|
||||
// Acc(s,i,r) & |s|<=i ==> nullable(r)
|
||||
th.add_axiom(~lit, ~len_s_le_i, is_nullable_lit);
|
||||
//TODO: what if is_nullable contains an in_re
|
||||
if (str().is_in_re(is_nullable))
|
||||
th.add_unhandled_expr(is_nullable);
|
||||
}
|
||||
|
@ -335,14 +339,19 @@ namespace smt {
|
|||
|
||||
// Rule 3: derivative unfolding
|
||||
literal_vector accept_next;
|
||||
expr_ref hd = th.mk_nth(s, i);
|
||||
expr_ref s_i = th.mk_nth(s, i);
|
||||
expr_ref deriv(m);
|
||||
deriv = derivative_wrapper(hd, r);
|
||||
deriv = mk_derivative_wrapper(s_i, r);
|
||||
STRACE("seq_regex", tout
|
||||
<< "mk_derivative_wrapper: " << re().to_str(deriv) << std::endl;);
|
||||
expr_ref accept_deriv(m);
|
||||
accept_deriv = mk_deriv_accept(s, idx + 1, deriv);
|
||||
accept_next.push_back(~lit);
|
||||
accept_next.push_back(len_s_le_i);
|
||||
accept_next.push_back(th.mk_literal(accept_deriv));
|
||||
// Acc(s, i, r) => (|s|<=i or Acc(s, i+1, D(s_i,r)))
|
||||
// where Acc(s, i+1, ite(c, t, f)) = ite(c, Acc(s, i+1, t), Acc(s, i+1, t))
|
||||
// and Acc(s, i+1, r U s) = Acc(s, i+1, r) or Acc(s, i+1, s)
|
||||
th.add_axiom(accept_next);
|
||||
}
|
||||
|
||||
|
@ -418,20 +427,21 @@ namespace smt {
|
|||
/*
|
||||
Wrapper around calls to is_nullable from the seq rewriter.
|
||||
|
||||
Note: the nullable wrapper and derivative wrapper actually use
|
||||
TODO: clean up the following:
|
||||
Note: the is_nullable_wrapper and mk_derivative_wrapper actually use
|
||||
different sequence rewriters; these are at:
|
||||
m_seq_rewrite
|
||||
(returned by seq_rw())
|
||||
th.m_rewrite.m_imp->m_cfg.m_seq_rw
|
||||
(private, can't be accessed directly)
|
||||
As a result operations are cached separately for the nullable
|
||||
and derivative calls. TBD if caching them using the same rewriter
|
||||
makes any difference.
|
||||
and derivative calls.
|
||||
*/
|
||||
expr_ref seq_regex::is_nullable_wrapper(expr* r) {
|
||||
STRACE("seq_regex", tout << "nullable: " << mk_pp(r, m) << std::endl;);
|
||||
|
||||
expr_ref result = seq_rw().is_nullable(r);
|
||||
//TODO: rewrite seems unnecessary here
|
||||
rewrite(result);
|
||||
|
||||
STRACE("seq_regex", tout << "nullable result: " << mk_pp(result, m) << std::endl;);
|
||||
|
@ -442,39 +452,28 @@ namespace smt {
|
|||
}
|
||||
|
||||
/*
|
||||
Wrapper around the regex symbolic derivative from the seq rewriter.
|
||||
Ensures that the derivative is written in a normalized BDD form
|
||||
with optimizations for if-then-else expressions involving the head.
|
||||
|
||||
Note: the nullable wrapper and derivative wrapper actually use
|
||||
different sequence rewriters; these are at:
|
||||
m_seq_rewrite
|
||||
(returned by seq_rw())
|
||||
th.m_rewrite.m_imp->m_cfg.m_seq_rw
|
||||
(private, can't be accessed directly)
|
||||
As a result operations are cached separately for the nullable
|
||||
and derivative calls. TBD if caching them using the same rewriter
|
||||
makes any difference.
|
||||
First creates a derivatrive of r wrt x=(:var 0) and then replaces x by ele.
|
||||
This will create a cached entry for the generic derivative of r that is independent of ele.
|
||||
*/
|
||||
expr_ref seq_regex::derivative_wrapper(expr* hd, expr* r) {
|
||||
STRACE("seq_regex", tout << "derivative(" << mk_pp(hd, m) << "): " << mk_pp(r, m) << std::endl;);
|
||||
expr_ref seq_regex::mk_derivative_wrapper(expr* ele, expr* r) {
|
||||
STRACE("seq_regex", tout << "derivative(" << mk_pp(ele, m) << "): " << mk_pp(r, m) << std::endl;);
|
||||
|
||||
// Use canonical variable for head
|
||||
expr_ref hd_canon(m.mk_var(0, hd->get_sort()), m);
|
||||
expr_ref result(re().mk_derivative(hd_canon, r), m);
|
||||
rewrite(result);
|
||||
// Uses canonical variable (:var 0) for the derivative element
|
||||
expr_ref der(seq_rw().mk_derivative(r), m);
|
||||
|
||||
// Substitute with real head
|
||||
// Substitute (:var 0) with the actual element
|
||||
var_subst subst(m);
|
||||
expr_ref_vector sub(m);
|
||||
sub.push_back(hd);
|
||||
result = subst(result, sub);
|
||||
sub.push_back(ele);
|
||||
der = subst(der, sub);
|
||||
|
||||
STRACE("seq_regex", tout << "derivative result: " << mk_pp(result, m) << std::endl;);
|
||||
STRACE("seq_regex", tout << "derivative result: " << mk_pp(der, m) << std::endl;);
|
||||
STRACE("seq_regex_brief", tout << "d(" << state_str(r) << ")="
|
||||
<< state_str(result) << " ";);
|
||||
<< state_str(der) << " ";);
|
||||
|
||||
return result;
|
||||
//TODO: simplify der further, if ele implies further simplifications
|
||||
//e.g. if ele='b' then de(ite (x='a') t f) simplifies to t
|
||||
return der;
|
||||
}
|
||||
|
||||
void seq_regex::propagate_eq(expr* r1, expr* r2) {
|
||||
|
@ -557,7 +556,7 @@ namespace smt {
|
|||
literal null_lit = th.mk_literal(is_nullable);
|
||||
expr_ref hd = mk_first(r, n);
|
||||
expr_ref d(m);
|
||||
d = derivative_wrapper(hd, r);
|
||||
d = mk_derivative_wrapper(hd, r);
|
||||
|
||||
literal_vector lits;
|
||||
lits.push_back(~lit);
|
||||
|
@ -584,11 +583,10 @@ namespace smt {
|
|||
}
|
||||
|
||||
/*
|
||||
Given a string s, index i, and a derivative regex d, return an
|
||||
Given a string s, index i, and a derivative r, return an
|
||||
expression that is equivalent to
|
||||
accept s i r
|
||||
but which pushes accept s i r into the leaves (next derivatives to
|
||||
explore).
|
||||
but which pushes accept s i r into the leaves
|
||||
|
||||
Input r is of type regex; output is of type bool.
|
||||
|
||||
|
@ -600,14 +598,18 @@ namespace smt {
|
|||
expr_ref seq_regex::mk_deriv_accept(expr* s, unsigned i, expr* r) {
|
||||
vector<expr*> to_visit;
|
||||
to_visit.push_back(r);
|
||||
obj_map<expr, expr*> re_to_bool;
|
||||
obj_map<expr, expr*> re_to_accept;
|
||||
expr_ref_vector _temp_bool_owner(m); // temp owner for bools we create
|
||||
|
||||
// DFS
|
||||
bool s_is_longer_than_i = str().min_length(s) > i;
|
||||
expr* i_int = a().mk_int(i);
|
||||
_temp_bool_owner.push_back(i_int);
|
||||
|
||||
// DFS, avoids duplicating derivative construction that has already been done
|
||||
while (to_visit.size() > 0) {
|
||||
expr* e = to_visit.back();
|
||||
expr* econd = nullptr, *e1 = nullptr, *e2 = nullptr;
|
||||
if (!re_to_bool.contains(e)) {
|
||||
if (!re_to_accept.contains(e)) {
|
||||
// First visit: add children
|
||||
STRACE("seq_regex_verbose", tout << "1";);
|
||||
if (m.is_ite(e, econd, e1, e2) ||
|
||||
|
@ -616,36 +618,40 @@ namespace smt {
|
|||
to_visit.push_back(e2);
|
||||
}
|
||||
// Mark first visit by adding nullptr to the map
|
||||
re_to_bool.insert(e, nullptr);
|
||||
re_to_accept.insert(e, nullptr);
|
||||
}
|
||||
else if (re_to_bool.find(e) == nullptr) {
|
||||
else if (re_to_accept.find(e) == nullptr) {
|
||||
// Second visit: set value
|
||||
STRACE("seq_regex_verbose", tout << "2";);
|
||||
to_visit.pop_back();
|
||||
if (m.is_ite(e, econd, e1, e2)) {
|
||||
expr* b1 = re_to_bool.find(e1);
|
||||
expr* b2 = re_to_bool.find(e2);
|
||||
expr* b = m.mk_ite(econd, b1, b2);
|
||||
expr* b1 = re_to_accept.find(e1);
|
||||
expr* b2 = re_to_accept.find(e2);
|
||||
expr* b = m.is_true(econd) || b1 == b2 ? b1 : m.is_false(econd) ? b2 : m.mk_ite(econd, b1, b2);
|
||||
_temp_bool_owner.push_back(b);
|
||||
re_to_bool.find(e) = b;
|
||||
re_to_accept.find(e) = b;
|
||||
}
|
||||
else if (re().is_empty(e) || (s_is_longer_than_i && re().is_epsilon(e)))
|
||||
{
|
||||
// s[i..] in [] <==> false, also: s[i..] in () <==> false when |s|>i
|
||||
re_to_accept.find(e) = m.mk_false();
|
||||
}
|
||||
else if (re().is_full_seq(e) || (s_is_longer_than_i && re().is_dot_plus(e)))
|
||||
{
|
||||
// s[i..] in .* <==> true, also: s[i..] in .+ <==> true when |s|>i
|
||||
re_to_accept.find(e) = m.mk_true();
|
||||
}
|
||||
/*
|
||||
else if (re().is_empty(e))
|
||||
{
|
||||
re_to_bool.find(e) = m.mk_false();
|
||||
}
|
||||
else if (re().is_epsilon(e))
|
||||
{
|
||||
expr* iplus1 = a().mk_int(i);
|
||||
expr* one = a().mk_int(1);
|
||||
_temp_bool_owner.push_back(iplus1);
|
||||
_temp_bool_owner.push_back(one);
|
||||
//the substring starting after position iplus1 must be empty
|
||||
expr* s_end = str().mk_substr(s, iplus1, one);
|
||||
//the substring starting after position i must be empty
|
||||
expr* s_end = str().mk_substr(s, i_int, one);
|
||||
expr* s_end_is_epsilon = m.mk_eq(s_end, str().mk_empty(m.get_sort(s)));
|
||||
|
||||
_temp_bool_owner.push_back(s_end_is_epsilon);
|
||||
re_to_bool.find(e) = s_end_is_epsilon;
|
||||
re_to_accept.find(e) = s_end_is_epsilon;
|
||||
|
||||
STRACE("seq_regex_verbose", tout
|
||||
<< "added empty sequence leaf: "
|
||||
|
@ -653,18 +659,16 @@ namespace smt {
|
|||
}
|
||||
*/
|
||||
else if (re().is_union(e, e1, e2)) {
|
||||
expr* b1 = re_to_bool.find(e1);
|
||||
expr* b2 = re_to_bool.find(e2);
|
||||
expr* b = m.mk_or(b1, b2);
|
||||
expr* b1 = re_to_accept.find(e1);
|
||||
expr* b2 = re_to_accept.find(e2);
|
||||
expr* b = m.is_false(b1) || b1 == b2 ? b2 : m.is_false(b2) ? b1 : m.mk_or(b1, b2);
|
||||
_temp_bool_owner.push_back(b);
|
||||
re_to_bool.find(e) = b;
|
||||
re_to_accept.find(e) = b;
|
||||
}
|
||||
else {
|
||||
expr* iplus1 = a().mk_int(i);
|
||||
_temp_bool_owner.push_back(iplus1);
|
||||
expr_ref acc_leaf = sk().mk_accept(s, iplus1, e);
|
||||
expr_ref acc_leaf = sk().mk_accept(s, i_int, e);
|
||||
_temp_bool_owner.push_back(acc_leaf);
|
||||
re_to_bool.find(e) = acc_leaf;
|
||||
re_to_accept.find(e) = acc_leaf;
|
||||
|
||||
STRACE("seq_regex_verbose", tout
|
||||
<< "mk_deriv_accept: added accept leaf: "
|
||||
|
@ -680,59 +684,44 @@ namespace smt {
|
|||
|
||||
// Finalize
|
||||
expr_ref result(m);
|
||||
result = re_to_bool.find(r); // Assigns ownership of all exprs in
|
||||
// re_to_bool for after this completes
|
||||
result = re_to_accept.find(r); // Assigns ownership of all exprs in
|
||||
// re_to_accept for after this completes
|
||||
rewrite(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
Return a list of all leaves in the derivative of a regex r,
|
||||
Return a list of all target regexes in the derivative of a regex r,
|
||||
ignoring the conditions along each path.
|
||||
|
||||
Warning: Although the derivative
|
||||
normal form tries to eliminate unsat condition paths, one cannot
|
||||
assume that the path to each leaf is satisfiable in general
|
||||
(e.g. when regexes are created using re.pred).
|
||||
So not all results may correspond to satisfiable predicates.
|
||||
It is OK to rely on the results being satisfiable for completeness,
|
||||
but not soundness.
|
||||
The derivative construction uses (:var 0) and tries
|
||||
to eliminate unsat condition paths but it does not perform
|
||||
full satisfiability checks and it is not guaranteed
|
||||
that all targets are actually reachable
|
||||
*/
|
||||
void seq_regex::get_all_derivatives(expr* r, expr_ref_vector& results) {
|
||||
// Get derivative
|
||||
sort* seq_sort = nullptr;
|
||||
VERIFY(u().is_re(r, seq_sort));
|
||||
expr_ref n(m.mk_fresh_const("re.char", seq_sort), m);
|
||||
expr_ref hd = mk_first(r, n);
|
||||
expr_ref d(m);
|
||||
d = derivative_wrapper(hd, r);
|
||||
void seq_regex::get_derivative_targets(expr* r, expr_ref_vector& targets) {
|
||||
// constructs the derivative wrt (:var 0)
|
||||
expr_ref d(seq_rw().mk_derivative(r), m);
|
||||
|
||||
// DFS
|
||||
vector<expr*> to_visit;
|
||||
to_visit.push_back(d);
|
||||
obj_map<expr, bool> visited; // set<expr> (bool is used as a unit type)
|
||||
while (to_visit.size() > 0) {
|
||||
expr* e = to_visit.back();
|
||||
to_visit.pop_back();
|
||||
if (visited.contains(e)) continue;
|
||||
visited.insert(e, true);
|
||||
expr* econd = nullptr, *e1 = nullptr, *e2 = nullptr;
|
||||
if (m.is_ite(e, econd, e1, e2) ||
|
||||
re().is_union(e, e1, e2)) {
|
||||
to_visit.push_back(e1);
|
||||
to_visit.push_back(e2);
|
||||
}
|
||||
else if (!re().is_empty(e)) {
|
||||
results.push_back(e);
|
||||
STRACE("seq_regex_verbose", tout
|
||||
<< "get_all_derivatives: added deriv: "
|
||||
<< mk_pp(e, m) << std::endl;);
|
||||
// use DFS to collect all the targets (leaf regexes) in d.
|
||||
expr* _1 = nullptr, * e1 = nullptr, * e2 = nullptr;
|
||||
obj_hashtable<expr>::entry* _2 = nullptr;
|
||||
vector<expr*> workset;
|
||||
workset.push_back(d);
|
||||
obj_hashtable<expr> done;
|
||||
done.insert(d);
|
||||
while (workset.size() > 0) {
|
||||
expr* e = workset.back();
|
||||
workset.pop_back();
|
||||
if (m.is_ite(e, _1, e1, e2) || re().is_union(e, e1, e2)) {
|
||||
if (done.insert_if_not_there_core(e1, _2))
|
||||
workset.push_back(e1);
|
||||
if (done.insert_if_not_there_core(e2, _2))
|
||||
workset.push_back(e2);
|
||||
}
|
||||
else if (!re().is_empty(e))
|
||||
targets.push_back(e);
|
||||
}
|
||||
|
||||
STRACE("seq_regex", tout << "Number of derivatives: "
|
||||
<< results.size() << std::endl;);
|
||||
STRACE("seq_regex_brief", tout << "#derivs=" << results.size() << " ";);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -800,7 +789,7 @@ namespace smt {
|
|||
th.add_axiom(~lit, ~th.mk_literal(is_nullable));
|
||||
expr_ref hd = mk_first(r, n);
|
||||
expr_ref d(m);
|
||||
d = derivative_wrapper(hd, r);
|
||||
d = mk_derivative_wrapper(hd, r);
|
||||
literal_vector lits;
|
||||
expr_ref_pair_vector cofactors(m);
|
||||
get_cofactors(d, cofactors);
|
||||
|
@ -884,7 +873,7 @@ namespace smt {
|
|||
|
||||
STRACE("state_graph",
|
||||
if (!m_state_graph.is_seen(r_id))
|
||||
tout << std::endl << "state(" << r_id << ") = " << seq_util::rex::pp(re(), r) << std::endl << "info(" << r_id << ") = " << re().get_info(r) << std::endl;);
|
||||
tout << std::endl << "state(" << r_id << ") = " << re().to_str(r) << std::endl << "info(" << r_id << ") = " << re().get_info(r) << std::endl;);
|
||||
// Add state
|
||||
m_state_graph.add_state(r_id);
|
||||
STRACE("seq_regex", tout << "Updating state graph for regex "
|
||||
|
@ -900,14 +889,14 @@ namespace smt {
|
|||
expr_ref_vector derivatives(m);
|
||||
STRACE("seq_regex_verbose", tout
|
||||
<< "getting all derivs: " << r_id << " " << std::endl;);
|
||||
get_all_derivatives(r, derivatives);
|
||||
get_derivative_targets(r, derivatives);
|
||||
for (auto const& dr: derivatives) {
|
||||
unsigned dr_id = get_state_id(dr);
|
||||
STRACE("seq_regex_verbose", tout
|
||||
<< std::endl << " traversing deriv: " << dr_id << " ";);
|
||||
STRACE("state_graph",
|
||||
if (!m_state_graph.is_seen(dr_id))
|
||||
tout << "state(" << dr_id << ") = " << seq_util::rex::pp(re(), dr) << std::endl << "info(" << dr_id << ") = " << re().get_info(dr) << std::endl;);
|
||||
tout << "state(" << dr_id << ") = " << re().to_str(dr) << std::endl << "info(" << dr_id << ") = " << re().get_info(dr) << std::endl;);
|
||||
// Add state
|
||||
m_state_graph.add_state(dr_id);
|
||||
bool maybecycle = can_be_in_cycle(r, dr);
|
||||
|
|
|
@ -158,12 +158,12 @@ namespace smt {
|
|||
expr_ref symmetric_diff(expr* r1, expr* r2);
|
||||
|
||||
expr_ref is_nullable_wrapper(expr* r);
|
||||
expr_ref derivative_wrapper(expr* hd, expr* r);
|
||||
expr_ref mk_derivative_wrapper(expr* hd, expr* r);
|
||||
|
||||
// Various support for unfolding derivative expressions that are
|
||||
// returned by derivative_wrapper
|
||||
expr_ref mk_deriv_accept(expr* s, unsigned i, expr* r);
|
||||
void get_all_derivatives(expr* r, expr_ref_vector& results);
|
||||
void get_derivative_targets(expr* r, expr_ref_vector& targets);
|
||||
void get_cofactors(expr* r, expr_ref_pair_vector& result);
|
||||
void get_cofactors_rec(expr* r, expr_ref_vector& conds,
|
||||
expr_ref_pair_vector& result);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue