3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2025-04-10 19:27:06 +00:00
z3/src/smt/seq_regex.cpp
Caleb Stanford 2c02264a94
Regex solver updates (#4636)
* std::cout debugging statements

* comment out std::cout debugging as this is now a shared fork

* convert std::cout to TRACE statements for seq_rewriter and seq_regex

* add cases to min_length and max_length for regexes

* bug fix

* update min_length and max_length functions for REs

* initial pass on simplifying derivative normal forms by eliminating redundant predicates locally

* add seq_regex_brief trace statements

* working on debugging ref count issue

* fix ref count bug and convert trace statements to seq_regex_brief

* add compact tracing for cache hits/misses

* seq_regex fix cache hit/miss tracing and wrapper around is_nullable

* minor

* label and disable more experimental changes for testing

* minor documentation / tracing

* a few more @EXP annotations

* dead state elimination skeleton code

* progress on dead state elimination

* more progress on dead state elimination

* refactor dead state class to separate self-contained state_graph class

* finish factoring state_graph to only work with unsigned values, and implement separate functionality for expr* logic

* implement get_all_derivatives, add debug tracing

* trace statements for debugging is_nullable loop bug

* fix is_nullable loop bug

* comment out local nullable change and mark experimental

* pretty printing for state_graph

* rewrite state graph to remove the fragile assumption that all edges from a state are added at a time

* start of general cycle detection check + fix some comments

* implement full cycle detection procedure

* normalize derivative conditions to form 'ele <= a'

* order derivative conditions by character code

* fix confusing names m_to and m_from

* assign increasing state IDs from 1 instead of using get_id on AST node

* remove elim_condition call in get_dall_derivatives

* use u_map instead of uint_map to avoid memory leak

* remove unnecessary call to is_ground

* debugging

* small improvements to seq_regex_brief tracing

* fix bug on evil2 example

* save work

* new propagate code

* work in progress on using same seq sort for deriv calls

* avoid re-computing derivatives: use same head var for every derivative call

* use min_length on regexes to prune search

* simple implementation of can_be_in_cycle using rank function idea

* add a disabled experimental change

* minor cleanup comments, etc.

* seq_rewriter cleanup for PR

* typo noticed by Nikolaj

* move state graph to util/state_graph

* re-add accidentally removed line

* clean up seq_regex code removing obsolete functions and comments

* a few more cleanup items

* oops, missed merge change to fix compilation

* disabled change to lift unions to the top level and treat them seperately in seq_regex solver

* added get_overapprox_regex to over-approximate regex membership constraints

* replace calls to is_epsilon with a centrally available method in seq_decl_plugin

* simplifications and modifications in get_overapprox_regex and related

* added approximation support for sequence expressions that use ite

* removed is_app check that was redundant

* tweak differences with upstream

* rewrite derivative leaves

* enable Antimorov-style derivatives via lifting unions in the solver

* TODO placeholders for outputting state graph

* change order in seq_regex propagate_in_re

* implement a more restricted form of Antimorov derivatives via a special op code to indicate lifting unions

* minor

* new Antimorov optimizations based on BDD compatibility checking

* seq regex tracing for # of derivatives

* fix get_cofactors (currently this fix is buggy)

* partially revert get_cofactors buggy change

* re-implement get_cofactors to more efficiently explore nodes in the derivative expression

* dgml generation for state graph

* fix release build

* improved dgml output

* bug fixes in dgml generation

* dot output support for state_graph and moved dgml and dot output under CASSERT

* updated tracing of what regex corresponds to what state id with /tr:state_graph

* clean up & document Antimorov derivative support

* remove op cache tracing

* remove re_rank experimental idea

* small fix

* fix Antimorov derivative (important change for the good performance)

* remove unused and unnecessary code

* implemented simpler efficient get_cofactors alternative mk_deriv_accept

* simplifications in propagate_accept, and trace unusual cases

* document the various seq_regex tracing & debugging command-line options

* fix debug build (broken tracing)

* guard eager Antimorov lifting for possible disabling

* fix bug in propagate_accept Rule 1

* disable eager version of Antimorov lifting for performance reasons

* remove some remaining obsolete comments

Co-authored-by: calebstanford-msr <t-casta@microsoft.com>
Co-authored-by: Margus Veanes <margus@microsoft.com>
2020-08-13 12:47:36 -07:00

839 lines
31 KiB
C++

/*++
Copyright (c) 2020 Microsoft Corporation
Module Name:
seq_regex.cpp
Abstract:
Solver for regexes
Author:
Nikolaj Bjorner (nbjorner) 2020-5-22
--*/
#include "smt/seq_regex.h"
#include "smt/theory_seq.h"
#include "ast/expr_abstract.h"
namespace smt {
seq_regex::seq_regex(theory_seq& th):
th(th),
ctx(th.get_context()),
m(th.get_manager()),
m_state_to_expr(m)
{}
seq_util& seq_regex::u() { return th.m_util; }
class seq_util::re& seq_regex::re() { return th.m_util.re; }
class seq_util::str& seq_regex::str() { return th.m_util.str; }
seq_rewriter& seq_regex::seq_rw() { return th.m_seq_rewrite; }
seq_skolem& seq_regex::sk() { return th.m_sk; }
arith_util& seq_regex::a() { return th.m_autil; }
void seq_regex::rewrite(expr_ref& e) { th.m_rewrite(e); }
/**
* is_string_equality holds of str.in_re s R,
*
* s in (all ++ x ++ all ++ y ++ all)
* =>
* s = fresh1 ++ x ++ fresh2 ++ y ++ fresh3
*
* TBD General rewrite possible:
*
* s in (R ++ Q)
* =>
* s = x ++ y and x in R and y in Q
*/
bool seq_regex::is_string_equality(literal lit) {
expr* s = nullptr, *r = nullptr;
expr* e = ctx.bool_var2expr(lit.var());
expr_ref id(a().mk_int(e->get_id()), m);
VERIFY(str().is_in_re(e, s, r));
sort* seq_sort = m.get_sort(s);
vector<expr_ref_vector> patterns;
auto mk_cont = [&](unsigned idx) {
return sk().mk("seq.cont", id, a().mk_int(idx), seq_sort);
};
unsigned idx = 0;
if (seq_rw().is_re_contains_pattern(r, patterns)) {
expr_ref_vector ts(m);
ts.push_back(mk_cont(idx));
for (auto const& p : patterns) {
ts.append(p);
ts.push_back(mk_cont(++idx));
}
expr_ref t = th.mk_concat(ts, seq_sort);
th.propagate_eq(lit, s, t, true);
return true;
}
return false;
}
/**
* Propagate the atom (str.in_re s r)
*
* Propagation implements the following inference rules
*
* (not (str.in_re s r)) => (str.in_re s (complement r))
* (str.in_re s r) => r != {}
*
* (str.in_re s r) => (accept s 0 r)
*/
void seq_regex::propagate_in_re(literal lit) {
expr* s = nullptr, *r = nullptr;
expr* e = ctx.bool_var2expr(lit.var());
VERIFY(str().is_in_re(e, s, r));
TRACE("seq_regex", tout << "propagate in RE: " << lit.sign() << " " << mk_pp(e, m) << std::endl;);
STRACE("seq_regex_brief", tout << "PIR(" << mk_pp(s, m) << ","
<< state_str(r) << ") ";);
// convert negative negative membership literals to positive
// ~(s in R) => s in C(R)
if (lit.sign()) {
expr_ref fml(re().mk_in_re(s, re().mk_complement(r)), m);
rewrite(fml);
literal nlit = th.mk_literal(fml);
if (lit == nlit) {
// is-nullable doesn't simplify for regexes with uninterpreted subterms
th.add_unhandled_expr(fml);
}
th.propagate_lit(nullptr, 1, &lit, nlit);
return;
}
if (coallesce_in_re(lit)) {
TRACE("seq_regex", tout
<< "simplified conjunctions to an intersection" << std::endl;);
STRACE("seq_regex_brief", tout << "coallesce_in_re ";);
return;
}
if (is_string_equality(lit)) {
TRACE("seq_regex", tout
<< "simplified regex using string equality" << std::endl;);
STRACE("seq_regex_brief", tout << "string_eq ";);
return;
}
// Convert a non-ground sequence into an additional regex and
// strengthen the original regex constraint into an intersection
// for example:
// (x ++ "a" ++ y) in b*
// is coverted to
// (x ++ "a" ++ y) in intersect((.* ++ "a" ++ .*), b*)
expr_ref _r_temp_owner(m);
if (!m.is_value(s)) {
expr_ref s_approx = get_overapprox_regex(s);
if (!re().is_full_seq(s_approx)) {
r = re().mk_inter(r, s_approx);
_r_temp_owner = r;
TRACE("seq_regex", tout
<< "get_overapprox_regex(" << mk_pp(s, m)
<< ") = " << mk_pp(s_approx, m) << std::endl;);
STRACE("seq_regex_brief", tout
<< "overapprox=" << state_str(r) << " ";);
}
}
expr_ref zero(a().mk_int(0), m);
expr_ref acc = sk().mk_accept(s, zero, r);
literal acc_lit = th.mk_literal(acc);
TRACE("seq", tout << "propagate " << acc << "\n";);
th.propagate_lit(nullptr, 1, &lit, acc_lit);
}
/**
* Gets an overapproximating regex s_approx for the input string expression s.
* such that for any valuation v(s) of s, v(s) in L(s_approx).
* If the overapproximation is trivial then dotstar is returned.
*/
expr_ref seq_regex::get_overapprox_regex(expr* s) {
expr_ref s_to_re(re().mk_to_re(s), m);
expr_ref dotstar(re().mk_full_seq(m.get_sort(s_to_re)), m);
if (m.is_value(s))
return s_to_re;
if (str().is_concat(s)) {
expr_ref_vector es(m);
str().get_concat(s, es);
expr_ref s_approx(m), e_approx(m), last(m);
for (expr* e : es) {
e_approx = get_overapprox_regex(e);
if (!s_approx)
s_approx = e_approx;
else if (last != dotstar || e_approx != dotstar)
s_approx = re().mk_concat(s_approx, e_approx);
last = e_approx;
}
if (!s_approx)
s_approx = re().mk_epsilon(m.get_sort(s));
return s_approx;
}
expr* c = nullptr, *r1 = nullptr, *r2 = nullptr;
if (m.is_ite(s, c, r1, r2)) {
// if either branch approximates to .* then the result is also .*
expr_ref s_approx1 = get_overapprox_regex(r1);
if (re().is_full_seq(s_approx1))
return s_approx1;
expr_ref s_approx2 = get_overapprox_regex(r2);
if (re().is_full_seq(s_approx2))
return s_approx2;
return expr_ref(re().mk_union(s_approx1, s_approx2), m);
}
// TBD: other app expressions that can be approximated
return dotstar;
}
/**
* Propagate the atom (accept s i r)
*
* Propagation triggers updating the state graph for dead state detection:
* (accept s i r) => update_state_graph(r)
* (accept s i r) & dead(r) => false
*
* Propagation is also blocked under certain conditions to throttle
* state space exploration past a certain point: see block_unfolding
*
* Otherwise, propagation implements the following inference rules:
*
* Rule 1. (accept s i r) => len(s) >= i + min_len(r)
* Rule 2. (accept s i r) & len(s) <= i => nullable(r)
* (only necessary if min_len fails and returns 0 for non-nullable r)
* Rule 3. (accept s i r) and len(s) > i =>
* (accept s (i + 1) (derivative s[i] r)
*
* Acceptance of a derivative is unfolded into a disjunction over
* all derivatives. Effectively, this implements the following rule,
* but all in one step:
* (accept s i (ite c r1 r2)) =>
* c & (accept s i r1) \/ ~c & (accept s i r2)
*/
void seq_regex::propagate_accept(literal lit) {
SASSERT(!lit.sign());
expr* s = nullptr, *i = nullptr, *r = nullptr;
expr* e = ctx.bool_var2expr(lit.var());
unsigned idx = 0;
VERIFY(sk().is_accept(e, s, i, idx, r));
TRACE("seq_regex", tout << "propagate accept: "
<< mk_pp(e, m) << std::endl;);
STRACE("seq_regex_brief", tout << std::endl
<< "PA(" << mk_pp(s, m) << "@" << idx
<< "," << state_str(r) << ") ";);
if (re().is_empty(r)) {
STRACE("seq_regex_brief", tout << "(empty) ";);
th.add_axiom(~lit);
return;
}
update_state_graph(r);
if (m_state_graph.is_dead(get_state_id(r))) {
STRACE("seq_regex_brief", tout << "(dead) ";);
th.add_axiom(~lit);
return;
}
if (block_unfolding(lit, idx)) {
STRACE("seq_regex_brief", tout << "(blocked) ";);
return;
}
STRACE("seq_regex_brief", tout << "(unfold) ";);
// Rule 1: use min_length to prune search
unsigned min_len = re().min_length(r);
unsigned min_len_plus_i = u().max_plus(min_len, idx);
literal len_s_ge_min = th.m_ax.mk_ge(th.mk_len(s), min_len_plus_i);
th.propagate_lit(nullptr, 1, &lit, len_s_ge_min);
// Axiom equivalent to the above: th.add_axiom(~lit, len_s_ge_min);
// Rule 2: nullable check
literal len_s_le_i = th.m_ax.mk_le(th.mk_len(s), idx);
if (min_len == 0) {
expr_ref is_nullable = is_nullable_wrapper(r);
if (m.is_false(is_nullable)) {
STRACE("seq_regex", tout
<< "Warning: min_length returned 0 for non-nullable regex"
<< std::endl;);
STRACE("seq_regex_brief", tout
<< " (Warning: min_length returned 0 for"
<< " non-nullable regex)";);
th.propagate_lit(nullptr, 1, &lit, ~len_s_le_i);
}
else if (!m.is_true(is_nullable)) {
// is_nullable did not simplify
STRACE("seq_regex", tout
<< "Warning: is_nullable did not simplify to true or false"
<< std::endl;);
STRACE("seq_regex_brief", tout
<< " (Warning: is_nullable did not simplify)";);
literal is_nullable_lit = th.mk_literal(is_nullable);
ctx.mark_as_relevant(is_nullable_lit);
th.add_axiom(~lit, ~len_s_le_i, is_nullable_lit);
}
}
// Rule 3: derivative unfolding
literal_vector accept_next;
expr_ref hd = th.mk_nth(s, i);
expr_ref deriv(m);
deriv = derivative_wrapper(hd, r);
expr_ref accept_deriv(m);
accept_deriv = mk_deriv_accept(s, idx + 1, deriv);
accept_next.push_back(~lit);
accept_next.push_back(len_s_le_i);
accept_next.push_back(th.mk_literal(accept_deriv));
th.add_axiom(accept_next);
}
/**
* Put a limit to the unfolding of s.
*/
bool seq_regex::block_unfolding(literal lit, unsigned i) {
return
i > th.m_max_unfolding_depth &&
th.m_max_unfolding_lit != null_literal &&
ctx.get_assignment(th.m_max_unfolding_lit) == l_true &&
!ctx.at_base_level() &&
(th.propagate_lit(nullptr, 1, &lit, ~th.m_max_unfolding_lit),
true);
}
/**
* Combine a conjunction of membership relations for the same string
* within the same Regex.
*/
bool seq_regex::coallesce_in_re(literal lit) {
return false; // disabled
expr* s = nullptr, *r = nullptr;
expr* e = ctx.bool_var2expr(lit.var());
VERIFY(str().is_in_re(e, s, r));
expr_ref regex(r, m);
literal_vector lits;
for (unsigned i = 0; i < m_s_in_re.size(); ++i) {
auto const& entry = m_s_in_re[i];
if (!entry.m_active)
continue;
enode* n1 = th.ensure_enode(entry.m_s);
enode* n2 = th.ensure_enode(s);
if (n1->get_root() != n2->get_root())
continue;
if (entry.m_re == regex)
continue;
th.m_trail_stack.push(vector_value_trail<theory_seq, s_in_re, true>(m_s_in_re, i));
m_s_in_re[i].m_active = false;
IF_VERBOSE(11, verbose_stream() << "Intersect " << regex << " " <<
mk_pp(entry.m_re, m) << " " << mk_pp(s, m) << " " << mk_pp(entry.m_s, m) << std::endl;);
regex = re().mk_inter(entry.m_re, regex);
rewrite(regex);
lits.push_back(~entry.m_lit);
if (n1 != n2)
lits.push_back(~th.mk_eq(n1->get_owner(), n2->get_owner(), false));
}
m_s_in_re.push_back(s_in_re(lit, s, regex));
th.get_trail_stack().push(push_back_vector<theory_seq, vector<s_in_re>>(m_s_in_re));
if (lits.empty())
return false;
lits.push_back(~lit);
lits.push_back(th.mk_literal(re().mk_in_re(s, regex)));
th.add_axiom(lits);
return true;
}
expr_ref seq_regex::symmetric_diff(expr* r1, expr* r2) {
expr_ref r(m);
if (re().is_empty(r1))
std::swap(r1, r2);
if (re().is_empty(r2))
r = r1;
else
r = re().mk_union(re().mk_diff(r1, r2), re().mk_diff(r2, r1));
rewrite(r);
return r;
}
/*
Wrapper around calls to is_nullable from the seq rewriter.
Note: the nullable wrapper and derivative wrapper actually use
different sequence rewriters; these are at:
m_seq_rewrite
(returned by seq_rw())
th.m_rewrite.m_imp->m_cfg.m_seq_rw
(private, can't be accessed directly)
As a result operations are cached separately for the nullable
and derivative calls. TBD if caching them using the same rewriter
makes any difference.
*/
expr_ref seq_regex::is_nullable_wrapper(expr* r) {
STRACE("seq_regex", tout << "nullable: " << mk_pp(r, m) << std::endl;);
expr_ref result = seq_rw().is_nullable(r);
rewrite(result);
STRACE("seq_regex", tout << "nullable result: " << mk_pp(result, m) << std::endl;);
STRACE("seq_regex_brief", tout << "n(" << state_str(r) << ")="
<< mk_pp(result, m) << " ";);
return result;
}
/*
Wrapper around the regex symbolic derivative from the seq rewriter.
Ensures that the derivative is written in a normalized BDD form
with optimizations for if-then-else expressions involving the head.
Note: the nullable wrapper and derivative wrapper actually use
different sequence rewriters; these are at:
m_seq_rewrite
(returned by seq_rw())
th.m_rewrite.m_imp->m_cfg.m_seq_rw
(private, can't be accessed directly)
As a result operations are cached separately for the nullable
and derivative calls. TBD if caching them using the same rewriter
makes any difference.
*/
expr_ref seq_regex::derivative_wrapper(expr* hd, expr* r) {
STRACE("seq_regex", tout << "derivative(" << mk_pp(hd, m) << "): " << mk_pp(r, m) << std::endl;);
// Use canonical variable for head
expr_ref hd_canon(m.mk_var(0, m.get_sort(hd)), m);
expr_ref result(re().mk_derivative(hd_canon, r), m);
rewrite(result);
// Substitute with real head
var_subst subst(m);
expr_ref_vector sub(m);
sub.push_back(hd);
result = subst(result, sub);
STRACE("seq_regex", tout << "derivative result: " << mk_pp(result, m) << std::endl;);
STRACE("seq_regex_brief", tout << "d(" << state_str(r) << ")="
<< state_str(result) << " ";);
return result;
}
void seq_regex::propagate_eq(expr* r1, expr* r2) {
TRACE("seq_regex", tout << "propagate EQ: " << mk_pp(r1, m) << ", " << mk_pp(r2, m) << std::endl;);
STRACE("seq_regex_brief", tout << "PEQ ";);
sort* seq_sort = nullptr;
VERIFY(u().is_re(r1, seq_sort));
expr_ref r = symmetric_diff(r1, r2);
expr_ref emp(re().mk_empty(m.get_sort(r)), m);
expr_ref n(m.mk_fresh_const("re.char", seq_sort), m);
expr_ref is_empty = sk().mk_is_empty(r, r, n);
th.add_axiom(~th.mk_eq(r1, r2, false), th.mk_literal(is_empty));
}
void seq_regex::propagate_ne(expr* r1, expr* r2) {
TRACE("seq_regex", tout << "propagate NEQ: " << mk_pp(r1, m) << ", " << mk_pp(r2, m) << std::endl;);
STRACE("seq_regex_brief", tout << "PNEQ ";);
sort* seq_sort = nullptr;
VERIFY(u().is_re(r1, seq_sort));
expr_ref r = symmetric_diff(r1, r2);
expr_ref emp(re().mk_empty(m.get_sort(r)), m);
expr_ref n(m.mk_fresh_const("re.char", seq_sort), m);
expr_ref is_non_empty = sk().mk_is_non_empty(r, r, n);
th.add_axiom(th.mk_eq(r1, r2, false), th.mk_literal(is_non_empty));
}
bool seq_regex::is_member(expr* r, expr* u) {
expr* u2 = nullptr;
while (re().is_union(u, u, u2)) {
if (r == u2)
return true;
}
return r == u;
}
/**
* is_non_empty(r, u) => nullable or \/_i (c_i and is_non_empty(r_i, u union r))
*
* for each (c_i, r_i) in cofactors (min-terms)
*
* is_non_empty(r_i, u union r) := false if r_i in u
*
*/
void seq_regex::propagate_is_non_empty(literal lit) {
expr* e = ctx.bool_var2expr(lit.var()), *r = nullptr, *u = nullptr, *n = nullptr;
VERIFY(sk().is_is_non_empty(e, r, u, n));
TRACE("seq_regex", tout << "propagate nonempty: " << mk_pp(e, m) << std::endl;);
STRACE("seq_regex_brief", tout
<< std::endl << "PNE(" << expr_id_str(e) << "," << state_str(r)
<< "," << expr_id_str(u) << "," << expr_id_str(n) << ") ";);
expr_ref is_nullable = is_nullable_wrapper(r);
if (m.is_true(is_nullable))
return;
literal null_lit = th.mk_literal(is_nullable);
expr_ref hd = mk_first(r, n);
expr_ref d(m);
d = derivative_wrapper(hd, r);
literal_vector lits;
lits.push_back(~lit);
if (null_lit != false_literal)
lits.push_back(null_lit);
expr_ref_pair_vector cofactors(m);
get_cofactors(d, cofactors);
for (auto const& p : cofactors) {
if (is_member(p.second, u))
continue;
expr_ref cond(p.first, m);
seq_rw().elim_condition(hd, cond);
rewrite(cond);
if (m.is_false(cond))
continue;
expr_ref next_non_empty = sk().mk_is_non_empty(p.second, re().mk_union(u, p.second), n);
if (!m.is_true(cond))
next_non_empty = m.mk_and(cond, next_non_empty);
lits.push_back(th.mk_literal(next_non_empty));
}
th.add_axiom(lits);
}
/*
Given a string s, index i, and a derivative regex d, return an
expression that is equivalent to
accept s i r
but which pushes accept s i r into the leaves (next derivatives to
explore).
Input r is of type regex; output is of type bool.
Example:
mk_deriv_accept(s, i, (ite a r1 r2) u (ite b r3 r4))
= (or (ite a (accept s i r1) (accept s i r2))
(ite b (accept s i r3) (accept s i r4)))
*/
expr_ref seq_regex::mk_deriv_accept(expr* s, unsigned i, expr* r) {
vector<expr*> to_visit;
to_visit.push_back(r);
obj_map<expr, expr*> re_to_bool;
expr_ref_vector _temp_bool_owner(m); // temp owner for bools we create
// DFS
while (to_visit.size() > 0) {
expr* e = to_visit.back();
expr* econd = nullptr, *e1 = nullptr, *e2 = nullptr;
if (!re_to_bool.contains(e)) {
// First visit: add children
STRACE("seq_regex_verbose", tout << "1";);
if (m.is_ite(e, econd, e1, e2) ||
re().is_union(e, e1, e2)) {
to_visit.push_back(e1);
to_visit.push_back(e2);
}
// Mark first visit by adding nullptr to the map
re_to_bool.insert(e, nullptr);
}
else if (re_to_bool.find(e) == nullptr) {
// Second visit: set value
STRACE("seq_regex_verbose", tout << "2";);
to_visit.pop_back();
if (m.is_ite(e, econd, e1, e2)) {
expr* b1 = re_to_bool.find(e1);
expr* b2 = re_to_bool.find(e2);
expr* b = m.mk_ite(econd, b1, b2);
_temp_bool_owner.push_back(b);
re_to_bool.find(e) = b;
}
else if (re().is_union(e, e1, e2)) {
expr* b1 = re_to_bool.find(e1);
expr* b2 = re_to_bool.find(e2);
expr* b = m.mk_or(b1, b2);
_temp_bool_owner.push_back(b);
re_to_bool.find(e) = b;
}
else {
expr* iplus1 = a().mk_int(i);
_temp_bool_owner.push_back(iplus1);
expr_ref acc_leaf = sk().mk_accept(s, iplus1, e);
_temp_bool_owner.push_back(acc_leaf);
re_to_bool.find(e) = acc_leaf;
STRACE("seq_regex_verbose", tout
<< "mk_deriv_accept: added accept leaf: "
<< mk_pp(acc_leaf, m) << std::endl;);
}
}
else {
STRACE("seq_regex_verbose", tout << "3";);
// Remaining visits: skip
to_visit.pop_back();
}
}
// Finalize
expr_ref result(m);
result = re_to_bool.find(r); // Assigns ownership of all exprs in
// re_to_bool for after this completes
rewrite(result);
return result;
}
/*
Return a list of all leaves in the derivative of a regex r,
ignoring the conditions along each path.
Warning: Although the derivative
normal form tries to eliminate unsat condition paths, one cannot
assume that the path to each leaf is satisfiable in general
(e.g. when regexes are created using re.pred).
So not all results may correspond to satisfiable predicates.
It is OK to rely on the results being satisfiable for completeness,
but not soundness.
*/
void seq_regex::get_all_derivatives(expr* r, expr_ref_vector& results) {
// Get derivative
sort* seq_sort = nullptr;
VERIFY(u().is_re(r, seq_sort));
expr_ref n(m.mk_fresh_const("re.char", seq_sort), m);
expr_ref hd = mk_first(r, n);
expr_ref d(m);
d = derivative_wrapper(hd, r);
// DFS
vector<expr*> to_visit;
to_visit.push_back(d);
obj_map<expr, bool> visited; // set<expr> (bool is used as a unit type)
while (to_visit.size() > 0) {
expr* e = to_visit.back();
to_visit.pop_back();
if (visited.contains(e)) continue;
visited.insert(e, true);
expr* econd = nullptr, *e1 = nullptr, *e2 = nullptr;
if (m.is_ite(e, econd, e1, e2) ||
re().is_union(e, e1, e2)) {
to_visit.push_back(e1);
to_visit.push_back(e2);
}
else if (!re().is_empty(e)) {
results.push_back(e);
STRACE("seq_regex_verbose", tout
<< "get_all_derivatives: added deriv: "
<< mk_pp(e, m) << std::endl;);
}
}
STRACE("seq_regex", tout << "Number of derivatives: "
<< results.size() << std::endl;);
STRACE("seq_regex_brief", tout << "#derivs=" << results.size() << " ";);
}
/*
Return a list of all (cond, leaf) pairs in a given derivative
expression r.
Note: this recursive implementation is inefficient, since if nodes
are repeated often in the expression DAG, they may be visited
many times. For this reason, prefer mk_deriv_accept and
get_all_derivatives when possible.
This method is still used by:
propagate_is_empty
propagate_is_non_empty
*/
void seq_regex::get_cofactors(expr* r, expr_ref_pair_vector& result) {
expr_ref_vector conds(m);
get_cofactors_rec(r, conds, result);
STRACE("seq_regex", tout << "Number of derivatives: "
<< result.size() << std::endl;);
STRACE("seq_regex_brief", tout << "#derivs=" << result.size() << " ";);
}
void seq_regex::get_cofactors_rec(expr* r, expr_ref_vector& conds,
expr_ref_pair_vector& result) {
expr* cond = nullptr, *r1 = nullptr, *r2 = nullptr;
if (m.is_ite(r, cond, r1, r2)) {
conds.push_back(cond);
get_cofactors_rec(r1, conds, result);
conds.pop_back();
conds.push_back(mk_not(m, cond));
get_cofactors_rec(r2, conds, result);
conds.pop_back();
}
else if (re().is_union(r, r1, r2)) {
get_cofactors_rec(r1, conds, result);
get_cofactors_rec(r2, conds, result);
}
else {
expr_ref conj = mk_and(conds);
if (!m.is_false(conj) && !re().is_empty(r))
result.push_back(conj, r);
}
}
/*
is_empty(r, u) => ~is_nullable(r)
is_empty(r, u) => (forall x . ~cond(x)) or is_empty(r1, u union r) for (cond, r) in min-terms(D(x,r))
is_empty(r, u) is true if r is a member of u
*/
void seq_regex::propagate_is_empty(literal lit) {
expr* e = ctx.bool_var2expr(lit.var()), *r = nullptr, *u = nullptr, *n = nullptr;
VERIFY(sk().is_is_empty(e, r, u, n));
expr_ref is_nullable = is_nullable_wrapper(r);
TRACE("seq_regex", tout << "propagate empty: " << mk_pp(e, m) << std::endl;);
STRACE("seq_regex_brief", tout
<< std::endl << "PE(" << expr_id_str(e) << "," << state_str(r)
<< "," << expr_id_str(u) << "," << expr_id_str(n) << ") ";);
if (m.is_true(is_nullable)) {
th.add_axiom(~lit);
return;
}
th.add_axiom(~lit, ~th.mk_literal(is_nullable));
expr_ref hd = mk_first(r, n);
expr_ref d(m);
d = derivative_wrapper(hd, r);
literal_vector lits;
expr_ref_pair_vector cofactors(m);
get_cofactors(d, cofactors);
for (auto const& p : cofactors) {
if (is_member(p.second, u))
continue;
expr_ref cond(p.first, m);
seq_rw().elim_condition(hd, cond);
rewrite(cond);
if (m.is_false(cond))
continue;
lits.reset();
lits.push_back(~lit);
if (!m.is_true(cond)) {
expr_ref ncond(mk_not(m, cond), m);
lits.push_back(th.mk_literal(mk_forall(m, hd, ncond)));
}
expr_ref is_empty1 = sk().mk_is_empty(p.second, re().mk_union(u, p.second), n);
lits.push_back(th.mk_literal(is_empty1));
th.add_axiom(lits);
}
}
expr_ref seq_regex::mk_first(expr* r, expr* n) {
sort* elem_sort = nullptr, *seq_sort = nullptr;
VERIFY(u().is_re(r, seq_sort));
VERIFY(u().is_seq(seq_sort, elem_sort));
return sk().mk("re.first", n, a().mk_int(r->get_id()), elem_sort);
}
/**
* Dead state elimination using the state_graph class
*/
unsigned seq_regex::get_state_id(expr* e) {
// Assign increasing IDs starting from 1
if (!m_expr_to_state.contains(e)) {
m_state_to_expr.push_back(e);
unsigned new_id = m_state_to_expr.size();
m_expr_to_state.insert(e, new_id);
STRACE("seq_regex_brief", tout << "new(" << expr_id_str(e)
<< ")=" << state_str(e) << " ";);
STRACE("seq_regex", tout
<< "New state ID: " << new_id
<< " = " << mk_pp(e, m) << std::endl;);
}
return m_expr_to_state.find(e);
}
expr* seq_regex::get_expr_from_id(unsigned id) {
SASSERT(id >= 1);
SASSERT(id <= m_state_to_expr.size());
return m_state_to_expr.get(id);
}
bool seq_regex::can_be_in_cycle(expr *r1, expr *r2) {
// TBD: This can be used to optimize the state graph:
// return false here if it is known that r1 -> r2 can never be
// in a cycle. There are various easy syntactic checks on r1 and r2
// that can be used to infer this (e.g. star height, or length if
// both are star-free).
// This check need not be sound, but if it is not, some dead states
// will be missed.
return true;
}
/*
Update the state graph with expression r and all its derivatives.
*/
bool seq_regex::update_state_graph(expr* r) {
unsigned r_id = get_state_id(r);
if (m_state_graph.is_done(r_id)) return false;
if (m_state_graph.get_size() >= m_max_state_graph_size) {
STRACE("seq_regex", tout << "Warning: ignored state graph update -- max size of seen states reached!" << std::endl;);
STRACE("seq_regex_brief", tout << "(MAX SIZE REACHED) ";);
return false;
}
// Add state
m_state_graph.add_state(r_id);
STRACE("state_graph", tout << "regex(" << r_id << ") = " << mk_pp(r, m) << std::endl;);
STRACE("seq_regex", tout << "Updating state graph for regex "
<< mk_pp(r, m) << ") " << std::endl;);
STRACE("seq_regex_brief", tout << std::endl << "USG("
<< state_str(r) << ") ";);
expr_ref r_nullable = is_nullable_wrapper(r);
if (m.is_true(r_nullable)) {
m_state_graph.mark_live(r_id);
}
else {
// Add edges to all derivatives
expr_ref_vector derivatives(m);
STRACE("seq_regex_verbose", tout
<< "getting all derivs: " << r_id << " " << std::endl;);
get_all_derivatives(r, derivatives);
for (auto const& dr: derivatives) {
unsigned dr_id = get_state_id(dr);
STRACE("seq_regex_verbose", tout
<< " traversing deriv: " << dr_id << " " << std::endl;);
m_state_graph.add_state(dr_id);
STRACE("state_graph", tout << "regex(" << dr_id << ") = " << mk_pp(dr, m) << std::endl;);
bool maybecycle = can_be_in_cycle(r, dr);
m_state_graph.add_edge(r_id, dr_id, maybecycle);
}
m_state_graph.mark_done(r_id);
}
STRACE("seq_regex", m_state_graph.display(tout););
STRACE("seq_regex_brief", tout << std::endl;);
STRACE("seq_regex_brief", m_state_graph.display(tout););
return true;
}
std::string seq_regex::state_str(expr* e) {
if (m_expr_to_state.contains(e))
return std::to_string(get_state_id(e));
else
return expr_id_str(e);
}
std::string seq_regex::expr_id_str(expr* e) {
return std::string("id") + std::to_string(e->get_id());
}
}