mirror of
https://github.com/Z3Prover/z3
synced 2025-04-06 17:44:08 +00:00
933 lines
35 KiB
C++
933 lines
35 KiB
C++
/*++
|
|
Copyright (c) 2020 Microsoft Corporation
|
|
|
|
Module Name:
|
|
|
|
seq_regex.cpp
|
|
|
|
Abstract:
|
|
|
|
Solver for regexes
|
|
|
|
Author:
|
|
|
|
Nikolaj Bjorner (nbjorner) 2020-5-22
|
|
|
|
--*/
|
|
|
|
#include "smt/seq_regex.h"
|
|
#include "smt/theory_seq.h"
|
|
#include "ast/expr_abstract.h"
|
|
|
|
namespace smt {
|
|
|
|
seq_regex::seq_regex(theory_seq& th):
|
|
th(th),
|
|
ctx(th.get_context()),
|
|
m(th.get_manager()),
|
|
m_state_to_expr(m),
|
|
m_state_graph(state_graph::state_pp(this, pp_state)) { }
|
|
|
|
seq_util& seq_regex::u() { return th.m_util; }
|
|
class seq_util::rex& seq_regex::re() { return th.m_util.re; }
|
|
class seq_util::str& seq_regex::str() { return th.m_util.str; }
|
|
seq_rewriter& seq_regex::seq_rw() { return th.m_seq_rewrite; }
|
|
seq::skolem& seq_regex::sk() { return th.m_sk; }
|
|
arith_util& seq_regex::a() { return th.m_autil; }
|
|
void seq_regex::rewrite(expr_ref& e) { th.m_rewrite(e); }
|
|
|
|
/**
|
|
* is_string_equality holds of str.in_re s R,
|
|
*
|
|
* s in (all ++ x ++ all ++ y ++ all)
|
|
* =>
|
|
* s = fresh1 ++ x ++ fresh2 ++ y ++ fresh3
|
|
*
|
|
* TBD General rewrite possible:
|
|
*
|
|
* s in (R ++ Q)
|
|
* =>
|
|
* s = x ++ y and x in R and y in Q
|
|
*/
|
|
|
|
bool seq_regex::is_string_equality(literal lit) {
|
|
expr* s = nullptr, *r = nullptr;
|
|
expr* e = ctx.bool_var2expr(lit.var());
|
|
expr_ref id(a().mk_int(e->get_id()), m);
|
|
VERIFY(str().is_in_re(e, s, r));
|
|
sort* seq_sort = s->get_sort();
|
|
vector<expr_ref_vector> patterns;
|
|
auto mk_cont = [&](unsigned idx) {
|
|
return sk().mk("seq.cont", id, a().mk_int(idx), seq_sort);
|
|
};
|
|
unsigned idx = 0;
|
|
if (seq_rw().is_re_contains_pattern(r, patterns)) {
|
|
expr_ref_vector ts(m);
|
|
ts.push_back(mk_cont(idx));
|
|
for (auto const& p : patterns) {
|
|
ts.append(p);
|
|
ts.push_back(mk_cont(++idx));
|
|
}
|
|
expr_ref t = th.mk_concat(ts, seq_sort);
|
|
th.propagate_eq(lit, s, t, true);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Propagate the atom (str.in_re s r)
|
|
*
|
|
* Propagation implements the following inference rules
|
|
*
|
|
* (not (str.in_re s r)) => (str.in_re s (complement r))
|
|
* (str.in_re s r) => r != {}
|
|
*
|
|
* (str.in_re s r) => (accept s 0 r)
|
|
*/
|
|
|
|
void seq_regex::propagate_in_re(literal lit) {
|
|
expr* s = nullptr, *r = nullptr;
|
|
expr* e = ctx.bool_var2expr(lit.var());
|
|
VERIFY(str().is_in_re(e, s, r));
|
|
|
|
TRACE("seq_regex", tout << "propagate in RE: " << lit.sign() << " " << mk_pp(e, m) << std::endl;);
|
|
STRACE("seq_regex_brief", tout << "PIR(" << mk_pp(s, m) << ","
|
|
<< state_str(r) << ") ";);
|
|
|
|
// convert negative negative membership literals to positive
|
|
// ~(s in R) => s in C(R)
|
|
if (lit.sign()) {
|
|
expr_ref fml(re().mk_in_re(s, re().mk_complement(r)), m);
|
|
rewrite(fml);
|
|
literal nlit = th.mk_literal(fml);
|
|
if (lit == nlit) {
|
|
// is-nullable doesn't simplify for regexes with uninterpreted subterms
|
|
th.add_unhandled_expr(fml);
|
|
}
|
|
th.propagate_lit(nullptr, 1, &lit, nlit);
|
|
return;
|
|
}
|
|
|
|
if (coallesce_in_re(lit)) {
|
|
TRACE("seq_regex", tout
|
|
<< "simplified conjunctions to an intersection" << std::endl;);
|
|
STRACE("seq_regex_brief", tout << "coallesce_in_re ";);
|
|
return;
|
|
}
|
|
|
|
if (is_string_equality(lit)) {
|
|
TRACE("seq_regex", tout
|
|
<< "simplified regex using string equality" << std::endl;);
|
|
STRACE("seq_regex_brief", tout << "string_eq ";);
|
|
return;
|
|
}
|
|
|
|
// Convert a non-ground sequence into an additional regex and
|
|
// strengthen the original regex constraint into an intersection
|
|
// for example:
|
|
// (x ++ "a" ++ y) in b*
|
|
// is coverted to
|
|
// (x ++ "a" ++ y) in intersect((.* ++ "a" ++ .*), b*)
|
|
expr_ref _r_temp_owner(m);
|
|
if (!m.is_value(s)) {
|
|
expr_ref s_approx = get_overapprox_regex(s);
|
|
if (!re().is_full_seq(s_approx)) {
|
|
r = re().mk_inter(r, s_approx);
|
|
_r_temp_owner = r;
|
|
TRACE("seq_regex", tout
|
|
<< "get_overapprox_regex(" << mk_pp(s, m)
|
|
<< ") = " << mk_pp(s_approx, m) << std::endl;);
|
|
STRACE("seq_regex_brief", tout
|
|
<< "overapprox=" << state_str(r) << " ";);
|
|
}
|
|
}
|
|
|
|
/*
|
|
//if r is uninterpreted then taking a derivative may diverge try to obtain the
|
|
//value from equations providing r a definition
|
|
if (is_uninterp(r)) {
|
|
if (m_const_to_expr.contains(r)) {
|
|
proof* _not_used = nullptr;
|
|
m_const_to_expr.get(r, r, _not_used);
|
|
if (is_uninterp(r)) {
|
|
if (m_const_to_expr.contains(r)) {
|
|
m_const_to_expr.get(r, r, _not_used);
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
//add the literal back
|
|
expr_ref r_alias(m.mk_fresh_const(symbol(r->get_id()), r->get_sort(), false), m);
|
|
expr_ref s_in_r_alias(re().mk_in_re(s, r_alias), m);
|
|
literal s_in_r_alias_lit = th.mk_literal(s_in_r_alias);
|
|
m_const_to_expr.insert(r_alias, r, nullptr);
|
|
th.add_axiom(s_in_r_alias_lit);
|
|
return;
|
|
}
|
|
}
|
|
*/
|
|
|
|
/*
|
|
if (is_uninterp(r)) {
|
|
th.add_unhandled_expr(e);
|
|
return;
|
|
}
|
|
*/
|
|
|
|
expr_ref zero(a().mk_int(0), m);
|
|
expr_ref acc(sk().mk_accept(s, zero, r), m);
|
|
literal acc_lit = th.mk_literal(acc);
|
|
|
|
TRACE("seq", tout << "propagate " << acc << "\n";);
|
|
|
|
//th.propagate_lit(nullptr, 1, &lit, acc_lit);
|
|
th.add_axiom(~lit, acc_lit);
|
|
}
|
|
|
|
/**
|
|
* Gets an overapproximating regex s_approx for the input string expression s.
|
|
* such that for any valuation v(s) of s, v(s) in L(s_approx).
|
|
* If the overapproximation is trivial then dotstar is returned.
|
|
*/
|
|
expr_ref seq_regex::get_overapprox_regex(expr* s) {
|
|
expr_ref s_to_re(re().mk_to_re(s), m);
|
|
expr_ref dotstar(re().mk_full_seq(s_to_re->get_sort()), m);
|
|
if (m.is_value(s))
|
|
return s_to_re;
|
|
|
|
if (str().is_concat(s)) {
|
|
expr_ref_vector es(m);
|
|
str().get_concat(s, es);
|
|
expr_ref s_approx(m), e_approx(m), last(m);
|
|
for (expr* e : es) {
|
|
e_approx = get_overapprox_regex(e);
|
|
if (!s_approx)
|
|
s_approx = e_approx;
|
|
else if (last != dotstar || e_approx != dotstar)
|
|
s_approx = re().mk_concat(s_approx, e_approx);
|
|
last = e_approx;
|
|
}
|
|
if (!s_approx)
|
|
s_approx = re().mk_epsilon(s->get_sort());
|
|
|
|
return s_approx;
|
|
}
|
|
|
|
expr* c = nullptr, *r1 = nullptr, *r2 = nullptr;
|
|
if (m.is_ite(s, c, r1, r2)) {
|
|
// if either branch approximates to .* then the result is also .*
|
|
|
|
expr_ref s_approx1 = get_overapprox_regex(r1);
|
|
if (re().is_full_seq(s_approx1))
|
|
return s_approx1;
|
|
|
|
expr_ref s_approx2 = get_overapprox_regex(r2);
|
|
if (re().is_full_seq(s_approx2))
|
|
return s_approx2;
|
|
|
|
return expr_ref(re().mk_union(s_approx1, s_approx2), m);
|
|
}
|
|
|
|
// TBD: other app expressions that can be approximated
|
|
return dotstar;
|
|
|
|
}
|
|
|
|
/**
|
|
* Propagate the atom (accept s i r)
|
|
*
|
|
* Propagation triggers updating the state graph for dead state detection:
|
|
* (accept s i r) => update_state_graph(r)
|
|
* (accept s i r) & dead(r) => false
|
|
*
|
|
* Propagation is also blocked under certain conditions to throttle
|
|
* state space exploration past a certain point: see block_unfolding
|
|
*
|
|
* Otherwise, propagation implements the following inference rules:
|
|
*
|
|
* Rule 1. (accept s i r) => len(s) >= i + min_len(r)
|
|
* Rule 2. (accept s i r) & len(s) <= i => nullable(r)
|
|
* (only necessary if min_len fails and returns 0 for non-nullable r)
|
|
* Rule 3. (accept s i r) and len(s) > i =>
|
|
* (accept s (i + 1) (derivative s[i] r)
|
|
*
|
|
* Acceptance of a derivative is unfolded into a disjunction over
|
|
* all derivatives. Effectively, this implements the following rule,
|
|
* but all in one step:
|
|
* (accept s i (ite c r1 r2)) =>
|
|
* c & (accept s i r1) \/ ~c & (accept s i r2)
|
|
*/
|
|
void seq_regex::propagate_accept(literal lit) {
|
|
SASSERT(!lit.sign());
|
|
|
|
expr* s = nullptr, *i = nullptr, *r = nullptr;
|
|
expr* e = ctx.bool_var2expr(lit.var());
|
|
unsigned idx = 0;
|
|
VERIFY(sk().is_accept(e, s, i, idx, r));
|
|
|
|
TRACE("seq_regex", tout << "propagate accept: "
|
|
<< mk_pp(e, m) << std::endl;);
|
|
STRACE("seq_regex_brief", tout << std::endl
|
|
<< "PA(" << mk_pp(s, m) << "@" << idx
|
|
<< "," << state_str(r) << ") ";);
|
|
|
|
if (re().is_empty(r)) {
|
|
STRACE("seq_regex_brief", tout << "(empty) ";);
|
|
th.add_axiom(~lit);
|
|
return;
|
|
}
|
|
|
|
auto info = re().get_info(r);
|
|
if (info.interpreted) {
|
|
update_state_graph(r);
|
|
|
|
if (m_state_graph.is_dead(get_state_id(r))) {
|
|
STRACE("seq_regex_brief", tout << "(dead) ";);
|
|
th.add_axiom(~lit);
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (block_unfolding(lit, idx)) {
|
|
STRACE("seq_regex_brief", tout << "(blocked) ";);
|
|
return;
|
|
}
|
|
|
|
STRACE("seq_regex_brief", tout << "(unfold) ";);
|
|
|
|
// Rule 1: use min_length to prune search
|
|
unsigned min_len = re().min_length(r);
|
|
unsigned min_len_plus_i = u().max_plus(min_len, idx);
|
|
literal len_s_ge_min = th.m_ax.mk_ge(th.mk_len(s), min_len_plus_i);
|
|
th.propagate_lit(nullptr, 1, &lit, len_s_ge_min);
|
|
// Axiom equivalent to the above: th.add_axiom(~lit, len_s_ge_min);
|
|
|
|
// Rule 2: nullable check
|
|
literal len_s_le_i = th.m_ax.mk_le(th.mk_len(s), idx);
|
|
if (min_len == 0) {
|
|
expr_ref is_nullable = is_nullable_wrapper(r);
|
|
if (m.is_false(is_nullable)) {
|
|
STRACE("seq_regex", tout
|
|
<< "Warning: min_length returned 0 for non-nullable regex"
|
|
<< std::endl;);
|
|
STRACE("seq_regex_brief", tout
|
|
<< " (Warning: min_length returned 0 for"
|
|
<< " non-nullable regex)";);
|
|
th.propagate_lit(nullptr, 1, &lit, ~len_s_le_i);
|
|
}
|
|
else if (!m.is_true(is_nullable)) {
|
|
// is_nullable did not simplify
|
|
STRACE("seq_regex", tout
|
|
<< "Warning: is_nullable did not simplify to true or false"
|
|
<< std::endl;);
|
|
STRACE("seq_regex_brief", tout
|
|
<< " (Warning: is_nullable did not simplify)";);
|
|
literal is_nullable_lit = th.mk_literal(is_nullable);
|
|
ctx.mark_as_relevant(is_nullable_lit);
|
|
th.add_axiom(~lit, ~len_s_le_i, is_nullable_lit);
|
|
if (str().is_in_re(is_nullable))
|
|
th.add_unhandled_expr(is_nullable);
|
|
}
|
|
}
|
|
|
|
// Rule 3: derivative unfolding
|
|
literal_vector accept_next;
|
|
expr_ref hd = th.mk_nth(s, i);
|
|
expr_ref deriv(m);
|
|
deriv = derivative_wrapper(hd, r);
|
|
expr_ref accept_deriv(m);
|
|
accept_deriv = mk_deriv_accept(s, idx + 1, deriv);
|
|
accept_next.push_back(~lit);
|
|
accept_next.push_back(len_s_le_i);
|
|
accept_next.push_back(th.mk_literal(accept_deriv));
|
|
th.add_axiom(accept_next);
|
|
}
|
|
|
|
/**
|
|
* Put a limit to the unfolding of s.
|
|
*/
|
|
bool seq_regex::block_unfolding(literal lit, unsigned i) {
|
|
return
|
|
i > th.m_max_unfolding_depth &&
|
|
th.m_max_unfolding_lit != null_literal &&
|
|
ctx.get_assignment(th.m_max_unfolding_lit) == l_true &&
|
|
!ctx.at_base_level() &&
|
|
(th.propagate_lit(nullptr, 1, &lit, ~th.m_max_unfolding_lit),
|
|
true);
|
|
}
|
|
|
|
/**
|
|
* Combine a conjunction of membership relations for the same string
|
|
* within the same Regex.
|
|
*/
|
|
bool seq_regex::coallesce_in_re(literal lit) {
|
|
return false; // disabled
|
|
expr* s = nullptr, *r = nullptr;
|
|
expr* e = ctx.bool_var2expr(lit.var());
|
|
VERIFY(str().is_in_re(e, s, r));
|
|
expr_ref regex(r, m);
|
|
literal_vector lits;
|
|
for (unsigned i = 0; i < m_s_in_re.size(); ++i) {
|
|
auto const& entry = m_s_in_re[i];
|
|
if (!entry.m_active)
|
|
continue;
|
|
enode* n1 = th.ensure_enode(entry.m_s);
|
|
enode* n2 = th.ensure_enode(s);
|
|
if (n1->get_root() != n2->get_root())
|
|
continue;
|
|
if (entry.m_re == regex)
|
|
continue;
|
|
|
|
th.m_trail_stack.push(vector_value_trail<s_in_re, true>(m_s_in_re, i));
|
|
m_s_in_re[i].m_active = false;
|
|
IF_VERBOSE(11, verbose_stream() << "Intersect " << regex << " " <<
|
|
mk_pp(entry.m_re, m) << " " << mk_pp(s, m) << " " << mk_pp(entry.m_s, m) << std::endl;);
|
|
regex = re().mk_inter(entry.m_re, regex);
|
|
rewrite(regex);
|
|
lits.push_back(~entry.m_lit);
|
|
if (n1 != n2)
|
|
lits.push_back(~th.mk_eq(n1->get_owner(), n2->get_owner(), false));
|
|
}
|
|
m_s_in_re.push_back(s_in_re(lit, s, regex));
|
|
th.get_trail_stack().push(push_back_vector<vector<s_in_re>>(m_s_in_re));
|
|
if (lits.empty())
|
|
return false;
|
|
lits.push_back(~lit);
|
|
lits.push_back(th.mk_literal(re().mk_in_re(s, regex)));
|
|
th.add_axiom(lits);
|
|
return true;
|
|
}
|
|
|
|
expr_ref seq_regex::symmetric_diff(expr* r1, expr* r2) {
|
|
expr_ref r(m);
|
|
if (r1 == r2)
|
|
r = re().mk_empty(r1->get_sort());
|
|
else if (re().is_empty(r1))
|
|
r = r2;
|
|
else if (re().is_empty(r2))
|
|
r = r1;
|
|
else
|
|
r = re().mk_union(re().mk_diff(r1, r2), re().mk_diff(r2, r1));
|
|
rewrite(r);
|
|
return r;
|
|
}
|
|
|
|
/*
|
|
Wrapper around calls to is_nullable from the seq rewriter.
|
|
|
|
Note: the nullable wrapper and derivative wrapper actually use
|
|
different sequence rewriters; these are at:
|
|
m_seq_rewrite
|
|
(returned by seq_rw())
|
|
th.m_rewrite.m_imp->m_cfg.m_seq_rw
|
|
(private, can't be accessed directly)
|
|
As a result operations are cached separately for the nullable
|
|
and derivative calls. TBD if caching them using the same rewriter
|
|
makes any difference.
|
|
*/
|
|
expr_ref seq_regex::is_nullable_wrapper(expr* r) {
|
|
STRACE("seq_regex", tout << "nullable: " << mk_pp(r, m) << std::endl;);
|
|
|
|
expr_ref result = seq_rw().is_nullable(r);
|
|
rewrite(result);
|
|
|
|
STRACE("seq_regex", tout << "nullable result: " << mk_pp(result, m) << std::endl;);
|
|
STRACE("seq_regex_brief", tout << "n(" << state_str(r) << ")="
|
|
<< mk_pp(result, m) << " ";);
|
|
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
Wrapper around the regex symbolic derivative from the seq rewriter.
|
|
Ensures that the derivative is written in a normalized BDD form
|
|
with optimizations for if-then-else expressions involving the head.
|
|
|
|
Note: the nullable wrapper and derivative wrapper actually use
|
|
different sequence rewriters; these are at:
|
|
m_seq_rewrite
|
|
(returned by seq_rw())
|
|
th.m_rewrite.m_imp->m_cfg.m_seq_rw
|
|
(private, can't be accessed directly)
|
|
As a result operations are cached separately for the nullable
|
|
and derivative calls. TBD if caching them using the same rewriter
|
|
makes any difference.
|
|
*/
|
|
expr_ref seq_regex::derivative_wrapper(expr* hd, expr* r) {
|
|
STRACE("seq_regex", tout << "derivative(" << mk_pp(hd, m) << "): " << mk_pp(r, m) << std::endl;);
|
|
|
|
// Use canonical variable for head
|
|
expr_ref hd_canon(m.mk_var(0, hd->get_sort()), m);
|
|
expr_ref result(re().mk_derivative(hd_canon, r), m);
|
|
rewrite(result);
|
|
|
|
// Substitute with real head
|
|
var_subst subst(m);
|
|
expr_ref_vector sub(m);
|
|
sub.push_back(hd);
|
|
result = subst(result, sub);
|
|
|
|
STRACE("seq_regex", tout << "derivative result: " << mk_pp(result, m) << std::endl;);
|
|
STRACE("seq_regex_brief", tout << "d(" << state_str(r) << ")="
|
|
<< state_str(result) << " ";);
|
|
|
|
return result;
|
|
}
|
|
|
|
void seq_regex::propagate_eq(expr* r1, expr* r2) {
|
|
TRACE("seq_regex", tout << "propagate EQ: " << mk_pp(r1, m) << ", " << mk_pp(r2, m) << std::endl;);
|
|
STRACE("seq_regex_brief", tout << "PEQ ";);
|
|
|
|
/*
|
|
if (is_uninterp(r1) || is_uninterp(r2)) {
|
|
th.add_axiom(th.mk_eq(r1, r2, false));
|
|
if (is_uninterp(r1))
|
|
m_const_to_expr.insert(r1, r2, nullptr);
|
|
else
|
|
m_const_to_expr.insert(r2, r1, nullptr);
|
|
|
|
}
|
|
*/
|
|
|
|
sort* seq_sort = nullptr;
|
|
VERIFY(u().is_re(r1, seq_sort));
|
|
expr_ref r = symmetric_diff(r1, r2);
|
|
if (re().is_empty(r))
|
|
//trivially true
|
|
return;
|
|
expr_ref emp(re().mk_empty(r->get_sort()), m);
|
|
expr_ref f(m.mk_fresh_const("re.char", seq_sort), m);
|
|
expr_ref is_empty = sk().mk_is_empty(r, r, f);
|
|
// is_empty : (re,re,seq) -> Bool is a Skolem function
|
|
// f is a fresh internal Skolem constant of sort seq
|
|
// the literal is satisfiable when emptiness check succeeds
|
|
// meaning that r is not nullable and
|
|
// that all derivatives of r (if any) are also empty
|
|
// TBD: rewrite to use state_graph
|
|
th.add_axiom(~th.mk_eq(r1, r2, false), th.mk_literal(is_empty));
|
|
}
|
|
|
|
void seq_regex::propagate_ne(expr* r1, expr* r2) {
|
|
TRACE("seq_regex", tout << "propagate NEQ: " << mk_pp(r1, m) << ", " << mk_pp(r2, m) << std::endl;);
|
|
STRACE("seq_regex_brief", tout << "PNEQ ";);
|
|
// TBD: rewrite to use state_graph
|
|
// why is is_non_empty even needed, why not just not(in_empty)
|
|
sort* seq_sort = nullptr;
|
|
VERIFY(u().is_re(r1, seq_sort));
|
|
expr_ref r = symmetric_diff(r1, r2);
|
|
expr_ref emp(re().mk_empty(r->get_sort()), m);
|
|
expr_ref n(m.mk_fresh_const("re.char", seq_sort), m);
|
|
expr_ref is_non_empty = sk().mk_is_non_empty(r, r, n);
|
|
th.add_axiom(th.mk_eq(r1, r2, false), th.mk_literal(is_non_empty));
|
|
}
|
|
|
|
bool seq_regex::is_member(expr* r, expr* u) {
|
|
expr* u2 = nullptr;
|
|
while (re().is_union(u, u, u2)) {
|
|
if (r == u2)
|
|
return true;
|
|
}
|
|
return r == u;
|
|
}
|
|
|
|
/**
|
|
* is_non_empty(r, u) => nullable or \/_i (c_i and is_non_empty(r_i, u union r))
|
|
*
|
|
* for each (c_i, r_i) in cofactors (min-terms)
|
|
*
|
|
* is_non_empty(r_i, u union r) := false if r_i in u
|
|
*
|
|
*/
|
|
void seq_regex::propagate_is_non_empty(literal lit) {
|
|
expr* e = ctx.bool_var2expr(lit.var()), *r = nullptr, *u = nullptr, *n = nullptr;
|
|
VERIFY(sk().is_is_non_empty(e, r, u, n));
|
|
|
|
TRACE("seq_regex", tout << "propagate nonempty: " << mk_pp(e, m) << std::endl;);
|
|
STRACE("seq_regex_brief", tout
|
|
<< std::endl << "PNE(" << expr_id_str(e) << "," << state_str(r)
|
|
<< "," << expr_id_str(u) << "," << expr_id_str(n) << ") ";);
|
|
|
|
expr_ref is_nullable = is_nullable_wrapper(r);
|
|
if (m.is_true(is_nullable))
|
|
return;
|
|
|
|
literal null_lit = th.mk_literal(is_nullable);
|
|
expr_ref hd = mk_first(r, n);
|
|
expr_ref d(m);
|
|
d = derivative_wrapper(hd, r);
|
|
|
|
literal_vector lits;
|
|
lits.push_back(~lit);
|
|
if (null_lit != false_literal)
|
|
lits.push_back(null_lit);
|
|
|
|
expr_ref_pair_vector cofactors(m);
|
|
get_cofactors(d, cofactors);
|
|
for (auto const& p : cofactors) {
|
|
if (is_member(p.second, u))
|
|
continue;
|
|
expr_ref cond(p.first, m);
|
|
seq_rw().elim_condition(hd, cond);
|
|
rewrite(cond);
|
|
if (m.is_false(cond))
|
|
continue;
|
|
expr_ref next_non_empty = sk().mk_is_non_empty(p.second, re().mk_union(u, p.second), n);
|
|
if (!m.is_true(cond))
|
|
next_non_empty = m.mk_and(cond, next_non_empty);
|
|
lits.push_back(th.mk_literal(next_non_empty));
|
|
}
|
|
|
|
th.add_axiom(lits);
|
|
}
|
|
|
|
/*
|
|
Given a string s, index i, and a derivative regex d, return an
|
|
expression that is equivalent to
|
|
accept s i r
|
|
but which pushes accept s i r into the leaves (next derivatives to
|
|
explore).
|
|
|
|
Input r is of type regex; output is of type bool.
|
|
|
|
Example:
|
|
mk_deriv_accept(s, i, (ite a r1 r2) u (ite b r3 r4))
|
|
= (or (ite a (accept s i r1) (accept s i r2))
|
|
(ite b (accept s i r3) (accept s i r4)))
|
|
*/
|
|
expr_ref seq_regex::mk_deriv_accept(expr* s, unsigned i, expr* r) {
|
|
vector<expr*> to_visit;
|
|
to_visit.push_back(r);
|
|
obj_map<expr, expr*> re_to_bool;
|
|
expr_ref_vector _temp_bool_owner(m); // temp owner for bools we create
|
|
|
|
// DFS
|
|
while (to_visit.size() > 0) {
|
|
expr* e = to_visit.back();
|
|
expr* econd = nullptr, *e1 = nullptr, *e2 = nullptr;
|
|
if (!re_to_bool.contains(e)) {
|
|
// First visit: add children
|
|
STRACE("seq_regex_verbose", tout << "1";);
|
|
if (m.is_ite(e, econd, e1, e2) ||
|
|
re().is_union(e, e1, e2)) {
|
|
to_visit.push_back(e1);
|
|
to_visit.push_back(e2);
|
|
}
|
|
// Mark first visit by adding nullptr to the map
|
|
re_to_bool.insert(e, nullptr);
|
|
}
|
|
else if (re_to_bool.find(e) == nullptr) {
|
|
// Second visit: set value
|
|
STRACE("seq_regex_verbose", tout << "2";);
|
|
to_visit.pop_back();
|
|
if (m.is_ite(e, econd, e1, e2)) {
|
|
expr* b1 = re_to_bool.find(e1);
|
|
expr* b2 = re_to_bool.find(e2);
|
|
expr* b = m.mk_ite(econd, b1, b2);
|
|
_temp_bool_owner.push_back(b);
|
|
re_to_bool.find(e) = b;
|
|
}
|
|
/*
|
|
else if (re().is_empty(e))
|
|
{
|
|
re_to_bool.find(e) = m.mk_false();
|
|
}
|
|
else if (re().is_epsilon(e))
|
|
{
|
|
expr* iplus1 = a().mk_int(i);
|
|
expr* one = a().mk_int(1);
|
|
_temp_bool_owner.push_back(iplus1);
|
|
_temp_bool_owner.push_back(one);
|
|
//the substring starting after position iplus1 must be empty
|
|
expr* s_end = str().mk_substr(s, iplus1, one);
|
|
expr* s_end_is_epsilon = m.mk_eq(s_end, str().mk_empty(m.get_sort(s)));
|
|
|
|
_temp_bool_owner.push_back(s_end_is_epsilon);
|
|
re_to_bool.find(e) = s_end_is_epsilon;
|
|
|
|
STRACE("seq_regex_verbose", tout
|
|
<< "added empty sequence leaf: "
|
|
<< mk_pp(s_end_is_epsilon, m) << std::endl;);
|
|
}
|
|
*/
|
|
else if (re().is_union(e, e1, e2)) {
|
|
expr* b1 = re_to_bool.find(e1);
|
|
expr* b2 = re_to_bool.find(e2);
|
|
expr* b = m.mk_or(b1, b2);
|
|
_temp_bool_owner.push_back(b);
|
|
re_to_bool.find(e) = b;
|
|
}
|
|
else {
|
|
expr* iplus1 = a().mk_int(i);
|
|
_temp_bool_owner.push_back(iplus1);
|
|
expr_ref acc_leaf = sk().mk_accept(s, iplus1, e);
|
|
_temp_bool_owner.push_back(acc_leaf);
|
|
re_to_bool.find(e) = acc_leaf;
|
|
|
|
STRACE("seq_regex_verbose", tout
|
|
<< "mk_deriv_accept: added accept leaf: "
|
|
<< mk_pp(acc_leaf, m) << std::endl;);
|
|
}
|
|
}
|
|
else {
|
|
STRACE("seq_regex_verbose", tout << "3";);
|
|
// Remaining visits: skip
|
|
to_visit.pop_back();
|
|
}
|
|
}
|
|
|
|
// Finalize
|
|
expr_ref result(m);
|
|
result = re_to_bool.find(r); // Assigns ownership of all exprs in
|
|
// re_to_bool for after this completes
|
|
rewrite(result);
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
Return a list of all leaves in the derivative of a regex r,
|
|
ignoring the conditions along each path.
|
|
|
|
Warning: Although the derivative
|
|
normal form tries to eliminate unsat condition paths, one cannot
|
|
assume that the path to each leaf is satisfiable in general
|
|
(e.g. when regexes are created using re.pred).
|
|
So not all results may correspond to satisfiable predicates.
|
|
It is OK to rely on the results being satisfiable for completeness,
|
|
but not soundness.
|
|
*/
|
|
void seq_regex::get_all_derivatives(expr* r, expr_ref_vector& results) {
|
|
// Get derivative
|
|
sort* seq_sort = nullptr;
|
|
VERIFY(u().is_re(r, seq_sort));
|
|
expr_ref n(m.mk_fresh_const("re.char", seq_sort), m);
|
|
expr_ref hd = mk_first(r, n);
|
|
expr_ref d(m);
|
|
d = derivative_wrapper(hd, r);
|
|
|
|
// DFS
|
|
vector<expr*> to_visit;
|
|
to_visit.push_back(d);
|
|
obj_map<expr, bool> visited; // set<expr> (bool is used as a unit type)
|
|
while (to_visit.size() > 0) {
|
|
expr* e = to_visit.back();
|
|
to_visit.pop_back();
|
|
if (visited.contains(e)) continue;
|
|
visited.insert(e, true);
|
|
expr* econd = nullptr, *e1 = nullptr, *e2 = nullptr;
|
|
if (m.is_ite(e, econd, e1, e2) ||
|
|
re().is_union(e, e1, e2)) {
|
|
to_visit.push_back(e1);
|
|
to_visit.push_back(e2);
|
|
}
|
|
else if (!re().is_empty(e)) {
|
|
results.push_back(e);
|
|
STRACE("seq_regex_verbose", tout
|
|
<< "get_all_derivatives: added deriv: "
|
|
<< mk_pp(e, m) << std::endl;);
|
|
}
|
|
}
|
|
|
|
STRACE("seq_regex", tout << "Number of derivatives: "
|
|
<< results.size() << std::endl;);
|
|
STRACE("seq_regex_brief", tout << "#derivs=" << results.size() << " ";);
|
|
}
|
|
|
|
/*
|
|
Return a list of all (cond, leaf) pairs in a given derivative
|
|
expression r.
|
|
|
|
Note: this recursive implementation is inefficient, since if nodes
|
|
are repeated often in the expression DAG, they may be visited
|
|
many times. For this reason, prefer mk_deriv_accept and
|
|
get_all_derivatives when possible.
|
|
|
|
This method is still used by:
|
|
propagate_is_empty
|
|
propagate_is_non_empty
|
|
*/
|
|
void seq_regex::get_cofactors(expr* r, expr_ref_pair_vector& result) {
|
|
expr_ref_vector conds(m);
|
|
get_cofactors_rec(r, conds, result);
|
|
STRACE("seq_regex", tout << "Number of derivatives: "
|
|
<< result.size() << std::endl;);
|
|
STRACE("seq_regex_brief", tout << "#derivs=" << result.size() << " ";);
|
|
}
|
|
void seq_regex::get_cofactors_rec(expr* r, expr_ref_vector& conds,
|
|
expr_ref_pair_vector& result) {
|
|
expr* cond = nullptr, *r1 = nullptr, *r2 = nullptr;
|
|
if (m.is_ite(r, cond, r1, r2)) {
|
|
conds.push_back(cond);
|
|
get_cofactors_rec(r1, conds, result);
|
|
conds.pop_back();
|
|
conds.push_back(mk_not(m, cond));
|
|
get_cofactors_rec(r2, conds, result);
|
|
conds.pop_back();
|
|
}
|
|
else if (re().is_union(r, r1, r2)) {
|
|
get_cofactors_rec(r1, conds, result);
|
|
get_cofactors_rec(r2, conds, result);
|
|
}
|
|
else {
|
|
expr_ref conj = mk_and(conds);
|
|
if (!m.is_false(conj) && !re().is_empty(r))
|
|
result.push_back(conj, r);
|
|
}
|
|
}
|
|
|
|
/*
|
|
is_empty(r, u) => ~is_nullable(r)
|
|
is_empty(r, u) => (forall x . ~cond(x)) or is_empty(r1, u union r) for (cond, r) in min-terms(D(x,r))
|
|
|
|
is_empty(r, u) is true if r is a member of u
|
|
*/
|
|
void seq_regex::propagate_is_empty(literal lit) {
|
|
expr* e = ctx.bool_var2expr(lit.var()), *r = nullptr, *u = nullptr, *n = nullptr;
|
|
VERIFY(sk().is_is_empty(e, r, u, n));
|
|
expr_ref is_nullable = is_nullable_wrapper(r);
|
|
|
|
TRACE("seq_regex", tout << "propagate empty: " << mk_pp(e, m) << std::endl;);
|
|
STRACE("seq_regex_brief", tout
|
|
<< std::endl << "PE(" << expr_id_str(e) << "," << state_str(r)
|
|
<< "," << expr_id_str(u) << "," << expr_id_str(n) << ") ";);
|
|
|
|
if (m.is_true(is_nullable)) {
|
|
th.add_axiom(~lit);
|
|
return;
|
|
}
|
|
th.add_axiom(~lit, ~th.mk_literal(is_nullable));
|
|
expr_ref hd = mk_first(r, n);
|
|
expr_ref d(m);
|
|
d = derivative_wrapper(hd, r);
|
|
literal_vector lits;
|
|
expr_ref_pair_vector cofactors(m);
|
|
get_cofactors(d, cofactors);
|
|
for (auto const& p : cofactors) {
|
|
if (is_member(p.second, u))
|
|
continue;
|
|
expr_ref cond(p.first, m);
|
|
seq_rw().elim_condition(hd, cond);
|
|
rewrite(cond);
|
|
if (m.is_false(cond))
|
|
continue;
|
|
lits.reset();
|
|
lits.push_back(~lit);
|
|
if (!m.is_true(cond)) {
|
|
expr_ref ncond(mk_not(m, cond), m);
|
|
lits.push_back(th.mk_literal(mk_forall(m, hd, ncond)));
|
|
}
|
|
expr_ref is_empty1 = sk().mk_is_empty(p.second, re().mk_union(u, p.second), n);
|
|
lits.push_back(th.mk_literal(is_empty1));
|
|
th.add_axiom(lits);
|
|
}
|
|
}
|
|
|
|
expr_ref seq_regex::mk_first(expr* r, expr* n) {
|
|
sort* elem_sort = nullptr, *seq_sort = nullptr;
|
|
VERIFY(u().is_re(r, seq_sort));
|
|
VERIFY(u().is_seq(seq_sort, elem_sort));
|
|
return sk().mk("re.first", n, a().mk_int(r->get_id()), elem_sort);
|
|
}
|
|
|
|
/**
|
|
* Dead state elimination using the state_graph class
|
|
*/
|
|
|
|
unsigned seq_regex::get_state_id(expr* e) {
|
|
// Assign increasing IDs starting from 1
|
|
if (!m_expr_to_state.contains(e)) {
|
|
m_state_to_expr.push_back(e);
|
|
unsigned new_id = m_state_to_expr.size();
|
|
m_expr_to_state.insert(e, new_id);
|
|
STRACE("seq_regex_brief", tout << "new(" << expr_id_str(e)
|
|
<< ")=" << state_str(e) << " ";);
|
|
STRACE("seq_regex", tout
|
|
<< "New state ID: " << new_id
|
|
<< " = " << mk_pp(e, m) << std::endl;);
|
|
SASSERT(get_expr_from_id(new_id) == e);
|
|
}
|
|
return m_expr_to_state.find(e);
|
|
}
|
|
expr* seq_regex::get_expr_from_id(unsigned id) {
|
|
SASSERT(id >= 1);
|
|
SASSERT(id <= m_state_to_expr.size());
|
|
return m_state_to_expr.get(id - 1);
|
|
}
|
|
|
|
|
|
bool seq_regex::can_be_in_cycle(expr *r1, expr *r2) {
|
|
// TBD: This can be used to optimize the state graph:
|
|
// return false here if it is known that r1 -> r2 can never be
|
|
// in a cycle. There are various easy syntactic checks on r1 and r2
|
|
// that can be used to infer this (e.g. star height, or length if
|
|
// both are star-free).
|
|
// This check need not be sound, but if it is not, some dead states
|
|
// will be missed.
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
Update the state graph with expression r and all its derivatives.
|
|
*/
|
|
bool seq_regex::update_state_graph(expr* r) {
|
|
unsigned r_id = get_state_id(r);
|
|
if (m_state_graph.is_done(r_id)) return false;
|
|
if (m_state_graph.get_size() >= m_max_state_graph_size) {
|
|
STRACE("seq_regex", tout << "Warning: ignored state graph update -- max size of seen states reached!" << std::endl;);
|
|
STRACE("seq_regex_brief", tout << "(MAX SIZE REACHED) ";);
|
|
return false;
|
|
}
|
|
STRACE("seq_regex", tout << "Updating state graph for regex "
|
|
<< mk_pp(r, m) << ") ";);
|
|
|
|
STRACE("state_graph",
|
|
if (!m_state_graph.is_seen(r_id))
|
|
tout << std::endl << "state(" << r_id << ") = " << seq_util::rex::pp(re(), r) << std::endl << "info(" << r_id << ") = " << re().get_info(r) << std::endl;);
|
|
// Add state
|
|
m_state_graph.add_state(r_id);
|
|
STRACE("seq_regex", tout << "Updating state graph for regex "
|
|
<< mk_pp(r, m) << ") " << std::endl;);
|
|
STRACE("seq_regex_brief", tout << std::endl << "USG("
|
|
<< state_str(r) << ") ";);
|
|
expr_ref r_nullable = is_nullable_wrapper(r);
|
|
if (m.is_true(r_nullable)) {
|
|
m_state_graph.mark_live(r_id);
|
|
}
|
|
else {
|
|
// Add edges to all derivatives
|
|
expr_ref_vector derivatives(m);
|
|
STRACE("seq_regex_verbose", tout
|
|
<< "getting all derivs: " << r_id << " " << std::endl;);
|
|
get_all_derivatives(r, derivatives);
|
|
for (auto const& dr: derivatives) {
|
|
unsigned dr_id = get_state_id(dr);
|
|
STRACE("seq_regex_verbose", tout
|
|
<< std::endl << " traversing deriv: " << dr_id << " ";);
|
|
STRACE("state_graph",
|
|
if (!m_state_graph.is_seen(dr_id))
|
|
tout << "state(" << dr_id << ") = " << seq_util::rex::pp(re(), dr) << std::endl << "info(" << dr_id << ") = " << re().get_info(dr) << std::endl;);
|
|
// Add state
|
|
m_state_graph.add_state(dr_id);
|
|
bool maybecycle = can_be_in_cycle(r, dr);
|
|
m_state_graph.add_edge(r_id, dr_id, maybecycle);
|
|
}
|
|
m_state_graph.mark_done(r_id);
|
|
}
|
|
STRACE("seq_regex", m_state_graph.display(tout););
|
|
STRACE("seq_regex_brief", tout << std::endl;);
|
|
STRACE("seq_regex_brief", m_state_graph.display(tout););
|
|
return true;
|
|
}
|
|
|
|
std::string seq_regex::state_str(expr* e) {
|
|
if (m_expr_to_state.contains(e))
|
|
return std::to_string(get_state_id(e));
|
|
else
|
|
return expr_id_str(e);
|
|
}
|
|
std::string seq_regex::expr_id_str(expr* e) {
|
|
return std::string("id") + std::to_string(e->get_id());
|
|
}
|
|
|
|
}
|