From 2c02264a94d2377ebf5f64162813b7e379446619 Mon Sep 17 00:00:00 2001 From: Caleb Stanford Date: Thu, 13 Aug 2020 15:47:36 -0400 Subject: [PATCH] Regex solver updates (#4636) * std::cout debugging statements * comment out std::cout debugging as this is now a shared fork * convert std::cout to TRACE statements for seq_rewriter and seq_regex * add cases to min_length and max_length for regexes * bug fix * update min_length and max_length functions for REs * initial pass on simplifying derivative normal forms by eliminating redundant predicates locally * add seq_regex_brief trace statements * working on debugging ref count issue * fix ref count bug and convert trace statements to seq_regex_brief * add compact tracing for cache hits/misses * seq_regex fix cache hit/miss tracing and wrapper around is_nullable * minor * label and disable more experimental changes for testing * minor documentation / tracing * a few more @EXP annotations * dead state elimination skeleton code * progress on dead state elimination * more progress on dead state elimination * refactor dead state class to separate self-contained state_graph class * finish factoring state_graph to only work with unsigned values, and implement separate functionality for expr* logic * implement get_all_derivatives, add debug tracing * trace statements for debugging is_nullable loop bug * fix is_nullable loop bug * comment out local nullable change and mark experimental * pretty printing for state_graph * rewrite state graph to remove the fragile assumption that all edges from a state are added at a time * start of general cycle detection check + fix some comments * implement full cycle detection procedure * normalize derivative conditions to form 'ele <= a' * order derivative conditions by character code * fix confusing names m_to and m_from * assign increasing state IDs from 1 instead of using get_id on AST node * remove elim_condition call in get_dall_derivatives * use u_map instead of uint_map to avoid memory leak * remove unnecessary call to is_ground * debugging * small improvements to seq_regex_brief tracing * fix bug on evil2 example * save work * new propagate code * work in progress on using same seq sort for deriv calls * avoid re-computing derivatives: use same head var for every derivative call * use min_length on regexes to prune search * simple implementation of can_be_in_cycle using rank function idea * add a disabled experimental change * minor cleanup comments, etc. * seq_rewriter cleanup for PR * typo noticed by Nikolaj * move state graph to util/state_graph * re-add accidentally removed line * clean up seq_regex code removing obsolete functions and comments * a few more cleanup items * oops, missed merge change to fix compilation * disabled change to lift unions to the top level and treat them seperately in seq_regex solver * added get_overapprox_regex to over-approximate regex membership constraints * replace calls to is_epsilon with a centrally available method in seq_decl_plugin * simplifications and modifications in get_overapprox_regex and related * added approximation support for sequence expressions that use ite * removed is_app check that was redundant * tweak differences with upstream * rewrite derivative leaves * enable Antimorov-style derivatives via lifting unions in the solver * TODO placeholders for outputting state graph * change order in seq_regex propagate_in_re * implement a more restricted form of Antimorov derivatives via a special op code to indicate lifting unions * minor * new Antimorov optimizations based on BDD compatibility checking * seq regex tracing for # of derivatives * fix get_cofactors (currently this fix is buggy) * partially revert get_cofactors buggy change * re-implement get_cofactors to more efficiently explore nodes in the derivative expression * dgml generation for state graph * fix release build * improved dgml output * bug fixes in dgml generation * dot output support for state_graph and moved dgml and dot output under CASSERT * updated tracing of what regex corresponds to what state id with /tr:state_graph * clean up & document Antimorov derivative support * remove op cache tracing * remove re_rank experimental idea * small fix * fix Antimorov derivative (important change for the good performance) * remove unused and unnecessary code * implemented simpler efficient get_cofactors alternative mk_deriv_accept * simplifications in propagate_accept, and trace unusual cases * document the various seq_regex tracing & debugging command-line options * fix debug build (broken tracing) * guard eager Antimorov lifting for possible disabling * fix bug in propagate_accept Rule 1 * disable eager version of Antimorov lifting for performance reasons * remove some remaining obsolete comments Co-authored-by: calebstanford-msr Co-authored-by: Margus Veanes --- src/ast/rewriter/seq_rewriter.cpp | 252 ++++++++++++++++++++++--- src/ast/rewriter/seq_rewriter.h | 6 +- src/ast/seq_decl_plugin.cpp | 2 + src/ast/seq_decl_plugin.h | 7 +- src/smt/seq_regex.cpp | 293 ++++++++++++++++++++++-------- src/smt/seq_regex.h | 76 +++++++- src/util/state_graph.cpp | 60 +++--- 7 files changed, 556 insertions(+), 140 deletions(-) diff --git a/src/ast/rewriter/seq_rewriter.cpp b/src/ast/rewriter/seq_rewriter.cpp index 5dae6921d..9cd6fe00d 100644 --- a/src/ast/rewriter/seq_rewriter.cpp +++ b/src/ast/rewriter/seq_rewriter.cpp @@ -557,6 +557,12 @@ br_status seq_rewriter::mk_app_core(func_decl * f, unsigned num_args, expr * con st = mk_re_concat(args[0], args[1], result); } break; + case _OP_RE_ANTIMOROV_UNION: + SASSERT(num_args == 2); + // Rewrite Antimorov union to real union + result = re().mk_union(args[0], args[1]); + st = BR_REWRITE1; + break; case OP_RE_UNION: if (num_args == 1) { result = args[0]; @@ -2365,11 +2371,26 @@ br_status seq_rewriter::mk_re_reverse(expr* r, expr_ref& result) { } } +/*************************************************** + ***** Begin Derivative Code ***** + ***************************************************/ + /* Symbolic derivative: seq -> regex -> regex seq should be single char -*/ + This is the rewriter entrypoint for computing a derivative. + Use mk_derivative from seq_decl_plugin instead to create a derivative + expression without computing it (simplifying). + + This calls mk_derivative, the main logic which builds a derivative + recursively, but mk_derivative doesn't guarantee full simplification. + Once the derivative is built, we return BR_REWRITE_FULL so that + any remaining possible simplification is performed from the bottom up. + + Rewriting also replaces _OP_RE_ANTIMOROV_UNION, which is produced + by is_derivative, with real union. +*/ br_status seq_rewriter::mk_re_derivative(expr* ele, expr* r, expr_ref& result) { result = mk_derivative(ele, r); // TBD: we may even declare BR_DONE here and potentially miss some simplifications @@ -2377,14 +2398,105 @@ br_status seq_rewriter::mk_re_derivative(expr* ele, expr* r, expr_ref& result) { } /* - Memoized, recursive implementation of the symbolic derivative such that - the result is in an optimized BDD form. + Note: Derivative Normal Form - Definition of BDD form: - if-then-elses are pushed outwards - and sorted by condition ID (cond->get_id()), from largest on - the outside to smallest on the inside. - Duplicate nested conditions are eliminated. + When computing derivatives recursively, we preserve the following + BDD normal form: + + - At the top level, the derivative is a union of Antimorov derivatives + (Conceptually each element of the union is a different derivative). + We currently express this derivative using an internal op code: + _OP_RE_ANTIMOROV_UNION + - An Antimorov derivative is a nested if-then-else term. + if-then-elses are pushed outwards and sorted by condition ID + (cond->get_id()), from largest on the outside to smallest on the + inside. Duplicate nested conditions are eliminated. + - The leaves of the if-then-else BDD can have unions themselves, + but these are interpreted as Regex union, not as separate Antimorov + derivatives. + + To debug the normal form, call Z3 with -dbg:seq_regex: + this calls check_deriv_normal_form (below) periodically. + + The main logic is in mk_der_op_rec for combining normal forms + (some also in mk_der_compl_rec). +*/ + +#ifdef Z3DEBUG +/* + Debugging to check the derivative normal form that we assume + (see definition above). + + This may fail on unusual/unexpected REs, such as those containing + regex variables, but this is by design as this is only checked + during debugging, and we have not considered how normal form + should apply in such cases. +*/ +bool seq_rewriter::check_deriv_normal_form(expr* r, int level) { + if (level == 3) { // top level + STRACE("seq_verbose", tout + << "Checking derivative normal form invariant...";); + } + expr *r1 = nullptr, *r2 = nullptr, *p = nullptr, *s = nullptr; + unsigned lo = 0, hi = 0; + STRACE("seq_verbose", tout << " (level " << level << ")";); + int new_level = 0; + if (re().is_antimorov_union(r)) { + SASSERT(level >= 2); + new_level = 2; + } + else if (m().is_ite(r)) { + SASSERT(level >= 1); + new_level = 1; + } + + SASSERT(!re().is_diff(r)); + SASSERT(!re().is_opt(r)); + SASSERT(!re().is_plus(r)); + + if (re().is_antimorov_union(r, r1, r2) || + re().is_concat(r, r1, r2) || + re().is_union(r, r1, r2) || + re().is_intersection(r, r1, r2) || + m().is_ite(r, p, r1, r2)) { + check_deriv_normal_form(r1, new_level); + check_deriv_normal_form(r2, new_level); + } + else if (re().is_star(r, r1) || + re().is_complement(r, r1) || + re().is_loop(r, r1, lo) || + re().is_loop(r, r1, lo, hi)) { + check_deriv_normal_form(r1, new_level); + } + else if (re().is_reverse(r, r1)) { + SASSERT(re().is_to_re(r1)); + } + else if (re().is_full_seq(r) || + re().is_empty(r) || + re().is_range(r) || + re().is_full_char(r) || + re().is_of_pred(r) || + re().is_to_re(r, s)) { + // OK + } + else { + SASSERT(false); + } + if (level == 3) { + STRACE("seq_verbose", tout << " passed!" << std::endl;); + } + return true; +} +#endif + +/* + Memoized, recursive implementation of the symbolic derivative such that + the result is in normal form. + + Functions without _rec are memoized wrappers, which call the _rec + version if lookup fails. + + The main logic is in mk_der_op_rec for combining normal forms. */ expr_ref seq_rewriter::mk_derivative(expr* ele, expr* r) { STRACE("seq_verbose", tout << "derivative: " << mk_pp(ele, m()) @@ -2396,9 +2508,14 @@ expr_ref seq_rewriter::mk_derivative(expr* ele, expr* r) { } STRACE("seq_verbose", tout << "derivative result: " << mk_pp(result, m()) << std::endl;); + CASSERT("seq_regex", check_deriv_normal_form(r)); return result; } +expr_ref seq_rewriter::mk_der_antimorov_union(expr* r1, expr* r2) { + return mk_der_op(_OP_RE_ANTIMOROV_UNION, r1, r2); +} + expr_ref seq_rewriter::mk_der_union(expr* r1, expr* r2) { return mk_der_op(OP_RE_UNION, r1, r2); } @@ -2466,15 +2583,37 @@ bool seq_rewriter::pred_implies(expr* a, expr* b) { } /* - Apply a binary operation, preserving BDD normal form on derivative expressions. + Utility function to decide if two BDDs (nested if-then-else terms) + have exactly the same structure and conditions. +*/ +bool seq_rewriter::ite_bdds_compatabile(expr* a, expr* b) { + expr* ca = nullptr, *a1 = nullptr, *a2 = nullptr; + expr* cb = nullptr, *b1 = nullptr, *b2 = nullptr; + if (m().is_ite(a, ca, a1, a2) && m().is_ite(b, cb, b1, b2)) { + return (ca == cb) && ite_bdds_compatabile(a1, b1) + && ite_bdds_compatabile(a2, b2); + } + else if (m().is_ite(a) || m().is_ite(b)) { + return false; + } + else { + return true; + } +} + +/* + Apply a binary operation, preserving normal form on derivative expressions. Preconditions: - - k is a binary op code on REs: one of concat, intersection, or union - (not difference) - - a and b are in BDD form + - k is one of the following binary op codes on REs: + OP_RE_INTERSECT + OP_RE_UNION + OP_RE_CONCAT + _OP_RE_ANTIMOROV_UNION + - a and b are in normal form (check_deriv_normal_form) Postcondition: - - result is in BDD form + - result is in normal form (check_deriv_normal_form) */ expr_ref seq_rewriter::mk_der_op_rec(decl_kind k, expr* a, expr* b) { STRACE("seq_verbose", tout << "mk_der_op_rec: " << k @@ -2483,6 +2622,7 @@ expr_ref seq_rewriter::mk_der_op_rec(decl_kind k, expr* a, expr* b) { expr* ca = nullptr, *a1 = nullptr, *a2 = nullptr; expr* cb = nullptr, *b1 = nullptr, *b2 = nullptr; expr_ref result(m()); + // Simplify if-then-elses whenever possible auto mk_ite = [&](expr* c, expr* a, expr* b) { return (a == b) ? a : m().mk_ite(c, a, b); @@ -2497,6 +2637,46 @@ expr_ref seq_rewriter::mk_der_op_rec(decl_kind k, expr* a, expr* b) { m().is_not(e, e); return e->get_id(); }; + + // Choose when to lift a union to the top level, by converting + // it to an Antimorov union + // This implements a restricted form of Antimorov derivatives + if (k == OP_RE_UNION) { + if (re().is_antimorov_union(a) || re().is_antimorov_union(b)) { + k = _OP_RE_ANTIMOROV_UNION; + } + #if 0 + // Disabled: eager Antimorov lifting unless BDDs are compatible + // Note: the check for BDD compatibility could be made more + // sophisticated: in an Antimorov union of n terms, we really + // want to check if any pair of them is compatible. + else if (m().is_ite(a) && m().is_ite(b) && + !ite_bdds_compatabile(a, b)) { + k = _OP_RE_ANTIMOROV_UNION; + } + #endif + } + if (k == _OP_RE_ANTIMOROV_UNION) { + result = re().mk_antimorov_union(a, b); + return result; + } + if (re().is_antimorov_union(a, a1, a2)) { + expr_ref r1(m()), r2(m()); + r1 = mk_der_op(k, a1, b); + r2 = mk_der_op(k, a2, b); + result = re().mk_antimorov_union(r1, r2); + return result; + } + if (re().is_antimorov_union(b, b1, b2)) { + expr_ref r1(m()), r2(m()); + r1 = mk_der_op(k, a, b1); + r2 = mk_der_op(k, a, b2); + result = re().mk_antimorov_union(r1, r2); + return result; + } + + // Remaining non-union case: combine two if-then-else BDDs + // (underneath top-level Antimorov unions) if (m().is_ite(a, ca, a1, a2)) { expr_ref r1(m()), r2(m()); expr_ref notca(m().mk_not(ca), m()); @@ -2594,6 +2774,7 @@ expr_ref seq_rewriter::mk_der_op(decl_kind k, expr* a, expr* b) { result = mk_der_op_rec(k, a, b); m_op_cache.insert(k, a, b, result); } + CASSERT("seq_regex", check_deriv_normal_form(result)); return result; } @@ -2603,13 +2784,22 @@ expr_ref seq_rewriter::mk_der_compl(expr* r) { expr_ref result(m_op_cache.find(OP_RE_COMPLEMENT, r, nullptr), m()); if (!result) { expr* c = nullptr, * r1 = nullptr, * r2 = nullptr; - if (m().is_ite(r, c, r1, r2)) { + if (re().is_antimorov_union(r, r1, r2)) { + // Convert union to intersection + // Result: Antimorov union at top level is lost, pushed inside ITEs + expr_ref res1(m()), res2(m()); + res1 = mk_der_compl(r1); + res2 = mk_der_compl(r2); + result = mk_der_inter(res1, res2); + } + else if (m().is_ite(r, c, r1, r2)) { result = m().mk_ite(c, mk_der_compl(r1), mk_der_compl(r2)); } else if (BR_FAILED == mk_re_complement(r, result)) result = re().mk_complement(r); m_op_cache.insert(OP_RE_COMPLEMENT, r, nullptr, result); } + CASSERT("seq_regex", check_deriv_normal_form(result)); return result; } @@ -2675,6 +2865,7 @@ expr_ref seq_rewriter::mk_der_cond(expr* cond, expr* ele, sort* seq_sort) { } STRACE("seq_verbose", tout << "mk_der_cond result: " << mk_pp(result, m()) << std::endl;); + CASSERT("seq_regex", check_deriv_normal_form(result)); return result; } @@ -2696,7 +2887,11 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) { } expr_ref dr2 = mk_derivative(ele, r2); is_n = re_predicate(is_n, seq_sort); - return mk_der_union(result, mk_der_concat(is_n, dr2)); + // Instead of mk_der_union here, we use mk_der_antimorov_union to + // force the two cases to be considered separately and lifted to + // the top level. This avoids blowup in cases where determinization + // is expensive. + return mk_der_antimorov_union(result, mk_der_concat(is_n, dr2)); } else if (re().is_star(r, r1)) { return mk_der_concat(mk_derivative(ele, r1), r); @@ -2843,6 +3038,11 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) { return expr_ref(re().mk_derivative(ele, r), m()); } +/************************************************* + ***** End Derivative Code ***** + *************************************************/ + + /* * pattern match against all ++ "abc" ++ all ++ "def" ++ all regexes. */ @@ -3128,11 +3328,11 @@ br_status seq_rewriter::mk_re_concat(expr* a, expr* b, expr_ref& result) { result = b; return BR_DONE; } - if (is_epsilon(a)) { + if (re().is_epsilon(a)) { result = b; return BR_DONE; } - if (is_epsilon(b)) { + if (re().is_epsilon(b)) { result = a; return BR_DONE; } @@ -3249,11 +3449,11 @@ br_status seq_rewriter::mk_re_union0(expr* a, expr* b, expr_ref& result) { result = b; return BR_DONE; } - if (re().is_star(a) && is_epsilon(b)) { + if (re().is_star(a) && re().is_epsilon(b)) { result = a; return BR_DONE; } - if (re().is_star(b) && is_epsilon(a)) { + if (re().is_star(b) && re().is_epsilon(a)) { result = b; return BR_DONE; } @@ -3558,11 +3758,11 @@ br_status seq_rewriter::mk_re_star(expr* a, expr_ref& result) { result = re().mk_star(re().mk_union(b, c1)); return BR_REWRITE2; } - if (is_epsilon(b)) { + if (re().is_epsilon(b)) { result = re().mk_star(c); return BR_REWRITE2; } - if (is_epsilon(c)) { + if (re().is_epsilon(c)) { result = re().mk_star(b); return BR_REWRITE2; } @@ -3599,7 +3799,7 @@ br_status seq_rewriter::mk_re_plus(expr* a, expr_ref& result) { result = a; return BR_DONE; } - if (is_epsilon(a)) { + if (re().is_epsilon(a)) { result = a; return BR_DONE; } @@ -4252,12 +4452,6 @@ bool seq_rewriter::reduce_by_length(expr_ref_vector& ls, expr_ref_vector& rs, return true; } - -bool seq_rewriter::is_epsilon(expr* e) const { - expr* e1; - return re().is_to_re(e, e1) && str().is_empty(e1); -} - /** reduce for the case where rs = a constant string, ls contains a substring that matches no substring of rs. @@ -4357,6 +4551,8 @@ void seq_rewriter::op_cache::cleanup() { if (m_table.size() >= m_max_cache_size) { m_trail.reset(); m_table.reset(); + STRACE("seq_regex", tout << "Op cache reset!" << std::endl;); + STRACE("seq_regex_brief", tout << "(OP CACHE RESET) ";); STRACE("seq_verbose", tout << "Derivative op cache reset" << std::endl;); } } diff --git a/src/ast/rewriter/seq_rewriter.h b/src/ast/rewriter/seq_rewriter.h index b9cc2707b..634eece7d 100644 --- a/src/ast/rewriter/seq_rewriter.h +++ b/src/ast/rewriter/seq_rewriter.h @@ -191,6 +191,11 @@ class seq_rewriter { expr_ref mk_der_inter(expr* a, expr* b); expr_ref mk_der_compl(expr* a); expr_ref mk_der_cond(expr* cond, expr* ele, sort* seq_sort); + expr_ref mk_der_antimorov_union(expr* r1, expr* r2); + bool ite_bdds_compatabile(expr* a, expr* b); + #ifdef Z3DEBUG + bool check_deriv_normal_form(expr* r, int level = 3); + #endif bool lt_char(expr* ch1, expr* ch2); bool eq_char(expr* ch1, expr* ch2); @@ -277,7 +282,6 @@ class seq_rewriter { void add_next(u_map& next, expr_ref_vector& trail, unsigned idx, expr* cond); bool is_sequence(expr* e, expr_ref_vector& seq); bool is_sequence(eautomaton& aut, expr_ref_vector& seq); - bool is_epsilon(expr* e) const; bool get_lengths(expr* e, expr_ref_vector& lens, rational& pos); bool reduce_back(expr_ref_vector& ls, expr_ref_vector& rs, expr_ref_pair_vector& new_eqs); bool reduce_front(expr_ref_vector& ls, expr_ref_vector& rs, expr_ref_pair_vector& new_eqs); diff --git a/src/ast/seq_decl_plugin.cpp b/src/ast/seq_decl_plugin.cpp index d359f010c..afa3a023f 100644 --- a/src/ast/seq_decl_plugin.cpp +++ b/src/ast/seq_decl_plugin.cpp @@ -616,6 +616,7 @@ void seq_decl_plugin::init() { m_sigs[OP_RE_OF_PRED] = alloc(psig, m, "re.of.pred", 1, 1, &predA, reA); m_sigs[OP_RE_REVERSE] = alloc(psig, m, "re.reverse", 1, 1, &reA, reA); m_sigs[OP_RE_DERIVATIVE] = alloc(psig, m, "re.derivative", 1, 2, AreA, reA); + m_sigs[_OP_RE_ANTIMOROV_UNION] = alloc(psig, m, "re.union", 1, 2, reAreA, reA); m_sigs[OP_SEQ_TO_RE] = alloc(psig, m, "seq.to.re", 1, 1, &seqA, reA); m_sigs[OP_SEQ_IN_RE] = alloc(psig, m, "seq.in.re", 1, 2, seqAreA, boolT); m_sigs[OP_SEQ_REPLACE_RE_ALL] = alloc(psig, m, "str.replace_re_all", 1, 3, seqAreAseqA, seqA); @@ -760,6 +761,7 @@ func_decl * seq_decl_plugin::mk_func_decl(decl_kind k, unsigned num_parameters, case OP_RE_COMPLEMENT: case OP_RE_REVERSE: case OP_RE_DERIVATIVE: + case _OP_RE_ANTIMOROV_UNION: m_has_re = true; // fall-through case OP_SEQ_UNIT: diff --git a/src/ast/seq_decl_plugin.h b/src/ast/seq_decl_plugin.h index 78286fac0..eb2feba7a 100644 --- a/src/ast/seq_decl_plugin.h +++ b/src/ast/seq_decl_plugin.h @@ -109,6 +109,7 @@ enum seq_op_kind { _OP_REGEXP_EMPTY, _OP_REGEXP_FULL_CHAR, _OP_RE_IS_NULLABLE, + _OP_RE_ANTIMOROV_UNION, // Lifted union for antimorov-style derivatives _OP_SEQ_SKOLEM, LAST_SEQ_OP }; @@ -237,9 +238,10 @@ class seq_util { mutable scoped_ptr m_bv; bv_util& bv() const; +public: + unsigned max_plus(unsigned x, unsigned y) const; unsigned max_mul(unsigned x, unsigned y) const; -public: ast_manager& get_manager() const { return m; } @@ -437,6 +439,7 @@ public: app* mk_of_pred(expr* p); app* mk_reverse(expr* r) { return m.mk_app(m_fid, OP_RE_REVERSE, r); } app* mk_derivative(expr* ele, expr* r) { return m.mk_app(m_fid, OP_RE_DERIVATIVE, ele, r); } + app* mk_antimorov_union(expr* r1, expr* r2) { return m.mk_app(m_fid, _OP_RE_ANTIMOROV_UNION, r1, r2); } bool is_to_re(expr const* n) const { return is_app_of(n, m_fid, OP_SEQ_TO_RE); } bool is_concat(expr const* n) const { return is_app_of(n, m_fid, OP_RE_CONCAT); } @@ -455,6 +458,7 @@ public: bool is_of_pred(expr const* n) const { return is_app_of(n, m_fid, OP_RE_OF_PRED); } bool is_reverse(expr const* n) const { return is_app_of(n, m_fid, OP_RE_REVERSE); } bool is_derivative(expr const* n) const { return is_app_of(n, m_fid, OP_RE_DERIVATIVE); } + bool is_antimorov_union(expr const* n) const { return is_app_of(n, m_fid, _OP_RE_ANTIMOROV_UNION); } MATCH_UNARY(is_to_re); MATCH_BINARY(is_concat); MATCH_BINARY(is_union); @@ -468,6 +472,7 @@ public: MATCH_UNARY(is_of_pred); MATCH_UNARY(is_reverse); MATCH_BINARY(is_derivative); + MATCH_BINARY(is_antimorov_union); bool is_loop(expr const* n, expr*& body, unsigned& lo, unsigned& hi) const; bool is_loop(expr const* n, expr*& body, unsigned& lo) const; bool is_loop(expr const* n, expr*& body, expr*& lo, expr*& hi) const; diff --git a/src/smt/seq_regex.cpp b/src/smt/seq_regex.cpp index 1803f5537..06b5e7f89 100644 --- a/src/smt/seq_regex.cpp +++ b/src/smt/seq_regex.cpp @@ -109,24 +109,6 @@ namespace smt { return; } - // Convert a non-ground sequence into an additional regex and - // strengthen the original regex constraint into an intersection - // for example: - // (x ++ "a" ++ y) in b* - // is coverted to - // (x ++ "a" ++ y) in intersect((.* ++ "a" ++ .*), b*) - if (!m.is_value(s)) { - expr_ref s_approx = get_overapprox_regex(s); - if (!re().is_full_seq(s_approx)) { - r = re().mk_inter(r, s_approx); - TRACE("seq_regex", tout - << "get_overapprox_regex(" << mk_pp(s, m) - << ") = " << mk_pp(s_approx, m) << std::endl;); - STRACE("seq_regex_brief", tout - << "overapprox=" << state_str(r) << " ";); - } - } - if (coallesce_in_re(lit)) { TRACE("seq_regex", tout << "simplified conjunctions to an intersection" << std::endl;); @@ -141,6 +123,26 @@ namespace smt { return; } + // Convert a non-ground sequence into an additional regex and + // strengthen the original regex constraint into an intersection + // for example: + // (x ++ "a" ++ y) in b* + // is coverted to + // (x ++ "a" ++ y) in intersect((.* ++ "a" ++ .*), b*) + expr_ref _r_temp_owner(m); + if (!m.is_value(s)) { + expr_ref s_approx = get_overapprox_regex(s); + if (!re().is_full_seq(s_approx)) { + r = re().mk_inter(r, s_approx); + _r_temp_owner = r; + TRACE("seq_regex", tout + << "get_overapprox_regex(" << mk_pp(s, m) + << ") = " << mk_pp(s_approx, m) << std::endl;); + STRACE("seq_regex_brief", tout + << "overapprox=" << state_str(r) << " ";); + } + } + expr_ref zero(a().mk_int(0), m); expr_ref acc = sk().mk_accept(s, zero, r); literal acc_lit = th.mk_literal(acc); @@ -213,6 +215,7 @@ namespace smt { * * Rule 1. (accept s i r) => len(s) >= i + min_len(r) * Rule 2. (accept s i r) & len(s) <= i => nullable(r) + * (only necessary if min_len fails and returns 0 for non-nullable r) * Rule 3. (accept s i r) and len(s) > i => * (accept s (i + 1) (derivative s[i] r) * @@ -258,24 +261,36 @@ namespace smt { STRACE("seq_regex_brief", tout << "(unfold) ";); // Rule 1: use min_length to prune search - expr_ref s_to_re(re().mk_to_re(s), m); - expr_ref s_plus_r(re().mk_concat(s_to_re, r), m); - unsigned min_len = re().min_length(s_plus_r); - literal len_s_ge_min = th.m_ax.mk_ge(th.mk_len(s), min_len); + unsigned min_len = re().min_length(r); + unsigned min_len_plus_i = u().max_plus(min_len, idx); + literal len_s_ge_min = th.m_ax.mk_ge(th.mk_len(s), min_len_plus_i); th.propagate_lit(nullptr, 1, &lit, len_s_ge_min); // Axiom equivalent to the above: th.add_axiom(~lit, len_s_ge_min); // Rule 2: nullable check literal len_s_le_i = th.m_ax.mk_le(th.mk_len(s), idx); - expr_ref is_nullable = is_nullable_wrapper(r); - if (m.is_false(is_nullable)) { - th.propagate_lit(nullptr, 1, &lit, ~len_s_le_i); - } - else if (!m.is_true(is_nullable)) { - // is_nullable did not simplify - literal is_nullable_lit = th.mk_literal(is_nullable_wrapper(r)); - ctx.mark_as_relevant(is_nullable_lit); - th.add_axiom(~lit, ~len_s_le_i, is_nullable_lit); + if (min_len == 0) { + expr_ref is_nullable = is_nullable_wrapper(r); + if (m.is_false(is_nullable)) { + STRACE("seq_regex", tout + << "Warning: min_length returned 0 for non-nullable regex" + << std::endl;); + STRACE("seq_regex_brief", tout + << " (Warning: min_length returned 0 for" + << " non-nullable regex)";); + th.propagate_lit(nullptr, 1, &lit, ~len_s_le_i); + } + else if (!m.is_true(is_nullable)) { + // is_nullable did not simplify + STRACE("seq_regex", tout + << "Warning: is_nullable did not simplify to true or false" + << std::endl;); + STRACE("seq_regex_brief", tout + << " (Warning: is_nullable did not simplify)";); + literal is_nullable_lit = th.mk_literal(is_nullable); + ctx.mark_as_relevant(is_nullable_lit); + th.add_axiom(~lit, ~len_s_le_i, is_nullable_lit); + } } // Rule 3: derivative unfolding @@ -283,24 +298,11 @@ namespace smt { expr_ref hd = th.mk_nth(s, i); expr_ref deriv(m); deriv = derivative_wrapper(hd, r); + expr_ref accept_deriv(m); + accept_deriv = mk_deriv_accept(s, idx + 1, deriv); accept_next.push_back(~lit); accept_next.push_back(len_s_le_i); - expr_ref_pair_vector cofactors(m); - get_cofactors(deriv, cofactors); - for (auto const& p : cofactors) { - if (m.is_false(p.first) || re().is_empty(p.second)) continue; - expr_ref cond(p.first, m); - expr_ref deriv_leaf(p.second, m); - - expr_ref acc = sk().mk_accept(s, a().mk_int(idx + 1), deriv_leaf); - expr_ref choice(m.mk_and(cond, acc), m); - literal choice_lit = th.mk_literal(choice); - accept_next.push_back(choice_lit); - // TBD: try prioritizing unvisited states here over visited - // ones (in the state graph), to improve performance - STRACE("seq_regex_verbose", tout << "added choice: " - << mk_pp(choice, m) << std::endl;); - } + accept_next.push_back(th.mk_literal(accept_deriv)); th.add_axiom(accept_next); } @@ -442,7 +444,7 @@ namespace smt { expr_ref r = symmetric_diff(r1, r2); expr_ref emp(re().mk_empty(m.get_sort(r)), m); expr_ref n(m.mk_fresh_const("re.char", seq_sort), m); - expr_ref is_empty = sk().mk_is_empty(r, emp, n); + expr_ref is_empty = sk().mk_is_empty(r, r, n); th.add_axiom(~th.mk_eq(r1, r2, false), th.mk_literal(is_empty)); } @@ -455,7 +457,7 @@ namespace smt { expr_ref r = symmetric_diff(r1, r2); expr_ref emp(re().mk_empty(m.get_sort(r)), m); expr_ref n(m.mk_fresh_const("re.char", seq_sort), m); - expr_ref is_non_empty = sk().mk_is_non_empty(r, emp, n); + expr_ref is_non_empty = sk().mk_is_non_empty(r, r, n); th.add_axiom(th.mk_eq(r1, r2, false), th.mk_literal(is_non_empty)); } @@ -517,22 +519,98 @@ namespace smt { th.add_axiom(lits); } - void seq_regex::get_cofactors(expr* r, expr_ref_vector& conds, expr_ref_pair_vector& result) { - expr* cond = nullptr, *th = nullptr, *el = nullptr; - if (m.is_ite(r, cond, th, el)) { - conds.push_back(cond); - get_cofactors(th, conds, result); - conds.pop_back(); - conds.push_back(mk_not(m, cond)); - get_cofactors(el, conds, result); - conds.pop_back(); - } - else { - expr_ref conj = mk_and(conds); - result.push_back(conj, r); + /* + Given a string s, index i, and a derivative regex d, return an + expression that is equivalent to + accept s i r + but which pushes accept s i r into the leaves (next derivatives to + explore). + + Input r is of type regex; output is of type bool. + + Example: + mk_deriv_accept(s, i, (ite a r1 r2) u (ite b r3 r4)) + = (or (ite a (accept s i r1) (accept s i r2)) + (ite b (accept s i r3) (accept s i r4))) + */ + expr_ref seq_regex::mk_deriv_accept(expr* s, unsigned i, expr* r) { + vector to_visit; + to_visit.push_back(r); + obj_map re_to_bool; + expr_ref_vector _temp_bool_owner(m); // temp owner for bools we create + + // DFS + while (to_visit.size() > 0) { + expr* e = to_visit.back(); + expr* econd = nullptr, *e1 = nullptr, *e2 = nullptr; + if (!re_to_bool.contains(e)) { + // First visit: add children + STRACE("seq_regex_verbose", tout << "1";); + if (m.is_ite(e, econd, e1, e2) || + re().is_union(e, e1, e2)) { + to_visit.push_back(e1); + to_visit.push_back(e2); + } + // Mark first visit by adding nullptr to the map + re_to_bool.insert(e, nullptr); + } + else if (re_to_bool.find(e) == nullptr) { + // Second visit: set value + STRACE("seq_regex_verbose", tout << "2";); + to_visit.pop_back(); + if (m.is_ite(e, econd, e1, e2)) { + expr* b1 = re_to_bool.find(e1); + expr* b2 = re_to_bool.find(e2); + expr* b = m.mk_ite(econd, b1, b2); + _temp_bool_owner.push_back(b); + re_to_bool.find(e) = b; + } + else if (re().is_union(e, e1, e2)) { + expr* b1 = re_to_bool.find(e1); + expr* b2 = re_to_bool.find(e2); + expr* b = m.mk_or(b1, b2); + _temp_bool_owner.push_back(b); + re_to_bool.find(e) = b; + } + else { + expr* iplus1 = a().mk_int(i); + _temp_bool_owner.push_back(iplus1); + expr_ref acc_leaf = sk().mk_accept(s, iplus1, e); + _temp_bool_owner.push_back(acc_leaf); + re_to_bool.find(e) = acc_leaf; + + STRACE("seq_regex_verbose", tout + << "mk_deriv_accept: added accept leaf: " + << mk_pp(acc_leaf, m) << std::endl;); + } + } + else { + STRACE("seq_regex_verbose", tout << "3";); + // Remaining visits: skip + to_visit.pop_back(); + } } + + // Finalize + expr_ref result(m); + result = re_to_bool.find(r); // Assigns ownership of all exprs in + // re_to_bool for after this completes + rewrite(result); + return result; } + /* + Return a list of all leaves in the derivative of a regex r, + ignoring the conditions along each path. + + Warning: Although the derivative + normal form tries to eliminate unsat condition paths, one cannot + assume that the path to each leaf is satisfiable in general + (e.g. when regexes are created using re.pred). + So not all results may correspond to satisfiable predicates. + It is OK to rely on the results being satisfiable for completeness, + but not soundness. + */ void seq_regex::get_all_derivatives(expr* r, expr_ref_vector& results) { // Get derivative sort* seq_sort = nullptr; @@ -541,14 +619,74 @@ namespace smt { expr_ref hd = mk_first(r, n); expr_ref d(m); d = derivative_wrapper(hd, r); - // Use get_cofactors method and try to filter out unsatisfiable conds - expr_ref_pair_vector cofactors(m); - get_cofactors(d, cofactors); - STRACE("seq_regex_verbose", tout << "getting all derivatives of: " << mk_pp(r, m) << std::endl;); - for (auto const& p : cofactors) { - if (m.is_false(p.first) || re().is_empty(p.second)) continue; - STRACE("seq_regex_verbose", tout << "adding derivative: " << mk_pp(p.second, m) << std::endl;); - results.push_back(p.second); + + // DFS + vector to_visit; + to_visit.push_back(d); + obj_map visited; // set (bool is used as a unit type) + while (to_visit.size() > 0) { + expr* e = to_visit.back(); + to_visit.pop_back(); + if (visited.contains(e)) continue; + visited.insert(e, true); + expr* econd = nullptr, *e1 = nullptr, *e2 = nullptr; + if (m.is_ite(e, econd, e1, e2) || + re().is_union(e, e1, e2)) { + to_visit.push_back(e1); + to_visit.push_back(e2); + } + else if (!re().is_empty(e)) { + results.push_back(e); + STRACE("seq_regex_verbose", tout + << "get_all_derivatives: added deriv: " + << mk_pp(e, m) << std::endl;); + } + } + + STRACE("seq_regex", tout << "Number of derivatives: " + << results.size() << std::endl;); + STRACE("seq_regex_brief", tout << "#derivs=" << results.size() << " ";); + } + + /* + Return a list of all (cond, leaf) pairs in a given derivative + expression r. + + Note: this recursive implementation is inefficient, since if nodes + are repeated often in the expression DAG, they may be visited + many times. For this reason, prefer mk_deriv_accept and + get_all_derivatives when possible. + + This method is still used by: + propagate_is_empty + propagate_is_non_empty + */ + void seq_regex::get_cofactors(expr* r, expr_ref_pair_vector& result) { + expr_ref_vector conds(m); + get_cofactors_rec(r, conds, result); + STRACE("seq_regex", tout << "Number of derivatives: " + << result.size() << std::endl;); + STRACE("seq_regex_brief", tout << "#derivs=" << result.size() << " ";); + } + void seq_regex::get_cofactors_rec(expr* r, expr_ref_vector& conds, + expr_ref_pair_vector& result) { + expr* cond = nullptr, *r1 = nullptr, *r2 = nullptr; + if (m.is_ite(r, cond, r1, r2)) { + conds.push_back(cond); + get_cofactors_rec(r1, conds, result); + conds.pop_back(); + conds.push_back(mk_not(m, cond)); + get_cofactors_rec(r2, conds, result); + conds.pop_back(); + } + else if (re().is_union(r, r1, r2)) { + get_cofactors_rec(r1, conds, result); + get_cofactors_rec(r2, conds, result); + } + else { + expr_ref conj = mk_and(conds); + if (!m.is_false(conj) && !re().is_empty(r)) + result.push_back(conj, r); } } @@ -618,6 +756,9 @@ namespace smt { m_expr_to_state.insert(e, new_id); STRACE("seq_regex_brief", tout << "new(" << expr_id_str(e) << ")=" << state_str(e) << " ";); + STRACE("seq_regex", tout + << "New state ID: " << new_id + << " = " << mk_pp(e, m) << std::endl;); } return m_expr_to_state.find(e); } @@ -627,6 +768,7 @@ namespace smt { return m_state_to_expr.get(id); } + bool seq_regex::can_be_in_cycle(expr *r1, expr *r2) { // TBD: This can be used to optimize the state graph: // return false here if it is known that r1 -> r2 can never be @@ -649,10 +791,11 @@ namespace smt { STRACE("seq_regex_brief", tout << "(MAX SIZE REACHED) ";); return false; } - STRACE("seq_regex", tout << "Updating state graph for regex " - << mk_pp(r, m) << ") ";); // Add state m_state_graph.add_state(r_id); + STRACE("state_graph", tout << "regex(" << r_id << ") = " << mk_pp(r, m) << std::endl;); + STRACE("seq_regex", tout << "Updating state graph for regex " + << mk_pp(r, m) << ") " << std::endl;); STRACE("seq_regex_brief", tout << std::endl << "USG(" << state_str(r) << ") ";); expr_ref r_nullable = is_nullable_wrapper(r); @@ -663,18 +806,20 @@ namespace smt { // Add edges to all derivatives expr_ref_vector derivatives(m); STRACE("seq_regex_verbose", tout - << std::endl << " getting all derivs: " << r_id << " ";); + << "getting all derivs: " << r_id << " " << std::endl;); get_all_derivatives(r, derivatives); for (auto const& dr: derivatives) { unsigned dr_id = get_state_id(dr); STRACE("seq_regex_verbose", tout - << std::endl << " traversing deriv: " << dr_id << " ";); + << " traversing deriv: " << dr_id << " " << std::endl;); m_state_graph.add_state(dr_id); + STRACE("state_graph", tout << "regex(" << dr_id << ") = " << mk_pp(dr, m) << std::endl;); bool maybecycle = can_be_in_cycle(r, dr); m_state_graph.add_edge(r_id, dr_id, maybecycle); } m_state_graph.mark_done(r_id); } + STRACE("seq_regex", m_state_graph.display(tout);); STRACE("seq_regex_brief", tout << std::endl;); STRACE("seq_regex_brief", m_state_graph.display(tout);); return true; diff --git a/src/smt/seq_regex.h b/src/smt/seq_regex.h index 1f49d7003..e5782a94f 100644 --- a/src/smt/seq_regex.h +++ b/src/smt/seq_regex.h @@ -23,6 +23,71 @@ Author: #include "smt/smt_context.h" #include "smt/seq_skolem.h" +/* + *** Tracing and debugging in this module and related modules *** + + Tracing and debugging for the regex solver are split across several + command-line flags. + + TRACING + + -tr:seq_regex and -tr:seq_regex_brief + These are the main tags to trace what the regex solver is doing. + They mostly trace the same things, except that seq_regex_brief + avoids printing out expressions and tries to abbreviate the output + as much as possible. seq_regex_brief shows the following output: + Top-level propagations: + PIR: Propagating an in_re constraint + PE/PNE: Propagating an empty/non-empty constraint + PEQ/PNEQ: Propagating a not-equal constraint + PA: Propagating an accept constraint + In tracing, arguments are generally put in parentheses. + To achieve abbreviated output, expressions are traced in one of two + ways: + id243 (expr ID): the regex or expression with id 243 + 3 (state ID): the regex with state ID 3 + When a regex is newly assigned to a state ID, we print this: + new(id606)=4 + Of these, PA is the most important, and traces as follows: + PA(x@i,r): propagate accept for string x at index i, regex r. + (empty), (dead), (blocked), (unfold): info about whether this + PA was cut off early, or unfolded into the derivatives + (next states) + d(r1)=r2: r2 is the derivative of r1 + n(r1)=b: b = whether r1 is nullable or not + USG(r): updating state graph for regex r (add all derivatives) + + -tr:state_graph + This is the tracing done by util/state_graph, the data structure + that seq_regex uses to track live and dead regexes, which can + altneratively be used to get a high-level picture of what states + are being explored and updated as the solver progresses. + + -tr:seq_regex_verbose + Used for some more frequent tracing (in the style of seq_regex, + not in the style of seq_regex_brief) + + -tr:seq and -tr:seq_verbose + These are the underlying sequence theory tracing, often used by + the rewriter. + + DEBUGGING AND VIEWING STATE GRAPH GRAPHICAL OUTPUT + + -dbg:seq_regex + Debugging that checks invariants. Currently, checks that derivative + normal form is correctly preserved in the rewriter. + + -dbg:state_graph + Debugging for the state graph, which + 1. Checks state graph invariants, and + 2. Generates the files .z3-state-graph.dgml and .z3-state-graph.dot + which can be used to visually view the state graph being explored, + during or after executing Z3. + The output can be viewed: + - Using Visual Studio for .dgml + - Using a tool such as xdot (`xdot .z3-state-graph.dot`) for .dot +*/ + namespace smt { class theory_seq; @@ -93,12 +158,13 @@ namespace smt { expr_ref is_nullable_wrapper(expr* r); expr_ref derivative_wrapper(expr* hd, expr* r); - void get_cofactors(expr* r, expr_ref_vector& conds, expr_ref_pair_vector& result); - void get_cofactors(expr* r, expr_ref_pair_vector& result) { - expr_ref_vector conds(m); - get_cofactors(r, conds, result); - } + // Various support for unfolding derivative expressions that are + // returned by derivative_wrapper + expr_ref mk_deriv_accept(expr* s, unsigned i, expr* r); void get_all_derivatives(expr* r, expr_ref_vector& results); + void get_cofactors(expr* r, expr_ref_pair_vector& result); + void get_cofactors_rec(expr* r, expr_ref_vector& conds, + expr_ref_pair_vector& result); public: diff --git a/src/util/state_graph.cpp b/src/util/state_graph.cpp index b5f621a00..2b870331e 100644 --- a/src/util/state_graph.cpp +++ b/src/util/state_graph.cpp @@ -324,13 +324,41 @@ bool state_graph::is_done(state s) const { return m_seen.contains(s) && !m_unexplored.contains(m_state_ufind.find(s)); } +/* + Pretty printing +*/ +std::ostream& state_graph::display(std::ostream& o) const { + o << "---------- State Graph ----------" << std::endl + << "Seen:"; + for (auto s : m_seen) { + o << " " << s; + state s_root = m_state_ufind.find(s); + if (s_root != s) + o << "(=" << s_root << ")"; + } + o << std::endl + << "Live:" << m_live << std::endl + << "Dead:" << m_dead << std::endl + << "Unknown:" << m_unknown << std::endl + << "Unexplored:" << m_unexplored << std::endl + << "Edges:" << std::endl; + for (auto s1 : m_seen) { + if (m_state_ufind.is_root(s1)) { + o << " " << s1 << " -> " << m_targets[s1] << std::endl; + } + } + o << "---------------------------------" << std::endl; + + return o; +} + +#ifdef Z3DEBUG /* Class invariants check (and associated auxiliary functions) check_invariant performs a sequence of SASSERT assertions, then always returns true. */ -#ifdef Z3DEBUG bool state_graph::is_subset(state_set set1, state_set set2) const { for (auto s1: set1) { if (!set2.contains(s1)) return false; @@ -387,37 +415,7 @@ bool state_graph::check_invariant() const { STRACE("state_graph", tout << "(invariant passed) ";); return true; } -#endif -/* - Pretty printing -*/ -std::ostream& state_graph::display(std::ostream& o) const { - o << "---------- State Graph ----------" << std::endl - << "Seen:"; - for (auto s: m_seen) { - o << " " << s; - state s_root = m_state_ufind.find(s); - if (s_root != s) - o << "(=" << s_root << ")"; - } - o << std::endl - << "Live:" << m_live << std::endl - << "Dead:" << m_dead << std::endl - << "Unknown:" << m_unknown << std::endl - << "Unexplored:" << m_unexplored << std::endl - << "Edges:" << std::endl; - for (auto s1: m_seen) { - if (m_state_ufind.is_root(s1)) { - o << " " << s1 << " -> " << m_targets[s1] << std::endl; - } - } - o << "---------------------------------" << std::endl; - - return o; -} - -#ifdef Z3DEBUG /* Output the whole state graph in dgml format into the file '.z3-state-graph.dgml' */