From a288f9048ad48275cb78ccf87412bb1fc3c06638 Mon Sep 17 00:00:00 2001 From: Margus Veanes Date: Thu, 16 Dec 2021 19:19:36 -0800 Subject: [PATCH] Update regex union and intersection to maintain ANF (#5717) * added merge for unions and intersections * added normalization rules to ensure ANF * fixing PR comments related to merge --- src/ast/rewriter/seq_rewriter.cpp | 286 +++++++++++++++++++++++++----- src/ast/rewriter/seq_rewriter.h | 10 +- 2 files changed, 255 insertions(+), 41 deletions(-) diff --git a/src/ast/rewriter/seq_rewriter.cpp b/src/ast/rewriter/seq_rewriter.cpp index d399df1e6..045642019 100644 --- a/src/ast/rewriter/seq_rewriter.cpp +++ b/src/ast/rewriter/seq_rewriter.cpp @@ -3112,7 +3112,7 @@ void seq_rewriter::mk_antimirov_deriv_rec(expr* e, expr* r, expr* path, expr_ref else // D(e,r1)r2|(ite (r1nullable) (D(e,r2)) []) // observe that (mk_ite_simplify(true, D(e,r2), []) = D(e,r2) - result = mk_antimirov_deriv_union(c1, re().mk_ite_simplify(r1nullable, mk_antimirov_deriv(e, r2, path), nothing())); + result = mk_regex_union_normalize(c1, re().mk_ite_simplify(r1nullable, mk_antimirov_deriv(e, r2, path), nothing())); } else if (m().is_ite(r, c, r1, r2)) { c1 = simplify_path(e, m().mk_and(c, path)); @@ -3171,7 +3171,7 @@ void seq_rewriter::mk_antimirov_deriv_rec(expr* e, expr* r, expr* path, expr_ref result = re().mk_ite_simplify(range, epsilon(), nothing()); } else if (re().is_union(r, r1, r2)) - result = mk_antimirov_deriv_union(mk_antimirov_deriv(e, r1, path), mk_antimirov_deriv(e, r2, path)); + result = mk_regex_union_normalize(mk_antimirov_deriv(e, r1, path), mk_antimirov_deriv(e, r2, path)); else if (re().is_intersection(r, r1, r2)) result = mk_antimirov_deriv_intersection(e, mk_antimirov_deriv(e, r1, path), @@ -3213,11 +3213,6 @@ expr_ref seq_rewriter::mk_antimirov_deriv_intersection(expr* e, expr* d1, expr* VERIFY(m_util.is_seq(seq_sort, ele_sort)); expr_ref result(m()); expr* c, * a, * b; - //if (d1 == d2 || re().is_full_seq(d2) || re().is_empty(d1)) - // result = d1; - //else if (re().is_full_seq(d1) || re().is_empty(d2)) - // result = d2; - //else if (re().is_empty(d1)) result = d1; else if (re().is_empty(d2)) @@ -3242,16 +3237,14 @@ expr_ref seq_rewriter::mk_antimirov_deriv_intersection(expr* e, expr* d1, expr* result = mk_antimirov_deriv_restrict(e, d2, path); else if (re().is_union(d1, a, b)) // distribute intersection over the union in d1 - result = mk_antimirov_deriv_union(mk_antimirov_deriv_intersection(e, a, d2, path), + result = mk_regex_union_normalize(mk_antimirov_deriv_intersection(e, a, d2, path), mk_antimirov_deriv_intersection(e, b, d2, path)); else if (re().is_union(d2, a, b)) // distribute intersection over the union in d2 - result = mk_antimirov_deriv_union(mk_antimirov_deriv_intersection(e, d1, a, path), + result = mk_regex_union_normalize(mk_antimirov_deriv_intersection(e, d1, a, path), mk_antimirov_deriv_intersection(e, d1, b, path)); else - // in all other cases create the intersection regex - // TODO: flatten, order and merge d1 and d2 to maintain equality under similarity - result = (d1->get_id() <= d2->get_id() ? re().mk_inter(d1, d2) : re().mk_inter(d2, d1)); + result = mk_regex_inter_normalize(d1, d2); return result; } @@ -3263,7 +3256,7 @@ expr_ref seq_rewriter::mk_antimirov_deriv_concat(expr* d, expr* r) { if (m().is_ite(d, c, t, e)) result = m().mk_ite(c, mk_antimirov_deriv_concat(t, r), mk_antimirov_deriv_concat(e, r)); else if (re().is_union(d, t, e)) - result = re().mk_union(mk_antimirov_deriv_concat(t, r), mk_antimirov_deriv_concat(e, r)); + result = mk_regex_union_normalize(mk_antimirov_deriv_concat(t, r), mk_antimirov_deriv_concat(e, r)); else result = mk_re_append(d, r); return result; @@ -3289,10 +3282,9 @@ expr_ref seq_rewriter::mk_antimirov_deriv_negate(expr* elem, expr* d) { else if (m().is_ite(d, c, t, e)) result = m().mk_ite(c, mk_antimirov_deriv_negate(elem, t), mk_antimirov_deriv_negate(elem, e)); else if (re().is_union(d, t, e)) - //result = re().mk_inter(mk_antimirov_deriv_negate(t), mk_antimirov_deriv_negate(e)); result = mk_antimirov_deriv_intersection(elem, mk_antimirov_deriv_negate(elem, t), mk_antimirov_deriv_negate(elem, e), m().mk_true()); else if (re().is_intersection(d, t, e)) - result = re().mk_union(mk_antimirov_deriv_negate(elem, t), mk_antimirov_deriv_negate(elem, e)); + result = mk_regex_union_normalize(mk_antimirov_deriv_negate(elem, t), mk_antimirov_deriv_negate(elem, e)); else if (re().is_complement(d, t)) result = t; else @@ -3300,22 +3292,6 @@ expr_ref seq_rewriter::mk_antimirov_deriv_negate(expr* elem, expr* d) { return result; } -expr_ref seq_rewriter::mk_antimirov_deriv_union(expr* d1, expr* d2) { - expr_ref result(m()); - if (re().is_empty(d1) || re().is_full_seq(d2)) - result = d2; - else if (re().is_empty(d2) || re().is_full_seq(d1)) - result = d1; - else if (re().is_dot_plus(d1) && re().get_info(d2).min_length > 0) - result = d1; - else if (re().is_dot_plus(d2) && re().get_info(d1).min_length > 0) - result = d2; - else - // TODO: flatten, order and merge d1 and d2 to maintain equality under similarity - result = (d1->get_id() <= d2->get_id() ? re().mk_union(d1, d2) : re().mk_union(d2, d1)); - return result; -} - // restrict the guards of all conditionals id d and simplify the resulting derivative // restrict(if(c, a, b), cond) = if(c, restrict(a, cond & c), restrict(b, cond & ~c)) // restrict(a U b, cond) = restrict(a, cond) U restrict(b, cond) @@ -3344,17 +3320,212 @@ expr_ref seq_rewriter::mk_antimirov_deriv_restrict(expr* e, expr* d, expr* cond) else if (re().is_union(d, a, b)) { expr_ref a1(mk_antimirov_deriv_restrict(e, a, cond), m()); expr_ref b1(mk_antimirov_deriv_restrict(e, b, cond), m()); - if (a1 == b1 || re().is_empty(b1) || re().is_full_seq(a1)) - result = a1; - else if (re().is_empty(a1) || re().is_full_seq(b1)) - result = b1; - else - //TODO: merge - result = (a1->get_id() <= b1->get_id() ? re().mk_union(a1, b1) : re().mk_union(b1, a1)); + result = mk_regex_union_normalize(a1, b1); } return result; } +expr_ref seq_rewriter::mk_regex_union_normalize(expr* r1, expr* r2) { + VERIFY(m_util.is_re(r1)); + VERIFY(m_util.is_re(r2)); + expr_ref result(m()); + std::function test = [&](expr* t, expr*& a, expr*& b) { return re().is_union(t, a, b); }; + std::function compose = [&](expr* r1, expr* r2) { return re().mk_union(r1, r2); }; + if (r1 == r2 || re().is_empty(r2) || re().is_full_seq(r1)) + result = r1; + else if (re().is_empty(r1) || re().is_full_seq(r2)) + result = r2; + else if (re().is_dot_plus(r1) && re().get_info(r2).min_length > 0) + result = r1; + else if (re().is_dot_plus(r2) && re().get_info(r1).min_length > 0) + result = r2; + else + result = merge_regex_sets(r1, r2, re().mk_full_seq(r1->get_sort()), test, compose); + return result; +} + +expr_ref seq_rewriter::mk_regex_inter_normalize(expr* r1, expr* r2) { + VERIFY(m_util.is_re(r1)); + VERIFY(m_util.is_re(r2)); + expr_ref result(m()); + std::function test = [&](expr* t, expr*& a, expr*& b) { return re().is_intersection(t, a, b); }; + std::function compose = [&](expr* r1, expr* r2) { return re().mk_inter(r1, r2); }; + if (r1 == r2 || re().is_empty(r1) || re().is_full_seq(r2)) + result = r1; + else if (re().is_empty(r2) || re().is_full_seq(r1)) + result = r2; + else if (re().is_epsilon(r1)) { + if (re().get_info(r2).nullable == l_true) + result = r1; + else if (re().get_info(r2).nullable == l_false) + result = re().mk_empty(r1->get_sort()); + else + result = merge_regex_sets(r1, r2, re().mk_empty(r1->get_sort()), test, compose); + } + else if (re().is_dot_plus(r1) && re().get_info(r2).min_length > 0) + result = r2; + else if (re().is_dot_plus(r2) && re().get_info(r1).min_length > 0) + result = r1; + else { + result = merge_regex_sets(r1, r2, re().mk_empty(r1->get_sort()), test, compose); + } + return result; +} + +expr_ref seq_rewriter::merge_regex_sets(expr* r1, expr* r2, expr* unit, + std::function& test, + std::function& compose) { + sort* seq_sort; + expr_ref result(unit, m()); + expr_ref_vector prefix(m()); + expr* a, * ar, * ar1, * b, * br, * br1; + VERIFY(m_util.is_re(r1, seq_sort)); + VERIFY(m_util.is_re(r2)); + SASSERT(r2->get_sort() == r1->get_sort()); + // Ordering of expressions used by merging, 0 means unordered, -1 means e1 < e2, 1 means e2 < e1 + auto compare = [&](expr* x, expr* y) { + expr* z; + unsigned int xid, yid; + // TODO: consider also the case of A{0,l}++B having the same id as A*++B + // in which case return 0 + if (x == y) + return 0; + + xid = (re().is_complement(x, z) ? z->get_id() : x->get_id()); + yid = (re().is_complement(y, z) ? z->get_id() : y->get_id()); + VERIFY(xid != yid); + return (xid < yid ? -1 : 1); + }; + auto composeresult = [&](expr* suffix) { + result = suffix; + while (!prefix.empty()) { + result = compose(prefix.back(), result); + prefix.pop_back(); + } + }; + if (r1 == r2) + result = r1; + else if (are_complements(r1, r2)) + // TODO: loops + result = unit; + else { + signed int k; + ar = r1; + br = r2; + while (true) {; + if (test(ar, a, ar1)) { + if (test(br, b, br1)) { + if (a == b) { + prefix.push_back(a); + ar = ar1; + br = br1; + } + else if (are_complements(a, b)) { + result = unit; + break; + } + else { + k = compare(a, b); + if (k == -1) { + // a < b + prefix.push_back(a); + ar = ar1; + } + else { + // b < a + prefix.push_back(b); + br = br1; + } + } + } + else { + // br is not decomposable + if (a == br) { + // result = prefix ++ ar + composeresult(ar); + break; + } + else if (are_complements(a, br)) { + result = unit; + break; + } + else { + k = compare(a, br); + if (k == -1) { + // a < br + prefix.push_back(a); + ar = ar1; + } + else { + // br < a, result = prefix ++ (br) ++ ar + prefix.push_back(br); + composeresult(ar); + break; + } + } + } + } + else { + // ar is not decomposable + if (test(br, b, br1)) { + if (ar == b) { + // result = prefix ++ br + composeresult(br); + break; + } + else if (are_complements(ar, b)) { + result = unit; + break; + } + else { + k = compare(ar, b); + if (k == -1) { + // ar < b, result = prefix ++ (ar) ++ br + prefix.push_back(ar); + composeresult(br); + break; + } + else { + // b < ar + prefix.push_back(b); + br = br1; + } + } + } + else { + // neither ar nor br is decomposable + if (ar == br) { + // result = prefix ++ ar + composeresult(ar); + break; + } + else if (are_complements(ar, br)) { + result = unit; + break; + } + else { + k = compare(ar, br); + if (k == -1) { + // ar < br, result = prefix ++ (ar) ++ (br) + prefix.push_back(ar); + composeresult(br); + break; + } + else { + // br < ar, result = prefix ++ (br) ++ (ar) + prefix.push_back(br); + composeresult(ar); + break; + } + } + } + } + } + } + return result; +} + + expr_ref seq_rewriter::mk_regex_reverse(expr* r) { expr* r1 = nullptr, * r2 = nullptr, * c = nullptr; unsigned lo = 0, hi = 0; @@ -4344,6 +4515,7 @@ br_status seq_rewriter::mk_str_to_regexp(expr* a, expr_ref& result) { r ++ [] -> [] r ++ "" -> r "" ++ r -> r + . ++ .* -> .+ to_re and star: (str.to_re s1) ++ (str.to_re s2) -> (str.to_re (s1 ++ s2)) @@ -4371,6 +4543,14 @@ br_status seq_rewriter::mk_re_concat(expr* a, expr* b, expr_ref& result) { result = a; return BR_DONE; } + if (re().is_full_char(a) && re().is_full_seq(b)) { + result = re().mk_plus(a); + return BR_DONE; + } + if (re().is_full_char(b) && re().is_full_seq(a)) { + result = re().mk_plus(b); + return BR_DONE; + } expr_ref a_str(m()); expr_ref b_str(m()); if (lift_str_from_to_re(a, a_str) && lift_str_from_to_re(b, b_str)) { @@ -4515,6 +4695,11 @@ br_status seq_rewriter::mk_re_union(expr* a, expr* b, expr_ref& result) { result = mk_full(); return BR_DONE; } + + //just keep the union normalized + result = mk_regex_union_normalize(a, b); + return BR_DONE; + expr* a1 = nullptr, *a2 = nullptr; expr* b1 = nullptr, *b2 = nullptr; @@ -4643,9 +4828,17 @@ br_status seq_rewriter::mk_re_inter(expr* a, expr* b, expr_ref& result) { result = mk_empty(); return BR_DONE; } + + // intersect and normalize + result = mk_regex_inter_normalize(a, b); + return BR_DONE; + expr* a1 = nullptr, *a2 = nullptr; expr* b1 = nullptr, *b2 = nullptr; + // the following rewrite rules do not seem to + // do the right thing when it comes to normalizing + // ensure intersection is right-associative // and swap-sort entries if (re().is_intersection(a, a1, a2)) { @@ -4696,7 +4889,7 @@ br_status seq_rewriter::mk_re_inter(expr* a, expr* b, expr_ref& result) { } br_status seq_rewriter::mk_re_diff(expr* a, expr* b, expr_ref& result) { - result = re().mk_inter(a, re().mk_complement(b)); + result = mk_regex_inter_normalize(a, re().mk_complement(b)); return BR_REWRITE2; } @@ -4939,7 +5132,7 @@ void seq_rewriter::elim_condition(expr* elem, expr_ref& cond) { flatten_and(cond, conds); expr* lhs = nullptr, *rhs = nullptr, *e1 = nullptr; if (u().is_char(elem)) { - unsigned ch = 0; + unsigned ch = 0, ch2 = 0; svector> ranges, ranges1; ranges.push_back(std::make_pair(0, u().max_char())); auto exclude_char = [&](unsigned ch) { @@ -4991,6 +5184,19 @@ void seq_rewriter::elim_condition(expr* elem, expr_ref& cond) { else intersect(0, ch-1, ranges); } + else if (m().is_true(e) || (m().is_eq(e, lhs, rhs) && lhs == rhs)) { + // trivially true + continue; + } + else if (m().is_not(e, e1) && m().is_eq(e1, lhs, rhs) && u().is_const_char(lhs, ch) && u().is_const_char(rhs, ch2) && ch != ch2) { + // trivially true + continue; + } + else if (m().is_false(e) || (m().is_not(e, e1) && m().is_eq(e1, lhs, rhs) && lhs == rhs)) { + // trivially false + cond = m().mk_false(); + return; + } else { all_ranges = false; break; diff --git a/src/ast/rewriter/seq_rewriter.h b/src/ast/rewriter/seq_rewriter.h index 9687797a0..c944e8f77 100644 --- a/src/ast/rewriter/seq_rewriter.h +++ b/src/ast/rewriter/seq_rewriter.h @@ -217,11 +217,19 @@ class seq_rewriter { expr_ref mk_antimirov_deriv_intersection(expr* elem, expr* d1, expr* d2, expr* path); expr_ref mk_antimirov_deriv_concat(expr* d, expr* r); expr_ref mk_antimirov_deriv_negate(expr* elem, expr* d); - expr_ref mk_antimirov_deriv_union(expr* d1, expr* d2); expr_ref mk_antimirov_deriv_restrict(expr* elem, expr* d1, expr* cond); expr_ref mk_regex_reverse(expr* r); expr_ref mk_regex_concat(expr* r1, expr* r2); + expr_ref merge_regex_sets(expr* r1, expr* r2, expr* unit, std::function& decompose, std::function& compose); + + // Apply simplifications and keep the representation normalized + // Assuming r1 and r2 are normalized + expr_ref mk_regex_union_normalize(expr* r1, expr* r2); + expr_ref mk_regex_inter_normalize(expr* r1, expr* r2); + + // elem is (:var 0) and path a condition that may have (:var 0) as a free variable + // simplify path, e.g., (:var 0) = 'a' & (:var 0) = 'b' is simplified to false expr_ref simplify_path(expr* elem, expr* path); bool lt_char(expr* ch1, expr* ch2);