mirror of
				https://github.com/Z3Prover/z3
				synced 2025-10-31 11:42:28 +00:00 
			
		
		
		
	Update regex union and intersection to maintain ANF (#5717)
* added merge for unions and intersections * added normalization rules to ensure ANF * fixing PR comments related to merge
This commit is contained in:
		
							parent
							
								
									db62038845
								
							
						
					
					
						commit
						a288f9048a
					
				
					 2 changed files with 255 additions and 41 deletions
				
			
		|  | @ -3112,7 +3112,7 @@ void seq_rewriter::mk_antimirov_deriv_rec(expr* e, expr* r, expr* path, expr_ref | |||
|         else | ||||
|             // D(e,r1)r2|(ite (r1nullable) (D(e,r2)) [])
 | ||||
|             // observe that (mk_ite_simplify(true, D(e,r2), []) = D(e,r2)
 | ||||
|             result = mk_antimirov_deriv_union(c1, re().mk_ite_simplify(r1nullable, mk_antimirov_deriv(e, r2, path), nothing())); | ||||
|             result = mk_regex_union_normalize(c1, re().mk_ite_simplify(r1nullable, mk_antimirov_deriv(e, r2, path), nothing())); | ||||
|     } | ||||
|     else if (m().is_ite(r, c, r1, r2)) { | ||||
|         c1 = simplify_path(e, m().mk_and(c, path)); | ||||
|  | @ -3171,7 +3171,7 @@ void seq_rewriter::mk_antimirov_deriv_rec(expr* e, expr* r, expr* path, expr_ref | |||
|             result = re().mk_ite_simplify(range, epsilon(), nothing()); | ||||
|     } | ||||
|     else if (re().is_union(r, r1, r2)) | ||||
|         result = mk_antimirov_deriv_union(mk_antimirov_deriv(e, r1, path), mk_antimirov_deriv(e, r2, path)); | ||||
|         result = mk_regex_union_normalize(mk_antimirov_deriv(e, r1, path), mk_antimirov_deriv(e, r2, path)); | ||||
|     else if (re().is_intersection(r, r1, r2)) | ||||
|         result = mk_antimirov_deriv_intersection(e,  | ||||
|             mk_antimirov_deriv(e, r1, path), | ||||
|  | @ -3213,11 +3213,6 @@ expr_ref seq_rewriter::mk_antimirov_deriv_intersection(expr* e, expr* d1, expr* | |||
|     VERIFY(m_util.is_seq(seq_sort, ele_sort)); | ||||
|     expr_ref result(m()); | ||||
|     expr* c, * a, * b; | ||||
|     //if (d1 == d2 || re().is_full_seq(d2) || re().is_empty(d1))
 | ||||
|     //    result = d1;
 | ||||
|     //else if (re().is_full_seq(d1) || re().is_empty(d2))
 | ||||
|     //    result = d2;
 | ||||
|     //else 
 | ||||
|     if (re().is_empty(d1)) | ||||
|         result = d1; | ||||
|     else if (re().is_empty(d2)) | ||||
|  | @ -3242,16 +3237,14 @@ expr_ref seq_rewriter::mk_antimirov_deriv_intersection(expr* e, expr* d1, expr* | |||
|         result = mk_antimirov_deriv_restrict(e, d2, path); | ||||
|     else if (re().is_union(d1, a, b)) | ||||
|         // distribute intersection over the union in d1
 | ||||
|         result = mk_antimirov_deriv_union(mk_antimirov_deriv_intersection(e, a, d2, path),  | ||||
|         result = mk_regex_union_normalize(mk_antimirov_deriv_intersection(e, a, d2, path), | ||||
|             mk_antimirov_deriv_intersection(e, b, d2, path)); | ||||
|     else if (re().is_union(d2, a, b)) | ||||
|         // distribute intersection over the union in d2
 | ||||
|         result = mk_antimirov_deriv_union(mk_antimirov_deriv_intersection(e, d1, a, path),  | ||||
|         result = mk_regex_union_normalize(mk_antimirov_deriv_intersection(e, d1, a, path), | ||||
|             mk_antimirov_deriv_intersection(e, d1, b, path)); | ||||
|     else | ||||
|         // in all other cases create the intersection regex
 | ||||
|         // TODO: flatten, order and merge d1 and d2 to maintain equality under similarity
 | ||||
|         result = (d1->get_id() <= d2->get_id() ? re().mk_inter(d1, d2) : re().mk_inter(d2, d1)); | ||||
|         result = mk_regex_inter_normalize(d1, d2); | ||||
|     return result; | ||||
| } | ||||
| 
 | ||||
|  | @ -3263,7 +3256,7 @@ expr_ref seq_rewriter::mk_antimirov_deriv_concat(expr* d, expr* r) { | |||
|     if (m().is_ite(d, c, t, e)) | ||||
|         result = m().mk_ite(c, mk_antimirov_deriv_concat(t, r), mk_antimirov_deriv_concat(e, r)); | ||||
|     else if (re().is_union(d, t, e)) | ||||
|         result = re().mk_union(mk_antimirov_deriv_concat(t, r), mk_antimirov_deriv_concat(e, r)); | ||||
|         result = mk_regex_union_normalize(mk_antimirov_deriv_concat(t, r), mk_antimirov_deriv_concat(e, r)); | ||||
|     else | ||||
|         result = mk_re_append(d, r); | ||||
|     return result; | ||||
|  | @ -3289,10 +3282,9 @@ expr_ref seq_rewriter::mk_antimirov_deriv_negate(expr* elem, expr* d) { | |||
|     else if (m().is_ite(d, c, t, e)) | ||||
|         result = m().mk_ite(c, mk_antimirov_deriv_negate(elem, t), mk_antimirov_deriv_negate(elem, e)); | ||||
|     else if (re().is_union(d, t, e)) | ||||
|         //result = re().mk_inter(mk_antimirov_deriv_negate(t), mk_antimirov_deriv_negate(e));
 | ||||
|         result = mk_antimirov_deriv_intersection(elem, mk_antimirov_deriv_negate(elem, t), mk_antimirov_deriv_negate(elem, e), m().mk_true()); | ||||
|     else if (re().is_intersection(d, t, e)) | ||||
|         result = re().mk_union(mk_antimirov_deriv_negate(elem, t), mk_antimirov_deriv_negate(elem, e)); | ||||
|         result = mk_regex_union_normalize(mk_antimirov_deriv_negate(elem, t), mk_antimirov_deriv_negate(elem, e)); | ||||
|     else if (re().is_complement(d, t)) | ||||
|         result = t; | ||||
|     else | ||||
|  | @ -3300,22 +3292,6 @@ expr_ref seq_rewriter::mk_antimirov_deriv_negate(expr* elem, expr* d) { | |||
|     return result; | ||||
| } | ||||
| 
 | ||||
| expr_ref seq_rewriter::mk_antimirov_deriv_union(expr* d1, expr* d2) { | ||||
|     expr_ref result(m()); | ||||
|     if (re().is_empty(d1) || re().is_full_seq(d2)) | ||||
|         result = d2; | ||||
|     else if (re().is_empty(d2) || re().is_full_seq(d1)) | ||||
|         result = d1; | ||||
|     else if (re().is_dot_plus(d1) && re().get_info(d2).min_length > 0) | ||||
|         result = d1; | ||||
|     else if (re().is_dot_plus(d2) && re().get_info(d1).min_length > 0) | ||||
|         result = d2; | ||||
|     else | ||||
|         // TODO: flatten, order and merge d1 and d2 to maintain equality under similarity
 | ||||
|         result = (d1->get_id() <= d2->get_id() ? re().mk_union(d1, d2) : re().mk_union(d2, d1)); | ||||
|     return result; | ||||
| } | ||||
| 
 | ||||
| // restrict the guards of all conditionals id d and simplify the resulting derivative
 | ||||
| // restrict(if(c, a, b), cond) = if(c, restrict(a, cond & c), restrict(b, cond & ~c))
 | ||||
| // restrict(a U b, cond) = restrict(a, cond) U restrict(b, cond)
 | ||||
|  | @ -3344,17 +3320,212 @@ expr_ref seq_rewriter::mk_antimirov_deriv_restrict(expr* e, expr* d, expr* cond) | |||
|     else if (re().is_union(d, a, b)) { | ||||
|         expr_ref a1(mk_antimirov_deriv_restrict(e, a, cond), m()); | ||||
|         expr_ref b1(mk_antimirov_deriv_restrict(e, b, cond), m()); | ||||
|         if (a1 == b1 || re().is_empty(b1) || re().is_full_seq(a1)) | ||||
|             result = a1; | ||||
|         else if (re().is_empty(a1) || re().is_full_seq(b1)) | ||||
|             result = b1; | ||||
|         else | ||||
|             //TODO: merge
 | ||||
|             result = (a1->get_id() <= b1->get_id() ? re().mk_union(a1, b1) : re().mk_union(b1, a1)); | ||||
|         result = mk_regex_union_normalize(a1, b1); | ||||
|     } | ||||
|     return result; | ||||
| } | ||||
| 
 | ||||
| expr_ref seq_rewriter::mk_regex_union_normalize(expr* r1, expr* r2) { | ||||
|     VERIFY(m_util.is_re(r1)); | ||||
|     VERIFY(m_util.is_re(r2)); | ||||
|     expr_ref result(m()); | ||||
|     std::function<bool(expr*, expr*&, expr*&)> test = [&](expr* t, expr*& a, expr*& b) { return re().is_union(t, a, b); }; | ||||
|     std::function<expr* (expr*, expr*)> compose = [&](expr* r1, expr* r2) { return re().mk_union(r1, r2); }; | ||||
|     if (r1 == r2 || re().is_empty(r2) || re().is_full_seq(r1)) | ||||
|         result = r1; | ||||
|     else if (re().is_empty(r1) || re().is_full_seq(r2)) | ||||
|         result = r2; | ||||
|     else if (re().is_dot_plus(r1) && re().get_info(r2).min_length > 0) | ||||
|         result = r1; | ||||
|     else if (re().is_dot_plus(r2) && re().get_info(r1).min_length > 0) | ||||
|         result = r2; | ||||
|     else | ||||
|         result = merge_regex_sets(r1, r2, re().mk_full_seq(r1->get_sort()), test, compose); | ||||
|     return result; | ||||
| } | ||||
| 
 | ||||
| expr_ref seq_rewriter::mk_regex_inter_normalize(expr* r1, expr* r2) { | ||||
|     VERIFY(m_util.is_re(r1)); | ||||
|     VERIFY(m_util.is_re(r2)); | ||||
|     expr_ref result(m()); | ||||
|     std::function<bool(expr*, expr*&, expr*&)> test = [&](expr* t, expr*& a, expr*& b) { return re().is_intersection(t, a, b); }; | ||||
|     std::function<expr* (expr*, expr*)> compose = [&](expr* r1, expr* r2) { return re().mk_inter(r1, r2); }; | ||||
|     if (r1 == r2 || re().is_empty(r1) || re().is_full_seq(r2)) | ||||
|         result = r1; | ||||
|     else if (re().is_empty(r2) || re().is_full_seq(r1)) | ||||
|         result = r2; | ||||
|     else if (re().is_epsilon(r1)) { | ||||
|         if (re().get_info(r2).nullable == l_true) | ||||
|             result = r1; | ||||
|         else if (re().get_info(r2).nullable == l_false) | ||||
|             result = re().mk_empty(r1->get_sort()); | ||||
|         else | ||||
|             result = merge_regex_sets(r1, r2, re().mk_empty(r1->get_sort()), test, compose); | ||||
|     } | ||||
|     else if (re().is_dot_plus(r1) && re().get_info(r2).min_length > 0) | ||||
|         result = r2; | ||||
|     else if (re().is_dot_plus(r2) && re().get_info(r1).min_length > 0) | ||||
|         result = r1; | ||||
|     else { | ||||
|         result = merge_regex_sets(r1, r2, re().mk_empty(r1->get_sort()), test, compose); | ||||
|     } | ||||
|     return result; | ||||
| } | ||||
| 
 | ||||
| expr_ref seq_rewriter::merge_regex_sets(expr* r1, expr* r2, expr* unit, | ||||
|     std::function<bool(expr*, expr*&, expr*&)>& test,  | ||||
|     std::function<expr* (expr*, expr*)>& compose) { | ||||
|     sort* seq_sort; | ||||
|     expr_ref result(unit, m()); | ||||
|     expr_ref_vector prefix(m()); | ||||
|     expr* a, * ar, * ar1, * b, * br, * br1; | ||||
|     VERIFY(m_util.is_re(r1, seq_sort)); | ||||
|     VERIFY(m_util.is_re(r2)); | ||||
|     SASSERT(r2->get_sort() == r1->get_sort()); | ||||
|     // Ordering of expressions used by merging, 0 means unordered, -1 means e1 < e2, 1 means e2 < e1
 | ||||
|     auto compare = [&](expr* x, expr* y) { | ||||
|         expr* z; | ||||
|         unsigned int xid, yid; | ||||
|         // TODO: consider also the case of A{0,l}++B having the same id as A*++B
 | ||||
|         // in which case return 0
 | ||||
|         if (x == y) | ||||
|             return 0; | ||||
| 
 | ||||
|         xid = (re().is_complement(x, z) ? z->get_id() : x->get_id()); | ||||
|         yid = (re().is_complement(y, z) ? z->get_id() : y->get_id()); | ||||
|         VERIFY(xid != yid); | ||||
|         return (xid < yid ? -1 : 1); | ||||
|     }; | ||||
|     auto composeresult = [&](expr* suffix) { | ||||
|         result = suffix; | ||||
|         while (!prefix.empty()) { | ||||
|             result = compose(prefix.back(), result); | ||||
|             prefix.pop_back(); | ||||
|         } | ||||
|     }; | ||||
|     if (r1 == r2) | ||||
|         result = r1; | ||||
|     else if (are_complements(r1, r2)) | ||||
|         // TODO: loops
 | ||||
|         result = unit; | ||||
|     else { | ||||
|         signed int k; | ||||
|         ar = r1; | ||||
|         br = r2; | ||||
|         while (true) {; | ||||
|             if (test(ar, a, ar1)) { | ||||
|                 if (test(br, b, br1)) { | ||||
|                     if (a == b) { | ||||
|                         prefix.push_back(a); | ||||
|                         ar = ar1; | ||||
|                         br = br1; | ||||
|                     } | ||||
|                     else if (are_complements(a, b)) { | ||||
|                         result = unit; | ||||
|                         break; | ||||
|                     } | ||||
|                     else { | ||||
|                         k = compare(a, b); | ||||
|                         if (k == -1) { | ||||
|                             // a < b
 | ||||
|                             prefix.push_back(a); | ||||
|                             ar = ar1; | ||||
|                         } | ||||
|                         else { | ||||
|                             // b < a
 | ||||
|                             prefix.push_back(b); | ||||
|                             br = br1; | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|                 else {     | ||||
|                     // br is not decomposable
 | ||||
|                     if (a == br) { | ||||
|                         // result = prefix ++ ar
 | ||||
|                         composeresult(ar); | ||||
|                         break; | ||||
|                     } | ||||
|                     else if (are_complements(a, br)) { | ||||
|                         result = unit; | ||||
|                         break; | ||||
|                     } | ||||
|                     else { | ||||
|                         k = compare(a, br); | ||||
|                         if (k == -1) { | ||||
|                             // a < br
 | ||||
|                             prefix.push_back(a); | ||||
|                             ar = ar1; | ||||
|                         } | ||||
|                         else { | ||||
|                             // br < a, result = prefix ++ (br) ++ ar
 | ||||
|                             prefix.push_back(br); | ||||
|                             composeresult(ar); | ||||
|                             break; | ||||
|                         } | ||||
|                     }        | ||||
|                 } | ||||
|             } | ||||
|             else { | ||||
|                 // ar is not decomposable
 | ||||
|                 if (test(br, b, br1)) { | ||||
|                     if (ar == b) { | ||||
|                         // result = prefix ++ br
 | ||||
|                         composeresult(br); | ||||
|                         break; | ||||
|                     } | ||||
|                     else if (are_complements(ar, b)) { | ||||
|                         result = unit; | ||||
|                         break; | ||||
|                     } | ||||
|                     else { | ||||
|                         k = compare(ar, b); | ||||
|                         if (k == -1) { | ||||
|                             // ar < b, result = prefix ++ (ar) ++ br
 | ||||
|                             prefix.push_back(ar); | ||||
|                             composeresult(br); | ||||
|                             break; | ||||
|                         } | ||||
|                         else { | ||||
|                             // b < ar
 | ||||
|                             prefix.push_back(b); | ||||
|                             br = br1; | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|                 else { | ||||
|                     // neither ar nor br is decomposable
 | ||||
|                     if (ar == br) { | ||||
|                         // result = prefix ++ ar
 | ||||
|                         composeresult(ar); | ||||
|                         break; | ||||
|                     } | ||||
|                     else if (are_complements(ar, br)) { | ||||
|                         result = unit; | ||||
|                         break; | ||||
|                     } | ||||
|                     else { | ||||
|                         k = compare(ar, br); | ||||
|                         if (k == -1) { | ||||
|                             // ar < br,  result = prefix ++ (ar) ++ (br)
 | ||||
|                             prefix.push_back(ar); | ||||
|                             composeresult(br); | ||||
|                             break; | ||||
|                         } | ||||
|                         else { | ||||
|                             // br < ar, result = prefix ++ (br) ++ (ar) 
 | ||||
|                             prefix.push_back(br); | ||||
|                             composeresult(ar); | ||||
|                             break; | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|     return result; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| expr_ref seq_rewriter::mk_regex_reverse(expr* r) { | ||||
|     expr* r1 = nullptr, * r2 = nullptr, * c = nullptr; | ||||
|     unsigned lo = 0, hi = 0; | ||||
|  | @ -4344,6 +4515,7 @@ br_status seq_rewriter::mk_str_to_regexp(expr* a, expr_ref& result) { | |||
|     r ++ [] -> [] | ||||
|     r ++ "" -> r | ||||
|     "" ++ r -> r | ||||
|     . ++ .* -> .+ | ||||
| 
 | ||||
|     to_re and star: | ||||
|     (str.to_re s1) ++ (str.to_re s2) -> (str.to_re (s1 ++ s2)) | ||||
|  | @ -4371,6 +4543,14 @@ br_status seq_rewriter::mk_re_concat(expr* a, expr* b, expr_ref& result) { | |||
|         result = a; | ||||
|         return BR_DONE; | ||||
|     } | ||||
|     if (re().is_full_char(a) && re().is_full_seq(b)) { | ||||
|         result = re().mk_plus(a); | ||||
|         return BR_DONE; | ||||
|     } | ||||
|     if (re().is_full_char(b) && re().is_full_seq(a)) { | ||||
|         result = re().mk_plus(b); | ||||
|         return BR_DONE; | ||||
|     } | ||||
|     expr_ref a_str(m()); | ||||
|     expr_ref b_str(m()); | ||||
|     if (lift_str_from_to_re(a, a_str) && lift_str_from_to_re(b, b_str)) { | ||||
|  | @ -4515,6 +4695,11 @@ br_status seq_rewriter::mk_re_union(expr* a, expr* b, expr_ref& result) { | |||
|         result = mk_full(); | ||||
|         return BR_DONE; | ||||
|     } | ||||
| 
 | ||||
|     //just keep the union normalized
 | ||||
|     result = mk_regex_union_normalize(a, b); | ||||
|     return BR_DONE; | ||||
| 
 | ||||
|          | ||||
|     expr* a1 = nullptr, *a2 = nullptr; | ||||
|     expr* b1 = nullptr, *b2 = nullptr; | ||||
|  | @ -4643,9 +4828,17 @@ br_status seq_rewriter::mk_re_inter(expr* a, expr* b, expr_ref& result) { | |||
|         result = mk_empty(); | ||||
|         return BR_DONE; | ||||
|     } | ||||
| 
 | ||||
|     // intersect and normalize
 | ||||
|     result = mk_regex_inter_normalize(a, b); | ||||
|     return BR_DONE; | ||||
| 
 | ||||
|     expr* a1 = nullptr, *a2 = nullptr; | ||||
|     expr* b1 = nullptr, *b2 = nullptr; | ||||
| 
 | ||||
|     // the following rewrite rules do not seem to 
 | ||||
|     // do the right thing when it comes to normalizing
 | ||||
| 
 | ||||
|     // ensure intersection is right-associative
 | ||||
|     // and swap-sort entries 
 | ||||
|     if (re().is_intersection(a, a1, a2)) { | ||||
|  | @ -4696,7 +4889,7 @@ br_status seq_rewriter::mk_re_inter(expr* a, expr* b, expr_ref& result) { | |||
| } | ||||
| 
 | ||||
| br_status seq_rewriter::mk_re_diff(expr* a, expr* b, expr_ref& result) { | ||||
|     result = re().mk_inter(a, re().mk_complement(b)); | ||||
|     result = mk_regex_inter_normalize(a, re().mk_complement(b)); | ||||
|     return BR_REWRITE2; | ||||
| } | ||||
| 
 | ||||
|  | @ -4939,7 +5132,7 @@ void seq_rewriter::elim_condition(expr* elem, expr_ref& cond) { | |||
|     flatten_and(cond, conds); | ||||
|     expr* lhs = nullptr, *rhs = nullptr, *e1 = nullptr;  | ||||
|     if (u().is_char(elem)) { | ||||
|         unsigned ch = 0; | ||||
|         unsigned ch = 0, ch2 = 0; | ||||
|         svector<std::pair<unsigned, unsigned>> ranges, ranges1; | ||||
|         ranges.push_back(std::make_pair(0, u().max_char())); | ||||
|         auto exclude_char = [&](unsigned ch) { | ||||
|  | @ -4991,6 +5184,19 @@ void seq_rewriter::elim_condition(expr* elem, expr_ref& cond) { | |||
|                 else                  | ||||
|                     intersect(0, ch-1, ranges); | ||||
|             } | ||||
|             else if (m().is_true(e) || (m().is_eq(e, lhs, rhs) && lhs == rhs)) { | ||||
|                 // trivially true
 | ||||
|                 continue; | ||||
|             } | ||||
|             else if (m().is_not(e, e1) && m().is_eq(e1, lhs, rhs) && u().is_const_char(lhs, ch) && u().is_const_char(rhs, ch2) && ch != ch2) { | ||||
|                 // trivially true
 | ||||
|                 continue; | ||||
|             } | ||||
|             else if (m().is_false(e) || (m().is_not(e, e1) && m().is_eq(e1, lhs, rhs) && lhs == rhs)) { | ||||
|                 // trivially false
 | ||||
|                 cond = m().mk_false(); | ||||
|                 return; | ||||
|             } | ||||
|             else { | ||||
|                 all_ranges = false; | ||||
|                 break; | ||||
|  |  | |||
|  | @ -217,11 +217,19 @@ class seq_rewriter { | |||
|     expr_ref mk_antimirov_deriv_intersection(expr* elem, expr* d1, expr* d2, expr* path); | ||||
|     expr_ref mk_antimirov_deriv_concat(expr* d, expr* r); | ||||
|     expr_ref mk_antimirov_deriv_negate(expr* elem, expr* d); | ||||
|     expr_ref mk_antimirov_deriv_union(expr* d1, expr* d2); | ||||
|     expr_ref mk_antimirov_deriv_restrict(expr* elem, expr* d1, expr* cond); | ||||
|     expr_ref mk_regex_reverse(expr* r); | ||||
|     expr_ref mk_regex_concat(expr* r1, expr* r2); | ||||
| 
 | ||||
|     expr_ref merge_regex_sets(expr* r1, expr* r2, expr* unit, std::function<bool(expr*, expr*&, expr*&)>& decompose, std::function<expr* (expr*, expr*)>& compose); | ||||
| 
 | ||||
|     // Apply simplifications and keep the representation normalized
 | ||||
|     // Assuming r1 and r2 are normalized
 | ||||
|     expr_ref mk_regex_union_normalize(expr* r1, expr* r2); | ||||
|     expr_ref mk_regex_inter_normalize(expr* r1, expr* r2); | ||||
| 
 | ||||
|     // elem is (:var 0) and path a condition that may have (:var 0) as a free variable
 | ||||
|     // simplify path, e.g., (:var 0) = 'a' & (:var 0) = 'b' is simplified to false
 | ||||
|     expr_ref simplify_path(expr* elem, expr* path); | ||||
| 
 | ||||
|     bool lt_char(expr* ch1, expr* ch2); | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue