From c572fc2e4f7442aa119dd1825042663fc13756e5 Mon Sep 17 00:00:00 2001 From: Clemens Eisenhofer <56730610+CEisenhofer@users.noreply.github.com> Date: Sun, 12 Jan 2025 02:41:37 +0100 Subject: [PATCH] Regex membership (#7506) * Make finding a word in the regex iterative * Fixed gc problem --- src/ast/converters/expr_inverter.cpp | 7 +- src/ast/rewriter/seq_rewriter.cpp | 168 +++++++++++++++------------ src/ast/rewriter/seq_rewriter.h | 4 +- src/ast/sls/sls_seq_plugin.cpp | 90 +------------- src/ast/sls/sls_seq_plugin.h | 3 - 5 files changed, 103 insertions(+), 169 deletions(-) diff --git a/src/ast/converters/expr_inverter.cpp b/src/ast/converters/expr_inverter.cpp index 40afd7cb1..886b99e5c 100644 --- a/src/ast/converters/expr_inverter.cpp +++ b/src/ast/converters/expr_inverter.cpp @@ -883,10 +883,13 @@ public: case OP_SEQ_IN_RE: if (uncnstr(args[0]) && seq.re.is_ground(args[1]) && seq.is_string(args[0]->get_sort())) { #if 1 - zstring s1, s2; + zstring s1; expr* re = args[1]; + if (!rw.some_string_in_re(re, s1)) + return false; + zstring s2; expr_ref not_re(seq.re.mk_complement(re), m); - if (!rw.some_string_in_re(re, s1) || !rw.some_string_in_re(not_re, s2)) + if (!rw.some_string_in_re(not_re, s2)) return false; mk_fresh_uncnstr_var_for(f, r); diff --git a/src/ast/rewriter/seq_rewriter.cpp b/src/ast/rewriter/seq_rewriter.cpp index 765131426..f59d32c83 100644 --- a/src/ast/rewriter/seq_rewriter.cpp +++ b/src/ast/rewriter/seq_rewriter.cpp @@ -6152,91 +6152,115 @@ bool seq_rewriter::some_string_in_re(expr* r, zstring& s) { // SASSERT(re().is_re(r, rs) && m_util.is_string(rs)); expr_mark visited; unsigned_vector str; - expr_ref_vector pinned(m()); - if (!some_string_in_re(pinned, visited, r, str)) + if (!some_string_in_re(visited, r, str)) return false; s = zstring(str.size(), str.data()); return true; } -bool seq_rewriter::some_string_in_re(expr_ref_vector& pinned, expr_mark& visited, expr* r, unsigned_vector& str) { - if (visited.is_marked(r)) - return false; - if (re().is_empty(r)) - return false; - auto info = re().get_info(r); - if (info.nullable == l_true) - return true; - visited.mark(r); - pinned.push_back(r); - if (re().is_union(r)) - return any_of(*to_app(r), [&](expr* arg) { return some_string_in_re(pinned, visited, arg, str); }); - - expr_ref r2 = mk_derivative(r); +struct re_eval_pos { + expr_ref e; // use reference to avoid gc + unsigned str_len; buffer> exclude; - return append_char(pinned, visited, exclude, r2, str); -} + bool needs_derivation; +}; -bool seq_rewriter::append_char(expr_ref_vector& pinned, expr_mark& visited, buffer>& exclude, expr* r, unsigned_vector& str) { - expr* c, * th, * el; - if (re().is_empty(r)) - return false; - if (re().is_union(r)) - return any_of(*to_app(r), [&](expr* arg) { return append_char(pinned, visited, exclude, arg, str); }); - if (m().is_ite(r, c, th, el)) { - unsigned low = 0, high = zstring::unicode_max_char(); - if (get_bounds(c, low, high)) { - SASSERT(low <= high); - str.push_back(low); // ASSERT: low .. high does not intersect with exclude - if (some_string_in_re(pinned, visited, th, str)) - return true; - str.pop_back(); - } - if (re().is_empty(el)) - return false; - exclude.push_back({ low, high }); - if (append_char(pinned, visited, exclude, el, str)) - return true; - exclude.pop_back(); - return false; - } - - if (is_ground(r)) { - // ensure selected character is not in exclude - unsigned ch = 'a'; - bool wrapped = false; - while (true) { - bool found = false; - for (auto [l, h] : exclude) { - if (l <= ch && ch <= h) { - found = true; - ch = h + 1; - } - } - if (!found) - break; - if (ch != zstring::unicode_max_char() + 1) +bool seq_rewriter::some_string_in_re(expr_mark& visited, expr* r, unsigned_vector& str) { + SASSERT(str.empty()); + vector todo; + todo.push_back({ expr_ref(r, m()), 0, {}, true }); + while (!todo.empty()) { + re_eval_pos current = todo.back(); + todo.pop_back(); + r = current.e; + str.resize(current.str_len); + if (current.needs_derivation) { + SASSERT(current.exclude.empty()); + // We are looking for the next character => generate derivation + if (visited.is_marked(r)) continue; - if (wrapped) - return false; - ch = 0; - wrapped = true; - } - str.push_back(ch); - if (some_string_in_re(pinned, visited, r, str)) - return true; - str.pop_back(); - return false; - } + if (re().is_empty(r)) + continue; + auto info = re().get_info(r); + if (info.nullable == l_true) + return true; + visited.mark(r); + if (re().is_union(r)) { + for (expr* arg : *to_app(r)) { + todo.push_back({ expr_ref(arg, m()), str.size(), {}, true }); + } + continue; + } - verbose_stream() << "todo append_char " << mk_pp(r, m()) << "\n"; - // TODO: select characters from m_chars[ctx.rand(m_chars.size())]); - str.push_back('a'); + r = mk_derivative(r); + } + // otw. we are still in the process of deciding case of the derivation to take + + buffer> exclude = std::move(current.exclude); + + expr* c, * th, * el; + if (re().is_empty(r)) + continue; + if (re().is_union(r)) { + for (expr* arg : *to_app(r)) { + todo.push_back({ expr_ref(arg, m()), str.size(), exclude, false }); + } + continue; + } + if (m().is_ite(r, c, th, el)) { + unsigned low = 0, high = zstring::unicode_max_char(); + bool hasBounds = get_bounds(c, low, high); + if (!re().is_empty(el)) { + exclude.push_back({ low, high }); + todo.push_back({ expr_ref(el, m()), str.size(), std::move(exclude), false }); + } + if (hasBounds) { + // I want this case to be processed first => push it last + // reason: current string is only pruned + SASSERT(low <= high); + str.push_back(low); // ASSERT: low .. high does not intersect with exclude + todo.push_back({ expr_ref(th, m()), str.size(), {}, true }); + } + continue; + } + + if (is_ground(r)) { + // ensure selected character is not in exclude + unsigned ch = 'a'; + bool wrapped = false; + bool failed = false; + while (true) { + bool found = false; + for (auto [l, h] : exclude) { + if (l <= ch && ch <= h) { + found = true; + ch = h + 1; + } + } + if (!found) + break; + if (ch != zstring::unicode_max_char() + 1) + continue; + if (wrapped) { + failed = true; + break; + } + ch = 0; + wrapped = true; + } + if (failed) + continue; + str.push_back(ch); + todo.push_back({ expr_ref(r, m()), str.size(), {}, true }); + continue; + } + + UNREACHABLE(); + } return false; } - bool seq_rewriter::get_bounds(expr* e, unsigned& low, unsigned& high) { low = 0; high = zstring::unicode_max_char(); diff --git a/src/ast/rewriter/seq_rewriter.h b/src/ast/rewriter/seq_rewriter.h index fc258ff50..5e3cf35f8 100644 --- a/src/ast/rewriter/seq_rewriter.h +++ b/src/ast/rewriter/seq_rewriter.h @@ -354,9 +354,7 @@ class seq_rewriter { void intersect(unsigned lo, unsigned hi, svector>& ranges); bool get_bounds(expr* e, unsigned& low, unsigned& high); - bool append_char(expr_ref_vector& pinned, expr_mark& visited, buffer>& exclude, expr* r, unsigned_vector& str); - bool some_string_in_re(expr_ref_vector& pinned, expr_mark& visited, expr* r, unsigned_vector& str); - + bool some_string_in_re(expr_mark& visited, expr* r, unsigned_vector& str); public: seq_rewriter(ast_manager & m, params_ref const & p = params_ref()): diff --git a/src/ast/sls/sls_seq_plugin.cpp b/src/ast/sls/sls_seq_plugin.cpp index 9288b81c0..286eed03b 100644 --- a/src/ast/sls/sls_seq_plugin.cpp +++ b/src/ast/sls/sls_seq_plugin.cpp @@ -1847,7 +1847,7 @@ namespace sls { { zstring s1; - if (ctx.is_true(e) && some_string_in_re(y, s1)) { + if (ctx.is_true(e) && rw.some_string_in_re(y, s1)) { m_str_updates.push_back({ x, s1, 1 }); return apply_update(); } @@ -2001,92 +2001,4 @@ namespace sls { choose(r2, k - 1, prefix2, result); } } - - // - // Find some string that is a member of regex - // - bool seq_plugin::some_string_in_re(expr* _r, zstring& s) { - expr_ref r(_r, m); - seq_rewriter rw(m); - while (true) { - if (seq.re.is_empty(r)) - return false; - auto info = seq.re.get_info(r); - if (info.nullable == l_true) - return true; - expr_ref r2 = rw.mk_derivative(r); - if (!append_char(r, r2, s)) - return false; - r = r2; - } - } - - bool seq_plugin::get_bounds(expr* e, unsigned& low, unsigned& high) { - // TODO: remove recursion (though it is probably not deep anyway) - expr* x, * y; - unsigned ch = 0; - if (m.is_and(e, x, y)) { - if (!get_bounds(x, low, high)) - return false; - return get_bounds(y, low, high); - } - if (m.is_eq(e, x, y)) { - if ((is_var(x) && seq.is_const_char(y, ch)) || (is_var(y) && seq.is_const_char(x, ch))) { - if (ch < low || ch > high) - return false; - low = high = ch; - return true; - } - return false; - } - if (seq.is_char_le(e, x, y)) { - if (seq.is_const_char(x, ch)) { - low = std::max(ch, low); - return high >= low; - } - if (seq.is_const_char(y, ch)) { - high = std::min(ch, high); - return high >= low; - } - return false; - } - return false; - } - - bool seq_plugin::append_char(expr* r0, expr_ref& r, zstring& s) { - expr* c, * th, * el; - if (seq.re.is_union(r)) { - auto info0 = seq.re.get_info(r0); - for (expr* r1 : *to_app(r)) { - auto info1 = seq.re.get_info(r1); - if (r0 == r1) - continue; - if (info1.min_length < info0.min_length) { - r = r1; - return append_char(r0, r, s); - } - } - expr* r2; - do { - r2 = to_app(r)->get_arg(ctx.rand(to_app(r)->get_num_args())); - } while (r2 == r0); - r = r2; - // Just take one that is not a self loop (there is always such one element) - return append_char(r0, r, s); - } - if (m.is_ite(r, c, th, el)) { - unsigned low = 0, high = UINT_MAX; - if (get_bounds(c, low, high)) { - SASSERT(low <= high); - s += zstring(low); - r = th; - return true; - } - verbose_stream() << "nyi append_char " << mk_bounded_pp(r, m) << "\n"; - NOT_IMPLEMENTED_YET(); - return false; - } - s += zstring(m_chars[ctx.rand(m_chars.size())]); - return true; - } } diff --git a/src/ast/sls/sls_seq_plugin.h b/src/ast/sls/sls_seq_plugin.h index 6e571e61f..7c7fc16ff 100644 --- a/src/ast/sls/sls_seq_plugin.h +++ b/src/ast/sls/sls_seq_plugin.h @@ -147,9 +147,6 @@ namespace sls { void next_char(expr* r, unsigned_vector& chars); bool is_in_re(zstring const& s, expr* r); - bool some_string_in_re(expr* r, zstring& s); - bool get_bounds(expr* e, unsigned& low, unsigned& high); - bool append_char(expr* r0, expr_ref& r, zstring& s); // access evaluation bool is_seq_predicate(expr* e);