From 2dd4faf59889fd35d8dbc5761cd595f8ed992ddf Mon Sep 17 00:00:00 2001 From: Nikolaj Bjorner Date: Tue, 7 Jan 2025 16:53:28 -0800 Subject: [PATCH] sketch expr_inverter approach for eliminating unconstrained regex containment. --- src/ast/converters/expr_inverter.cpp | 46 +++++------- src/ast/rewriter/seq_rewriter.cpp | 103 +++++++++++++++++++++++++++ src/ast/rewriter/seq_rewriter.h | 12 ++++ 3 files changed, 134 insertions(+), 27 deletions(-) diff --git a/src/ast/converters/expr_inverter.cpp b/src/ast/converters/expr_inverter.cpp index f7d59c9e3..6b5045e2b 100644 --- a/src/ast/converters/expr_inverter.cpp +++ b/src/ast/converters/expr_inverter.cpp @@ -881,36 +881,28 @@ public: } return false; case OP_SEQ_IN_RE: - if (uncnstr(args[0]) && seq.re.is_ground(args[1])) { + if (uncnstr(args[0]) && seq.re.is_ground(args[1]) && seq.is_string(args[0]->get_sort())) { #if 0 + // // requires auxiliary functions - // is_empty - // find_member - expr* r = args[1]; - expr_ref not_r(seq.re.mk_complement(r), m); - lbool emp_r = rw.is_empty(r); - - if (emp_r == l_undef) + // some_string_in_re. + // A preliminary implementation exists in sls_seq_plugin.cpp + // it should be moved to seq_rewriter and made agnostic to m_chars. + // maybe use backtracking for better covereage: when some_string_in_re + // fails it doesn't necessarily mean that the regex is empty. + // + zstring s1, s2; + expr* re = args[1]; + expr_ref not_re(seq.re.mk_complement(re), m); + if (!rw.some_string_in_re(re, s1) || !rw.some_string_in_re(not_re, s2)) return false; - if (emp_r == l_true) { - r = m.mk_false(); - return true; - } - lbool emp_not_r = rw.is_empty(not_r); - if (emp_not_r == l_true) { - r = m.mk_true(); - return true; - } - if (emp_not_r == l_false) { - mk_fresh_uncnstr_var_for(f, r); - expr_ref witness1 = rw.find_member(r); - expr_ref witness2 = rw.find_member(not_r); - if (!witness1 || !witness2) - return false; - if (m_mc) - add_def(args[0], m.mk_ite(r, witness1, witness2)); - return true; - } + + mk_fresh_uncnstr_var_for(f, r); + expr_ref witness1 = expr_ref(seq.str.mk_string(s1), m); + expr_ref witness2 = expr_ref(seq.str.mk_string(s2), m); + if (m_mc) + add_def(args[0], m.mk_ite(r, witness1, witness2)); + return true; #endif } return false; diff --git a/src/ast/rewriter/seq_rewriter.cpp b/src/ast/rewriter/seq_rewriter.cpp index 7675e0d23..20f3a9efa 100644 --- a/src/ast/rewriter/seq_rewriter.cpp +++ b/src/ast/rewriter/seq_rewriter.cpp @@ -6145,3 +6145,106 @@ void seq_rewriter::op_cache::cleanup() { STRACE("seq_verbose", tout << "Derivative op cache reset" << std::endl;); } } + +bool seq_rewriter::some_string_in_re(expr* r, zstring& s) { + sort* rs; + (void)rs; + SASSERT(re().is_re(r, rs) && m_util.is_string(rs)); + expr_mark visited; + unsigned_vector str; + expr_ref_vector pinned(m()); + if (!some_string_in_re(pinned, visited, r, str)) + return false; + s = zstring(str.size(), str.data()); + return true; +} + +bool seq_rewriter::some_string_in_re(expr_ref_vector& pinned, expr_mark& visited, expr* r, unsigned_vector& str) { + if (visited.is_marked(r)) + return false; + if (re().is_empty(r)) + return false; + auto info = re().get_info(r); + if (info.nullable == l_true) + return true; + visited.mark(r); + pinned.push_back(r); + if (re().is_union(r)) + return any_of(*to_app(r), [&](expr* arg) { return some_string_in_re(pinned, visited, arg, str); }); + + expr_ref r2 = mk_derivative(r); + buffer> exclude; + return append_char(pinned, visited, exclude, r2, str); +} + +bool seq_rewriter::append_char(expr_ref_vector& pinned, expr_mark& visited, buffer>& exclude, expr* r, unsigned_vector& str) { + expr* c, * th, * el; + if (re().is_empty(r)) + return false; + if (re().is_union(r)) + return any_of(*to_app(r), [&](expr* arg) { return append_char(pinned, visited, exclude, arg, str); }); + if (m().is_ite(r, c, th, el)) { + unsigned low = 0, high = zstring::unicode_max_char(); + if (get_bounds(c, low, high)) { + SASSERT(low <= high); + unsigned sz = str.size(); + str.push_back(low); // TODO: check that low is not in exclude + if (some_string_in_re(pinned, visited, th, str)) + return true; + str.shrink(sz); + } + if (re().is_empty(el)) + return false; + exclude.push_back({ low, high }); + if (append_char(pinned, visited, exclude, el, str)) + return true; + exclude.pop_back(); + } + + if (is_ground(r)) { + for (auto [l, h] : exclude) + verbose_stream() << "exclude " << l << " " << h << "\n"; + str.push_back('a'); + if (some_string_in_re(pinned, visited, r, str)) + return true; + str.pop_back(); + return false; + } + + verbose_stream() << "todo append_char " << mk_pp(r, m()) << "\n"; + // TODO: select characters from m_chars[ctx.rand(m_chars.size())]); + str.push_back('a'); + return false; +} + + +bool seq_rewriter::get_bounds(expr* e, unsigned& low, unsigned& high) { + low = 0; + high = zstring::unicode_max_char(); + ptr_buffer todo; + todo.push_back(e); + expr* x, * y; + unsigned ch = 0; + while (!todo.empty()) { + e = todo.back(); + todo.pop_back(); + if (m().is_and(e)) + todo.append(to_app(e)->get_num_args(), to_app(e)->get_args()); + else if (m_util.is_char_le(e, x, y) && m_util.is_const_char(x, ch) && is_var(y)) + low = std::max(ch, low); + else if (m_util.is_char_le(e, x, y) && m_util.is_const_char(y, ch) && is_var(x)) + high = std::min(ch, high); + else if (m().is_eq(e, x, y) && is_var(x) && m_util.is_const_char(y, ch)) { + low = std::max(ch, low); + high = std::min(ch, high); + } + else if (m().is_eq(e, x, y) && is_var(y) && m_util.is_const_char(x, ch)) { + low = std::max(ch, low); + high = std::min(ch, high); + } + else + return false; + } + return low <= high; +} + diff --git a/src/ast/rewriter/seq_rewriter.h b/src/ast/rewriter/seq_rewriter.h index af4756576..fc258ff50 100644 --- a/src/ast/rewriter/seq_rewriter.h +++ b/src/ast/rewriter/seq_rewriter.h @@ -353,6 +353,11 @@ class seq_rewriter { void intersect(unsigned lo, unsigned hi, svector>& ranges); + bool get_bounds(expr* e, unsigned& low, unsigned& high); + bool append_char(expr_ref_vector& pinned, expr_mark& visited, buffer>& exclude, expr* r, unsigned_vector& str); + bool some_string_in_re(expr_ref_vector& pinned, expr_mark& visited, expr* r, unsigned_vector& str); + + public: seq_rewriter(ast_manager & m, params_ref const & p = params_ref()): m_util(m), m_autil(m), m_br(m, p), m_re2aut(m), m_op_cache(m), m_es(m), @@ -432,5 +437,12 @@ public: expr_ref mk_regex_union_normalize(expr* r1, expr* r2); /* Apply simplifications to the intersection to keep it normalized (r1 and r2 are not normalized)*/ expr_ref mk_regex_inter_normalize(expr* r1, expr* r2); + + /* + * Extract some string that is a member of r. + * Return true if a valid string was extracted. + * Return false when giving up or the regular expression is empty. + */ + bool some_string_in_re(expr* r, zstring& s); };