diff --git a/src/ast/euf/euf_sgraph.cpp b/src/ast/euf/euf_sgraph.cpp index f7fcfa8e2..23c35336c 100644 --- a/src/ast/euf/euf_sgraph.cpp +++ b/src/ast/euf/euf_sgraph.cpp @@ -379,7 +379,8 @@ namespace euf { } snode* sgraph::find(expr* e) const { - SASSERT(e); + if (!e) + return nullptr; unsigned eid = e->get_id(); if (eid < m_expr2snode.size()) return m_expr2snode[eid]; @@ -453,18 +454,15 @@ namespace euf { } snode* sgraph::mk_concat(snode* a, snode* b) { - if (a->is_empty()) - return b; - if (b->is_empty()) - return a; + if (a->is_empty()) return b; + if (b->is_empty()) return a; if (m_seq.is_re(a->get_expr())) return mk(expr_ref(m_seq.re.mk_concat(a->get_expr(), b->get_expr()), m)); return mk(expr_ref(m_seq.str.mk_concat(a->get_expr(), b->get_expr()), m)); } snode* sgraph::drop_first(snode* n) { - SASSERT(!n->is_empty()); - if (n->is_token()) + if (n->is_empty() || n->is_token()) return mk_empty_seq(n->get_sort()); SASSERT(n->is_concat()); snode* l = n->arg(0); @@ -474,10 +472,8 @@ namespace euf { return mk_concat(drop_first(l), r); } - // TODO: Optimize snode* sgraph::drop_last(snode* n) { - SASSERT(!n->is_empty()); - if (n->is_token()) + if (n->is_empty() || n->is_token()) return mk_empty_seq(n->get_sort()); SASSERT(n->is_concat()); snode* l = n->arg(0); @@ -487,28 +483,19 @@ namespace euf { return mk_concat(l, drop_last(r)); } - // TODO: Optimize snode* sgraph::drop_left(snode* n, unsigned count) { - if (count == 0) - return n; - SASSERT(count <= n->length()); - if (count == n->length()) - return mk_empty_seq(n->get_sort()); - for (unsigned i = 0; i < count; ++i) { + if (count == 0 || n->is_empty()) return n; + if (count >= n->length()) return mk_empty_seq(n->get_sort()); + for (unsigned i = 0; i < count; ++i) n = drop_first(n); - } return n; } snode* sgraph::drop_right(snode* n, unsigned count) { - if (count == 0) - return n; - SASSERT(count <= n->length()); - if (count == n->length()) - return mk_empty_seq(n->get_sort()); - for (unsigned i = 0; i < count; ++i) { + if (count == 0 || n->is_empty()) return n; + if (count >= n->length()) return mk_empty_seq(n->get_sort()); + for (unsigned i = 0; i < count; ++i) n = drop_last(n); - } return n; } @@ -528,7 +515,8 @@ namespace euf { snode* sgraph::brzozowski_deriv(snode* re, snode* elem, snode* allowed_range) { expr* re_expr = re->get_expr(); expr* elem_expr = elem->get_expr(); - SASSERT(re_expr && elem_expr); + if (!re_expr || !elem_expr) + return nullptr; // unwrap str.unit to get the character expression expr* ch = nullptr; @@ -539,60 +527,46 @@ namespace euf { // we extract a representative character (like 'lo') from it, // and evaluate the derivative with respect to that representative character. // This avoids generating massive 'ite' structures for symbolic variables. - sort* seq_sort = nullptr, *ele_sort = nullptr; if (m_seq.is_re(re_expr, seq_sort) && m_seq.is_seq(seq_sort, ele_sort)) { - // Just take one element - they are anyway all assumed to produce the same result - auto extract_rep = [&](expr* e) -> expr* { - expr* lo = nullptr, *hi = nullptr; - expr* r1 = nullptr, *r2 = nullptr; - while (e) { - if (m_seq.re.is_full_char(e)) - return m_seq.mk_char(0); - if (m_seq.re.is_range(e, lo, hi) && lo) { - expr* lo_ch = nullptr; - zstring zs; - if (m_seq.str.is_unit(lo, lo_ch)) - return lo_ch; - if (m_seq.str.is_string(lo, zs) && zs.length() > 0) - return m_seq.str.mk_char(zs[0]); - return lo; - } - if (m_seq.re.is_union(e, r1, r2)) - e = r1; - else - return nullptr; - } - return nullptr; - }; - + sort* seq_sort = nullptr, *ele_sort = nullptr; + if (m_seq.is_re(re_expr, seq_sort) && m_seq.is_seq(seq_sort, ele_sort)) { if (allowed_range && allowed_range->get_expr()) { expr* range_expr = allowed_range->get_expr(); + expr* lo = nullptr, *hi = nullptr; if (m_seq.re.is_full_char(range_expr)) { - // For full char, keep symbolic + // For full char, we can't substitute a representative without losing info. + // Fallback to testing the symbolic character. } - else { - expr* rep = extract_rep(range_expr); - if (rep) - elem_expr = rep; + else if (m_seq.re.is_range(range_expr, lo, hi) && lo) { + expr* lo_ch = nullptr; + zstring zs; + if (m_seq.str.is_unit(lo, lo_ch)) + elem_expr = lo_ch; + else if (m_seq.str.is_string(lo, zs) && zs.length() > 0) + elem_expr = m_seq.str.mk_char(zs[0]); + else + elem_expr = lo; // Use representative to take the derivative } } + // Fallback: If elem itself is a regex predicate, extract representative else if (ele_sort != elem_expr->get_sort()) { - expr* rep = extract_rep(elem_expr); - if (rep) - elem_expr = rep; + expr* lo = nullptr, *hi = nullptr; + if (m_seq.re.is_full_char(elem_expr)) + return nullptr; + if (m_seq.re.is_range(elem_expr, lo, hi) && lo) { + expr* lo_ch = nullptr; + zstring zs; + if (m_seq.str.is_unit(lo, lo_ch)) + elem_expr = lo_ch; + else if (m_seq.str.is_string(lo, zs) && zs.length() > 0) + elem_expr = m_seq.str.mk_char(zs[0]); + else + elem_expr = lo; + } else return nullptr; } } - - SASSERT(elem_expr); - if (elem_expr->get_sort() != ele_sort) { - std::cout << "SORT MISMATCH before mk_derivative\n" - << " elem_expr: " << mk_pp(elem_expr, m) << "\n" - << " elem_sort: " << mk_pp(elem_expr->get_sort(), m) << "\n" - << " ele_sort: " << mk_pp(ele_sort, m) << "\n" - << " re_expr: " << mk_pp(re_expr, m) << std::endl; - } - + expr_ref result = m_rewriter.mk_derivative(elem_expr, re_expr); if (!result) return nullptr; @@ -600,7 +574,8 @@ namespace euf { } void sgraph::collect_re_predicates(snode* re, expr_ref_vector& preds) { - SASSERT(re && re->get_expr()); + if (!re || !re->get_expr()) + return; expr* e = re->get_expr(); expr* lo = nullptr, *hi = nullptr; // leaf regex predicates: character ranges and single characters @@ -616,8 +591,7 @@ namespace euf { if (m_seq.str.is_string(s, zs) && zs.length() > 0) { unsigned c = zs[0]; ch_expr = m_seq.str.mk_char(c); - } - else if (m_seq.str.is_unit(s, ch_expr)) { + } else if (m_seq.str.is_unit(s, ch_expr)) { // ch_expr correctly extracted } @@ -626,8 +600,8 @@ namespace euf { expr_ref re_char(m_seq.re.mk_to_re(unit_str), m); bool dup = false; for (expr* p : preds) { - if (p == re_char) { - dup = true; + if (p == re_char) { + dup = true; break; } } @@ -652,7 +626,7 @@ namespace euf { void sgraph::compute_minterms(snode* re, snode_vector& minterms) { expr_ref_vector preds(m); collect_re_predicates(re, preds); - + unsigned max_c = m_seq.max_char(); if (preds.empty()) { @@ -667,29 +641,37 @@ namespace euf { for (expr* p : preds) { char_set p_set; expr* lo = nullptr, *hi = nullptr; - + if (m_seq.re.is_range(p, lo, hi)) { unsigned vlo = 0, vhi = 0; - if (m_seq.is_const_char(lo, vlo) && m_seq.is_const_char(hi, vhi)) { - if (vlo <= vhi) - p_set = char_set(char_range(vlo, vhi + 1)); + bool has_lo = false, has_hi = false; + + if (lo) { + if (decode_re_char(lo, vlo)) + has_lo = true; } - } + + if (hi) { + if (decode_re_char(hi, vhi)) + has_hi = true; + } + + if (has_lo && has_hi && vlo <= vhi) + p_set = char_set(char_range(vlo, vhi + 1)); + } else if (m_seq.re.is_to_re(p)) { expr* str_arg = nullptr; - expr* ch_expr = nullptr; unsigned char_val = 0; if (m_seq.re.is_to_re(p, str_arg) && - m_seq.str.is_unit(str_arg, ch_expr) && - m_seq.is_const_char(ch_expr, char_val)) { + decode_re_char(str_arg, char_val)) { p_set.add(char_val); } - } + } else if (m_seq.re.is_full_char(p)) p_set = char_set::full(max_c); - else + else continue; - + if (p_set.is_empty() || p_set.is_full(max_c)) continue; @@ -698,13 +680,11 @@ namespace euf { for (auto const& c : classes) { char_set in_c = c.intersect_with(p_set); char_set out_c = c.intersect_with(p_comp); - if (!in_c.is_empty()) - next_classes.push_back(in_c); - if (!out_c.is_empty()) - next_classes.push_back(out_c); + if (!in_c.is_empty()) next_classes.push_back(in_c); + if (!out_c.is_empty()) next_classes.push_back(out_c); } classes = std::move(next_classes); - } + } for (auto const& c : classes) { expr_ref class_expr(m); for (auto const& r : c.ranges()) { @@ -723,6 +703,22 @@ namespace euf { } } + bool sgraph::decode_re_char(expr* ex, unsigned& out) const { + if (!ex) + return false; + if (m_seq.is_const_char(ex, out)) + return true; + expr* ch = nullptr; + if (m_seq.str.is_unit(ex, ch) && m_seq.is_const_char(ch, out)) + return true; + zstring s; + if (m_seq.str.is_string(ex, s) && s.length() == 1) { + out = s[0]; + return true; + } + return false; + } + std::ostream& sgraph::display(std::ostream& out) const { auto kind_str = [](snode_kind k) -> char const* { switch (k) { @@ -779,9 +775,3 @@ namespace euf { } - - - - - - diff --git a/src/ast/euf/euf_sgraph.h b/src/ast/euf/euf_sgraph.h index 5c52efa92..a3d52a403 100644 --- a/src/ast/euf/euf_sgraph.h +++ b/src/ast/euf/euf_sgraph.h @@ -144,6 +144,10 @@ namespace euf { // for deriving symbolic variables. snode* brzozowski_deriv(snode* re, snode* elem, snode* allowed_range = nullptr); + // Decode a character expression that may be represented as a const-char, + // a unit string containing a const-char, or a one-character string literal. + bool decode_re_char(expr* ex, unsigned& out) const; + // compute minterms (character class partition) from a regex void compute_minterms(snode* re, snode_vector& minterms); diff --git a/src/smt/seq/seq_regex.cpp b/src/smt/seq/seq_regex.cpp index b5d994291..c7c22ae4f 100644 --- a/src/smt/seq/seq_regex.cpp +++ b/src/smt/seq/seq_regex.cpp @@ -354,8 +354,8 @@ namespace seq { // ----------------------------------------------------------------------- // BFS regex emptiness check — helper: collect character boundaries + // This is faster than computing the actual minterms but probably not minimal // ----------------------------------------------------------------------- - void seq_regex::collect_char_boundaries(euf::snode* re, unsigned_vector& bounds) const { SASSERT(re && re->get_expr()); @@ -367,11 +367,11 @@ namespace seq { expr* lo_expr = nullptr; expr* hi_expr = nullptr; if (seq.re.is_range(e, lo_expr, hi_expr)) { - zstring s_lo, s_hi; - if (lo_expr && seq.str.is_string(lo_expr, s_lo) && s_lo.length() == 1) - bounds.push_back(s_lo[0]); - if (hi_expr && seq.str.is_string(hi_expr, s_hi) && s_hi.length() == 1 && s_hi[0] < zstring::max_char()) - bounds.push_back(s_hi[0] + 1); + unsigned lo = 0, hi = 0; + if (m_sg.decode_re_char(lo_expr, lo)) + bounds.push_back(lo); + if (m_sg.decode_re_char(hi_expr, hi) && hi < zstring::max_char()) + bounds.push_back(hi + 1); return; } @@ -379,7 +379,13 @@ namespace seq { expr* body = nullptr; if (seq.re.is_to_re(e, body)) { zstring s; - if (seq.str.is_string(body, s) && s.length() > 0) { + unsigned ch = 0; + if (m_sg.decode_re_char(body, ch)) { + bounds.push_back(ch); + if (ch < zstring::max_char()) + bounds.push_back(ch + 1); + } + else if (seq.str.is_string(body, s) && s.length() > 0) { unsigned first_ch = s[0]; bounds.push_back(first_ch); if (first_ch < zstring::max_char()) @@ -400,23 +406,41 @@ namespace seq { // ----------------------------------------------------------------------- // BFS regex emptiness check — helper: alphabet representatives + // Faster alternative of computing all min-terms and taking representatives of them // ----------------------------------------------------------------------- - void seq_regex::get_alphabet_representatives(euf::snode* re, euf::snode_vector& reps) { - euf::snode_vector minterms; - m_sg.compute_minterms(re, minterms); + if (!re || !re->get_expr()) + return; - for (euf::snode* mt : minterms) { - SASSERT(mt); - if (mt->is_fail()) - continue; - char_set cs = minterm_to_char_set(mt->get_expr()); - SASSERT(!cs.is_empty()); - - // Pick a concrete character from the character set to act as the representative - unsigned rep_char = cs.ranges()[0].m_lo; - reps.push_back(m_sg.mk_char(rep_char)); + seq_util& seq = m_sg.get_seq_util(); + unsigned max_c = seq.max_char(); + + // Partition the alphabet using boundary points induced by regex + // predicates; one representative per interval is sufficient. + unsigned_vector bounds; + bounds.push_back(0); + if (max_c < UINT_MAX) + bounds.push_back(max_c + 1); + collect_char_boundaries(re, bounds); + + std::sort(bounds.begin(), bounds.end()); + unsigned_vector uniq; + for (unsigned b : bounds) { + if (uniq.empty() || uniq.back() != b) + uniq.push_back(b); } + bounds = uniq; + + for (unsigned i = 0; i + 1 < bounds.size(); ++i) { + unsigned lo = bounds[i]; + unsigned hi = bounds[i + 1]; + if (lo <= max_c && lo < hi) + reps.push_back(m_sg.mk_char(lo)); + } + + // Defensive fallback for degenerate inputs. + if (reps.empty()) + reps.push_back(m_sg.mk_char(0)); } // ----------------------------------------------------------------------- @@ -457,6 +481,8 @@ namespace seq { unsigned states_explored = 0; while (!worklist.empty()) { + if (!m_sg.get_manager().inc()) + return l_undef; if (states_explored >= max_states) return l_undef; @@ -476,6 +502,8 @@ namespace seq { continue; for (euf::snode* ch : reps) { + if (!m_sg.get_manager().inc()) + return l_undef; // std::cout << "Deriving by " << snode_label_html(ch, sg().get_manager()) << std::endl; euf::snode* deriv = m_sg.brzozowski_deriv(current, ch); SASSERT(deriv); @@ -1097,13 +1125,30 @@ namespace seq { return char_set::full(max_c); // range [lo, hi] (hi inclusive in Z3's regex representation) - unsigned lo = 0, hi = 0; - if (seq.re.is_range(re_expr, lo, hi)) { - // lo > hi is a degenerate range; should not arise from well-formed minterms - SASSERT(lo <= hi); - if (lo > hi) return char_set(); - // char_range uses exclusive upper bound; Z3 hi is inclusive - return char_set(char_range(lo, hi + 1)); + expr* lo_expr = nullptr; + expr* hi_expr = nullptr; + if (seq.re.is_range(re_expr, lo_expr, hi_expr)) { + unsigned lo = 0, hi = 0; + bool has_lo = false, has_hi = false; + + if (lo_expr) { + if (m_sg.decode_re_char(lo_expr, lo)) { + has_lo = true; + } + } + if (hi_expr) { + if (m_sg.decode_re_char(hi_expr, hi)) { + has_hi = true; + } + } + + if (has_lo && has_hi) { + SASSERT(lo <= hi); + if (lo > hi) + return char_set(); + // char_range uses exclusive upper bound; Z3 hi is inclusive + return char_set(char_range(lo, hi + 1)); + } } // complement: alphabet minus the inner set @@ -1130,11 +1175,8 @@ namespace seq { // to_re(str.unit(c)): singleton character set expr* str_arg = nullptr; - expr* ch_expr = nullptr; unsigned char_val = 0; - if (seq.re.is_to_re(re_expr, str_arg) && - seq.str.is_unit(str_arg, ch_expr) && - seq.is_const_char(ch_expr, char_val)) { + if (seq.re.is_to_re(re_expr, str_arg) && m_sg.decode_re_char(str_arg, char_val)) { char_set cs; cs.add(char_val); return cs; @@ -1150,3 +1192,5 @@ namespace seq { } } + + diff --git a/src/smt/theory_nseq.cpp b/src/smt/theory_nseq.cpp index ad2fc99ca..d26fa69a0 100644 --- a/src/smt/theory_nseq.cpp +++ b/src/smt/theory_nseq.cpp @@ -444,6 +444,7 @@ namespace smt { << (m_nielsen.sat_node() ? "set" : "null") << "\n";); // Nielsen found a consistent assignment for positive constraints. // If there are disequalities we haven't verified, we cannot soundly declare sat. + SASSERT(!m_state.empty()); // we should have axiomatized them if (!m_state.diseqs().empty()) return FC_GIVEUP; if (!has_unhandled_preds())