From 39080166519a72c0c2ae2f2e02f99c2c1291a157 Mon Sep 17 00:00:00 2001 From: CEisenhofer Date: Tue, 2 Jun 2026 17:04:31 +0200 Subject: [PATCH] Potentially fixed termination problem with projection operators --- src/ast/euf/euf_sgraph.cpp | 56 ++++++++++++++++++++++++++----- src/ast/euf/euf_sgraph.h | 7 ++-- src/smt/seq_model.cpp | 67 +++++++++++++++++++++++++++++++++++++- src/smt/seq_model.h | 8 +++++ 4 files changed, 127 insertions(+), 11 deletions(-) diff --git a/src/ast/euf/euf_sgraph.cpp b/src/ast/euf/euf_sgraph.cpp index 96f42bf01..694f1f56f 100644 --- a/src/ast/euf/euf_sgraph.cpp +++ b/src/ast/euf/euf_sgraph.cpp @@ -597,18 +597,40 @@ namespace euf { return expr_ref(m_seq.mk_skolem(re_proj_name(), 3, args, re_sort), m); } + // A symbolic-character derivative is a linear form: a *dispatch* over the + // character built from `ite` (predicate selection) and `union` (the linear + // factors α_i·r_i), bottoming out in ordinary derivative *states* r_i. We + // recognize this dispatch skeleton so that π can be pushed all the way to + // the concrete state leaves (where it is meaningful) rather than wrapped + // around a non-atomic union/ite (which is not a DFA state and breaks + // projection_state_in_Q / nullability). A plain regex state (even one that + // is itself a union, e.g. δ of a union regex) is NOT dispatch. + bool sgraph::is_char_dispatch(expr* e) const { + if (m.is_ite(e)) + return true; + if (m_seq.re.is_empty(e)) + return true; + expr* a = nullptr, *b = nullptr; + if (m_seq.re.is_union(e, a, b)) + return is_char_dispatch(a) && is_char_dispatch(b); + return false; + } + expr_ref sgraph::wrap_proj(expr* e, expr* root, unsigned nu) { - // The symbolic derivative of a regex is an ite-term dispatching on - // character predicates; its leaves are ordinary derivative states. - // π_{Q,F} is propagated to all such leaves (paper §4). Partial-DFA - // states are produced from concrete-character derivatives and are thus - // ite-free, so every ite encountered here is dispatch structure. + // Push π_{Q,F} through the dispatch skeleton (ite / dispatch-union) to + // the concrete state leaves (paper §4). expr* c = nullptr, *th = nullptr, *el = nullptr; if (m.is_ite(e, c, th, el)) { expr_ref t = wrap_proj(th, root, nu); expr_ref f = wrap_proj(el, root, nu); return expr_ref(m.mk_ite(c, t, f), m); } + expr* a = nullptr, *b = nullptr; + if (m_seq.re.is_union(e, a, b) && is_char_dispatch(e)) { + expr_ref wa = wrap_proj(a, root, nu); + expr_ref wb = wrap_proj(b, root, nu); + return expr_ref(m_seq.re.mk_union(wa, wb), m); + } // π(∅) ≡ ∅: a dead leaf stays dead. if (m_seq.re.is_empty(e)) return expr_ref(e, m); @@ -691,10 +713,23 @@ namespace euf { // top-level linear form, exactly as the standard mk_derivative does. // These combinators also fold the trivial regex identities so the // projection skolem and its inner state id are preserved verbatim. + // The symbolic-character derivative is a dispatch over the char built + // from `ite` (predicate selection) AND `union` (the linear factors). + // The combinators below distribute over BOTH so the dispatch skeleton + // stays at the top with concrete (π-wrapped) state leaves — otherwise a + // surrounding operator buries the dispatch and apply_regex_if_split can + // no longer resolve the symbolic char (the multi-cycle-SCC divergence). + // We only distribute over a union that is itself a char-dispatch (so a + // semantic state-union — e.g. δ of a union regex — is left intact and + // not needlessly expanded). All distributions are language-preserving + // (∩ and · distribute over ⊔; ~(A⊔B) = ~A ∩ ~B by De Morgan). auto is_empty = [&](expr* r) { return m_seq.re.is_empty(r); }; auto is_full = [&](expr* r) { return m_seq.re.is_full_seq(r); }; auto is_eps = [&](expr* r) { return m_seq.re.is_epsilon(r); }; auto is_ite = [&](expr* r, expr*& c, expr*& t, expr*& e) { return m.is_ite(r, c, t, e); }; + auto is_disp_union = [&](expr* r, expr*& a, expr*& b) { + return m_seq.re.is_union(r, a, b) && is_char_dispatch(r); + }; std::function mk_union = [&](expr* x, expr* y) -> expr* { expr *c = nullptr, *t = nullptr, *e = nullptr; @@ -707,9 +742,11 @@ namespace euf { return m_seq.re.mk_union(x, y); }; std::function mk_inter = [&](expr* x, expr* y) -> expr* { - expr *c = nullptr, *t = nullptr, *e = nullptr; + expr *c = nullptr, *t = nullptr, *e = nullptr, *a = nullptr, *b = nullptr; if (is_ite(x, c, t, e)) return m.mk_ite(c, mk_inter(t, y), mk_inter(e, y)); if (is_ite(y, c, t, e)) return m.mk_ite(c, mk_inter(x, t), mk_inter(x, e)); + if (is_disp_union(x, a, b)) return mk_union(mk_inter(a, y), mk_inter(b, y)); + if (is_disp_union(y, a, b)) return mk_union(mk_inter(x, a), mk_inter(x, b)); if (is_empty(x) || is_empty(y)) return m_seq.re.mk_empty(re_sort); if (is_full(x)) return y; if (is_full(y)) return x; @@ -717,17 +754,20 @@ namespace euf { return m_seq.re.mk_inter(x, y); }; std::function mk_concat = [&](expr* x, expr* y) -> expr* { - expr *c = nullptr, *t = nullptr, *e = nullptr; + expr *c = nullptr, *t = nullptr, *e = nullptr, *a = nullptr, *b = nullptr; if (is_ite(x, c, t, e)) return m.mk_ite(c, mk_concat(t, y), mk_concat(e, y)); if (is_ite(y, c, t, e)) return m.mk_ite(c, mk_concat(x, t), mk_concat(x, e)); + if (is_disp_union(x, a, b)) return mk_union(mk_concat(a, y), mk_concat(b, y)); + if (is_disp_union(y, a, b)) return mk_union(mk_concat(x, a), mk_concat(x, b)); if (is_empty(x) || is_empty(y)) return m_seq.re.mk_empty(re_sort); if (is_eps(x)) return y; if (is_eps(y)) return x; return m_seq.re.mk_concat(x, y); }; std::function mk_compl = [&](expr* x) -> expr* { - expr *c = nullptr, *t = nullptr, *e = nullptr; + expr *c = nullptr, *t = nullptr, *e = nullptr, *a = nullptr, *b = nullptr; if (is_ite(x, c, t, e)) return m.mk_ite(c, mk_compl(t), mk_compl(e)); + if (is_disp_union(x, a, b)) return mk_inter(mk_compl(a), mk_compl(b)); // De Morgan if (is_empty(x)) return m_seq.re.mk_full_seq(re_sort); if (is_full(x)) return m_seq.re.mk_empty(re_sort); expr* inner = nullptr; diff --git a/src/ast/euf/euf_sgraph.h b/src/ast/euf/euf_sgraph.h index df23eefc6..fb7b3df29 100644 --- a/src/ast/euf/euf_sgraph.h +++ b/src/ast/euf/euf_sgraph.h @@ -182,8 +182,11 @@ namespace euf { bool is_re_proj(expr* e, expr*& state, expr*& root, unsigned& nu) const; // Build the re.proj skolem expression for π_{{root}}(state) at snapshot nu. expr_ref mk_re_proj(expr* state, expr* root, unsigned nu); - // Wrap a (possibly ite-structured) symbolic-derivative result in the - // projection operator, propagating π into every ite leaf (paper §4). + // True if e is a symbolic-character dispatch skeleton (ite / union of + // dispatch, bottoming out in ∅) rather than a concrete regex state. + bool is_char_dispatch(expr* e) const; + // Wrap a (possibly ite/union-structured) symbolic-derivative result in + // the projection operator, propagating π into every dispatch leaf (§4). expr_ref wrap_proj(expr* e, expr* root, unsigned nu); // Projection-aware Brzozowski derivative w.r.t. a character expr // (concrete or symbolic). diff --git a/src/smt/seq_model.cpp b/src/smt/seq_model.cpp index a676f0cd2..baf1146c3 100644 --- a/src/smt/seq_model.cpp +++ b/src/smt/seq_model.cpp @@ -460,13 +460,19 @@ namespace smt { expr_ref witness(m); // We checked non-emptiness during Nielsen already lbool wr = m_rewriter.some_seq_in_re(re_expr, witness); + if (wr != l_true && re->has_projection()) { + // some_seq_in_re cannot extract a witness from a projection + // operator (re.proj). Fall back to a projection-aware BFS over + // the (length-intersected) regex using the sgraph derivative. + wr = projection_witness(m_sg.mk(re_expr), witness); + } if (wr == l_true) { SASSERT(witness); m_trail.push_back(witness); m_factory->register_value(witness); return witness; } - IF_VERBOSE(1, verbose_stream() << "witness extraction failed for " << mk_pp(var->get_expr(), m) + IF_VERBOSE(1, verbose_stream() << "witness extraction failed for " << mk_pp(var->get_expr(), m) << " : " << wr << " with len " << (has_len ? len_val.to_string() : "unknown") << "\n" << mk_pp(re_expr, m) << "\n"); UNREACHABLE(); } @@ -507,6 +513,65 @@ namespace smt { return m_seq.str.mk_empty(srt); } + lbool seq_model::projection_witness(euf::snode* re0, expr_ref& witness) { + if (!re0 || !re0->get_expr()) + return l_undef; + sort* seq_sort = nullptr; + if (!m_seq.is_re(re0->get_expr(), seq_sort)) + return l_undef; + + // Shortest-path BFS over the projection-aware derivative automaton. + // Each frontier item is (state, accepting-word-so-far). A state is + // accepting when re_nullable (projection-aware) reports nullable. + // The regex carries the length intersection (∩ Σ^n), so an accepting + // run has exactly the requested length and the search is finite. + vector> work; + work.push_back({re0, zstring()}); + uint_set visited; + visited.insert(re0->id()); + + unsigned head = 0; + const unsigned MAX_STATES = 100000; + while (head < work.size() && head < MAX_STATES) { + euf::snode* st = work[head].first; + zstring w = work[head].second; + ++head; + + if (m_sg.re_nullable(st) == l_true) { + witness = m_seq.str.mk_string(w); + return l_true; + } + if (st->is_fail()) + continue; + + euf::snode_vector mts; + m_sg.compute_minterms(st, mts); + for (euf::snode* mt : mts) { + euf::snode* d = m_sg.brzozowski_deriv(st, mt); + if (!d || d->is_fail()) + continue; + if (visited.contains(d->id())) + continue; + visited.insert(d->id()); + + // Representative character of the minterm (must lie in mt so + // that δ_ch(st) = d). Minterms over the cycle alphabet are + // ranges / to_re singletons / full_char. + unsigned ch = 0; + expr* me = mt->get_expr(); + expr* lo = nullptr, *hi = nullptr, *s = nullptr; + if (m_seq.re.is_range(me, lo, hi)) + m_sg.decode_re_char(lo, ch); + else if (m_seq.re.is_to_re(me, s)) + m_sg.decode_re_char(s, ch); + // re.full_char and anything else: ch = 0 (a valid representative) + + work.push_back({d, w + zstring(ch)}); + } + } + return l_undef; + } + void seq_model::collect_var_regex_constraints(seq::nielsen_node const* sat_node) { SASSERT(sat_node); for (auto const& mem : sat_node->str_mems()) { diff --git a/src/smt/seq_model.h b/src/smt/seq_model.h index 2d2e75834..8a8e1b5f8 100644 --- a/src/smt/seq_model.h +++ b/src/smt/seq_model.h @@ -117,6 +117,14 @@ namespace smt { // to a plain fresh value from the factory. expr* mk_fresh_value(euf::snode* var); + // Witness extraction for regexes that contain a projection operator + // (re.proj), which the standard seq_rewriter::some_seq_in_re cannot + // handle. Runs a shortest-path BFS over the projection-aware + // derivative automaton (m_sg) for a nullable (accepting) state, + // building the accepting word. Returns l_true and sets `witness` + // on success. + lbool projection_witness(euf::snode* re, expr_ref& witness); + // collect per-variable regex constraints from the state. // For each positive str_mem, records the regex (or intersects // with existing) into m_var_regex keyed by the string snode id.