diff --git a/src/ast/euf/euf_snode.h b/src/ast/euf/euf_snode.h index 2e4f94213..614d9f225 100644 --- a/src/ast/euf/euf_snode.h +++ b/src/ast/euf/euf_snode.h @@ -67,7 +67,7 @@ namespace euf { inline symbol re_proj_name() { return symbol("re.proj"); } class snode { - expr *m_expr = nullptr; + expr *m_expr = nullptr; // assumed to be non-null snode_kind m_kind = snode_kind::s_var; unsigned m_id = UINT_MAX; unsigned m_num_args = 0; @@ -107,7 +107,7 @@ namespace euf { public: expr *get_expr() const { - return m_expr; + return m_expr; // assumed to be non-null } snode_kind kind() const { return m_kind; @@ -115,14 +115,28 @@ namespace euf { unsigned id() const { return m_id; } + unsigned num_args() const { return m_num_args; } - snode *arg(unsigned i) const { + + snode* arg(const unsigned i) const { SASSERT(i < m_num_args); return m_args[i]; } + snode* arg0() const { + return arg(0); + } + + snode * const* begin() const { + return m_args; + } + + snode * const* end() const { + return m_args + m_num_args; + } + // TODO: Track regex being "classical" (no complement, intersection, fail) bool is_ground() const { return m_ground; @@ -214,6 +228,28 @@ namespace euf { bool is_in_re() const { return m_kind == snode_kind::s_in_re; } + bool is_string() const { + // constant string (we assume a flat concatenation) + return is_concat() && std::ranges::all_of(begin(), end(), + [](const snode * const arg) { return arg->is_char(); }); + } + + bool is_string(zstring& str, seq_util& seq) const { + // constant string (we assume a flat concatenation) + // TODO: Optimize + if (!is_concat()) + return false; + str.reset(); + const unsigned cnt = num_args(); + for (unsigned i = 0; i < cnt; i++) { + const snode* const c = arg(i); + unsigned val; + if (!seq.is_const_char(c->get_expr(), val)) + return false; + str += zstring(val); + } + return true; + } bool is_projection() const { return m_kind == snode_kind::s_projection; } diff --git a/src/params/smt_params.cpp b/src/params/smt_params.cpp index 2bca286db..03b4795dd 100644 --- a/src/params/smt_params.cpp +++ b/src/params/smt_params.cpp @@ -59,6 +59,7 @@ void smt_params::updt_local_params(params_ref const & _p) { m_nseq_parikh = p.nseq_parikh(); m_nseq_regex_precheck = p.nseq_regex_precheck(); m_nseq_regex_factorization_threshold = p.nseq_regex_factorization_threshold(); + m_nseq_regex_factorization_eager = p.nseq_regex_factorization_eager(); m_nseq_signature = p.nseq_signature(); m_nseq_axiomatize_diseq = p.nseq_axiomatize_diseq(); m_up_persist_clauses = p.up_persist_clauses(); @@ -175,6 +176,7 @@ void smt_params::display(std::ostream & out) const { DISPLAY_PARAM(m_nseq_parikh); DISPLAY_PARAM(m_nseq_regex_precheck); DISPLAY_PARAM(m_nseq_regex_factorization_threshold); + DISPLAY_PARAM(m_nseq_regex_factorization_eager); DISPLAY_PARAM(m_nseq_axiomatize_diseq); DISPLAY_PARAM(m_profile_res_sub); diff --git a/src/params/smt_params.h b/src/params/smt_params.h index 5fc9744bb..174016bb2 100644 --- a/src/params/smt_params.h +++ b/src/params/smt_params.h @@ -254,6 +254,7 @@ struct smt_params : public preprocessor_params, bool m_nseq_parikh = false; bool m_nseq_regex_precheck = true; unsigned m_nseq_regex_factorization_threshold = 1; + bool m_nseq_regex_factorization_eager = false; bool m_nseq_signature = false; bool m_nseq_axiomatize_diseq = false; diff --git a/src/params/smt_params_helper.pyg b/src/params/smt_params_helper.pyg index eb4c5d4f9..54848eb59 100644 --- a/src/params/smt_params_helper.pyg +++ b/src/params/smt_params_helper.pyg @@ -138,6 +138,7 @@ def_module_params(module_name='smt', ('nseq.parikh', BOOL, False, 'enable Parikh image checks in nseq solver'), ('nseq.regex_precheck', BOOL, True, 'enable regex membership pre-check before DFS in theory_nseq: checks intersection emptiness per-variable and short-circuits SAT/UNSAT for regex-only problems'), ('nseq.regex_factorization_threshold', UINT, 1, 'maximum number of cases to factor a classical regex into in a single step (gives completeness on classical regexes)'), + ('nseq.regex_factorization_eager', BOOL, False, 'apply regex factorization (sigma splitting) eagerly in the theory interface (propagate_pos_mem) instead of lazily inside the Nielsen graph'), ('nseq.signature', BOOL, False, 'enable heuristic signature-based string equation splitting in Nielsen solver'), ('nseq.axiomatize_diseq', BOOL, False, 'eagerly axiomatize sequence disequalities'), ('core.validate', BOOL, False, '[internal] validate unsat core produced by SMT context. This option is intended for debugging'), diff --git a/src/smt/seq/seq_nielsen.cpp b/src/smt/seq/seq_nielsen.cpp index 8a81a5c59..13f37e73d 100644 --- a/src/smt/seq/seq_nielsen.cpp +++ b/src/smt/seq/seq_nielsen.cpp @@ -379,8 +379,8 @@ namespace seq { ast_manager& m = graph().get_manager(); if (s.is_char_subst()) { - expr* var_c_expr = s.m_var->arg(0)->get_expr(); - expr* repl_c_expr = s.m_replacement->arg(0)->get_expr(); + expr* var_c_expr = s.m_var->arg0()->get_expr(); + expr* repl_c_expr = s.m_replacement->arg0()->get_expr(); add_constraint( constraint(m.mk_eq(var_c_expr, repl_c_expr), s.m_dep, m)); @@ -835,7 +835,7 @@ namespace seq { } if (ub.is_one()) { // base^1 → base - euf::snode* base_sn = tok->arg(0); + euf::snode* base_sn = tok->arg0(); if (base_sn) { dep = node->graph().dep_mgr().mk_join(dep, ub_dep); result.push_back(base_sn); @@ -900,7 +900,7 @@ namespace seq { // Skip at leading position (i == 0) to avoid undoing power unwinding: // unwind produces u · u^(n-1); merging it back to u^n creates an infinite cycle. if (i > 0 && t->is_power()) { - euf::snode* pow_base = t->arg(0); + euf::snode* pow_base = t->arg0(); if (pow_base) { euf::snode_vector pb_tokens; collect_tokens_dir(pow_base, fwd, pb_tokens); @@ -1455,7 +1455,7 @@ namespace seq { euf::snode* end_tok = dir_token(pow_side, fwd); if (!end_tok || !end_tok->is_power()) continue; - euf::snode* base_sn = end_tok->arg(0); + euf::snode* base_sn = end_tok->arg0(); expr* pow_exp = get_power_exp_expr(end_tok, seq); if (!base_sn || !pow_exp) continue; @@ -1659,7 +1659,7 @@ namespace seq { expr_ref d(rw.mk_derivative(mem.m_regex->get_expr()), m); // Extract the inner char expression from seq.unit(?inner) - expr *inner_char = tok->arg(0)->get_expr(); + expr *inner_char = tok->arg0()->get_expr(); // substitute the inner char for the derivative variable in d var_subst vs(m); @@ -2236,7 +2236,7 @@ namespace seq { euf::snode* other_head = (side == 0) ? rh : lh; if (!pow_head || !pow_head->is_power() || !other_head || !other_head->is_char()) continue; - euf::snode* base_sn = pow_head->arg(0); + euf::snode* base_sn = pow_head->arg0(); if (!base_sn) continue; euf::snode* base_head = dir_token(base_sn, fwd); if (!base_head || !base_head->is_char()) continue; @@ -2853,7 +2853,7 @@ namespace seq { if (apply_gpower_intr(node)) return ++m_stats.m_mod_gpower_intr, true; - // Priority 8: Regex Factorization (Boolean Closure) + // Priority 8: Regex Factorization if (apply_regex_factorization(node)) return ++m_stats.m_mod_regex_factorization, true; @@ -3058,7 +3058,7 @@ namespace seq { return false; SASSERT(power->is_power() && power->num_args() >= 1); - euf::snode *base = power->arg(0); + euf::snode *base = power->arg0(); nielsen_node* child; nielsen_edge* e; @@ -3109,7 +3109,7 @@ namespace seq { if (lhead->num_args() < 1 || rhead->num_args() < 1) continue; // same base: compare the two powers - if (lhead->arg(0) != rhead->arg(0)) + if (lhead->arg0() != rhead->arg0()) continue; // Skip if the exponents differ by a constant — simplify_and_init's @@ -3169,7 +3169,7 @@ namespace seq { euf::snode* end_tok = dir_token(pow_side, fwd); if (!end_tok || !end_tok->is_power()) continue; - euf::snode* base_sn = end_tok->arg(0); + euf::snode* base_sn = end_tok->arg0(); expr* pow_exp = get_power_exp_expr(end_tok, m_seq); // NB: Shuvendu - this test is also redundant if (!base_sn || !pow_exp) @@ -3223,7 +3223,7 @@ namespace seq { return false; SASSERT(power->is_power() && power->num_args() >= 1); - euf::snode *base = power->arg(0); + euf::snode *base = power->arg0(); expr *exp_n = get_power_exponent(power); expr *zero = a.mk_int(0); expr *one = a.mk_int(1); @@ -3582,62 +3582,157 @@ namespace seq { // Modifier: apply_regex_factorization (Boolean Closure) // ----------------------------------------------------------------------- - struct tau_pair { - expr_ref m_p; - expr_ref m_q; - tau_pair(expr* p, expr* q, ast_manager& m) : m_p(p, m), m_q(q, m) { - SASSERT(p); - SASSERT(q); + // Cross-product intersection of two split-sets (split algebra): + // S1 ⊓ S2 = { ⟨Δ1⊓Δ2, ∇1⊓∇2⟩ | ⟨Δ1,∇1⟩∈S1, ⟨Δ2,∇2⟩∈S2 } + // Pairs where either component is the empty regex are dropped (∅⊓r ≡ ∅). + static bool intersect_sigma_pairs(ast_manager& m, seq_util& seq, + sigma_pairs const& s1, sigma_pairs const& s2, sigma_pairs& result, unsigned threshold) { + for (auto const& p1 : s1) { + for (auto const& p2 : s2) { + if (seq.re.is_empty(p1.m_p) || seq.re.is_empty(p2.m_p) || + seq.re.is_empty(p1.m_q) || seq.re.is_empty(p2.m_q)) + continue; + result.push_back(sigma_pair(seq.re.mk_inter(p1.m_p, p2.m_p), + seq.re.mk_inter(p1.m_q, p2.m_q), m)); + if (result.size() > threshold) + return false; + } } - }; - typedef vector tau_pairs; + return true; + } - static void compute_tau(ast_manager& m, seq_util& seq, euf::sgraph& sg, expr* r, tau_pairs& result) { + // Complement of a split-set via De Morgan: ~S = ⊓_{s∈S} ~s, + // ~⟨Δ,∇⟩ = { ⟨~Δ, .*⟩, ⟨.*, ~∇⟩ } and ~{} = { ⟨.*, .*⟩ }. + // str_sort is the sequence-element sort; mk_full_seq needs the regex sort. + // May produce up to 2^|sp| pairs (bounded downstream by the factorization threshold). + static bool complement_sigma_pairs(ast_manager& m, seq_util& seq, sort* str_sort, + sigma_pairs const& sp, sigma_pairs& result, unsigned threshold) { + + sort* re_sort = seq.re.mk_re(str_sort); + const expr_ref full(seq.re.mk_full_seq(re_sort), m); // .* + if (sp.empty()) { // ~{} = ⟨.*, .*⟩ + result.push_back(sigma_pair(full, full, m)); + return true; + } + sigma_pairs acc; + acc.push_back(sigma_pair(seq.re.mk_complement(sp[0].m_p), full, m)); + acc.push_back(sigma_pair(full, seq.re.mk_complement(sp[0].m_q), m)); + for (unsigned i = 1; i < sp.size(); ++i) { + sigma_pairs next; + next.push_back(sigma_pair(seq.re.mk_complement(sp[i].m_p), full, m)); + next.push_back(sigma_pair(full, seq.re.mk_complement(sp[i].m_q), m)); + sigma_pairs tmp; + if (intersect_sigma_pairs(m, seq, acc, next, tmp, threshold) || tmp.empty()) + break; + acc = std::move(tmp); + if (acc.size() > threshold) + return false; + } + result.append(acc); + return true; + } + + bool compute_sigma(ast_manager& m, seq_util& seq, seq_rewriter& rw, const euf::snode* r, sigma_pairs& result, unsigned threshold) { SASSERT(r); sort* str_sort = nullptr; - if (!seq.is_re(r, str_sort)) return; - expr *body = nullptr; + if (!seq.is_re(r->get_expr(), str_sort)) + return false; + std::cout << "Computing sigma of " << snode_label_html(r, m, false) << std::endl; - if (seq.re.is_epsilon(r)) { + if (r->is_empty()) { const expr_ref eps(seq.re.mk_epsilon(str_sort), m); - result.push_back(tau_pair(eps, eps, m)); + result.push_back(sigma_pair(eps, eps, m)); + return true; } - else if (seq.str.is_unit(r) || seq.str.is_string(r) || seq.re.is_range(r) || seq.re.is_full_char(r) || - (seq.re.is_to_re(r) && seq.str.is_string(to_app(r)->get_arg(0)))) { - if (seq.re.is_to_re(r)) { - const expr * arg = to_app(r)->get_arg(0); + if (r->is_to_re()) { + const euf::snode* const c = r->arg0(); + if (c->is_range()) { + const expr_ref ex(c->get_expr(), m); + const expr_ref eps(seq.re.mk_epsilon(str_sort), m); + + result.push_back(sigma_pair(eps, ex, m)); + result.push_back(sigma_pair(ex, eps, m)); + return true; + } + if (c->is_empty()) { + const expr_ref eps(seq.re.mk_epsilon(str_sort), m); + result.push_back(sigma_pair(eps, eps, m)); + return true; + } + if (c->is_char()) { + unsigned val; + VERIFY(seq.is_const_char(c->arg0()->get_expr(), val)); + const expr_ref ex(seq.re.mk_to_re(seq.str.mk_string(zstring(val))), m); + const expr_ref eps(seq.re.mk_epsilon(str_sort), m); + + result.push_back(sigma_pair(eps, ex, m)); + result.push_back(sigma_pair(ex, eps, m)); + return true; + } + if (c->is_string()) { + const euf::snode * arg = r->arg0(); zstring s; - if (seq.str.is_string(arg, s) && s.length() > 1) { + if (arg->is_string(s, seq) && s.length() > 1) { for (unsigned i = 0; i <= s.length(); ++i) { expr_ref p(seq.re.mk_to_re(seq.str.mk_string(s.extract(0, i))), m); expr_ref q(seq.re.mk_to_re(seq.str.mk_string(s.extract(i, s.length() - i))), m); - result.push_back(tau_pair(p, q, m)); + result.push_back(sigma_pair(p, q, m)); } - return; } + return true; } + UNREACHABLE(); + return false; + } + if (r->is_full_char()) { + const expr_ref ex(r->get_expr(), m); const expr_ref eps(seq.re.mk_epsilon(str_sort), m); - result.push_back(tau_pair(eps, r, m)); - result.push_back(tau_pair(r, eps, m)); + + result.push_back(sigma_pair(eps, ex, m)); + result.push_back(sigma_pair(ex, eps, m)); + return true; } - else if (seq.re.is_empty(r)) { - // empty set has no splits - } - else if (seq.re.is_union(r)) { - for (expr* arg : *to_app(r)) { - compute_tau(m, seq, sg, arg, result); + if (r->is_union()) { + // σ(r₁ ⊔ r₂) = σ(r₁) ∪ σ(r₂) + SASSERT(r->num_args() >= 2); + for (const euf::snode* const arg : *r) { + if (!compute_sigma(m, seq, rw, arg, result, threshold)) + return false; } + return true; } - else if (seq.re.is_concat(r)) { - const unsigned num_args = to_app(r)->get_num_args(); - if (num_args == 0) { - const expr_ref eps(seq.re.mk_epsilon(str_sort), m); - result.push_back(tau_pair(eps, eps, m)); - return; + if (r->is_intersect()) { + // σ(r₁ ⊓ r₂ ⊓ …) = σ(r₁) ⊓ σ(r₂) ⊓ …; empty intersection (0 args) = {⟨.*,.*⟩} + const unsigned n = r->num_args(); + SASSERT(n >= 2); + sigma_pairs current; + if (!compute_sigma(m, seq, rw, r->arg0(), current, threshold)) + return false; + for (unsigned i = 1; i < n && !current.empty(); ++i) { + sigma_pairs arg_i; + compute_sigma(m, seq, rw, r->arg(i), arg_i, threshold); + sigma_pairs tmp; + if (!intersect_sigma_pairs(m, seq, current, arg_i, tmp, threshold)) + return false; + current = std::move(tmp); } - for (unsigned i = 0; i < num_args; ++i) { - tau_pairs tau_arg; - compute_tau(m, seq, sg, to_app(r)->get_arg(i), tau_arg); + result.append(current); + return true; + } + if (r->is_complement()) { + // σ(~r) = ~σ(r) + sigma_pairs body_pairs; + if (!compute_sigma(m, seq, rw, r->arg0(), body_pairs, threshold)) + return false; + return complement_sigma_pairs(m, seq, str_sort, body_pairs, result, threshold); + } + if (r->is_concat()) { + const unsigned n = r->num_args(); + SASSERT(n >= 2); + for (unsigned i = 0; i < n; ++i) { + sigma_pairs sigma_arg; + if (!compute_sigma(m, seq, rw, r->arg(i), sigma_arg, threshold)) + return false; expr_ref left(m); expr_ref right(m); @@ -3646,58 +3741,116 @@ namespace seq { left = seq.re.mk_epsilon(str_sort); else { for (unsigned j = 0; j < i; ++j) { - const auto arg = to_app(r)->get_arg(j); - left = left ? seq.re.mk_concat(left, arg) : arg; + const euf::snode* arg = r->arg(j); + left = left ? seq.re.mk_concat(left, arg->get_expr()) : arg->get_expr(); } } - if (i == num_args - 1) + if (i == n - 1) right = seq.re.mk_epsilon(str_sort); else { - for (unsigned j = i + 1; j < num_args; ++j) { - const auto arg = to_app(r)->get_arg(j); - right = right ? seq.re.mk_concat(right, arg) : arg; + for (unsigned j = i + 1; j < n; ++j) { + const euf::snode* arg = r->arg(j); + right = right ? seq.re.mk_concat(right, arg->get_expr()) : arg->get_expr(); } } - for (auto const &[tp, tq] : tau_arg) { - seq_rewriter rw(m); - auto p = rw.mk_re_append(left, tp); - auto q = rw.mk_re_append(tq, right); - result.push_back(tau_pair(p, q, m)); + for (auto const &[tp, tq] : sigma_arg) { + expr_ref p = rw.mk_re_append(left, tp); + expr_ref q = rw.mk_re_append(tq, right); + result.push_back(sigma_pair(p, q, m)); } } + return true; } - else if (seq.re.is_star(r, body) || seq.re.is_plus(r, body)) { - if (seq.re.is_plus(r)) { - const expr_ref star(seq.re.mk_star(body), m); - const expr_ref concat(seq.re.mk_concat(body, star), m); - compute_tau(m, seq, sg, concat, result); - return; - } + if (r->is_star()) { + const euf::snode* body = r->arg0(); + const expr_ref eps(seq.re.mk_epsilon(str_sort), m); + result.push_back(sigma_pair(eps, eps, m)); - const expr_ref eps(seq.re.mk_epsilon(str_sort), m); - result.push_back(tau_pair(eps, eps, m)); - - tau_pairs tau_body; - compute_tau(m, seq, sg, body, tau_body); - for (auto const &[tp, tq] : tau_body) { - seq_rewriter rw(m); - auto p = rw.mk_re_append(r, tp); - auto q = rw.mk_re_append(tq, r); - result.push_back(tau_pair(p, q, m)); + sigma_pairs sigma_body; + if (!compute_sigma(m, seq, rw, body, sigma_body, threshold)) + return false; + for (auto const &[tp, tq] : sigma_body) { + auto p = rw.mk_re_append(r->get_expr(), tp); + auto q = rw.mk_re_append(tq, r->get_expr()); + result.push_back(sigma_pair(p, q, m)); } + return true; } - else if (seq.re.is_opt(r, body)) { - const expr_ref eps(seq.re.mk_epsilon(str_sort), m); - result.push_back(tau_pair(eps, eps, m)); - compute_tau(m, seq, sg, body, result); + if (r->is_plus()) { + // r⁺ = r·r* ; by Kleene factorization σ(r⁺) = r*·σ(r)·r*. + // Same shape as the star rule but with the surrounding factor being + // body* without the {⟨ε,ε⟩} pair + const euf::snode* body = r->arg0(); + const expr_ref star(seq.re.mk_star(body->get_expr()), m); // body* + + sigma_pairs sigma_body; + if (!compute_sigma(m, seq, rw, body, sigma_body, threshold)) + return false; + for (auto const &[tp, tq] : sigma_body) { + auto p = rw.mk_re_append(star, tp); // body* · tp + auto q = rw.mk_re_append(tq, star); // tq · body* + result.push_back(sigma_pair(p, q, m)); + } + return true; } - else { - const expr_ref eps(seq.re.mk_epsilon(str_sort), m); - result.push_back(tau_pair(eps, r, m)); - result.push_back(tau_pair(r, eps, m)); + // the simplifier should have eliminated everything else already + UNREACHABLE(); + return false; + } + + void simplify_sigma_pairs(sigma_pairs& pairs, seq_regex& sr, euf::sgraph& sg) { + return; // For now + if (pairs.size() <= 1) + return; + // Guard against pathological cost: subsumption is O(n^2) language-subset + // BFS checks. Large split-sets are left to the factorization threshold. + if (pairs.size() > 64) + return; + + struct row { euf::snode* p; euf::snode* q; unsigned idx; }; + + // Materialise snodes once; drop pairs with a structurally-empty component. + vector rows; + for (unsigned i = 0; i < pairs.size(); ++i) { + euf::snode* p = sg.mk(pairs[i].m_p); + euf::snode* q = sg.mk(pairs[i].m_q); + if (sr.is_empty_regex(p) || sr.is_empty_regex(q)) + continue; + rows.push_back({ p, q, i }); } + + // a subsumes b iff L(b.p) ⊆ L(a.p) and L(b.q) ⊆ L(a.q). + // is_language_subset may return l_undef (inconclusive); only treat a + // definite l_true as subsumption, so we never drop a needed split. + auto subsumes = [&](row const& a, row const& b) { + return sr.is_language_subset(b.p, a.p) == l_true && + sr.is_language_subset(b.q, a.q) == l_true; + }; + + vector kept; + for (row const& r : rows) { + bool redundant = false; + for (row const& k : kept) + if (subsumes(k, r)) { redundant = true; break; } + if (redundant) + continue; + // drop already-kept rows strictly subsumed by r + unsigned w = 0; + for (unsigned t = 0; t < kept.size(); ++t) { + if (subsumes(r, kept[t])) + continue; + kept[w++] = kept[t]; + } + kept.shrink(w); + kept.push_back(r); + } + + sigma_pairs result; + for (row const& k : kept) + result.push_back(pairs[k.idx]); + pairs.swap(result); } bool nielsen_graph::apply_regex_factorization(nielsen_node* node) { @@ -3713,7 +3866,9 @@ namespace seq { for (str_mem const& mem : node->str_mems()) { SASSERT(mem.well_formed()); - if (mem.m_str->is_empty() || mem.is_primitive() || !mem.m_regex->is_classical()) + // compute_sigma handles all regex forms (incl. complement / intersection), + // so the classical restriction is no longer needed. + if (mem.m_str->is_empty() || mem.is_primitive()) continue; euf::snode* first = mem.m_str->first(); @@ -3721,10 +3876,14 @@ namespace seq { SASSERT(!first->is_char()); euf::snode* tail = m_sg.drop_first(mem.m_str); SASSERT(tail); - std::cout << "Processing: " << mem_pp(mem, m) << std::endl; - tau_pairs pairs; - compute_tau(m, m_seq, m_sg, mem.m_regex->get_expr(), pairs); + sigma_pairs pairs; + seq_rewriter rw(m); + if (!compute_sigma(m, m_seq, rw, mem.m_regex, pairs, m_regex_factorization_threshold)) + continue; + + if (m_seq_regex) + simplify_sigma_pairs(pairs, *m_seq_regex, m_sg); vector feasible; dep_tracker eliminated_dep = mem.m_dep; @@ -4222,7 +4381,7 @@ namespace seq { return false; SASSERT(power->is_power() && power->num_args() >= 1); - euf::snode* base = power->arg(0); + euf::snode* base = power->arg0(); const expr_ref zero(a.mk_int(0), m); // Branch 1: enumerate all decompositions of the base. @@ -4331,7 +4490,7 @@ namespace seq { return false; SASSERT(power->is_power() && power->num_args() >= 1); - euf::snode* base = power->arg(0); + euf::snode* base = power->arg0(); expr* exp_n = get_power_exponent(power); SASSERT(exp_n); const expr_ref zero(a.mk_int(0), m); @@ -4375,7 +4534,7 @@ namespace seq { return false; SASSERT(power->is_power() && power->num_args() >= 1); - euf::snode* base = power->arg(0); + euf::snode* base = power->arg0(); expr* exp_n = get_power_exponent(power); SASSERT(exp_n); const expr_ref zero(a.mk_int(0), m); @@ -4535,7 +4694,7 @@ namespace seq { return expr_ref(a.mk_int(1), m); if (n->is_concat()) { - const expr_ref left = compute_length_expr(n->arg(0)); + const expr_ref left = compute_length_expr(n->arg0()); const expr_ref right = compute_length_expr(n->arg(1)); return expr_ref(a.mk_add(left, right), m); } @@ -4562,8 +4721,8 @@ namespace seq { expr_ref len_lhs = compute_length_expr(eq.m_lhs); expr_ref len_rhs = compute_length_expr(eq.m_rhs); TRACE(seq, - tout << "Length constraint from LHS " << snode_label_html(eq.m_lhs, m) << " to " << len_lhs << ":\n"; - tout << "Length constraint from RHS " << snode_label_html(eq.m_rhs, m) << " to " << len_rhs << "\n"); + tout << "Length constraint from LHS " << snode_label_html(eq.m_lhs, m, true) << " to " << len_lhs << ":\n"; + tout << "Length constraint from RHS " << snode_label_html(eq.m_rhs, m, true) << " to " << len_rhs << "\n"); expr_ref len_eq(m.mk_eq(len_lhs, len_rhs), m); constraints.push_back(length_constraint(len_eq, eq.m_dep, length_kind::eq, true, m)); diff --git a/src/smt/seq/seq_nielsen.h b/src/smt/seq/seq_nielsen.h index cbd8b2dac..eb08c110f 100644 --- a/src/smt/seq/seq_nielsen.h +++ b/src/smt/seq/seq_nielsen.h @@ -257,9 +257,34 @@ namespace seq { class seq_parikh; class seq_regex; // forward declaration (defined in smt/seq/seq_regex.h) - std::string snode_label_html(euf::snode const* n, obj_map& names, uint64_t& next_id, ast_manager& m); + std::string snode_label_html(euf::snode const* n, + obj_map& names, uint64_t& next_id, ast_manager& m, bool html_escape); - std::string snode_label_html(euf::snode const* n, ast_manager& m); + std::string snode_label_html(euf::snode const* n, ast_manager& m, bool html_escape); + + // Split-pair produced by compute_sigma: uv |= r iff exists i: u |= m_p[i] and v |= m_q[i] + struct sigma_pair { + expr_ref m_p; + expr_ref m_q; + sigma_pair(expr* p, expr* q, ast_manager& m) : m_p(p, m), m_q(q, m) { + SASSERT(p && q); + } + }; + typedef vector sigma_pairs; + + // Compute the split-set sigma(r) per the splitting rules of the paper + // "Extended Regular Expression Membership". Generalises the classical compute_tau + // with intersection and complement cases via the split-set algebra. + // `result` is appended to (not cleared). + bool compute_sigma(ast_manager& m, seq_util& seq, seq_rewriter& rw, const euf::snode* r, sigma_pairs& result, unsigned threshold); + + // Simplify a split-set in place using the split algebra's language-level rules + // (paper section "split-set simplification heuristics"): drop pairs with an + // empty-language component, and drop any split subsumed by another + // ( is subsumed by iff L(D_i) subseteq L(D_j) and L(N_i) subseteq L(N_j)). + // Subsumption tames the 2^k blow-up of sigma(~r) (e.g. sigma(~a*): 8 -> 2). + // Requires the regex emptiness/subset checker and an sgraph to build snodes. + void simplify_sigma_pairs(sigma_pairs& pairs, seq_regex& sr, euf::sgraph& sg); // simplification result for constraint processing // mirrors ZIPT's SimplifyResult enum @@ -377,8 +402,8 @@ namespace seq { // string equality constraint: lhs = rhs // mirrors ZIPT's StrEq (both sides are regex-free snode trees) struct str_eq { - euf::snode* m_lhs; - euf::snode* m_rhs; + euf::snode* m_lhs; // assumed to be non-null + euf::snode* m_rhs; // assumed to be non-null dep_tracker m_dep; str_eq(euf::snode* lhs, euf::snode* rhs, dep_tracker const& dep): @@ -400,7 +425,8 @@ namespace seq { bool contains_var(euf::snode* var) const; bool well_formed() const { - return !!m_lhs && !!m_rhs; + // assumed to be always true + return m_lhs && m_rhs; } }; @@ -411,13 +437,15 @@ namespace seq { }; inline std::ostream &operator<<(std::ostream &out, eq_pp const &p) { - return out << mk_pp(p.eq.m_lhs->get_expr(), p.m) << " == " << mk_pp(p.eq.m_rhs->get_expr(), p.m) << "\n"; + return out << snode_label_html(p.eq.m_lhs, p.m, false) + << " == " + << snode_label_html(p.eq.m_rhs, p.m, false); } // string disequality constraint: lhs != rhs struct str_deq { - euf::snode* m_lhs; - euf::snode* m_rhs; + euf::snode* m_lhs; // assumed to be non-null + euf::snode* m_rhs; // assumed to be non-null dep_tracker m_dep; str_deq(euf::snode* lhs, euf::snode* rhs, dep_tracker const& dep): @@ -440,7 +468,8 @@ namespace seq { } bool well_formed() const { - return !!m_lhs && !!m_rhs; + // assumed to be always true + return m_lhs && m_rhs; } }; @@ -451,17 +480,18 @@ namespace seq { }; inline std::ostream &operator<<(std::ostream &out, deq_pp const &p) { - return out << mk_pp(p.deq.m_lhs->get_expr(), p.m) << " != " << mk_pp(p.deq.m_rhs->get_expr(), p.m) << "\n"; + return out << snode_label_html(p.deq.m_lhs, p.m, false) + << " != " + << snode_label_html(p.deq.m_rhs, p.m, false); } // regex membership constraint: str in regex // mirrors ZIPT's StrMem struct str_mem { - euf::snode* m_str; - euf::snode* m_regex; + euf::snode* m_str; // assumed to be non-null + euf::snode* m_regex; // assumed to be non-null dep_tracker m_dep; - // str_mem(): m_str(nullptr), m_regex(nullptr), m_dep(nullptr) {} str_mem(euf::snode* str, euf::snode* regex, dep_tracker const& dep): m_str(str), m_regex(regex), m_dep(dep) {} @@ -481,17 +511,21 @@ namespace seq { bool contains_var(euf::snode* var) const; bool well_formed() const { - return !!m_str && !!m_regex; + // assumed to be always true + return m_str && m_regex; } }; struct mem_pp { str_mem const& mem; ast_manager &m; - mem_pp(str_mem const& mem, ast_manager& m) : m(m), mem(mem) {} + mem_pp(str_mem const& mem, ast_manager& m) : mem(mem), m(m) {} }; inline std::ostream &operator<<(std::ostream &out, mem_pp const &p) { - return out << mk_pp(p.mem.m_str->get_expr(), p.m) << " in " << mk_pp(p.mem.m_regex->get_expr(), p.m) << "\n"; + return out + << snode_label_html(p.mem.m_str, p.m, false) + << " in " + << snode_label_html(p.mem.m_regex, p.m, false); } // string variable substitution: var -> replacement @@ -907,6 +941,7 @@ namespace seq { bool m_parikh_enabled = true; bool m_signature_split = false; unsigned m_regex_factorization_threshold = 1; + bool m_regex_factorization_eager = false; unsigned m_fresh_cnt = 0; nielsen_stats m_stats; @@ -1050,6 +1085,7 @@ namespace seq { void set_signature_split(bool e) { m_signature_split = e; } void set_regex_factorization_threshold(unsigned max) { m_regex_factorization_threshold = max; } + void set_regex_factorization_eager(bool e) { m_regex_factorization_eager = e; } // display for debugging std::ostream& display(std::ostream& out) const; diff --git a/src/smt/seq/seq_nielsen_pp.cpp b/src/smt/seq/seq_nielsen_pp.cpp index dec2cd626..605a037b4 100644 --- a/src/smt/seq/seq_nielsen_pp.cpp +++ b/src/smt/seq/seq_nielsen_pp.cpp @@ -92,7 +92,7 @@ namespace seq { out << "nielsen_graph with " << m_nodes.size() << " nodes, " << m_edges.size() << " edges\n"; - for (nielsen_node* n : m_nodes) + for (const nielsen_node * n : m_nodes) display(out, n); return out; @@ -105,10 +105,12 @@ namespace seq { // ----------------------------------------------------------------------- // Helper: HTML-escape a string and replace literal \n with
. - static std::string dot_html_escape(std::string const& s) { + static std::string dot_html_escape(std::string const& s, const bool html_escape) { + if (!html_escape) + return s; std::string r; r.reserve(s.size()); - for (char c : s) { + for (const char c : s) { switch (c) { case '&': r += "&"; break; case '<': r += "<"; break; @@ -130,7 +132,7 @@ namespace seq { return result; } - std::string decode_recursive_name(expr* e, ast_manager& m) { + std::string decode_recursive_name(expr* e, ast_manager& m, bool html_escape) { SASSERT(e && is_app(e)); th_rewriter rw(m); const skolem sk(m, rw); @@ -141,7 +143,7 @@ namespace seq { } if (to_app(arg)->get_num_args() != 0) return ""; - std::string s = dot_html_escape(to_app(arg)->get_decl()->get_name().str()); + std::string s = dot_html_escape(to_app(arg)->get_decl()->get_name().str(), html_escape); if (cnt == 0) return s; return s + "[" + std::to_string(cnt) + "]"; @@ -159,7 +161,7 @@ namespace seq { // Helper: render an arithmetic/integer expression in infix HTML notation. // Recognises +, -, *, unary minus, numerals, str.len, and named constants; // falls back to HTML-escaped mk_pp for anything else. - static std::string arith_expr_html(expr* e, obj_map& names, uint64_t& next_id, ast_manager& m) { + static std::string arith_expr_html(expr* e, obj_map& names, uint64_t& next_id, ast_manager& m, bool html_escape) { if (!e) return "null"; arith_util arith(m); seq_util seq(m); @@ -169,39 +171,39 @@ namespace seq { if (!is_app(e)) { std::ostringstream os; os << mk_bounded_pp(e, m); - return dot_html_escape(os.str()); + return dot_html_escape(os.str(), html_escape); } app* a = to_app(e); expr* x, * y; if (m.is_or(e)) { app* ap = to_app(e); std::string res; - res = arith_expr_html(ap->get_arg(0), names, next_id, m); + res = arith_expr_html(ap->get_arg(0), names, next_id, m, html_escape); for (unsigned i = 1; i < ap->get_num_args(); ++i) { res += " || "; - res += arith_expr_html(ap->get_arg(i), names, next_id, m); + res += arith_expr_html(ap->get_arg(i), names, next_id, m, html_escape); } return res; } if (m.is_not(e, x)) - return "!(" + arith_expr_html(x, names, next_id, m) + ")"; + return "!(" + arith_expr_html(x, names, next_id, m, html_escape) + ")"; if (arith.is_lt(e, x, y)) { - return arith_expr_html(x, names, next_id, m) + " < " + arith_expr_html(y, names, next_id, m); + return arith_expr_html(x, names, next_id, m, html_escape) + " < " + arith_expr_html(y, names, next_id, m, html_escape); } if (arith.is_gt(e, x, y)) { - return arith_expr_html(x, names, next_id, m) + " > " + arith_expr_html(y, names, next_id, m); + return arith_expr_html(x, names, next_id, m, html_escape) + " > " + arith_expr_html(y, names, next_id, m, html_escape); } if (arith.is_le(e, x, y)) { - return arith_expr_html(x, names, next_id, m) + " ≤ " + arith_expr_html(y, names, next_id, m); + return arith_expr_html(x, names, next_id, m, html_escape) + " ≤ " + arith_expr_html(y, names, next_id, m, html_escape); } if (arith.is_ge(e, x, y)) { - return arith_expr_html(x, names, next_id, m) + " ≥ " + arith_expr_html(y, names, next_id, m); + return arith_expr_html(x, names, next_id, m, html_escape) + " ≥ " + arith_expr_html(y, names, next_id, m, html_escape); } if (m.is_eq(e, x, y)) { - return arith_expr_html(x, names, next_id, m) + " = " + arith_expr_html(y, names, next_id, m); + return arith_expr_html(x, names, next_id, m, html_escape) + " = " + arith_expr_html(y, names, next_id, m, html_escape); } if (arith.is_add(e)) { - std::string r = arith_expr_html(a->get_arg(0), names, next_id, m); + std::string r = arith_expr_html(a->get_arg(0), names, next_id, m, html_escape); for (unsigned i = 1; i < a->get_num_args(); ++i) { expr* arg = a->get_arg(i); // render (+ x (- y)) as "x - y" and (+ x (- n)) as "x - n" @@ -209,32 +211,32 @@ namespace seq { rational neg_val; if (arith.is_uminus(arg, neg_inner)) { r += " − "; // minus sign - r += arith_expr_html(neg_inner, names, next_id, m); + r += arith_expr_html(neg_inner, names, next_id, m, html_escape); } else if (arith.is_numeral(arg, neg_val) && neg_val.is_neg()) { r += " − "; // minus sign r += (-neg_val).to_string(); } else { r += " + "; - r += arith_expr_html(arg, names, next_id, m); + r += arith_expr_html(arg, names, next_id, m, html_escape); } } return r; } if (arith.is_sub(e, x, y)) - return arith_expr_html(x, names, next_id, m) + " − " + arith_expr_html(y, names, next_id, m); + return arith_expr_html(x, names, next_id, m, html_escape) + " − " + arith_expr_html(y, names, next_id, m, html_escape); if (arith.is_uminus(e, x)) - return "−" + arith_expr_html(x, names, next_id, m); + return "−" + arith_expr_html(x, names, next_id, m, html_escape); if (arith.is_mul(e)) { std::string r; for (unsigned i = 0; i < a->get_num_args(); ++i) { if (i > 0) r += " · "; // middle dot - r += arith_expr_html(a->get_arg(i), names, next_id, m); + r += arith_expr_html(a->get_arg(i), names, next_id, m, html_escape); } return r; } if (seq.str.is_length(e, x)) { - std::string name = decode_recursive_name(x, m); + std::string name = decode_recursive_name(x, m, html_escape); if (!name.empty()) { return "|" + name + "|"; } @@ -253,28 +255,23 @@ namespace seq { // named constant, fresh variable like n!0 if (a->get_num_args() == 0) - return dot_html_escape(a->get_decl()->get_name().str()); + return dot_html_escape(a->get_decl()->get_name().str(), html_escape); if (names.contains(e)) return names[e]; std::stringstream ss; ss << mk_bounded_pp(e, m); - std::string s = dot_html_escape(ss.str()); + std::string s = dot_html_escape(ss.str(), html_escape); names.insert(e, s); return s; - - // fallback - std::ostringstream os; - os << mk_pp(e, m); - return dot_html_escape(os.str()); } // Helper: render a constraint as an HTML string for DOT edge labels. static std::string constraint_html(constraint const& ic, obj_map& names, uint64_t& next_id, ast_manager& m) { - if (ic.fml) return arith_expr_html(ic.fml, names, next_id, m); + if (ic.fml) return arith_expr_html(ic.fml, names, next_id, m, true); return "null"; } - static std::string regex_expr_html(expr* e, obj_map& names, uint64_t& next_id, ast_manager& m, seq_util& seq) { + static std::string regex_expr_html(expr* e, obj_map& names, uint64_t& next_id, ast_manager& m, seq_util& seq, bool html_escape) { if (!e) return "null"; expr* a = nullptr, * b = nullptr; @@ -296,14 +293,14 @@ namespace seq { } if (seq.str.is_string(arg, s)) { if (!first) os << " "; - os << "\"" + dot_html_escape(s.encode()) + "\""; + os << "\"" + dot_html_escape(s.encode(), html_escape) + "\""; first = false; continue; } unsigned ch_val = 0; if (seq.str.is_unit(arg) && seq.is_const_char(to_app(arg)->get_arg(0), ch_val)) { if (!first) os << " "; - os << "\"" + dot_html_escape(zstring(ch_val).encode()) + "\""; + os << "\"" + dot_html_escape(zstring(ch_val).encode(), html_escape) + "\""; first = false; continue; } @@ -311,7 +308,7 @@ namespace seq { os << mk_pp(arg, m); first = false; } - return dot_html_escape(os.str()); + return dot_html_escape(os.str(), html_escape); } unsigned c; if (seq.is_const_char(e, c)) @@ -324,16 +321,16 @@ namespace seq { if (i > 0) res += " "; bool needs_parens = seq.re.is_union(ap->get_arg(i)); if (needs_parens) res += "("; - res += regex_expr_html(ap->get_arg(i), names, next_id, m, seq); + res += regex_expr_html(ap->get_arg(i), names, next_id, m, seq, html_escape); if (needs_parens) res += ")"; } return res; } if (m.is_ite(e)) { app* ap = to_app(e); - std::string cond = arith_expr_html(ap->get_arg(0), names, next_id, m); - std::string t = regex_expr_html(ap->get_arg(1), names, next_id, m, seq); - std::string f = regex_expr_html(ap->get_arg(2), names, next_id, m, seq); + std::string cond = arith_expr_html(ap->get_arg(0), names, next_id, m, html_escape); + std::string t = regex_expr_html(ap->get_arg(1), names, next_id, m, seq, html_escape); + std::string f = regex_expr_html(ap->get_arg(2), names, next_id, m, seq, html_escape); return cond + " ? (" + t + ") : (" + f + ")"; } if (seq.re.is_union(e)) { @@ -341,10 +338,10 @@ namespace seq { std::string res; if (ap->get_num_args() == 0) return "∅"; - res = regex_expr_html(ap->get_arg(0), names, next_id, m, seq); + res = regex_expr_html(ap->get_arg(0), names, next_id, m, seq, html_escape); for (unsigned i = 1; i < ap->get_num_args(); ++i) { res += " | "; - res += regex_expr_html(ap->get_arg(i), names, next_id, m, seq); + res += regex_expr_html(ap->get_arg(i), names, next_id, m, seq, html_escape); } return res; } @@ -355,7 +352,7 @@ namespace seq { if (i > 0) res += " & "; bool needs_parens = seq.re.is_union(ap->get_arg(i)) || seq.re.is_concat(ap->get_arg(i)); if (needs_parens) res += "("; - res += regex_expr_html(ap->get_arg(i), names, next_id, m, seq); + res += regex_expr_html(ap->get_arg(i), names, next_id, m, seq, html_escape); if (needs_parens) res += ")"; } return res; @@ -363,21 +360,21 @@ namespace seq { if (seq.re.is_star(e, a)) { bool needs_parens = seq.re.is_union(a) || seq.re.is_concat(a) || seq.re.is_intersection(a); std::string res = needs_parens ? "(" : ""; - res += regex_expr_html(a, names, next_id, m, seq); + res += regex_expr_html(a, names, next_id, m, seq, html_escape); res += needs_parens ? ")*" : "*"; return res; } if (seq.re.is_plus(e, a)) { bool needs_parens = seq.re.is_union(a) || seq.re.is_concat(a) || seq.re.is_intersection(a); std::string res = needs_parens ? "(" : ""; - res += regex_expr_html(a, names, next_id, m, seq); + res += regex_expr_html(a, names, next_id, m, seq, html_escape); res += needs_parens ? ")+" : "+"; return res; } if (seq.re.is_opt(e, a)) { bool needs_parens = seq.re.is_union(a) || seq.re.is_concat(a) || seq.re.is_intersection(a); std::string res = needs_parens ? "(" : ""; - res += regex_expr_html(a, names, next_id, m, seq); + res += regex_expr_html(a, names, next_id, m, seq, html_escape); res += needs_parens ? ")?" : "?"; return res; } @@ -385,14 +382,14 @@ namespace seq { bool needs_parens = seq.re.is_union(a) || seq.re.is_concat(a) || seq.re.is_intersection(a); std::string res = "~"; res += needs_parens ? "(" : ""; - res += regex_expr_html(a, names, next_id, m, seq); + res += regex_expr_html(a, names, next_id, m, seq, html_escape); res += needs_parens ? ")" : ""; return res; } if (seq.re.is_range(e, a, b)) { zstring s1, s2; - std::string c1 = seq.str.is_string(a, s1) ? dot_html_escape(s1.encode()) : arith_expr_html(a, names, next_id, m); - std::string c2 = seq.str.is_string(b, s2) ? dot_html_escape(s2.encode()) : arith_expr_html(b, names, next_id, m); + std::string c1 = seq.str.is_string(a, s1) ? dot_html_escape(s1.encode(), html_escape) : arith_expr_html(a, names, next_id, m, html_escape); + std::string c2 = seq.str.is_string(b, s2) ? dot_html_escape(s2.encode(), html_escape) : arith_expr_html(b, names, next_id, m, html_escape); return "[" + c1 + "-" + c2 + "]"; } if (seq.re.is_full_char(e)) { @@ -407,14 +404,14 @@ namespace seq { std::ostringstream os; os << mk_pp(e, m); - return dot_html_escape(os.str()); + return dot_html_escape(os.str(), html_escape); } // Helper: render a snode as an HTML label for DOT output. // Groups consecutive s_char tokens into quoted strings, renders s_var by name, // shows s_power with superscripts, s_unit by its inner expression, // and falls back to mk_pp, HTML-escaped, for other token kinds. - std::string snode_label_html(euf::snode const* n, obj_map& names, uint64_t& next_id, ast_manager& m) { + std::string snode_label_html(euf::snode const* n, obj_map& names, uint64_t& next_id, ast_manager& m, bool html_escape) { if (!n) return "null"; seq_util seq(m); @@ -433,7 +430,7 @@ namespace seq { auto flush_chars = [&]() { if (char_acc.empty()) return; if (!first) result += " + "; - result += "\"" + dot_html_escape(char_acc) + "\""; + result += "\"" + dot_html_escape(char_acc, html_escape) + "\""; first = false; char_acc.clear(); }; @@ -459,7 +456,7 @@ namespace seq { if (!e) { result += "#" + std::to_string(tok->id()); } else if (tok->is_var()) { - std::string name = decode_recursive_name(e, m); + std::string name = decode_recursive_name(e, m, html_escape); if (!name.empty()) { result += name; } @@ -468,7 +465,7 @@ namespace seq { else { std::stringstream ss; ss << mk_bounded_pp(e, m); - std::string s = dot_html_escape(ss.str()); + std::string s = dot_html_escape(ss.str(), html_escape); names.insert(e, s); result += s; } @@ -477,40 +474,40 @@ namespace seq { std::cout << mk_pp(e, m) << std::endl; expr* ch = to_app(e)->get_arg(0); if (is_app(ch) && to_app(ch)->get_num_args() == 0) - result += "[" + dot_html_escape(to_app(ch)->get_decl()->get_name().str()) + "]"; + result += "[" + dot_html_escape(to_app(ch)->get_decl()->get_name().str(), html_escape) + "]"; else { std::ostringstream os; os << mk_pp(ch, m); - result += "[" + dot_html_escape(os.str()) + "]"; + result += "[" + dot_html_escape(os.str(), html_escape) + "]"; } } else if (tok->is_power()) { // seq.power(base, n): render as basen - std::string base_html = snode_label_html(tok->arg(0), m); + std::string base_html = snode_label_html(tok->arg(0), m, html_escape); if (tok->arg(0)->length() > 1) base_html = "(" + base_html + ")"; result += base_html; result += ""; expr* exp_expr = to_app(e)->get_arg(1); - result += arith_expr_html(exp_expr, names, next_id, m); + result += arith_expr_html(exp_expr, names, next_id, m, html_escape); result += ""; } - else if (e && seq.is_re(e)) - result += regex_expr_html(e, names, next_id, m, seq); + else if (seq.is_re(e)) + result += regex_expr_html(e, names, next_id, m, seq, html_escape); else { std::ostringstream os; os << mk_pp(e, m); - result += dot_html_escape(os.str()); + result += dot_html_escape(os.str(), html_escape); } } flush_chars(); return result; } - std::string snode_label_html(euf::snode const* n, ast_manager& m) { + std::string snode_label_html(euf::snode const* n, ast_manager& m, bool html_escape) { obj_map names; uint64_t next_id = 0; - return snode_label_html(n, names, next_id, m); + return snode_label_html(n, names, next_id, m, html_escape); } std::ostream& nielsen_node::to_html(std::ostream& out, ast_manager& m) const { @@ -530,27 +527,27 @@ namespace seq { for (auto const& eq : m_str_eq) { if (!any) { out << "Cnstr:
"; any = true; } if (!hasEq) { out << "Eq:
"; hasEq = true; } - out << snode_label_html(eq.m_lhs, names, next_id, m) + out << snode_label_html(eq.m_lhs, names, next_id, m, true) << " = " - << snode_label_html(eq.m_rhs, names, next_id, m) + << snode_label_html(eq.m_rhs, names, next_id, m, true) << "
"; } // string disequalities for (auto const& eq : m_str_deq) { if (!any) { out << "Cnstr:
"; any = true; } if (!hasDisEq) { out << "DisEq:
"; hasDisEq = true; } - out << snode_label_html(eq.m_lhs, names, next_id, m) + out << snode_label_html(eq.m_lhs, names, next_id, m, true) << " ≠ " - << snode_label_html(eq.m_rhs, names, next_id, m) + << snode_label_html(eq.m_rhs, names, next_id, m, true) << "
"; } // regex memberships for (auto const& mem : m_str_mem) { if (!any) { out << "Cnstr:
"; any = true; } if (!hasMem) { out << "Mem:
"; hasMem = true; } - out << snode_label_html(mem.m_str, names, next_id, m) + out << snode_label_html(mem.m_str, names, next_id, m, true) << " ∈ " - << regex_expr_html(mem.m_regex->get_expr(), names, next_id, m, graph().seq()) + << regex_expr_html(mem.m_regex->get_expr(), names, next_id, m, graph().seq(), true) << "
"; } // character ranges @@ -613,19 +610,6 @@ namespace seq { } } - // Returns true when the reason is a direct, non-propagated, conflict. - // Mirrors ZIPT's NielsenNode.IsActualConflict(): all conflicts except ChildrenFailed. - static bool is_actual_conflict(backtrack_reason r) { - return r == backtrack_reason::symbol_clash - || r == backtrack_reason::parikh_image - || r == backtrack_reason::subsumption - || r == backtrack_reason::arithmetic - || r == backtrack_reason::regex - || r == backtrack_reason::regex_widening - || r == backtrack_reason::character_range - || r == backtrack_reason::smt; - } - // Render the current substitution of the original (root) variables at `node`. // Walks the parent-edge chain from the root down to `node`, composing the // edge substitutions exactly the way seq_model::extract_assignments does, @@ -683,9 +667,9 @@ namespace seq { if (val == var) continue; // unchanged: variable is still free at this node if (!any) { out << "
Subst:
"; any = true; } - out << snode_label_html(var, names, next_id, m) + out << snode_label_html(var, names, next_id, m, true) << " → " - << snode_label_html(val, names, next_id, m) + << snode_label_html(val, names, next_id, m, true) << "
"; } } @@ -747,9 +731,9 @@ namespace seq { out << "[" << e->rule_name() << "]"; for (auto const& s : e->subst()) { out << "
"; - out << snode_label_html(s.m_var, m) + out << snode_label_html(s.m_var, m, true) << " → " // mapping arrow - << snode_label_html(s.m_replacement, m); + << snode_label_html(s.m_replacement, m, true); } // side constraints: integer equalities/inequalities for (auto const& ic : e->side_constraints()) { @@ -787,7 +771,7 @@ namespace seq { if (!start_state || !start_state->get_expr()) return out << "}\n"; - unsigned start_state_id = start_state->get_expr()->get_id(); + const unsigned start_state_id = start_state->get_expr()->get_id(); unsigned_vector todo; uint_set visited; @@ -796,11 +780,9 @@ namespace seq { todo.push_back(start_state_id); visited.insert(start_state_id); - ast_manager& m = m_sg.get_manager(); - auto sanitize = [](std::string const& s) { std::string res; - for (char c : s) { + for (const char c : s) { if (c == '"') res += "\\\""; else if (c == '\n') res += "\\n"; else res += c; @@ -816,8 +798,9 @@ namespace seq { if (it == m_partial_dfa_out.end()) continue; - for (unsigned edge_idx : it->second) { - if (edge_idx >= m_partial_dfa_edges.size()) continue; + for (const unsigned edge_idx : it->second) { + if (edge_idx >= m_partial_dfa_edges.size()) + continue; partial_dfa_edge const& e = m_partial_dfa_edges[edge_idx]; edges.push_back(&e); @@ -828,11 +811,17 @@ namespace seq { } } - for (unsigned node_id : visited) { + for (const unsigned node_id : visited) { expr* node_expr = nullptr; for (auto* e : edges) { - if (e->m_src->get_id() == node_id) { node_expr = e->m_src; break; } - if (e->m_dst->get_id() == node_id) { node_expr = e->m_dst; break; } + if (e->m_src->get_id() == node_id) { + node_expr = e->m_src; + break; + } + if (e->m_dst->get_id() == node_id) { + node_expr = e->m_dst; + break; + } } if (!node_expr) { for (expr* pinned : m_partial_dfa_pin) { diff --git a/src/smt/seq/seq_regex.cpp b/src/smt/seq/seq_regex.cpp index 4fe5df764..ac271db4c 100644 --- a/src/smt/seq/seq_regex.cpp +++ b/src/smt/seq/seq_regex.cpp @@ -324,8 +324,7 @@ namespace seq { // ----------------------------------------------------------------------- bool seq_regex::is_empty_regex(euf::snode* re) const { - if (!re) - return false; + SASSERT(re); // direct empty language constant if (re->is_fail()) return true; diff --git a/src/smt/theory_nseq.cpp b/src/smt/theory_nseq.cpp index fd528999f..b7dd8ef48 100644 --- a/src/smt/theory_nseq.cpp +++ b/src/smt/theory_nseq.cpp @@ -208,9 +208,8 @@ namespace smt { case l_true: { // regexes are disjoint: conflict enode_pair_vector eqs; - literal_vector lits; eqs.push_back({n1, n2}); - set_conflict(eqs, lits); + set_conflict(eqs); return; } default: break; @@ -454,37 +453,53 @@ namespace smt { SASSERT(mem.well_formed()); + expr* const re = mem.m_regex->get_expr(); + expr* const s = mem.m_str->get_expr(); + std::cout << "Propagating: " << seq::mem_pp(mem, m) << std::endl; + + if (mem.m_regex->is_full_seq()) { + // u \in .* can be ignored + m_ignored_mem.insert(mem.lit); + ctx.push_trail(insert_map(m_ignored_mem, mem.lit)); + return; + } + + // try to rewrite into an easier form + expr_ref simpl(m); + m_th_rewriter(re, simpl); + if (simpl != re) { + // we could simplify; let's propagate it + const expr_ref e(m_seq.re.mk_in_re(s, simpl), m); + ctx.mk_th_axiom(get_id(), ~mem.lit, mk_literal(e)); + m_ignored_mem.insert(mem.lit); + ctx.push_trail(insert_map(m_ignored_mem, mem.lit)); + // std::cout << "Simplified to " << seq::snode_label_html(m_sgraph.mk(simpl), m, false) << std::endl; + return; + } + // regex is ∅ → conflict if (m_regex.is_empty_regex(mem.m_regex)) { - enode_pair_vector eqs; literal_vector lits; lits.push_back(mem.lit); - set_conflict(eqs, lits); + set_conflict(lits); return; } // empty string in non-nullable regex → conflict - if (mem.m_str->is_empty() && m_seq.re.get_info(mem.m_regex->get_expr()).nullable == l_false) { - enode_pair_vector eqs; + if (mem.m_str->is_empty() && m_sgraph.re_nullable(mem.m_regex) == l_false) { literal_vector lits; lits.push_back(mem.lit); - set_conflict(eqs, lits); + set_conflict(lits); return; } - // ensure length term exists for the string argument - expr* s_expr = mem.m_str->get_expr(); - if (s_expr) - ensure_length_var(s_expr); - if (!get_fparams().m_nseq_regex_factorization_threshold) return; - // Boolean Closure Propagations - expr* re_expr = mem.m_regex->get_expr(); - if (m_seq.re.is_intersection(re_expr)) { - for (expr* arg : *to_app(re_expr)) { - expr_ref in_r(m_seq.re.mk_in_re(s_expr, arg), m); + if (mem.m_regex->is_intersect()) { + // u \in r1 & r_2 → u \in r1 && u \in r2 + for (const euf::snode* const arg : *mem.m_regex) { + expr_ref in_r(m_seq.re.mk_in_re(s, arg->get_expr()), m); literal_vector lits; lits.push_back(~mem.lit); lits.push_back(mk_literal(in_r)); @@ -494,11 +509,12 @@ namespace smt { ctx.push_trail(insert_map(m_ignored_mem, mem.lit)); return; } - if (m_seq.re.is_union(re_expr)) { + if (mem.m_regex->is_union()) { + // u \in r1 | r_2 → u \in r1 || u \in r2 literal_vector lits; lits.push_back(~mem.lit); - for (expr* arg : *to_app(re_expr)) { - expr_ref in_r(m_seq.re.mk_in_re(s_expr, arg), m); + for (const euf::snode* const arg : *mem.m_regex) { + expr_ref in_r(m_seq.re.mk_in_re(s, arg->get_expr()), m); lits.push_back(mk_literal(in_r)); } ctx.mk_th_axiom(get_id(), lits.size(), lits.data()); @@ -506,24 +522,81 @@ namespace smt { ctx.push_trail(insert_map(m_ignored_mem, mem.lit)); return; } - if (m_seq.re.is_to_re(re_expr)) { - return; - zstring s; - expr_ref arg(to_app(re_expr)->get_arg(0), m); - if (m_seq.str.is_string(arg, s)) { + + if (mem.m_regex->is_to_re()) { + // u \in v (with v is constant) → u = v + zstring str; + const expr_ref arg(to_app(re)->get_arg(0), m); + if (m_seq.str.is_string(arg, str)) { literal_vector lits; lits.push_back(~mem.lit); - lits.push_back(mk_literal(m.mk_eq(s_expr, arg))); + lits.push_back(mk_literal(m.mk_eq(m_seq.str.mk_string(str), arg))); ctx.mk_th_axiom(get_id(), lits.size(), lits.data()); m_ignored_mem.insert(mem.lit); ctx.push_trail(insert_map(m_ignored_mem, mem.lit)); } } + + // Eager sigma factorization (token-level): when enabled, split a non-primitive + // membership s ∈ r at the boundary between the first concat argument (head) and + // the rest (tail), using compute_sigma. This mirrors the lazy Nielsen + // apply_regex_factorization and the paper's Reduce rule for x·u'. + // (s ∈ r) → ⋁_{⟨Δ,∇⟩∈σ(r)} ( head ∈ Δ ∧ tail ∈ ∇ ) + // Only fires for a concatenation s (single-variable s is already primitive). + if (get_fparams().m_nseq_regex_factorization_eager && + get_fparams().m_nseq_regex_factorization_threshold > 0 && + mem.m_str->is_concat()) { + const app* const a = to_app(s); + const unsigned na = a->get_num_args(); + SASSERT(na >= 2); + + const expr_ref head(a->get_arg(0), m); + const expr_ref tail(m_seq.str.mk_concat(na - 1, a->get_args() + 1, s->get_sort()), m); + + const unsigned threshold = get_fparams().m_nseq_regex_factorization_threshold; + seq::sigma_pairs pairs; + if (!seq::compute_sigma(m, m_seq, m_rewriter, mem.m_regex, pairs, threshold)) + // we give up + return; + + seq::simplify_sigma_pairs(pairs, m_regex, m_sgraph); + + if (pairs.empty()) { + // no viable splits + literal_vector lits; + lits.push_back(~mem.lit); + set_conflict(lits); + return; + } + if (pairs.size() <= threshold) { + TRACE(seq, tout << "eager regex fact: " << mk_pp(s, m) << " in " + << mk_pp(re, m) << " -> " << pairs.size() << " splits\n";); + + if (!ctx.e_internalized(head)) + ctx.internalize(head, false); + if (!ctx.e_internalized(tail)) + ctx.internalize(tail, false); + + // forward direction; mk_literal Tseitin-encodes each conjunction + literal_vector lits; + lits.push_back(~mem.lit); + for (auto const& sp : pairs) { + expr_ref mem_head(m_seq.re.mk_in_re(head, sp.m_p), m); + expr_ref mem_tail(m_seq.re.mk_in_re(tail, sp.m_q), m); + expr_ref conj(m.mk_and(mem_head, mem_tail), m); + lits.push_back(mk_literal(conj)); + } + ctx.mk_th_axiom(get_id(), lits.size(), lits.data()); + m_ignored_mem.insert(mem.lit); + ctx.push_trail(insert_map(m_ignored_mem, mem.lit)); + return; + } + } } - void theory_nseq::ensure_length_var(expr* e) { + void theory_nseq::ensure_length_var(expr* e) const { SASSERT(e && m_seq.is_seq(e)); - expr_ref len(m_seq.str.mk_length(e), m); + const expr_ref len(m_seq.str.mk_length(e), m); if (!ctx.e_internalized(len)) ctx.internalize(len, false); } @@ -773,6 +846,7 @@ namespace smt { m_nielsen.set_parikh_enabled(get_fparams().m_nseq_parikh); m_nielsen.set_signature_split(get_fparams().m_nseq_signature); m_nielsen.set_regex_factorization_threshold(get_fparams().m_nseq_regex_factorization_threshold); + m_nielsen.set_regex_factorization_eager(get_fparams().m_nseq_regex_factorization_eager); } SASSERT(!m_nielsen.root()->is_currently_conflict()); @@ -1044,11 +1118,11 @@ namespace smt { std::cout << "The root node contained " << m_nielsen.root()->str_mems().size() << " memberships and " << m_nielsen.root()->str_eqs().size() << " equalities" << std::endl; unsigned idx = 0; for (auto& eq : m_nielsen.root()->str_eqs()) { - std::cout << "[" << (idx++) << "]: " << seq::eq_pp(eq, m); + std::cout << "[" << (idx++) << "]: " << seq::eq_pp(eq, m) << "\n"; } idx = 0; for (auto& mem : m_nielsen.root()->str_mems()) { - std::cout << "[" << (idx++) << "]: " << seq::mem_pp(mem, m); + std::cout << "[" << (idx++) << "]: " << seq::mem_pp(mem, m) << "\n"; } std::flush(std::cout); #endif @@ -1505,7 +1579,6 @@ namespace smt { if (result == l_true) { // Intersection is empty → the memberships on this variable are // jointly unsatisfiable. Assert a conflict from all their literals. - enode_pair_vector eqs; literal_vector lits; for (unsigned i : mem_indices) { SASSERT(ctx.get_assignment(mems[i]->lit) == l_true); // we already stored the polarity of the literal @@ -1513,7 +1586,7 @@ namespace smt { } TRACE(seq, tout << "nseq regex precheck: empty intersection for var " << var_id << ", conflict with " << lits.size() << " lits\n";); - set_conflict(eqs, lits); + set_conflict(lits); return l_true; // conflict asserted } if (result == l_undef) diff --git a/src/smt/theory_nseq.h b/src/smt/theory_nseq.h index 09d7f35b5..132470d98 100644 --- a/src/smt/theory_nseq.h +++ b/src/smt/theory_nseq.h @@ -137,6 +137,14 @@ namespace smt { void populate_nielsen_graph(); void explain_nielsen_conflict(); void set_conflict(enode_pair_vector const& eqs, literal_vector const& lits); + void set_conflict(literal_vector const& lits) { + const enode_pair_vector eqs; + set_conflict(eqs, lits); + } + void set_conflict(enode_pair_vector const& eqs) { + const literal_vector lits; + set_conflict(eqs, lits); + } void set_propagate(enode_pair_vector const &eqs, literal_vector const &lits, literal p); bool add_nielsen_assumptions(); euf::snode* get_snode(expr* e); @@ -147,7 +155,7 @@ namespace smt { void propagate_pos_mem(tracked_str_mem const& mem); void enqueue_axiom(expr* e); void dequeue_axiom(expr* e); - void ensure_length_var(expr* e); + void ensure_length_var(expr* e) const; // higher-order term unfolding bool unfold_ho_terms();