diff --git a/src/ast/rewriter/seq_rewriter.h b/src/ast/rewriter/seq_rewriter.h index d57a50fb1..f4e4bd18e 100644 --- a/src/ast/rewriter/seq_rewriter.h +++ b/src/ast/rewriter/seq_rewriter.h @@ -383,9 +383,11 @@ public: return result; } - // Split decomposition (sigma) of a regex; see seq_split.h. - bool split(expr* r, split_set& out, unsigned threshold, split_mode mode = split_mode::strong) { - return m_split.compute(r, out, threshold, mode); + // Split decomposition (sigma) of a regex; see seq_split.h. `oracle` (optional) + // prunes non-viable splits during generation. + bool split(expr* r, split_set& out, unsigned threshold, + split_mode mode = split_mode::strong, split_oracle const& oracle = {}) { + return m_split.compute(r, out, threshold, mode, oracle); } void simplify_split(split_set& s) { m_split.simplify(s); } diff --git a/src/ast/rewriter/seq_split.cpp b/src/ast/rewriter/seq_split.cpp index c915e5293..63cb68f4c 100644 --- a/src/ast/rewriter/seq_split.cpp +++ b/src/ast/rewriter/seq_split.cpp @@ -28,18 +28,26 @@ ast_manager& seq_split::m() const { return m_rw.m(); } seq_util& seq_split::seq() const { return m_rw.u(); } seq_util::rex& seq_split::re() const { return m_rw.u().re; } +// Add unless the (optional) lookahead oracle prunes it. +void seq_split::push(split_set& out, split_oracle const& oracle, expr* d, expr* n) const { + if (!oracle || oracle(d, n)) + out.push_back(split_pair(d, n, m())); +} + // Cross-product intersection of two split-sets (split algebra): // S1 cap S2 = { | in S1, in S2 }. // Pairs where any component is bottom (the empty regex) are dropped. -bool seq_split::intersect(split_set const& s1, split_set const& s2, split_set& result, unsigned threshold) { +bool seq_split::intersect(split_set const& s1, split_set const& s2, split_set& result, + unsigned threshold, split_oracle const& oracle) { seq_util::rex& r = re(); for (auto const& p1 : s1) { for (auto const& p2 : s2) { if (r.is_empty(p1.m_d) || r.is_empty(p2.m_d) || r.is_empty(p1.m_n) || r.is_empty(p2.m_n)) continue; - result.push_back(split_pair(m_rw.mk_regex_inter_normalize(p1.m_d, p2.m_d), - m_rw.mk_regex_inter_normalize(p1.m_n, p2.m_n), m())); + const expr_ref di(m_rw.mk_regex_inter_normalize(p1.m_d, p2.m_d), m()); + const expr_ref ni(m_rw.mk_regex_inter_normalize(p1.m_n, p2.m_n), m()); + push(result, oracle, di, ni); if (result.size() > threshold) return false; } @@ -52,23 +60,27 @@ bool seq_split::intersect(split_set const& s1, split_set const& s2, split_set& r // May produce up to 2^|sp| pairs (bounded by the threshold). A threshold // overrun must abort entirely: a partial fold is a strictly weaker (unsound) // split-set, since each ~sp[i] further constrains ~S. -bool seq_split::complement(sort* seq_sort, split_set const& sp, split_set& result, unsigned threshold) { +bool seq_split::complement(sort* seq_sort, split_set const& sp, split_set& result, + unsigned threshold, split_oracle const& oracle) { seq_util::rex& r = re(); sort* re_sort = r.mk_re(seq_sort); const expr_ref full(r.mk_full_seq(re_sort), m()); // .* if (sp.empty()) { // ~{} = <.*, .*> - result.push_back(split_pair(full, full, m())); + push(result, oracle, full, full); return true; } + // The acc/next pairs carry genuine output-orientation N components (the De + // Morgan ~ = {<~D,.*>, <.*,~N>}), so the oracle prunes them soundly and + // keeps the 2^|sp| fold from blowing up. split_set acc; - acc.push_back(split_pair(r.mk_complement(sp[0].m_d), full, m())); - acc.push_back(split_pair(full, r.mk_complement(sp[0].m_n), m())); + push(acc, oracle, r.mk_complement(sp[0].m_d), full); + push(acc, oracle, full, r.mk_complement(sp[0].m_n)); for (unsigned i = 1; i < sp.size(); ++i) { split_set next; - next.push_back(split_pair(r.mk_complement(sp[i].m_d), full, m())); - next.push_back(split_pair(full, r.mk_complement(sp[i].m_n), m())); + push(next, oracle, r.mk_complement(sp[i].m_d), full); + push(next, oracle, full, r.mk_complement(sp[i].m_n)); split_set tmp; - if (!intersect(acc, next, tmp, threshold)) + if (!intersect(acc, next, tmp, threshold, oracle)) return false; acc = std::move(tmp); if (acc.empty()) // intersection empty => ~S is empty @@ -80,14 +92,13 @@ bool seq_split::complement(sort* seq_sort, split_set const& sp, split_set& resul return true; } -bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mode mode) { +bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mode mode, + split_oracle const& oracle) { SASSERT(r); seq_util& sq = seq(); seq_util::rex& rex = re(); ast_manager& mm = m(); - // std::cout << "compute sigma of " << mk_pp(r, m()) << std::endl; - sort* seq_sort = nullptr; if (!sq.is_re(r, seq_sort)) return false; @@ -99,7 +110,7 @@ bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mo // epsilon: sigma(eps) = { } if (rex.is_epsilon(r)) { const expr_ref eps(rex.mk_epsilon(seq_sort), mm); - result.push_back(split_pair(eps, eps, mm)); + push(result, oracle, eps, eps); return true; } @@ -113,7 +124,7 @@ bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mo for (unsigned i = 0; i <= str.length(); ++i) { const expr_ref p(rex.mk_to_re(sq.str.mk_string(str.extract(0, i))), mm); const expr_ref q(rex.mk_to_re(sq.str.mk_string(str.extract(i, str.length() - i))), mm); - result.push_back(split_pair(p, q, mm)); + push(result, oracle, p, q); } return true; } @@ -121,8 +132,8 @@ bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mo if (sq.str.is_unit(s)) { const expr_ref ex(r, mm); const expr_ref eps(rex.mk_epsilon(seq_sort), mm); - result.push_back(split_pair(eps, ex, mm)); - result.push_back(split_pair(ex, eps, mm)); + push(result, oracle, eps, ex); + push(result, oracle, ex, eps); return true; } // to_re over a non-literal sequence: not handled. @@ -134,15 +145,15 @@ bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mo if (rex.is_full_char(r) || rex.is_range(r) || rex.is_of_pred(r)) { const expr_ref ex(r, mm); const expr_ref eps(rex.mk_epsilon(seq_sort), mm); - result.push_back(split_pair(eps, ex, mm)); - result.push_back(split_pair(ex, eps, mm)); + push(result, oracle, eps, ex); + push(result, oracle, ex, eps); return true; } // .* : sigma(.*) = { <.*, .*> } if (rex.is_full_seq(r)) { const expr_ref ex(r, mm); - result.push_back(split_pair(ex, ex, mm)); + push(result, oracle, ex, ex); return true; } @@ -150,7 +161,7 @@ bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mo if (rex.is_union(r)) { app* ap = to_app(r); for (unsigned i = 0; i < ap->get_num_args(); ++i) { - if (!compute(ap->get_arg(i), result, threshold, mode)) + if (!compute(ap->get_arg(i), result, threshold, mode, oracle)) return false; } return true; @@ -162,8 +173,11 @@ bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mo app* ap = to_app(r); const unsigned n = ap->get_num_args(); for (unsigned i = 0; i < n; ++i) { + // Sound to pass the oracle into the sub-computation: N_inner.Sigma* + // over-approximates the final N_inner.right, so a prune here is a + // prune of the final pair too (prefix-compatible test). split_set sigma_arg; - if (!compute(ap->get_arg(i), sigma_arg, threshold, mode)) + if (!compute(ap->get_arg(i), sigma_arg, threshold, mode, oracle)) return false; expr_ref left(mm), right(mm); @@ -188,7 +202,7 @@ bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mo for (auto const& [d, nn] : sigma_arg) { const expr_ref p = m_rw.mk_re_append(left, d); const expr_ref q = m_rw.mk_re_append(nn, right); - result.push_back(split_pair(p, q, mm)); + push(result, oracle, p, q); } } return true; @@ -197,14 +211,14 @@ bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mo // star: sigma(a*) = { } cup a*.sigma(a).a* if (rex.is_star(r, a)) { const expr_ref eps(rex.mk_epsilon(seq_sort), mm); - result.push_back(split_pair(eps, eps, mm)); + push(result, oracle, eps, eps); split_set sa; - if (!compute(a, sa, threshold, mode)) + if (!compute(a, sa, threshold, mode, oracle)) return false; for (auto const& [d, n] : sa) { const expr_ref p = m_rw.mk_re_append(r, d); // a*.D const expr_ref q = m_rw.mk_re_append(n, r); // N.a* - result.push_back(split_pair(p, q, mm)); + push(result, oracle, p, q); } return true; } @@ -213,12 +227,12 @@ bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mo if (rex.is_plus(r, a)) { const expr_ref star(rex.mk_star(a), mm); // a* split_set sa; - if (!compute(a, sa, threshold, mode)) + if (!compute(a, sa, threshold, mode, oracle)) return false; for (auto const& [d, n] : sa) { const expr_ref p = m_rw.mk_re_append(star, d); const expr_ref q = m_rw.mk_re_append(n, star); - result.push_back(split_pair(p, q, mm)); + push(result, oracle, p, q); } return true; } @@ -230,16 +244,16 @@ bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mo app* ap = to_app(r); const unsigned n = ap->get_num_args(); split_set current; - if (!compute(ap->get_arg(0), current, threshold, mode)) + if (!compute(ap->get_arg(0), current, threshold, mode, oracle)) return false; // A give-up on any conjunct must propagate as a give-up: silently treating // it as the empty split-set would collapse the whole intersection to bottom // and be misreported as an (unsound) conflict. for (unsigned i = 1; i < n && !current.empty(); ++i) { split_set arg_i, tmp; - if (!compute(ap->get_arg(i), arg_i, threshold, mode)) + if (!compute(ap->get_arg(i), arg_i, threshold, mode, oracle)) return false; - if (!intersect(current, arg_i, tmp, threshold)) + if (!intersect(current, arg_i, tmp, threshold, oracle)) return false; current = std::move(tmp); } @@ -247,28 +261,31 @@ bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mo return true; } - // complement: sigma(~a) = ~sigma(a) + // complement: sigma(~a) = ~sigma(a). + // The body is computed WITHOUT the oracle (the body's pairs are inverted, so + // their N is unrelated to the output N); the oracle is re-applied in complement(). if (rex.is_complement(r, a)) { if (mode == split_mode::weak) return false; split_set sa; if (!compute(a, sa, threshold, mode)) return false; - return complement(seq_sort, sa, result, threshold); + return complement(seq_sort, sa, result, threshold, oracle); } - // difference: a \ b = a & ~b ; sigma(a \ b) = sigma(a) cap ~sigma(b) + // difference: a \ b = a & ~b ; sigma(a \ b) = sigma(a) cap ~sigma(b). + // sigma(b) (used only inside the complement) is computed WITHOUT the oracle. if (rex.is_diff(r, a, b)) { if (mode == split_mode::weak) return false; split_set sa, sb, sb_compl, tmp; - if (!compute(a, sa, threshold, mode)) + if (!compute(a, sa, threshold, mode, oracle)) return false; if (!compute(b, sb, threshold, mode)) return false; - if (!complement(seq_sort, sb, sb_compl, threshold)) + if (!complement(seq_sort, sb, sb_compl, threshold, oracle)) return false; - if (!intersect(sa, sb_compl, tmp, threshold)) + if (!intersect(sa, sb_compl, tmp, threshold, oracle)) return false; result.append(tmp); return true; @@ -276,7 +293,6 @@ bool seq_split::compute(expr* r, split_set& result, unsigned threshold, split_mo // bounded loop / ite / other: not handled (paper "v1: bail"). TRACE(seq, tout << "seq_split: unsupported regex " << mk_pp(r, mm) << "\n";); - std::cout << "seq_split: unsupported regex " << mk_pp(r, mm) << std::endl; return false; } diff --git a/src/ast/rewriter/seq_split.h b/src/ast/rewriter/seq_split.h index 78f6dcee9..fb53330a0 100644 --- a/src/ast/rewriter/seq_split.h +++ b/src/ast/rewriter/seq_split.h @@ -33,6 +33,7 @@ Author: #include "ast/seq_decl_plugin.h" #include "ast/rewriter/seq_subset.h" +#include class seq_rewriter; @@ -57,6 +58,11 @@ typedef vector split_set; // give up (return false) on complement / intersection instead. enum class split_mode { weak, strong }; +// Optional lookahead oracle. Called for each candidate split as it is +// generated; returns true to keep it, false to prune it. An empty oracle (the +// default) keeps everything, so sigma is unchanged. See seq_split::compute. +typedef std::function split_oracle; + class seq_split { seq_rewriter& m_rw; // for mk_re_append + manager / seq_util access seq_subset m_subset; // language-subset checks for subsumption @@ -65,13 +71,18 @@ class seq_split { seq_util& seq() const; seq_util::rex& re() const; + // Push onto `out`, unless `oracle` rejects it. + void push(split_set& out, split_oracle const& oracle, expr* d, expr* n) const; + // S1 cap S2 = { } dropping any pair with a bottom - // component. Returns false on threshold overrun. - bool intersect(split_set const& s1, split_set const& s2, split_set& result, unsigned threshold); + // component (and any rejected by `oracle`). Returns false on threshold overrun. + bool intersect(split_set const& s1, split_set const& s2, split_set& result, + unsigned threshold, split_oracle const& oracle); // De Morgan complement of a split-set: ~S = cap_{s in S} ~s with // ~ = { <~D, .*>, <.*, ~N> } and ~{} = { <.*, .*> }. - bool complement(sort* seq_sort, split_set const& sp, split_set& result, unsigned threshold); + bool complement(sort* seq_sort, split_set const& sp, split_set& result, + unsigned threshold, split_oracle const& oracle); // same-D / same-N merge: groups pairs that share a (syntactically identical) // left (resp. right) component and unions the other component. @@ -84,7 +95,16 @@ public: // bounds the number of produced splits; an overrun, an unsupported regex // shape (bounded loop / ite), or a Boolean-closure case in weak mode makes // it return false ("give up"). - bool compute(expr* r, split_set& out, unsigned threshold, split_mode mode = split_mode::strong); + // + // `oracle` (optional) prunes non-viable splits *during* generation. It must + // be sound to apply at every generation step: a candidate N can still gain a + // prefix from a factor appended to its right later (concat/star), so the + // oracle must use a "prefix-compatible" test (prune only when N can never + // match the lookahead, even partially), NOT a strict "starts-with" test. + // The complement body is computed WITHOUT the oracle (inverted orientation); + // the oracle is re-applied to the complement's output fold. + bool compute(expr* r, split_set& out, unsigned threshold, + split_mode mode = split_mode::strong, split_oracle const& oracle = {}); // In-place simplification of a split-set: drop bottom components, apply the // same-D / same-N merge rules, and drop splits subsumed by another (using diff --git a/src/smt/seq/seq_nielsen.cpp b/src/smt/seq/seq_nielsen.cpp index e0ee27240..f17ec9164 100644 --- a/src/smt/seq/seq_nielsen.cpp +++ b/src/smt/seq/seq_nielsen.cpp @@ -2458,7 +2458,7 @@ namespace seq { euf::snode* tail = get_tail(var_head, a.mk_int(1), fwd); euf::snode* replacement = dir_concat(m_sg, char_head, tail, fwd); child = mk_child(node); - e = mk_edge(node, child, "nielsen const >", false); + e = mk_edge(node, child, "nielsen const >", false); e->add_side_constraint(mk_constraint(a.mk_ge(compute_length_expr(tail), a.mk_int(0)), eq.m_dep)); const nielsen_subst s2(var_head, replacement, eq.m_dep); e->add_subst(s2); @@ -2505,7 +2505,7 @@ namespace seq { { euf::snode* replacement = dir_concat(m_sg, rhead, get_tail(lhead, compute_length_expr(rhead).get(), fwd), fwd); nielsen_node* child = mk_child(node); - nielsen_edge* e = mk_edge(node, child, "nielsen var >", false); + nielsen_edge* e = mk_edge(node, child, "nielsen var >", false); const nielsen_subst s(lhead, replacement, eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); @@ -2514,7 +2514,7 @@ namespace seq { { euf::snode* replacement = dir_concat(m_sg, lhead, get_tail(rhead, compute_length_expr(lhead).get(), fwd), fwd); nielsen_node* child = mk_child(node); - nielsen_edge* e = mk_edge(node, child, "nielsen var <", false); + nielsen_edge* e = mk_edge(node, child, "nielsen var <", false); const nielsen_subst s(rhead, replacement, eq.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); @@ -3180,13 +3180,13 @@ namespace seq { // Branch 1: pow_exp < count (i.e., count >= pow_exp + 1) { - nielsen_edge *e = mk_edge(node, mk_child(node), "power elim >", true); + nielsen_edge *e = mk_edge(node, mk_child(node), "power elim >", true); const expr_ref pow_plus1(a.mk_add(pow_exp, a.mk_int(1)), m); e->add_side_constraint(mk_constraint(a.mk_ge(norm_count, pow_plus1), eq.m_dep)); } // Branch 2: count <= pow_exp (i.e., pow_exp >= count) { - nielsen_edge *e = mk_edge(node, mk_child(node), "power elim <=", true); + nielsen_edge *e = mk_edge(node, mk_child(node), "power elim ≤", true); e->add_side_constraint(mk_constraint(a.mk_ge(pow_exp, norm_count), eq.m_dep)); } return true; @@ -3246,7 +3246,7 @@ namespace seq { euf::snode* replacement = dir_concat(m_sg, base, nested_power_snode, fwd); child = mk_child(node); - e = mk_edge(node, child, "unwinding >", true); + e = mk_edge(node, child, "unwinding >", true); const nielsen_subst s2(power, replacement, eq->m_dep); e->add_subst(s2); child->apply_subst(m_sg, s2); @@ -3576,6 +3576,30 @@ namespace seq { // Modifier: apply_regex_factorization (Boolean Closure) // ----------------------------------------------------------------------- + // Lookahead oracle for the split engine: is the split's right component + // `n_regex` prefix-compatible with the constant character sequence `c`? + // The factorization picks a boundary so the tail starts with c, hence the + // tail-regex ∇ must be able to match c as a prefix. We use a *prefix* test + // (not strict "starts-with"): we accept as soon as N accepts a prefix of c + // (a suffix appended downstream can complete it). This is sound to apply + // during split generation — it never drops a viable split. + bool nielsen_graph::split_lookahead_viable(expr* n_regex, zstring const& c) { + euf::snode* cur = m_sg.mk(n_regex); + if (!cur) + return true; // conservative: keep + for (unsigned i = 0; i < c.length(); ++i) { + if (m_sg.re_nullable(cur) == l_true) + return true; // N accepts the prefix c[0..i) → a suffix completes it + cur = m_sg.brzozowski_deriv(cur, m_sg.mk_char(c[i])); + if (!cur || cur->is_fail()) + return false; // N went (syntactically) dead before reaching c + } + // Consumed all of c without matching a prefix: keep iff δ_c(N) is non-empty + // (one BFS emptiness check rather than per-char; an empty-language state is + // never nullable, so the loop above can't wrongly early-accept it). + return !m_seq_regex->is_empty_regex(cur); + } + bool nielsen_graph::apply_regex_factorization(nielsen_node* node) { if (m_regex_factorization_threshold == 0) return false; @@ -3603,15 +3627,50 @@ namespace seq { euf::snode* first = mem.m_str->first(); SASSERT(first); - SASSERT(!first->is_char()); - euf::snode* tail = m_sg.drop_first(mem.m_str); - SASSERT(tail); + SASSERT(!first->is_char()); // constants are consumed earlier + + // Choose the factorization boundary so the tail starts with the + // LONGEST run of concrete characters c — this gives the split-engine + // lookahead oracle the most pruning information. head = u' (tokens + // before the run), tail = c · u''' (tokens from the run onward). + euf::snode_vector toks; + mem.m_str->collect_tokens(toks); + const unsigned total = toks.size(); + unsigned run_start = 0, run_len = 0; + for (unsigned i = 0; i < total; ) { + if (!toks[i]->is_char()) { ++i; continue; } + unsigned j = i; + while (j < total && toks[j]->is_char()) ++j; + if (j - i > run_len) { run_len = j - i; run_start = i; } + i = j; + } + // No constant run → fall back to splitting off the first token. + const unsigned p = (run_len == 0) ? 1 : run_start; + SASSERT(p >= 1); + euf::snode* head = (p == 1) ? first : m_sg.drop_right(mem.m_str, total - p); + euf::snode* tail = m_sg.drop_left(mem.m_str, p); + SASSERT(head && tail); + + // Build the constant lookahead c and (if non-empty) an oracle that + // prunes splits whose ∇ cannot match c. + zstring c; + for (unsigned i = 0; i < run_len; ++i) { + expr* ch; unsigned cv; + VERIFY(m_seq.str.is_unit(toks[run_start + i]->get_expr(), ch)); + VERIFY(m_seq.is_const_char(ch, cv)); + c = c + zstring(cv); + } + split_oracle oracle; + if (!c.empty()) + oracle = [this, c](expr*, expr* n) { return split_lookahead_viable(n, c); }; // Decompose the regex into a split-set via the shared seq_split engine - // (sigma from the paper): first ∈ Δ ∧ tail ∈ ∇ for each ⟨Δ,∇⟩. + // (sigma from the paper): head ∈ Δ ∧ tail ∈ ∇ for each ⟨Δ,∇⟩, with the + // lookahead oracle pruning non-viable ∇ during generation. split_set pairs; seq_rewriter rw(m); - if (!rw.split(mem.m_regex->get_expr(), pairs, m_regex_factorization_threshold)) + if (!rw.split(mem.m_regex->get_expr(), pairs, m_regex_factorization_threshold, + split_mode::strong, oracle)) continue; rw.simplify_split(pairs); @@ -3623,14 +3682,18 @@ namespace seq { euf::snode* sn_p = m_sg.mk(tp); euf::snode* sn_q = m_sg.mk(tq); - // Also check intersection with other primitive constraints on `first` + // Also check intersection with other primitive constraints on `head`. + // Only valid when head is the single token `first`; for a multi-token + // head Δ constrains the whole prefix, so we only check Δ ≠ ∅. ptr_vector regexes_p; regexes_p.push_back(sn_p); dep_tracker first_filter_dep = nullptr; - for (auto const& prev_mem : node->str_mems()) { - if (prev_mem.m_str == first) { - regexes_p.push_back(prev_mem.m_regex); - first_filter_dep = m_dep_mgr.mk_join(first_filter_dep, prev_mem.m_dep); + if (head == first) { + for (auto const& prev_mem : node->str_mems()) { + if (prev_mem.m_str == first) { + regexes_p.push_back(prev_mem.m_regex); + first_filter_dep = m_dep_mgr.mk_join(first_filter_dep, prev_mem.m_dep); + } } } if (m_seq_regex->check_intersection_emptiness(regexes_p, 100) == l_true) { @@ -3666,7 +3729,7 @@ namespace seq { } } - child->add_str_mem(str_mem(first, m_p, m_dep)); + child->add_str_mem(str_mem(head, m_p, m_dep)); child->add_str_mem(str_mem(tail, m_q, m_dep)); } return true; @@ -4246,7 +4309,7 @@ namespace seq { euf::snode* power_snode = m_sg.mk(power_expr); euf::snode* replacement = dir_concat(m_sg, base, power_snode, fwd); nielsen_node* child = mk_child(node); - nielsen_edge* e = mk_edge(node, child, "unwinding eq >", false); + nielsen_edge* e = mk_edge(node, child, "unwinding eq >", false); const nielsen_subst s(power, replacement, eq->m_dep); // TODO review - ensure var does not occur in replacement. e->add_subst(s); child->apply_subst(m_sg, s); @@ -4290,7 +4353,7 @@ namespace seq { euf::snode* replacement = dir_concat(m_sg, base, power_snode, fwd); nielsen_node* child = mk_child(node); - nielsen_edge* e = mk_edge(node, child, "unwinding mem >", false); + nielsen_edge* e = mk_edge(node, child, "unwinding mem >", false); const nielsen_subst s(power, replacement, mem->m_dep); // TODO review - ensure var does not occur in replacement. e->add_subst(s); child->apply_subst(m_sg, s); diff --git a/src/smt/seq/seq_nielsen.h b/src/smt/seq/seq_nielsen.h index a30a80442..24026264c 100644 --- a/src/smt/seq/seq_nielsen.h +++ b/src/smt/seq/seq_nielsen.h @@ -1316,6 +1316,13 @@ namespace seq { // generalized regex factorization (Boolean closure derivation rule) bool apply_regex_factorization(nielsen_node* node); + // Lookahead oracle for apply_regex_factorization's split() call: returns + // true iff the split's right component `n_regex` is prefix-compatible with + // the constant character sequence `c` (the tail of the factorization starts + // with c). Prunes splits whose tail-regex can never match c. Sound to + // apply during split generation (prefix-, not strict-, match). + bool split_lookahead_viable(expr* n_regex, zstring const& c); + // helper for apply_gpower_intr: fires the substitution. // `fwd=true` uses left-to-right decomposition; `fwd=false` mirrors ZIPT's // backward (right-to-left) direction. diff --git a/src/smt/seq/seq_nielsen_pp.cpp b/src/smt/seq/seq_nielsen_pp.cpp index 101fd53d0..a1e614255 100644 --- a/src/smt/seq/seq_nielsen_pp.cpp +++ b/src/smt/seq/seq_nielsen_pp.cpp @@ -471,7 +471,6 @@ namespace seq { } } else if (tok->is_unit()) { // seq.unit with non-literal character: show the character expression - std::cout << mk_pp(e, m) << std::endl; expr* ch = to_app(e)->get_arg(0); if (is_app(ch) && to_app(ch)->get_num_args() == 0) result += "[" + dot_html_escape(to_app(ch)->get_decl()->get_name().str(), html_escape) + "]"; diff --git a/src/smt/theory_nseq.cpp b/src/smt/theory_nseq.cpp index 2e71e22f0..c1d0fb6f7 100644 --- a/src/smt/theory_nseq.cpp +++ b/src/smt/theory_nseq.cpp @@ -287,13 +287,13 @@ namespace smt { void theory_nseq::assign_eh(bool_var v, bool is_true) { try { expr* e = ctx.bool_var2expr(v); - //std::cout << "assigned [" << sat::literal(v, !is_true) << "] " << mk_pp(e, m) << " = " << is_true << std::endl; + const literal lit(v, !is_true); + //std::cout << "assigned [" << lit << "] " << mk_pp(e, m) << " = " << is_true << std::endl; expr *s = nullptr, *re = nullptr, *a = nullptr, *b = nullptr; TRACE(seq, tout << (is_true ? "" : "¬") << mk_bounded_pp(e, m, 3) << "\n";); if (m_seq.str.is_in_re(e, s, re)) { euf::snode* sn_str = get_snode(s); euf::snode* sn_re = get_snode(re); - const literal lit(v, !is_true); const seq::dep_tracker dep = nullptr; if (is_true) { ctx.push_trail(restore_vector(m_prop_queue)); @@ -314,18 +314,63 @@ namespace smt { } } else if (m_seq.str.is_prefix(e)) { + zstring str; + if (m_seq.str.is_string(to_app(e)->get_arg(0), str)) { + // prefix(u, v) with u const => v \in u \Sigma^* + const expr_ref pre(m_seq.re.mk_in_re(to_app(e)->get_arg(1), m_seq.re.mk_concat( + m_seq.re.mk_to_re(to_app(e)->get_arg(0)), + m_seq.re.mk_full_seq(m_seq.re.mk_re(to_app(e)->get_arg(0)->get_sort())) + )), m); + ctx.internalize(pre, false); + literal l = ctx.get_literal(pre); + if (!is_true) + l = ~l; + ctx.mk_th_axiom(get_id(), ~lit, l); + return; + } if (is_true) m_axioms.prefix_true_axiom(e); else m_axioms.prefix_axiom(e); } else if (m_seq.str.is_suffix(e)) { + zstring str; + if (m_seq.str.is_string(to_app(e)->get_arg(0), str)) { + // suffix(u, v) with u const => v \in \Sigma* u + const expr_ref suff(m_seq.re.mk_in_re(to_app(e)->get_arg(1), m_seq.re.mk_concat( + m_seq.re.mk_full_seq(m_seq.re.mk_re(to_app(e)->get_arg(0)->get_sort())), + m_seq.re.mk_to_re(to_app(e)->get_arg(0)) + )), m); + ctx.internalize(suff, false); + literal l = ctx.get_literal(suff); + if (!is_true) + l = ~l; + ctx.mk_th_axiom(get_id(), ~lit, l); + return; + } if (is_true) m_axioms.suffix_true_axiom(e); else m_axioms.suffix_axiom(e); } else if (m_seq.str.is_contains(e)) { + zstring str; + if (m_seq.str.is_string(to_app(e)->get_arg(1), str)) { + // contains(u, v) with v const => u \in \Sigma* v \Sigma^* + sort* re_sort = m_seq.re.mk_re(to_app(e)->get_arg(0)->get_sort()); + expr* all = m_seq.re.mk_full_seq(re_sort); + const expr_ref con(m_seq.re.mk_in_re(to_app(e)->get_arg(0), m_seq.re.mk_concat( + all, + m_seq.re.mk_concat( + m_seq.re.mk_to_re(to_app(e)->get_arg(1)), all) + )), m); + ctx.internalize(con, false); + literal l = ctx.get_literal(con); + if (!is_true) + l = ~l; + ctx.mk_th_axiom(get_id(), ~lit, l); + return; + } if (is_true) m_axioms.contains_true_axiom(e); else @@ -457,7 +502,7 @@ namespace smt { expr* const re = mem.m_regex->get_expr(); expr* const s = mem.m_str->get_expr(); - // std::cout << "Propagating: " << seq::mem_pp(mem, m) << std::endl; + std::cout << "Propagating: " << seq::mem_pp(mem, m) << std::endl; if (!mem.m_str->is_empty()) { if (mem.m_str->first()->is_char()) { @@ -721,7 +766,6 @@ namespace smt { } else if (std::holds_alternative(item)) { auto const& mem = std::get(item); - std::cout << "Adding " << seq::mem_pp(mem, m) << std::endl; int triv = m_regex.check_trivial(mem); if (triv > 0) { ++num_mems;