diff --git a/src/ast/rewriter/seq_derive.cpp b/src/ast/rewriter/seq_derive.cpp index 9f60d5be7..408b1e922 100644 --- a/src/ast/rewriter/seq_derive.cpp +++ b/src/ast/rewriter/seq_derive.cpp @@ -52,7 +52,7 @@ namespace seq { expr_ref derive::operator()(expr* ele, expr* r) { SASSERT(m_util.is_re(r)); - if (m_trail.size() > 1000) + if (m_trail.size() > 100000) reset(); // Check top-level cache (post-simplify result) expr* cached = nullptr; @@ -576,6 +576,109 @@ namespace seq { return false; } + // Extract character range [lo, hi] from a derivative condition. + // Conditions are of the form: + // char_le(lo_expr, ele) && char_le(ele, hi_expr) → range [lo, hi] + // char_le(lo_expr, ele) → range [lo, max_char] + // char_le(ele, hi_expr) → range [0, hi] + // Returns false if not a recognizable range condition. + bool derive::extract_char_range(expr* cond, unsigned& lo, unsigned& hi) { + expr* e1 = nullptr, *e2 = nullptr, *lhs = nullptr, *rhs = nullptr; + lo = 0; + hi = u().max_char(); + + // Negation: ~(range [a,b]) = [0,a-1] ∪ [b+1,max] + // We don't handle negation here — it's handled via pred_implies logic + if (m.is_not(cond, e1)) + return false; + + // Conjunction: and(char_le(lo, x), char_le(x, hi)) + if (m.is_and(cond, e1, e2)) { + expr *a1, *a2, *b1, *b2; + unsigned v; + if (u().is_char_le(e1, a1, a2) && u().is_char_le(e2, b1, b2)) { + // e1: a1 <= a2, e2: b1 <= b2 + // Expect: lo <= ele (a1=const, a2=var) and ele <= hi (b1=var, b2=const) + // OR: ele <= hi (a1=var, a2=const) and lo <= ele (b1=const, b2=var) + if (u().is_const_char(a1, v) && u().is_const_char(b2, lo)) { + // e1: const <= a2, e2: b1 <= const + // This is: v <= ele and ele <= lo — wrong naming, let me fix + lo = v; + hi = 0; + if (u().is_const_char(b2, hi)) + return true; + } + } + // Try more carefully: extract from each conjunct + lo = 0; + hi = u().max_char(); + if (u().is_char_le(e1, a1, a2)) { + if (u().is_const_char(a1, v) && !u().is_const_char(a2, v)) + lo = std::max(lo, v); // v <= ele + else if (!u().is_const_char(a1, v) && u().is_const_char(a2, v)) + hi = std::min(hi, v); // ele <= v + } + if (u().is_char_le(e2, b1, b2)) { + unsigned v2; + if (u().is_const_char(b1, v2) && !u().is_const_char(b2, v2)) + lo = std::max(lo, v2); // v2 <= ele + else if (!u().is_const_char(b1, v2) && u().is_const_char(b2, v2)) + hi = std::min(hi, v2); // ele <= v2 + } + return lo <= hi; + } + + // Single char_le + if (u().is_char_le(cond, lhs, rhs)) { + unsigned v; + if (u().is_const_char(lhs, v) && !u().is_const_char(rhs, v)) { + lo = v; // v <= ele + return true; + } + if (!u().is_const_char(lhs, v) && u().is_const_char(rhs, v)) { + hi = v; // ele <= v + return true; + } + } + + return false; + } + + // Predicate implication for character range conditions. + // Returns true if: whenever cond_a is true, cond_b must also be true. + // Used for BDD-merge of derivative ITE trees. + bool derive::pred_implies(expr* a, expr* b) { + if (a == b) return true; + + expr *nota = nullptr, *notb = nullptr; + + // ~a implies ~b iff b implies a + if (m.is_not(a, nota) && m.is_not(b, notb)) + return pred_implies(notb, nota); + + unsigned lo_a, hi_a, lo_b, hi_b; + + // a implies b: range_a ⊆ range_b + if (extract_char_range(a, lo_a, hi_a) && extract_char_range(b, lo_b, hi_b)) + return lo_b <= lo_a && hi_a <= hi_b; + + // a implies ~b: range_a ∩ range_b = ∅ + if (m.is_not(b, notb)) { + if (extract_char_range(a, lo_a, hi_a) && extract_char_range(notb, lo_b, hi_b)) + return hi_a < lo_b || hi_b < lo_a; + } + + // ~a implies b: complement of range_a ⊆ range_b + // This is true when range_b covers everything outside range_a + // i.e., lo_b == 0 and hi_b >= max_char, minus range_a... complex, skip for now + if (m.is_not(a, nota)) { + if (extract_char_range(nota, lo_a, hi_a) && extract_char_range(b, lo_b, hi_b)) + return lo_b <= 0 && hi_b >= u().max_char(); // only if b is universal + } + + return false; + } + expr_ref derive::mk_union(expr* a, expr* b) { // Check op cache expr* cached = nullptr; @@ -618,8 +721,10 @@ namespace seq { return mk_deriv_concat(expr_ref(a1, m), tail); } - // ITE combination: if both are ITE with same condition, merge + // ITE handling for union expr *c1, *t1, *e1, *c2, *t2, *e2; + + // Same condition merge (cheap, always correct) if (m.is_ite(a, c1, t1, e1) && m.is_ite(b, c2, t2, e2) && c1 == c2) { expr_ref then_br = mk_union(t1, t2); expr_ref else_br = mk_union(e1, e2); @@ -629,10 +734,10 @@ namespace seq { // Conservative ITE hoisting via subsumption: // Only hoist when at least one branch simplifies by is_subset. if (m.is_ite(a, c1, t1, e1)) { - bool t1_sub_b = is_subset(t1, b); // t1 ∪ b = b - bool b_sub_t1 = is_subset(b, t1); // t1 ∪ b = t1 - bool e1_sub_b = is_subset(e1, b); // e1 ∪ b = b - bool b_sub_e1 = is_subset(b, e1); // e1 ∪ b = e1 + bool t1_sub_b = is_subset(t1, b); + bool b_sub_t1 = is_subset(b, t1); + bool e1_sub_b = is_subset(e1, b); + bool b_sub_e1 = is_subset(b, e1); if (t1_sub_b || b_sub_t1 || e1_sub_b || b_sub_e1) { expr_ref then_br = t1_sub_b ? expr_ref(b, m) : b_sub_t1 ? expr_ref(t1, m) : mk_union(t1, b); expr_ref else_br = e1_sub_b ? expr_ref(b, m) : b_sub_e1 ? expr_ref(e1, m) : mk_union(e1, b); @@ -640,10 +745,10 @@ namespace seq { } } if (m.is_ite(b, c2, t2, e2)) { - bool t2_sub_a = is_subset(t2, a); // a ∪ t2 = a - bool a_sub_t2 = is_subset(a, t2); // a ∪ t2 = t2 - bool e2_sub_a = is_subset(e2, a); // a ∪ e2 = a - bool a_sub_e2 = is_subset(a, e2); // a ∪ e2 = e2 + bool t2_sub_a = is_subset(t2, a); + bool a_sub_t2 = is_subset(a, t2); + bool e2_sub_a = is_subset(e2, a); + bool a_sub_e2 = is_subset(a, e2); if (t2_sub_a || a_sub_t2 || e2_sub_a || a_sub_e2) { expr_ref then_br = t2_sub_a ? expr_ref(a, m) : a_sub_t2 ? expr_ref(t2, m) : mk_union(a, t2); expr_ref else_br = e2_sub_a ? expr_ref(a, m) : a_sub_e2 ? expr_ref(e2, m) : mk_union(a, e2); @@ -651,6 +756,36 @@ namespace seq { } } + // BDD merge for union: only when both are ITE and pred_implies fires + // (avoids exponential blowup when conditions are unrelated) + if (m.is_ite(a, c1, t1, e1) && m.is_ite(b, c2, t2, e2)) { + // Only merge if we can determine the relationship between conditions + bool c1_imp_c2 = pred_implies(c1, c2); + bool c1_imp_nc2 = !c1_imp_c2 && pred_implies(c1, m.mk_not(c2)); + expr_ref notc1(m.mk_not(c1), m); + bool nc1_imp_c2 = pred_implies(notc1, c2); + bool nc1_imp_nc2 = !nc1_imp_c2 && pred_implies(notc1, m.mk_not(c2)); + if (c1_imp_c2 || c1_imp_nc2 || nc1_imp_c2 || nc1_imp_nc2) { + // pred_implies fires — safe to merge without exponential blowup + expr_ref r1(m), r2(m); + // Under c1: + if (c1_imp_c2) + r1 = mk_union(t1, t2); + else if (c1_imp_nc2) + r1 = mk_union(t1, e2); + else + r1 = mk_union(t1, b); + // Under ~c1: + if (nc1_imp_c2) + r2 = mk_union(e1, t2); + else if (nc1_imp_nc2) + r2 = mk_union(e1, e2); + else + r2 = mk_union(e1, b); + return mk_ite(c1, r1, r2); + } + } + // ACI: flatten, sort, deduplicate expr_ref_vector args(m); flatten_union(a, args); @@ -721,17 +856,48 @@ namespace seq { return mk_deriv_concat(expr_ref(a1, m), tail); } - // ITE combination: if both are ITE with same condition, merge + // ITE handling for intersection expr *c1, *t1, *e1, *c2, *t2, *e2; + + // Same condition merge if (m.is_ite(a, c1, t1, e1) && m.is_ite(b, c2, t2, e2) && c1 == c2) { expr_ref then_br = mk_inter(t1, t2); expr_ref else_br = mk_inter(e1, e2); return mk_ite(c1, then_br, else_br); } - // ITE hoisting for intersection with depth bound. - // Unconditional hoisting needed for re.inter/re.diff/re.comp patterns, - // but bounded to prevent exponential blowup on union-heavy patterns. + // Both-ITE with pred_implies: exploit condition relationships (no depth cost) + if (m.is_ite(a, c1, t1, e1) && m.is_ite(b, c2, t2, e2)) { + // Order conditions: larger id on outside + if (c1->get_id() < c2->get_id()) { + std::swap(a, b); + std::swap(c1, c2); + std::swap(t1, t2); + std::swap(e1, e2); + } + expr_ref r1(m), r2(m); + bool have_r1 = false, have_r2 = false; + // Under c1: what do we know about c2? + if (pred_implies(c1, c2)) { + r1 = mk_inter(t1, t2); have_r1 = true; + } else if (pred_implies(c1, m.mk_not(c2))) { + r1 = mk_inter(t1, e2); have_r1 = true; + } + // Under ~c1: what do we know about c2? + expr_ref notc1(m.mk_not(c1), m); + if (pred_implies(notc1, c2)) { + r2 = mk_inter(e1, t2); have_r2 = true; + } else if (pred_implies(notc1, m.mk_not(c2))) { + r2 = mk_inter(e1, e2); have_r2 = true; + } + if (have_r1 || have_r2) { + if (!have_r1) r1 = mk_inter(t1, b); + if (!have_r2) r2 = mk_inter(e1, b); + return mk_ite(c1, r1, r2); + } + } + + // ITE hoisting with depth bound (fallback when pred_implies doesn't fire) if (m_inter_hoist_depth < m_max_inter_hoist_depth) { if (m.is_ite(a, c1, t1, e1)) { m_inter_hoist_depth++; @@ -854,17 +1020,10 @@ namespace seq { return mk_ite(c, ct, ce); } - // De Morgan: ~(r1 ∪ r2) → ~r1 ∩ ~r2 - expr* r1 = nullptr, * r2 = nullptr; - if (re().is_union(a, r1, r2)) { - expr_ref c1 = mk_complement(r1); - expr_ref c2 = mk_complement(r2); - return mk_inter(c1, c2); - } - // ~ε → .+ sort* s = nullptr; - if (re().is_to_re(a, r) && u().str.is_empty(r)) { + expr* r1 = nullptr; + if (re().is_to_re(a, r1) && u().str.is_empty(r1)) { VERIFY(m_util.is_re(a, s)); return expr_ref(re().mk_plus(re().mk_full_char(a->get_sort())), m); } @@ -973,39 +1132,61 @@ namespace seq { if (!is_ite1 && !is_ite2) return op(d1, d2); - // d1 is ITE, d2 is not + // d1 is ITE, d2 is not — linear distribution (no depth cost) if (is_ite1 && !is_ite2) { expr_ref then_r = ite_combine_binary(t1, d2, op); expr_ref else_r = ite_combine_binary(e1, d2, op); return mk_ite(c1, then_r, else_r); } - // d2 is ITE, d1 is not + // d2 is ITE, d1 is not — linear distribution (no depth cost) if (!is_ite1 && is_ite2) { expr_ref then_r = ite_combine_binary(d1, t2, op); expr_ref else_r = ite_combine_binary(d1, e2, op); return mk_ite(c2, then_r, else_r); } - // Both are ITE + // Both are ITE — this is the cross-product case, consume depth budget + m_inter_hoist_depth++; + expr_ref result(m); + if (c1 == c2) { - // Same condition: combine pairwise + // Same condition: combine pairwise (no cross-product) expr_ref then_r = ite_combine_binary(t1, t2, op); expr_ref else_r = ite_combine_binary(e1, e2, op); - return mk_ite(c1, then_r, else_r); - } - - // Order by condition id (larger id on outside for canonical form) - if (c1->get_id() > c2->get_id()) { - expr_ref then_r = ite_combine_binary(t1, d2, op); - expr_ref else_r = ite_combine_binary(e1, d2, op); - return mk_ite(c1, then_r, else_r); + result = mk_ite(c1, then_r, else_r); } else { - expr_ref then_r = ite_combine_binary(d1, t2, op); - expr_ref else_r = ite_combine_binary(d1, e2, op); - return mk_ite(c2, then_r, else_r); + // Different conditions. Order by id for canonical form. + if (c1->get_id() < c2->get_id()) { + std::swap(d1, d2); + std::swap(c1, c2); + std::swap(t1, t2); + std::swap(e1, e2); + } + + // Now c1->get_id() >= c2->get_id(). Hoist c1. + expr_ref r1(m), r2(m); + if (pred_implies(c1, c2)) + r1 = ite_combine_binary(t1, t2, op); + else if (pred_implies(c1, m.mk_not(c2))) + r1 = ite_combine_binary(t1, e2, op); + else + r1 = ite_combine_binary(t1, d2, op); + + expr_ref notc1(m.mk_not(c1), m); + if (pred_implies(notc1, c2)) + r2 = ite_combine_binary(e1, t2, op); + else if (pred_implies(notc1, m.mk_not(c2))) + r2 = ite_combine_binary(e1, e2, op); + else + r2 = ite_combine_binary(e1, d2, op); + + result = mk_ite(c1, r1, r2); } + + m_inter_hoist_depth--; + return result; } expr_ref derive::ite_combine_unary(expr* d, diff --git a/src/ast/rewriter/seq_derive.h b/src/ast/rewriter/seq_derive.h index 122619809..1c6623bb6 100644 --- a/src/ast/rewriter/seq_derive.h +++ b/src/ast/rewriter/seq_derive.h @@ -73,7 +73,7 @@ namespace seq { unsigned m_simp_depth { 0 }; static const unsigned m_max_simp_depth = 8; - // Intersection ITE hoisting depth limit + // ITE combine depth limit (bounds exponential blowup in BDD merge) unsigned m_inter_hoist_depth { 0 }; static const unsigned m_max_inter_hoist_depth = 4; @@ -129,6 +129,11 @@ namespace seq { // Lightweight subsumption check: returns true if L(a) ⊆ L(b) bool is_subset(expr* a, expr* b); + // Predicate implication for character range conditions. + // Returns true if condition a implies condition b. + bool pred_implies(expr* a, expr* b); + bool extract_char_range(expr* cond, unsigned& lo, unsigned& hi); + // Normalize reverse(r) by pushing reverse inward expr_ref normalize_reverse(expr* r); diff --git a/src/smt/seq_regex.cpp b/src/smt/seq_regex.cpp index 62dbfd3aa..a3c56159d 100644 --- a/src/smt/seq_regex.cpp +++ b/src/smt/seq_regex.cpp @@ -22,6 +22,7 @@ Author: #include "ast/ast_util.h" #include "ast/for_each_expr.h" #include +#include namespace smt { @@ -223,6 +224,40 @@ namespace smt { th.add_axiom(~lit); return true; } + // Second pass: deeper exploration for intersection/complement/diff regexes + // These are candidates for dead state detection (the result may be empty) + // For these, do unlimited depth exploration with a time budget + unsigned r_id = get_state_id(r); + expr* r1 = nullptr, *r2 = nullptr; + if (!m_state_graph.is_dead(r_id) && !m_state_graph.is_live(r_id) && + (re().is_intersection(r, r1, r2) || re().is_complement(r, r1) || re().is_diff(r, r1, r2))) { + // Collect all unexplored states and explore them iteratively + // with a time budget + auto pass2_start = std::chrono::steady_clock::now(); + bool changed = true; + while (changed && !m_state_graph.is_dead(r_id)) { + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - pass2_start).count(); + if (elapsed > 100) break; + changed = false; + for (unsigned i = 0; i < m_state_to_expr.size() && !m_state_graph.is_dead(r_id); ++i) { + unsigned st_id = i + 1; + if (m_state_graph.is_done(st_id) || m_state_graph.is_live(st_id) || m_state_graph.is_dead(st_id)) + continue; + // This is an unexplored state — explore it + expr* st = m_state_to_expr.get(i); + if (re().get_info(st).nullable == l_true) + continue; + if (update_state_graph(st, 1)) + changed = true; + } + } + if (m_state_graph.is_dead(r_id)) { + STRACE(seq_regex_brief, tout << "(dead2) ";); + th.add_axiom(~lit); + return true; + } + } } return false; } @@ -816,7 +851,7 @@ namespace smt { /* Update the state graph with expression r and all its derivatives. */ - bool seq_regex::update_state_graph(expr* r) { + bool seq_regex::update_state_graph(expr* r, unsigned depth) { unsigned r_id = get_state_id(r); if (m_state_graph.is_done(r_id)) return false; if (m_state_graph.get_size() >= m_max_state_graph_size) { @@ -859,15 +894,37 @@ namespace smt { m_state_graph.add_edge(r_id, dr_id, maybecycle); } m_state_graph.mark_done(r_id); - // Recursively explore unexplored targets for dead state detection - // Skip targets that are nullable to avoid state explosion - for (auto const& dr: derivatives) { - unsigned dr_id = get_state_id(dr); - if (m_state_graph.is_done(dr_id) || m_state_graph.is_live(dr_id)) - continue; - if (re().get_info(dr).nullable == l_true) - continue; - update_state_graph(dr); + // Explore direct targets for dead state detection (depth 1 only) + // This compensates for less-canonical derivative representations + if (depth < 1) { + for (auto const& dr: derivatives) { + unsigned dr_id = get_state_id(dr); + if (m_state_graph.is_done(dr_id) || m_state_graph.is_live(dr_id)) + continue; + if (re().get_info(dr).nullable == l_true) + continue; + update_state_graph(dr, depth + 1); + } + } + else if (depth == 1) { + // At depth 1, do lightweight exploration: compute derivatives + // of this state's targets but only to check if they're all dead. + // Don't add complex states to the graph — just mark them dead if + // their get_info says min_length == UINT_MAX or is_empty. + for (auto const& dr: derivatives) { + unsigned dr_id = get_state_id(dr); + if (m_state_graph.is_done(dr_id) || m_state_graph.is_live(dr_id)) + continue; + auto dr_info = re().get_info(dr); + if (dr_info.nullable == l_true) { + m_state_graph.add_state(dr_id); + m_state_graph.mark_live(dr_id); + } + else if (re().is_empty(dr) || dr_info.min_length == UINT_MAX) { + m_state_graph.add_state(dr_id); + m_state_graph.mark_done(dr_id); + } + } } } diff --git a/src/smt/seq_regex.h b/src/smt/seq_regex.h index 5c3fddd25..af03b3c50 100644 --- a/src/smt/seq_regex.h +++ b/src/smt/seq_regex.h @@ -124,7 +124,7 @@ namespace smt { // Note: Doesn't need to be sound or complete (doesn't affect soundness) bool can_be_in_cycle(expr* r1, expr* r2); // Update the graph - bool update_state_graph(expr* r); + bool update_state_graph(expr* r, unsigned depth = 0); // Printing expressions for seq_regex_brief std::string state_str(expr* e);