/*++ Copyright (c) 2026 Microsoft Corporation Module Name: seq_regex.cpp Abstract: Lazy regex membership processing for the Nielsen-based string solver. Author: Clemens Eisenhofer 2026-03-01 Nikolaj Bjorner (nbjorner) 2026-03-01 --*/ #include "smt/seq/seq_regex.h" #include namespace seq { // ----------------------------------------------------------------------- // Stabilizer store // ----------------------------------------------------------------------- void seq_regex::reset_stabilizers() { m_stabilizers.reset(); m_self_stabilizing.reset(); } void seq_regex::add_stabilizer(euf::snode* regex, euf::snode* stabilizer) { if (!regex || !stabilizer) return; unsigned id = regex->id(); auto& stabs = m_stabilizers.insert_if_not_there(id, ptr_vector()); // De-duplicate by pointer equality (mirrors ZIPT Environment.AddStabilizer // which checks reference equality before adding). for (euf::snode* s : stabs) if (s == stabilizer) return; stabs.push_back(stabilizer); } euf::snode* seq_regex::get_stabilizer_union(euf::snode* regex) { if (!regex) return nullptr; if (!m_stabilizers.contains(regex->id())) return nullptr; auto& stabs = m_stabilizers[regex->id()]; if (stabs.empty()) return nullptr; // Single stabilizer: return it directly. if (stabs.size() == 1) return stabs[0]; // Multiple stabilizers: build re.union chain. // union(s1, union(s2, ... union(sN-1, sN)...)) seq_util& seq = m_sg.get_seq_util(); euf::snode* result = stabs[stabs.size() - 1]; for (unsigned i = stabs.size() - 1; i-- > 0; ) { expr* lhs = stabs[i]->get_expr(); expr* rhs = result->get_expr(); if (!lhs || !rhs) return nullptr; expr_ref un(seq.re.mk_union(lhs, rhs), m_sg.get_manager()); result = m_sg.mk(un); } return result; } bool seq_regex::has_stabilizers(euf::snode* regex) const { if (!regex) return false; if (!m_stabilizers.contains(regex->id())) return false; return !m_stabilizers[regex->id()].empty(); } ptr_vector const* seq_regex::get_stabilizers(euf::snode* regex) const { if (!regex) return nullptr; if (!m_stabilizers.contains(regex->id())) return nullptr; return &m_stabilizers[regex->id()]; } void seq_regex::set_self_stabilizing(euf::snode* regex) { if (!regex) return; m_self_stabilizing.insert(regex->id()); } bool seq_regex::is_self_stabilizing(euf::snode* regex) const { if (!regex) return false; return m_self_stabilizing.contains(regex->id()); } // ----------------------------------------------------------------------- // Self-stabilizing auto-detection // ----------------------------------------------------------------------- bool seq_regex::compute_self_stabilizing(euf::snode* regex) const { if (!regex) return false; // R* is always self-stabilizing: D(c, R*) = D(c,R) · R*, // so R* appears as the tail of every derivative and acts as // its own stabilizer. if (regex->is_star()) return true; // Σ* (full_seq, i.e., re.all / .*) is self-stabilizing: // D(c, Σ*) = Σ* for every character c. if (regex->is_full_seq()) return true; // ∅ (fail / empty language) is trivially self-stabilizing: // it has no live derivatives, so the flag is vacuously true. if (regex->is_fail()) return true; // Complement of full_seq is ∅ (complement of Σ*), which is // also trivially self-stabilizing. if (regex->is_complement() && regex->num_args() == 1 && regex->arg(0)->is_full_seq()) return true; // Loop with lo=0 and no upper bound behaves like R* // (r{0,} ≡ r*), so it is self-stabilizing. if (regex->is_loop() && regex->is_nullable()) { // A nullable loop with a star-like body: heuristic check. // Only mark as self-stabilizing if the body is a Kleene closure. // Loop(R, 0, ∞) ~ R* — but we rely on the sgraph to normalize // these, so only catch exact star nodes above. } return false; } // ----------------------------------------------------------------------- // Self-stabilizing propagation through derivatives // ----------------------------------------------------------------------- void seq_regex::propagate_self_stabilizing(euf::snode* parent, euf::snode* deriv) { if (!parent || !deriv) return; // If the derivative is already known to be self-stabilizing (either // inherently or from a prior propagation), nothing to do. if (is_self_stabilizing(deriv)) return; // If the derivative is itself inherently self-stabilizing // (e.g., it is a star or full_seq), mark it now. if (compute_self_stabilizing(deriv)) { set_self_stabilizing(deriv); return; } // Rule 1: Star parent. // D(c, R*) = D(c, R) · R*. The derivative always contains the // R* tail, so it is self-stabilizing regardless of D(c,R). if (parent->is_star()) { set_self_stabilizing(deriv); return; } // Rule 2: Full_seq parent. // D(c, Σ*) = Σ*, and Σ* is self-stabilizing. // (The derivative should be Σ* itself; mark it for safety.) if (parent->is_full_seq()) { set_self_stabilizing(deriv); return; } // Check if parent is self-stabilizing (either inherently or marked). bool parent_ss = is_self_stabilizing(parent) || compute_self_stabilizing(parent); // Rule 3: Concat parent R · S. // D(c, R·S) = D(c,R)·S | (nullable(R) ? D(c,S) : ∅). // If S is self-stabilizing, the D(c,R)·S branch inherits it. // If the whole parent R·S is self-stabilizing, the derivative is too. if (parent->is_concat() && parent->num_args() == 2) { euf::snode* tail = parent->arg(1); bool tail_ss = is_self_stabilizing(tail) || compute_self_stabilizing(tail); if (tail_ss || parent_ss) { set_self_stabilizing(deriv); return; } } // Rule 4: Union parent R | S. // D(c, R|S) = D(c,R) | D(c,S). // Self-stabilizing if both children are self-stabilizing. if (parent->is_union() && parent->num_args() == 2) { euf::snode* lhs = parent->arg(0); euf::snode* rhs = parent->arg(1); bool lhs_ss = is_self_stabilizing(lhs) || compute_self_stabilizing(lhs); bool rhs_ss = is_self_stabilizing(rhs) || compute_self_stabilizing(rhs); if (lhs_ss && rhs_ss) { set_self_stabilizing(deriv); return; } } // Rule 5: Intersection parent R ∩ S. // D(c, R∩S) = D(c,R) ∩ D(c,S). // Self-stabilizing if both children are self-stabilizing. if (parent->is_intersect() && parent->num_args() == 2) { euf::snode* lhs = parent->arg(0); euf::snode* rhs = parent->arg(1); bool lhs_ss = is_self_stabilizing(lhs) || compute_self_stabilizing(lhs); bool rhs_ss = is_self_stabilizing(rhs) || compute_self_stabilizing(rhs); if (lhs_ss && rhs_ss) { set_self_stabilizing(deriv); return; } } // Rule 6: Complement parent ~R. // D(c, ~R) = ~D(c, R). // Preserves self-stabilizing from R. if (parent->is_complement() && parent->num_args() == 1) { euf::snode* inner = parent->arg(0); bool inner_ss = is_self_stabilizing(inner) || compute_self_stabilizing(inner); if (inner_ss) { set_self_stabilizing(deriv); return; } } // Rule 7: Generic self-stabilizing parent. // If the parent was explicitly marked self-stabilizing (e.g., via // a previous propagation), propagate to the derivative. if (parent_ss) { set_self_stabilizing(deriv); return; } } // ----------------------------------------------------------------------- // Derivative with propagation // ----------------------------------------------------------------------- euf::snode* seq_regex::derivative_with_propagation(euf::snode* re, euf::snode* elem) { if (!re || !elem) return nullptr; euf::snode* deriv = derivative(re, elem); if (deriv) propagate_self_stabilizing(re, deriv); return deriv; } // ----------------------------------------------------------------------- // Uniform derivative (symbolic character consumption) // ----------------------------------------------------------------------- euf::snode* seq_regex::try_uniform_derivative(euf::snode* regex) { if (!regex) return nullptr; // Quick exits: trivial regexes with known uniform derivatives. // Σ* (full_seq) has derivative Σ* for every character. if (regex->is_full_seq()) return regex; // ∅ (fail) has derivative ∅ for every character — but this means // every character is rejected. Return fail so the caller can // detect a conflict. if (regex->is_fail()) return regex; // Compute minterms: the character-class partition of the alphabet // induced by the regex. euf::snode_vector minterms; m_sg.compute_minterms(regex, minterms); if (minterms.empty()) return nullptr; // Compute the derivative for each non-empty minterm. If all produce // the same result, the derivative is independent of the character // value and we can consume a symbolic character deterministically. euf::snode* uniform = nullptr; for (euf::snode* mt : minterms) { if (!mt || mt->is_fail()) continue; // empty character class — no character belongs to it euf::snode* deriv = m_sg.brzozowski_deriv(regex, mt); if (!deriv) return nullptr; // derivative computation failed if (!uniform) { uniform = deriv; } else if (uniform->id() != deriv->id()) { return nullptr; // different derivatives — not uniform } } return uniform; // may be nullptr if all minterms were fail/empty } // ----------------------------------------------------------------------- // Ground prefix consumption // ----------------------------------------------------------------------- bool seq_regex::is_empty_regex(euf::snode* re) const { if (!re) return false; // direct empty language constant if (re->is_fail()) return true; // kinds that are never empty if (re->is_star() || re->is_to_re() || re->is_full_char() || re->is_full_seq()) return false; // loop with lo == 0 accepts ε if (re->is_loop() && re->is_nullable()) return false; seq_util& seq = m_sg.get_seq_util(); expr* e = re->get_expr(); if (!e) return false; expr* r1, * r2; // union is empty iff both children are empty if (seq.re.is_union(e, r1, r2)) { SASSERT(re->num_args() == 2); return is_empty_regex(re->arg(0)) && is_empty_regex(re->arg(1)); } // regex concat is empty if either child is empty if (seq.re.is_concat(e, r1, r2)) { SASSERT(re->num_args() == 2); return is_empty_regex(re->arg(0)) || is_empty_regex(re->arg(1)); } // intersection is empty if either child is empty if (seq.re.is_intersection(e, r1, r2)) { SASSERT(re->num_args() == 2); if (is_empty_regex(re->arg(0)) || is_empty_regex(re->arg(1))) return true; } // complement of full_seq is empty if (re->is_complement() && re->num_args() == 1 && re->arg(0)->is_full_seq()) return true; // loop(empty, lo, _) with lo > 0 is empty if (re->is_loop() && re->num_args() >= 1 && is_empty_regex(re->arg(0))) return !re->is_nullable(); // empty if not nullable (i.e., lo > 0) return false; } // ----------------------------------------------------------------------- // BFS regex emptiness check — helper: collect character boundaries // ----------------------------------------------------------------------- void seq_regex::collect_char_boundaries(euf::snode* re, unsigned_vector& bounds) const { if (!re || !re->get_expr()) return; seq_util& seq = m_sg.get_seq_util(); expr* e = re->get_expr(); // Range predicate re.range(lo, hi): boundary at lo and hi+1 // Range arguments are string expressions (e.g., str.unit(ch)) expr* lo_expr = nullptr; expr* hi_expr = nullptr; if (seq.re.is_range(e, lo_expr, hi_expr)) { zstring s_lo, s_hi; if (lo_expr && seq.str.is_string(lo_expr, s_lo) && s_lo.length() == 1) bounds.push_back(s_lo[0]); if (hi_expr && seq.str.is_string(hi_expr, s_hi) && s_hi.length() == 1 && s_hi[0] < zstring::max_char()) bounds.push_back(s_hi[0] + 1); return; } // to_re(s): boundary at first character and first+1 expr* body = nullptr; if (seq.re.is_to_re(e, body)) { zstring s; if (seq.str.is_string(body, s) && s.length() > 0) { unsigned first_ch = s[0]; bounds.push_back(first_ch); if (first_ch < zstring::max_char()) bounds.push_back(first_ch + 1); } return; } // Leaf nodes with no character discrimination if (re->is_fail() || re->is_full_char() || re->is_full_seq()) return; // Recurse into children (handles union, concat, star, loop, etc.) for (unsigned i = 0; i < re->num_args(); ++i) collect_char_boundaries(re->arg(i), bounds); } // ----------------------------------------------------------------------- // BFS regex emptiness check — helper: alphabet representatives // ----------------------------------------------------------------------- void seq_regex::get_alphabet_representatives(euf::snode* re, euf::snode_vector& reps) { unsigned_vector bounds; bounds.push_back(0); // always include character 0 collect_char_boundaries(re, bounds); // Sort and deduplicate std::sort(bounds.begin(), bounds.end()); unsigned prev = UINT_MAX; for (unsigned b : bounds) { if (b != prev) { reps.push_back(m_sg.mk_char(b)); prev = b; } } } // ----------------------------------------------------------------------- // BFS regex emptiness check // ----------------------------------------------------------------------- lbool seq_regex::is_empty_bfs(euf::snode* re, unsigned max_states) { if (!re || !re->get_expr()) return l_undef; if (re->is_fail()) return l_true; if (re->is_nullable()) return l_false; // Structural quick checks for kinds that are never empty if (re->is_star() || re->is_full_char() || re->is_full_seq() || re->is_to_re()) return l_false; // Structural emptiness catches simple cases if (is_empty_regex(re)) return l_true; // Only handle ground regexes; non-ground can't be fully explored if (!re->is_ground()) return l_undef; // BFS over the Brzozowski derivative automaton. // Each state is a derivative regex snode identified by its id. // We explore states by computing derivatives for representative // characters from the alphabet partition. uint_set visited; euf::snode_vector worklist; worklist.push_back(re); visited.insert(re->id()); unsigned states_explored = 0; bool had_failed_deriv = false; while (!worklist.empty()) { if (states_explored >= max_states) return l_undef; euf::snode* current = worklist.back(); worklist.pop_back(); ++states_explored; // Compute representative characters for current state's // alphabet partition. Each representative is a concrete // character snode whose equivalence class has identical // derivative behavior. euf::snode_vector reps; get_alphabet_representatives(current, reps); if (reps.empty()) { // No representatives means no character predicates; // use a default character to explore the single partition. reps.push_back(m_sg.mk_char('a')); } for (euf::snode* ch : reps) { euf::snode* deriv = m_sg.brzozowski_deriv(current, ch); if (!deriv) { // Derivative computation failed for this character. // Track the failure but continue with other characters. had_failed_deriv = true; continue; } if (deriv->is_nullable()) return l_false; // found an accepting state if (deriv->is_fail()) continue; // dead-end, no need to explore further if (is_empty_regex(deriv)) continue; // structurally empty subtree if (!visited.contains(deriv->id())) { visited.insert(deriv->id()); worklist.push_back(deriv); } } } // Exhausted all reachable states without finding a nullable one. // If we had any failed derivative computations, the result is // inconclusive since we may have missed reachable states. if (had_failed_deriv) return l_undef; return l_true; } // ----------------------------------------------------------------------- // Multi-regex intersection emptiness check // BFS over the product of Brzozowski derivative automata. // Mirrors ZIPT NielsenNode.CheckEmptiness (NielsenNode.cs:1429-1469) // ----------------------------------------------------------------------- lbool seq_regex::check_intersection_emptiness(ptr_vector const& regexes, unsigned max_states) { if (regexes.empty()) return l_false; // empty intersection = full language (vacuously non-empty) // Quick checks: if any regex is fail/empty, intersection is empty for (euf::snode* re : regexes) { if (!re || !re->get_expr()) return l_undef; if (re->is_fail() || is_empty_regex(re)) return l_true; } // Check if all are nullable (intersection accepts ε) bool all_nullable = true; for (euf::snode* re : regexes) { if (!re->is_nullable()) { all_nullable = false; break; } } if (all_nullable) return l_false; // Single regex: delegate to is_empty_bfs if (regexes.size() == 1) return is_empty_bfs(regexes[0], max_states); // Build product BFS. State = tuple of regex snode ids. // Use a map from state hash to visited set. using state_t = svector; auto state_hash = [](state_t const& s) -> unsigned { unsigned h = 0; for (unsigned id : s) h = h * 31 + id; return h; }; auto state_eq = [](state_t const& a, state_t const& b) -> bool { if (a.size() != b.size()) return false; for (unsigned i = 0; i < a.size(); ++i) if (a[i] != b[i]) return false; return true; }; // Use simple set via sorted vector of hashes (good enough for bounded BFS) std::unordered_set visited_hashes; struct bfs_state { ptr_vector regexes; }; std::vector worklist; bfs_state initial; initial.regexes.append(regexes); worklist.push_back(std::move(initial)); state_t init_ids; for (euf::snode* re : regexes) init_ids.push_back(re->id()); visited_hashes.insert(state_hash(init_ids)); unsigned states_explored = 0; bool had_failed = false; // Collect alphabet representatives from the intersection of all regexes // (merge boundaries from all) unsigned_vector all_bounds; all_bounds.push_back(0); for (euf::snode* re : regexes) collect_char_boundaries(re, all_bounds); std::sort(all_bounds.begin(), all_bounds.end()); euf::snode_vector reps; unsigned prev = UINT_MAX; for (unsigned b : all_bounds) { if (b != prev) { reps.push_back(m_sg.mk_char(b)); prev = b; } } if (reps.empty()) reps.push_back(m_sg.mk_char('a')); while (!worklist.empty()) { if (states_explored >= max_states) return l_undef; bfs_state current = std::move(worklist.back()); worklist.pop_back(); ++states_explored; for (euf::snode* ch : reps) { ptr_vector derivs; bool any_fail = false; bool all_null = true; bool deriv_failed = false; for (euf::snode* re : current.regexes) { euf::snode* d = m_sg.brzozowski_deriv(re, ch); if (!d) { deriv_failed = true; break; } if (d->is_fail()) { any_fail = true; break; } if (!d->is_nullable()) all_null = false; derivs.push_back(d); } if (deriv_failed) { had_failed = true; continue; } if (any_fail) continue; // this character leads to empty intersection if (all_null) return l_false; // found an accepting state in the product // Check if any component is structurally empty bool any_empty = false; for (euf::snode* d : derivs) { if (is_empty_regex(d)) { any_empty = true; break; } } if (any_empty) continue; // Compute state hash and check visited state_t ids; for (euf::snode* d : derivs) ids.push_back(d->id()); unsigned h = state_hash(ids); if (visited_hashes.count(h) == 0) { visited_hashes.insert(h); bfs_state next; next.regexes.append(derivs); worklist.push_back(std::move(next)); } } } if (had_failed) return l_undef; return l_true; // exhausted all states, intersection is empty } // ----------------------------------------------------------------------- // Language subset check: L(A) ⊆ L(B) // via intersection(A, complement(B)) = ∅ // Mirrors ZIPT NielsenNode.IsLanguageSubset (NielsenNode.cs:1382-1385) // ----------------------------------------------------------------------- lbool seq_regex::is_language_subset(euf::snode* subset_re, euf::snode* superset_re) { if (!subset_re || !superset_re) return l_undef; // Quick checks if (subset_re->is_fail() || is_empty_regex(subset_re)) return l_true; // ∅ ⊆ anything if (superset_re->is_full_seq()) return l_true; // anything ⊆ Σ* if (subset_re == superset_re) return l_true; // L ⊆ L // Build complement(superset) seq_util& seq = m_sg.get_seq_util(); ast_manager& mgr = m_sg.get_manager(); expr* sup_expr = superset_re->get_expr(); if (!sup_expr) return l_undef; expr_ref comp(seq.re.mk_complement(sup_expr), mgr); euf::snode* comp_sn = m_sg.mk(comp); if (!comp_sn) return l_undef; // Build intersection and check emptiness // subset ∩ complement(superset) should be empty for subset relation expr* sub_expr = subset_re->get_expr(); if (!sub_expr) return l_undef; expr_ref inter(seq.re.mk_inter(sub_expr, comp.get()), mgr); euf::snode* inter_sn = m_sg.mk(inter); if (!inter_sn) return l_undef; return is_empty_bfs(inter_sn); } // ----------------------------------------------------------------------- // Collect primitive regex intersection for a variable // ----------------------------------------------------------------------- euf::snode* seq_regex::collect_primitive_regex_intersection( euf::snode* var, seq::nielsen_node const& node) { if (!var) return nullptr; seq_util& seq = m_sg.get_seq_util(); ast_manager& mgr = m_sg.get_manager(); euf::snode* result = nullptr; for (auto const& mem : node.str_mems()) { if (!mem.m_str || !mem.m_regex) continue; // Primitive constraint: str is a single variable if (!mem.is_primitive()) continue; euf::snode* first = mem.m_str->first(); if (!first || first != var) continue; if (!result) { result = mem.m_regex; } else { expr* r1 = result->get_expr(); expr* r2 = mem.m_regex->get_expr(); if (r1 && r2) { expr_ref inter(seq.re.mk_inter(r1, r2), mgr); result = m_sg.mk(inter); } } } return result; } // ----------------------------------------------------------------------- // Cycle detection // ----------------------------------------------------------------------- bool seq_regex::detect_cycle(seq::str_mem const& mem) const { return extract_cycle(mem) != nullptr; } // ----------------------------------------------------------------------- // Ground prefix consumption // ----------------------------------------------------------------------- seq_regex::simplify_status seq_regex::simplify_ground_prefix(seq::str_mem& mem) { if (!mem.m_str || !mem.m_regex) return simplify_status::ok; while (mem.m_str && !mem.m_str->is_empty()) { euf::snode* first = mem.m_str->first(); if (!first || !first->is_char()) break; euf::snode* parent_re = mem.m_regex; euf::snode* deriv = m_sg.brzozowski_deriv(parent_re, first); if (!deriv) break; if (deriv->is_fail()) return simplify_status::conflict; // propagate self-stabilizing flag from parent to derivative propagate_self_stabilizing(parent_re, deriv); mem.m_str = m_sg.drop_first(mem.m_str); mem.m_regex = deriv; } // check final state if (mem.m_str && mem.m_str->is_empty()) { if (mem.m_regex->is_nullable()) return simplify_status::satisfied; return simplify_status::conflict; } return simplify_status::ok; } // ----------------------------------------------------------------------- // Ground suffix consumption (best-effort) // ----------------------------------------------------------------------- seq_regex::simplify_status seq_regex::simplify_ground_suffix(seq::str_mem& mem) { // Suffix consumption via reverse derivatives is complex. // For now, only handle the case where the entire string is ground: // consume all characters from the front (which covers trailing chars // when the string is fully ground). if (!mem.m_str || !mem.m_regex) return simplify_status::ok; if (!mem.m_str->is_ground()) return simplify_status::ok; // If the string is ground, simplify_ground_prefix handles everything. return simplify_ground_prefix(mem); } // ----------------------------------------------------------------------- // Trivial checks // ----------------------------------------------------------------------- int seq_regex::check_trivial(seq::str_mem const& mem) const { if (!mem.m_str || !mem.m_regex) return 0; // regex is ∅ => always conflict if (is_empty_regex(mem.m_regex)) return -1; // regex is Σ* => always satisfied if (is_full_regex(mem.m_regex)) return 1; // empty string checks if (mem.m_str->is_empty()) { if (mem.m_regex->is_nullable()) return 1; return -1; } return 0; } // ----------------------------------------------------------------------- // Minterm computation with filtering // ----------------------------------------------------------------------- void seq_regex::get_minterms(euf::snode* regex, euf::snode_vector& minterms) { if (!regex) return; // compute raw minterms from the regex predicates euf::snode_vector raw; m_sg.compute_minterms(regex, raw); // filter: keep only minterms that are non-fail (non-empty character class). // note: minterms are regex character-class expressions, not concrete // characters, so we cannot compute Brzozowski derivatives with them. // callers should compute derivatives using concrete or fresh chars. for (euf::snode* mt : raw) { if (!mt || mt->is_fail()) continue; minterms.push_back(mt); } } // ----------------------------------------------------------------------- // Collect first characters // ----------------------------------------------------------------------- void seq_regex::collect_first_chars(euf::snode* re, euf::snode_vector& chars) { if (!re) return; // to_re(s): extract first character of the string body if (re->is_to_re()) { euf::snode* body = re->arg(0); if (body && !body->is_empty()) { euf::snode* first = body->first(); if (first && first->is_char()) { bool dup = false; for (euf::snode* c : chars) if (c == first) { dup = true; break; } if (!dup) chars.push_back(first); } // Handle string literals (classified as s_other in sgraph) else if (first && first->get_expr()) { seq_util& seq = m_sg.get_seq_util(); zstring s; if (seq.str.is_string(first->get_expr(), s) && s.length() > 0) { euf::snode* ch = m_sg.mk_char(s[0]); bool dup = false; for (euf::snode* c : chars) if (c == ch) { dup = true; break; } if (!dup) chars.push_back(ch); } } } return; } // leaf cases: produce representative characters for character classes if (re->is_full_char()) { // full character set (.): use 'a' as representative euf::snode* ch = m_sg.mk_char('a'); bool dup = false; for (euf::snode* c : chars) if (c == ch) { dup = true; break; } if (!dup) chars.push_back(ch); return; } // re.range(lo, hi): use lo as representative if (re->get_expr()) { seq_util& seq = m_sg.get_seq_util(); expr* lo = nullptr, *hi = nullptr; if (seq.re.is_range(re->get_expr(), lo, hi) && lo) { zstring s; unsigned ch_val = 'a'; if (seq.is_const_char(lo, ch_val)) { euf::snode* ch = m_sg.mk_char(ch_val); bool dup = false; for (euf::snode* c : chars) if (c == ch) { dup = true; break; } if (!dup) chars.push_back(ch); } return; } } if (re->is_fail() || re->is_full_seq()) return; // recurse into children (handles union, concat, star, loop, etc.) for (unsigned i = 0; i < re->num_args(); ++i) collect_first_chars(re->arg(i), chars); } // ----------------------------------------------------------------------- // Membership processing // ----------------------------------------------------------------------- bool seq_regex::process_str_mem(seq::str_mem const& mem, vector& out_mems) { if (!mem.m_str || !mem.m_regex) return true; // empty string: check nullable if (mem.m_str->is_empty()) return mem.m_regex->is_nullable(); // consume ground prefix: derive regex by each leading concrete char seq::str_mem working = mem; simplify_status st = simplify_ground_prefix(working); if (st == simplify_status::conflict) return false; if (st == simplify_status::satisfied) return true; // after ground prefix consumption, if the front is still a concrete // character we can take one more step (shouldn't happen after // simplify_ground_prefix, but guard defensively) euf::snode* first = working.m_str->first(); if (first && first->is_char()) { seq::str_mem derived = derive(working, first); if (is_empty_regex(derived.m_regex)) return false; out_mems.push_back(derived); return true; } // string starts with a non-ground element (variable or unit): // return the simplified constraint for the Nielsen graph to expand // via character-split modifiers. out_mems.push_back(working); return true; } // ----------------------------------------------------------------------- // History recording // ----------------------------------------------------------------------- seq::str_mem seq_regex::record_history(seq::str_mem const& mem, euf::snode* history_re) { // Build a history chain by prepending the new regex entry to the // existing history. Uses regex-concat as a cons cell: // new_history = re.concat(history_re, old_history) // where arg(0) is the latest entry and arg(1) is the tail. // If old_history is nullptr, the new entry becomes the terminal leaf. euf::snode* new_history = history_re; if (mem.m_history && history_re) { expr* re_expr = history_re->get_expr(); expr* old_expr = mem.m_history->get_expr(); if (re_expr && old_expr) { seq_util& seq = m_sg.get_seq_util(); expr_ref chain(seq.re.mk_concat(re_expr, old_expr), m_sg.get_manager()); new_history = m_sg.mk(chain); } } return seq::str_mem(mem.m_str, mem.m_regex, new_history, mem.m_id, mem.m_dep); } // ----------------------------------------------------------------------- // Cycle detection // ----------------------------------------------------------------------- euf::snode* seq_regex::extract_cycle(seq::str_mem const& mem) const { // Walk the history chain looking for a repeated regex. // A cycle exists when the current regex matches a regex in the history. if (!mem.m_regex || !mem.m_history) return nullptr; euf::snode* current = mem.m_regex; euf::snode* hist = mem.m_history; // Walk the history chain up to a bounded depth. // The history is structured as a chain of regex snapshots connected // via the sgraph's regex-concat: each level's arg(0) is a snapshot // and arg(1) is the tail. A leaf (non-concat) is a terminal entry. unsigned bound = 1000; while (hist && bound-- > 0) { euf::snode* entry = hist; euf::snode* tail = nullptr; // If the history node is a regex concat, decompose it: // arg(0) is the regex snapshot, arg(1) is the rest of the chain seq_util& seq = m_sg.get_seq_util(); if (hist->is_concat() && hist->get_expr() && seq.re.is_concat(hist->get_expr())) { entry = hist->arg(0); tail = hist->arg(1); } // Check pointer equality (fast, covers normalized regexes) if (entry == current) return entry; // Check expression-level equality as fallback if (entry->get_expr() && current->get_expr() && entry->get_expr() == current->get_expr()) return entry; hist = tail; } return nullptr; } // ----------------------------------------------------------------------- // Stabilizer from cycle // ----------------------------------------------------------------------- euf::snode* seq_regex::stabilizer_from_cycle(euf::snode* cycle_regex, euf::snode* current_regex) { if (!cycle_regex || !current_regex) return nullptr; expr* re_expr = cycle_regex->get_expr(); if (!re_expr) return nullptr; seq_util& seq = m_sg.get_seq_util(); expr_ref star_expr(seq.re.mk_star(re_expr), m_sg.get_manager()); return m_sg.mk(star_expr); } // ----------------------------------------------------------------------- // Extract cycle history tokens // ----------------------------------------------------------------------- euf::snode* seq_regex::extract_cycle_history(seq::str_mem const& current, seq::str_mem const& ancestor) { // The history is built by simplify_and_init as a left-associative // string concat chain: concat(concat(concat(nil, c1), c2), c3). // Extract the tokens consumed since the ancestor. if (!current.m_history) return nullptr; unsigned cur_len = current.m_history->length(); unsigned anc_len = ancestor.m_history ? ancestor.m_history->length() : 0; if (cur_len <= anc_len) return nullptr; if (anc_len == 0) return current.m_history; return m_sg.drop_left(current.m_history, anc_len); } // ----------------------------------------------------------------------- // Get filtered stabilizer star // Mirrors ZIPT StrMem.GetFilteredStabilizerStar (StrMem.cs:228-243) // ----------------------------------------------------------------------- euf::snode* seq_regex::get_filtered_stabilizer_star(euf::snode* re, euf::snode* excluded_char) { if (!re) return nullptr; ptr_vector const* stabs = get_stabilizers(re); if (!stabs || stabs->empty()) return nullptr; seq_util& seq = m_sg.get_seq_util(); ast_manager& m = m_sg.get_manager(); euf::snode* filtered_union = nullptr; for (euf::snode* s : *stabs) { if (!s) continue; // Keep only stabilizers whose language cannot start with excluded_char euf::snode* d = m_sg.brzozowski_deriv(s, excluded_char); if (d && d->is_fail()) { if (!filtered_union) { filtered_union = s; } else { expr* e1 = filtered_union->get_expr(); expr* e2 = s->get_expr(); if (e1 && e2) { expr_ref u(seq.re.mk_union(e1, e2), m); filtered_union = m_sg.mk(u); } } } } if (!filtered_union) return nullptr; expr* fe = filtered_union->get_expr(); if (!fe) return nullptr; expr_ref star_expr(seq.re.mk_star(fe), m); return m_sg.mk(star_expr); } // ----------------------------------------------------------------------- // Strengthened stabilizer construction with sub-cycle detection // Mirrors ZIPT StrMem.StabilizerFromCycle (StrMem.cs:163-225) // ----------------------------------------------------------------------- euf::snode* seq_regex::strengthened_stabilizer(euf::snode* cycle_regex, euf::snode* cycle_history) { if (!cycle_regex || !cycle_history) return nullptr; // Flatten the history concat chain into a vector of character tokens. euf::snode_vector tokens; cycle_history->collect_tokens(tokens); if (tokens.empty()) return nullptr; seq_util& seq = m_sg.get_seq_util(); ast_manager& m = m_sg.get_manager(); // Replay tokens on the cycle regex, detecting sub-cycles. // A sub-cycle is detected when the derivative returns to cycle_regex. svector> sub_cycles; unsigned cycle_start = 0; euf::snode* current_re = cycle_regex; for (unsigned i = 0; i < tokens.size(); ++i) { euf::snode* tok = tokens[i]; if (!tok) return nullptr; euf::snode* deriv = m_sg.brzozowski_deriv(current_re, tok); if (!deriv) return nullptr; // Sub-cycle: derivative returned to the cycle entry regex if (deriv == cycle_regex || (deriv->get_expr() && cycle_regex->get_expr() && deriv->get_expr() == cycle_regex->get_expr())) { sub_cycles.push_back(std::make_pair(cycle_start, i + 1)); cycle_start = i + 1; current_re = cycle_regex; } else { current_re = deriv; } } // Remaining tokens that don't complete a sub-cycle if (cycle_start < tokens.size()) sub_cycles.push_back(std::make_pair(cycle_start, tokens.size())); if (sub_cycles.empty()) return nullptr; // Build a stabilizer body for each sub-cycle. // body = to_re(t0) · [filteredStar(R1, t1)] · to_re(t1) · ... · to_re(t_{n-1}) euf::snode* overall_union = nullptr; for (auto const& sc : sub_cycles) { unsigned start = sc.first; unsigned end = sc.second; if (start >= end) continue; euf::snode* re_state = cycle_regex; euf::snode* body = nullptr; for (unsigned i = start; i < end; ++i) { euf::snode* tok = tokens[i]; if (!tok) break; // Insert filtered stabilizer star before each token after the first if (i > start) { euf::snode* filtered = get_filtered_stabilizer_star(re_state, tok); if (filtered) { expr* fe = filtered->get_expr(); if (fe) { if (!body) { body = filtered; } else { expr* be = body->get_expr(); if (be) { expr_ref cat(seq.re.mk_concat(be, fe), m); body = m_sg.mk(cat); } } } } } // Convert char token to regex: to_re(unit(tok)) expr* tok_expr = tok->get_expr(); if (!tok_expr) break; expr_ref unit_str(seq.str.mk_unit(tok_expr), m); expr_ref tok_re(seq.re.mk_to_re(unit_str), m); euf::snode* tok_re_sn = m_sg.mk(tok_re); if (!body) { body = tok_re_sn; } else { expr* be = body->get_expr(); expr* te = tok_re_sn->get_expr(); if (be && te) { expr_ref cat(seq.re.mk_concat(be, te), m); body = m_sg.mk(cat); } } // Advance the regex state euf::snode* deriv = m_sg.brzozowski_deriv(re_state, tok); if (!deriv) break; re_state = deriv; } if (!body) continue; if (!overall_union) { overall_union = body; } else { expr* oe = overall_union->get_expr(); expr* be = body->get_expr(); if (oe && be) { expr_ref u(seq.re.mk_union(oe, be), m); overall_union = m_sg.mk(u); } } } return overall_union; } // ----------------------------------------------------------------------- // Stabilizer-based subsumption (enhanced) // Mirrors ZIPT StrMem.TrySubsume (StrMem.cs:354-386) // ----------------------------------------------------------------------- bool seq_regex::try_subsume(seq::str_mem const& mem, seq::nielsen_node const& node) { if (!mem.m_str || !mem.m_regex) return false; // 1. Leading token must be a variable euf::snode* first = mem.m_str->first(); if (!first || !first->is_var()) return false; // 2. Must have stabilizers for the regex if (!has_stabilizers(mem.m_regex)) return false; // 3. Build stabStar = star(union(all stabilizers for this regex)) euf::snode* stab_union = get_stabilizer_union(mem.m_regex); if (!stab_union) return false; seq_util& seq = m_sg.get_seq_util(); ast_manager& mgr = m_sg.get_manager(); expr* su_expr = stab_union->get_expr(); if (!su_expr) return false; expr_ref stab_star(seq.re.mk_star(su_expr), mgr); euf::snode* stab_star_sn = m_sg.mk(stab_star); if (!stab_star_sn) return false; // 4. Collect all primitive regex constraints on variable `first` euf::snode* x_range = collect_primitive_regex_intersection(first, node); if (!x_range) return false; // 5. Check L(x_range) ⊆ L(stab_star) lbool result = is_language_subset(x_range, stab_star_sn); return result == l_true; } }