z3/src/smt/seq/seq_regex.cpp

/*++
Copyright (c) 2026 Microsoft Corporation

Module Name:

    seq_regex.cpp

Abstract:

    Lazy regex membership processing for the Nielsen-based string solver.

Author:

    Clemens Eisenhofer 2026-03-01
    Nikolaj Bjorner (nbjorner) 2026-03-01

--*/
#include "smt/seq/seq_regex.h"
#include <unordered_set>

namespace seq {

    // -----------------------------------------------------------------------
    // Stabilizer store
    // -----------------------------------------------------------------------

    void seq_regex::reset_stabilizers() {
        m_stabilizers.reset();
        m_self_stabilizing.reset();
    }

    void seq_regex::add_stabilizer(euf::snode* regex, euf::snode* stabilizer) {
        if (!regex || !stabilizer)
            return;

        unsigned id = regex->id();
        auto& stabs = m_stabilizers.insert_if_not_there(id, ptr_vector<euf::snode>());

        // De-duplicate by pointer equality (mirrors ZIPT Environment.AddStabilizer
        // which checks reference equality before adding).
        for (euf::snode* s : stabs)
            if (s == stabilizer)
                return;
        stabs.push_back(stabilizer);
    }

    euf::snode* seq_regex::get_stabilizer_union(euf::snode* regex) {
        if (!regex)
            return nullptr;

        if (!m_stabilizers.contains(regex->id()))
            return nullptr;

        auto& stabs = m_stabilizers[regex->id()];
        if (stabs.empty())
            return nullptr;

        // Single stabilizer: return it directly.
        if (stabs.size() == 1)
            return stabs[0];

        // Multiple stabilizers: build re.union chain.
        // union(s1, union(s2, ... union(sN-1, sN)...))
        seq_util& seq = m_sg.get_seq_util();
        euf::snode* result = stabs[stabs.size() - 1];
        for (unsigned i = stabs.size() - 1; i-- > 0; ) {
            expr* lhs = stabs[i]->get_expr();
            expr* rhs = result->get_expr();
            if (!lhs || !rhs)
                return nullptr;
            expr_ref un(seq.re.mk_union(lhs, rhs), m_sg.get_manager());
            result = m_sg.mk(un);
        }
        return result;
    }

    bool seq_regex::has_stabilizers(euf::snode* regex) const {
        if (!regex)
            return false;
        if (!m_stabilizers.contains(regex->id()))
            return false;
        return !m_stabilizers[regex->id()].empty();
    }

    ptr_vector<euf::snode> const* seq_regex::get_stabilizers(euf::snode* regex) const {
        if (!regex)
            return nullptr;
        if (!m_stabilizers.contains(regex->id()))
            return nullptr;
        return &m_stabilizers[regex->id()];
    }

    void seq_regex::set_self_stabilizing(euf::snode* regex) {
        if (!regex)
            return;
        m_self_stabilizing.insert(regex->id());
    }

    bool seq_regex::is_self_stabilizing(euf::snode* regex) const {
        if (!regex)
            return false;
        return m_self_stabilizing.contains(regex->id());
    }

    // -----------------------------------------------------------------------
    // Self-stabilizing auto-detection
    // -----------------------------------------------------------------------

    bool seq_regex::compute_self_stabilizing(euf::snode* regex) const {
        if (!regex)
            return false;

        // R* is always self-stabilizing: D(c, R*) = D(c,R) · R*,
        // so R* appears as the tail of every derivative and acts as
        // its own stabilizer.
        if (regex->is_star())
            return true;

        // Σ* (full_seq, i.e., re.all / .*) is self-stabilizing:
        // D(c, Σ*) = Σ* for every character c.
        if (regex->is_full_seq())
            return true;

        // ∅ (fail / empty language) is trivially self-stabilizing:
        // it has no live derivatives, so the flag is vacuously true.
        if (regex->is_fail())
            return true;

        // Complement of full_seq is ∅ (complement of Σ*), which is
        // also trivially self-stabilizing.
        if (regex->is_complement() && regex->num_args() == 1 &&
            regex->arg(0)->is_full_seq())
            return true;

        // Loop with lo=0 and no upper bound behaves like R*
        // (r{0,} ≡ r*), so it is self-stabilizing.
        if (regex->is_loop() && regex->is_nullable()) {
            // A nullable loop with a star-like body: heuristic check.
            // Only mark as self-stabilizing if the body is a Kleene closure.
            // Loop(R, 0, ∞) ~ R* — but we rely on the sgraph to normalize
            // these, so only catch exact star nodes above.
        }

        return false;
    }

    // -----------------------------------------------------------------------
    // Self-stabilizing propagation through derivatives
    // -----------------------------------------------------------------------

    void seq_regex::propagate_self_stabilizing(euf::snode* parent, euf::snode* deriv) {
        if (!parent || !deriv)
            return;

        // If the derivative is already known to be self-stabilizing (either
        // inherently or from a prior propagation), nothing to do.
        if (is_self_stabilizing(deriv))
            return;

        // If the derivative is itself inherently self-stabilizing
        // (e.g., it is a star or full_seq), mark it now.
        if (compute_self_stabilizing(deriv)) {
            set_self_stabilizing(deriv);
            return;
        }

        // Rule 1: Star parent.
        // D(c, R*) = D(c, R) · R*. The derivative always contains the
        // R* tail, so it is self-stabilizing regardless of D(c,R).
        if (parent->is_star()) {
            set_self_stabilizing(deriv);
            return;
        }

        // Rule 2: Full_seq parent.
        // D(c, Σ*) = Σ*, and Σ* is self-stabilizing.
        // (The derivative should be Σ* itself; mark it for safety.)
        if (parent->is_full_seq()) {
            set_self_stabilizing(deriv);
            return;
        }

        // Check if parent is self-stabilizing (either inherently or marked).
        bool parent_ss = is_self_stabilizing(parent) || compute_self_stabilizing(parent);

        // Rule 3: Concat parent R · S.
        // D(c, R·S) = D(c,R)·S | (nullable(R) ? D(c,S) : ∅).
        // If S is self-stabilizing, the D(c,R)·S branch inherits it.
        // If the whole parent R·S is self-stabilizing, the derivative is too.
        if (parent->is_concat() && parent->num_args() == 2) {
            euf::snode* tail = parent->arg(1);
            bool tail_ss = is_self_stabilizing(tail) || compute_self_stabilizing(tail);
            if (tail_ss || parent_ss) {
                set_self_stabilizing(deriv);
                return;
            }
        }

        // Rule 4: Union parent R | S.
        // D(c, R|S) = D(c,R) | D(c,S).
        // Self-stabilizing if both children are self-stabilizing.
        if (parent->is_union() && parent->num_args() == 2) {
            euf::snode* lhs = parent->arg(0);
            euf::snode* rhs = parent->arg(1);
            bool lhs_ss = is_self_stabilizing(lhs) || compute_self_stabilizing(lhs);
            bool rhs_ss = is_self_stabilizing(rhs) || compute_self_stabilizing(rhs);
            if (lhs_ss && rhs_ss) {
                set_self_stabilizing(deriv);
                return;
            }
        }

        // Rule 5: Intersection parent R ∩ S.
        // D(c, R∩S) = D(c,R) ∩ D(c,S).
        // Self-stabilizing if both children are self-stabilizing.
        if (parent->is_intersect() && parent->num_args() == 2) {
            euf::snode* lhs = parent->arg(0);
            euf::snode* rhs = parent->arg(1);
            bool lhs_ss = is_self_stabilizing(lhs) || compute_self_stabilizing(lhs);
            bool rhs_ss = is_self_stabilizing(rhs) || compute_self_stabilizing(rhs);
            if (lhs_ss && rhs_ss) {
                set_self_stabilizing(deriv);
                return;
            }
        }

        // Rule 6: Complement parent ~R.
        // D(c, ~R) = ~D(c, R).
        // Preserves self-stabilizing from R.
        if (parent->is_complement() && parent->num_args() == 1) {
            euf::snode* inner = parent->arg(0);
            bool inner_ss = is_self_stabilizing(inner) || compute_self_stabilizing(inner);
            if (inner_ss) {
                set_self_stabilizing(deriv);
                return;
            }
        }

        // Rule 7: Generic self-stabilizing parent.
        // If the parent was explicitly marked self-stabilizing (e.g., via
        // a previous propagation), propagate to the derivative.
        if (parent_ss) {
            set_self_stabilizing(deriv);
            return;
        }
    }

    // -----------------------------------------------------------------------
    // Derivative with propagation
    // -----------------------------------------------------------------------

    euf::snode* seq_regex::derivative_with_propagation(euf::snode* re, euf::snode* elem) {
        if (!re || !elem)
            return nullptr;
        euf::snode* deriv = derivative(re, elem);
        if (deriv)
            propagate_self_stabilizing(re, deriv);
        return deriv;
    }

    // -----------------------------------------------------------------------
    // Uniform derivative (symbolic character consumption)
    // -----------------------------------------------------------------------

    euf::snode* seq_regex::try_uniform_derivative(euf::snode* regex) {
        if (!regex)
            return nullptr;

        // Quick exits: trivial regexes with known uniform derivatives.
        // Σ* (full_seq) has derivative Σ* for every character.
        if (regex->is_full_seq())
            return regex;
        // ∅ (fail) has derivative ∅ for every character — but this means
        // every character is rejected.  Return fail so the caller can
        // detect a conflict.
        if (regex->is_fail())
            return regex;

        // Compute minterms: the character-class partition of the alphabet
        // induced by the regex.
        euf::snode_vector minterms;
        m_sg.compute_minterms(regex, minterms);
        if (minterms.empty())
            return nullptr;

        // Compute the derivative for each non-empty minterm.  If all produce
        // the same result, the derivative is independent of the character
        // value and we can consume a symbolic character deterministically.
        euf::snode* uniform = nullptr;
        for (euf::snode* mt : minterms) {
            if (!mt || mt->is_fail())
                continue;  // empty character class — no character belongs to it
            euf::snode* deriv = m_sg.brzozowski_deriv(regex, mt);
            if (!deriv)
                return nullptr;  // derivative computation failed
            if (!uniform) {
                uniform = deriv;
            } else if (uniform->id() != deriv->id()) {
                return nullptr;  // different derivatives — not uniform
            }
        }
        return uniform;  // may be nullptr if all minterms were fail/empty
    }

    // -----------------------------------------------------------------------
    // Ground prefix consumption
    // -----------------------------------------------------------------------

    bool seq_regex::is_empty_regex(euf::snode* re) const {
        if (!re)
            return false;
        // direct empty language constant
        if (re->is_fail())
            return true;
        // kinds that are never empty
        if (re->is_star() || re->is_to_re() ||
            re->is_full_char() || re->is_full_seq())
            return false;
        // loop with lo == 0 accepts ε
        if (re->is_loop() && re->is_nullable())
            return false;

        seq_util& seq = m_sg.get_seq_util();
        expr* e = re->get_expr();
        if (!e)
            return false;

        expr* r1, * r2;
        // union is empty iff both children are empty
        if (seq.re.is_union(e, r1, r2)) {
            SASSERT(re->num_args() == 2);
            return is_empty_regex(re->arg(0)) && is_empty_regex(re->arg(1));
        }
        // regex concat is empty if either child is empty
        if (seq.re.is_concat(e, r1, r2)) {
            SASSERT(re->num_args() == 2);
            return is_empty_regex(re->arg(0)) || is_empty_regex(re->arg(1));
        }
        // intersection is empty if either child is empty
        if (seq.re.is_intersection(e, r1, r2)) {
            SASSERT(re->num_args() == 2);
            if (is_empty_regex(re->arg(0)) || is_empty_regex(re->arg(1)))
                return true;
        }
        // complement of full_seq is empty
        if (re->is_complement() && re->num_args() == 1 && re->arg(0)->is_full_seq())
            return true;
        // loop(empty, lo, _) with lo > 0 is empty
        if (re->is_loop() && re->num_args() >= 1 && is_empty_regex(re->arg(0)))
            return !re->is_nullable(); // empty if not nullable (i.e., lo > 0)

        return false;
    }

    // -----------------------------------------------------------------------
    // BFS regex emptiness check — helper: collect character boundaries
    // -----------------------------------------------------------------------

    void seq_regex::collect_char_boundaries(euf::snode* re, unsigned_vector& bounds) const {
        if (!re || !re->get_expr())
            return;

        seq_util& seq = m_sg.get_seq_util();
        expr* e = re->get_expr();

        // Range predicate re.range(lo, hi): boundary at lo and hi+1
        // Range arguments are string expressions (e.g., str.unit(ch))
        expr* lo_expr = nullptr;
        expr* hi_expr = nullptr;
        if (seq.re.is_range(e, lo_expr, hi_expr)) {
            zstring s_lo, s_hi;
            if (lo_expr && seq.str.is_string(lo_expr, s_lo) && s_lo.length() == 1)
                bounds.push_back(s_lo[0]);
            if (hi_expr && seq.str.is_string(hi_expr, s_hi) && s_hi.length() == 1 && s_hi[0] < zstring::max_char())
                bounds.push_back(s_hi[0] + 1);
            return;
        }

        // to_re(s): boundary at first character and first+1
        expr* body = nullptr;
        if (seq.re.is_to_re(e, body)) {
            zstring s;
            if (seq.str.is_string(body, s) && s.length() > 0) {
                unsigned first_ch = s[0];
                bounds.push_back(first_ch);
                if (first_ch < zstring::max_char())
                    bounds.push_back(first_ch + 1);
            }
            return;
        }

        // Leaf nodes with no character discrimination
        if (re->is_fail() || re->is_full_char() || re->is_full_seq())
            return;

        // Recurse into children (handles union, concat, star, loop, etc.)
        for (unsigned i = 0; i < re->num_args(); ++i)
            collect_char_boundaries(re->arg(i), bounds);
    }

    // -----------------------------------------------------------------------
    // BFS regex emptiness check — helper: alphabet representatives
    // -----------------------------------------------------------------------

    void seq_regex::get_alphabet_representatives(euf::snode* re, euf::snode_vector& reps) {
        unsigned_vector bounds;
        bounds.push_back(0); // always include character 0
        collect_char_boundaries(re, bounds);

        // Sort and deduplicate
        std::sort(bounds.begin(), bounds.end());
        unsigned prev = UINT_MAX;
        for (unsigned b : bounds) {
            if (b != prev) {
                reps.push_back(m_sg.mk_char(b));
                prev = b;
            }
        }
    }

    // -----------------------------------------------------------------------
    // BFS regex emptiness check
    // -----------------------------------------------------------------------

    lbool seq_regex::is_empty_bfs(euf::snode* re, unsigned max_states) {
        if (!re || !re->get_expr())
            return l_undef;
        if (re->is_fail())
            return l_true;
        if (re->is_nullable())
            return l_false;
        // Structural quick checks for kinds that are never empty
        if (re->is_star() || re->is_full_char() || re->is_full_seq() || re->is_to_re())
            return l_false;
        // Structural emptiness catches simple cases
        if (is_empty_regex(re))
            return l_true;
        // Only handle ground regexes; non-ground can't be fully explored
        if (!re->is_ground())
            return l_undef;

        // BFS over the Brzozowski derivative automaton.
        // Each state is a derivative regex snode identified by its id.
        // We explore states by computing derivatives for representative
        // characters from the alphabet partition.
        uint_set visited;
        euf::snode_vector worklist;
        worklist.push_back(re);
        visited.insert(re->id());

        unsigned states_explored = 0;
        bool had_failed_deriv = false;

        while (!worklist.empty()) {
            if (states_explored >= max_states)
                return l_undef;

            euf::snode* current = worklist.back();
            worklist.pop_back();
            ++states_explored;

            // Compute representative characters for current state's
            // alphabet partition. Each representative is a concrete
            // character snode whose equivalence class has identical
            // derivative behavior.
            euf::snode_vector reps;
            get_alphabet_representatives(current, reps);

            if (reps.empty()) {
                // No representatives means no character predicates;
                // use a default character to explore the single partition.
                reps.push_back(m_sg.mk_char('a'));
            }

            for (euf::snode* ch : reps) {
                euf::snode* deriv = m_sg.brzozowski_deriv(current, ch);
                if (!deriv) {
                    // Derivative computation failed for this character.
                    // Track the failure but continue with other characters.
                    had_failed_deriv = true;
                    continue;
                }
                if (deriv->is_nullable())
                    return l_false; // found an accepting state
                if (deriv->is_fail())
                    continue; // dead-end, no need to explore further
                if (is_empty_regex(deriv))
                    continue; // structurally empty subtree
                if (!visited.contains(deriv->id())) {
                    visited.insert(deriv->id());
                    worklist.push_back(deriv);
                }
            }
        }

        // Exhausted all reachable states without finding a nullable one.
        // If we had any failed derivative computations, the result is
        // inconclusive since we may have missed reachable states.
        if (had_failed_deriv)
            return l_undef;

        return l_true;
    }

    // -----------------------------------------------------------------------
    // Multi-regex intersection emptiness check
    // BFS over the product of Brzozowski derivative automata.
    // Mirrors ZIPT NielsenNode.CheckEmptiness (NielsenNode.cs:1429-1469)
    // -----------------------------------------------------------------------

    lbool seq_regex::check_intersection_emptiness(ptr_vector<euf::snode> const& regexes,
                                                    unsigned max_states) {
        if (regexes.empty())
            return l_false; // empty intersection = full language (vacuously non-empty)

        // Quick checks: if any regex is fail/empty, intersection is empty
        for (euf::snode* re : regexes) {
            if (!re || !re->get_expr())
                return l_undef;
            if (re->is_fail() || is_empty_regex(re))
                return l_true;
        }

        // Check if all are nullable (intersection accepts ε)
        bool all_nullable = true;
        for (euf::snode* re : regexes) {
            if (!re->is_nullable()) { all_nullable = false; break; }
        }
        if (all_nullable)
            return l_false;

        // Single regex: delegate to is_empty_bfs
        if (regexes.size() == 1)
            return is_empty_bfs(regexes[0], max_states);

        // Build product BFS. State = tuple of regex snode ids.
        // Use a map from state hash to visited set.
        using state_t = svector<unsigned>;

        auto state_hash = [](state_t const& s) -> unsigned {
            unsigned h = 0;
            for (unsigned id : s)
                h = h * 31 + id;
            return h;
        };

        auto state_eq = [](state_t const& a, state_t const& b) -> bool {
            if (a.size() != b.size()) return false;
            for (unsigned i = 0; i < a.size(); ++i)
                if (a[i] != b[i]) return false;
            return true;
        };

        // Use simple set via sorted vector of hashes (good enough for bounded BFS)
        std::unordered_set<unsigned> visited_hashes;

        struct bfs_state {
            ptr_vector<euf::snode> regexes;
        };

        std::vector<bfs_state> worklist;
        bfs_state initial;
        initial.regexes.append(regexes);
        worklist.push_back(std::move(initial));

        state_t init_ids;
        for (euf::snode* re : regexes)
            init_ids.push_back(re->id());
        visited_hashes.insert(state_hash(init_ids));

        unsigned states_explored = 0;
        bool had_failed = false;

        // Collect alphabet representatives from the intersection of all regexes
        // (merge boundaries from all)
        unsigned_vector all_bounds;
        all_bounds.push_back(0);
        for (euf::snode* re : regexes)
            collect_char_boundaries(re, all_bounds);
        std::sort(all_bounds.begin(), all_bounds.end());

        euf::snode_vector reps;
        unsigned prev = UINT_MAX;
        for (unsigned b : all_bounds) {
            if (b != prev) {
                reps.push_back(m_sg.mk_char(b));
                prev = b;
            }
        }
        if (reps.empty())
            reps.push_back(m_sg.mk_char('a'));

        while (!worklist.empty()) {
            if (states_explored >= max_states)
                return l_undef;

            bfs_state current = std::move(worklist.back());
            worklist.pop_back();
            ++states_explored;

            for (euf::snode* ch : reps) {
                ptr_vector<euf::snode> derivs;
                bool any_fail = false;
                bool all_null = true;
                bool deriv_failed = false;

                for (euf::snode* re : current.regexes) {
                    euf::snode* d = m_sg.brzozowski_deriv(re, ch);
                    if (!d) { deriv_failed = true; break; }
                    if (d->is_fail()) { any_fail = true; break; }
                    if (!d->is_nullable()) all_null = false;
                    derivs.push_back(d);
                }

                if (deriv_failed) { had_failed = true; continue; }
                if (any_fail) continue; // this character leads to empty intersection

                if (all_null)
                    return l_false; // found an accepting state in the product

                // Check if any component is structurally empty
                bool any_empty = false;
                for (euf::snode* d : derivs) {
                    if (is_empty_regex(d)) { any_empty = true; break; }
                }
                if (any_empty) continue;

                // Compute state hash and check visited
                state_t ids;
                for (euf::snode* d : derivs)
                    ids.push_back(d->id());
                unsigned h = state_hash(ids);
                if (visited_hashes.count(h) == 0) {
                    visited_hashes.insert(h);
                    bfs_state next;
                    next.regexes.append(derivs);
                    worklist.push_back(std::move(next));
                }
            }
        }

        if (had_failed)
            return l_undef;
        return l_true; // exhausted all states, intersection is empty
    }

    // -----------------------------------------------------------------------
    // Language subset check: L(A) ⊆ L(B)
    // via intersection(A, complement(B)) = ∅
    // Mirrors ZIPT NielsenNode.IsLanguageSubset (NielsenNode.cs:1382-1385)
    // -----------------------------------------------------------------------

    lbool seq_regex::is_language_subset(euf::snode* subset_re, euf::snode* superset_re) {
        if (!subset_re || !superset_re)
            return l_undef;

        // Quick checks
        if (subset_re->is_fail() || is_empty_regex(subset_re))
            return l_true;  // ∅ ⊆ anything
        if (superset_re->is_full_seq())
            return l_true;  // anything ⊆ Σ*
        if (subset_re == superset_re)
            return l_true;  // L ⊆ L

        // Build complement(superset)
        seq_util& seq = m_sg.get_seq_util();
        ast_manager& mgr = m_sg.get_manager();
        expr* sup_expr = superset_re->get_expr();
        if (!sup_expr)
            return l_undef;
        expr_ref comp(seq.re.mk_complement(sup_expr), mgr);
        euf::snode* comp_sn = m_sg.mk(comp);
        if (!comp_sn)
            return l_undef;

        // Build intersection and check emptiness
        // subset ∩ complement(superset) should be empty for subset relation
        expr* sub_expr = subset_re->get_expr();
        if (!sub_expr)
            return l_undef;
        expr_ref inter(seq.re.mk_inter(sub_expr, comp.get()), mgr);
        euf::snode* inter_sn = m_sg.mk(inter);
        if (!inter_sn)
            return l_undef;

        return is_empty_bfs(inter_sn);
    }

    // -----------------------------------------------------------------------
    // Collect primitive regex intersection for a variable
    // -----------------------------------------------------------------------

    euf::snode* seq_regex::collect_primitive_regex_intersection(
            euf::snode* var, seq::nielsen_node const& node) {
        if (!var)
            return nullptr;

        seq_util& seq = m_sg.get_seq_util();
        ast_manager& mgr = m_sg.get_manager();
        euf::snode* result = nullptr;

        for (auto const& mem : node.str_mems()) {
            if (!mem.m_str || !mem.m_regex)
                continue;
            // Primitive constraint: str is a single variable
            if (!mem.is_primitive())
                continue;
            euf::snode* first = mem.m_str->first();
            if (!first || first != var)
                continue;

            if (!result) {
                result = mem.m_regex;
            } else {
                expr* r1 = result->get_expr();
                expr* r2 = mem.m_regex->get_expr();
                if (r1 && r2) {
                    expr_ref inter(seq.re.mk_inter(r1, r2), mgr);
                    result = m_sg.mk(inter);
                }
            }
        }
        return result;
    }

    // -----------------------------------------------------------------------
    // Cycle detection
    // -----------------------------------------------------------------------

    bool seq_regex::detect_cycle(seq::str_mem const& mem) const {
        return extract_cycle(mem) != nullptr;
    }

    // -----------------------------------------------------------------------
    // Ground prefix consumption
    // -----------------------------------------------------------------------

    seq_regex::simplify_status seq_regex::simplify_ground_prefix(seq::str_mem& mem) {
        if (!mem.m_str || !mem.m_regex)
            return simplify_status::ok;

        while (mem.m_str && !mem.m_str->is_empty()) {
            euf::snode* first = mem.m_str->first();
            if (!first || !first->is_char())
                break;
            euf::snode* parent_re = mem.m_regex;
            euf::snode* deriv = m_sg.brzozowski_deriv(parent_re, first);
            if (!deriv)
                break;
            if (deriv->is_fail())
                return simplify_status::conflict;
            // propagate self-stabilizing flag from parent to derivative
            propagate_self_stabilizing(parent_re, deriv);
            mem.m_str = m_sg.drop_first(mem.m_str);
            mem.m_regex = deriv;
        }

        // check final state
        if (mem.m_str && mem.m_str->is_empty()) {
            if (mem.m_regex->is_nullable())
                return simplify_status::satisfied;
            return simplify_status::conflict;
        }

        return simplify_status::ok;
    }

    // -----------------------------------------------------------------------
    // Ground suffix consumption (best-effort)
    // -----------------------------------------------------------------------

    seq_regex::simplify_status seq_regex::simplify_ground_suffix(seq::str_mem& mem) {
        // Suffix consumption via reverse derivatives is complex.
        // For now, only handle the case where the entire string is ground:
        // consume all characters from the front (which covers trailing chars
        // when the string is fully ground).
        if (!mem.m_str || !mem.m_regex)
            return simplify_status::ok;
        if (!mem.m_str->is_ground())
            return simplify_status::ok;

        // If the string is ground, simplify_ground_prefix handles everything.
        return simplify_ground_prefix(mem);
    }

    // -----------------------------------------------------------------------
    // Trivial checks
    // -----------------------------------------------------------------------

    int seq_regex::check_trivial(seq::str_mem const& mem) const {
        if (!mem.m_str || !mem.m_regex)
            return 0;
        // regex is ∅ => always conflict
        if (is_empty_regex(mem.m_regex))
            return -1;
        // regex is Σ* => always satisfied
        if (is_full_regex(mem.m_regex))
            return 1;
        // empty string checks
        if (mem.m_str->is_empty()) {
            if (mem.m_regex->is_nullable())
                return 1;
            return -1;
        }
        return 0;
    }

    // -----------------------------------------------------------------------
    // Minterm computation with filtering
    // -----------------------------------------------------------------------

    void seq_regex::get_minterms(euf::snode* regex, euf::snode_vector& minterms) {
        if (!regex)
            return;

        // compute raw minterms from the regex predicates
        euf::snode_vector raw;
        m_sg.compute_minterms(regex, raw);

        // filter: keep only minterms that are non-fail (non-empty character class).
        // note: minterms are regex character-class expressions, not concrete
        // characters, so we cannot compute Brzozowski derivatives with them.
        // callers should compute derivatives using concrete or fresh chars.
        for (euf::snode* mt : raw) {
            if (!mt || mt->is_fail())
                continue;
            minterms.push_back(mt);
        }
    }

    // -----------------------------------------------------------------------
    // Collect first characters
    // -----------------------------------------------------------------------

    void seq_regex::collect_first_chars(euf::snode* re, euf::snode_vector& chars) {
        if (!re)
            return;

        // to_re(s): extract first character of the string body
        if (re->is_to_re()) {
            euf::snode* body = re->arg(0);
            if (body && !body->is_empty()) {
                euf::snode* first = body->first();
                if (first && first->is_char()) {
                    bool dup = false;
                    for (euf::snode* c : chars)
                        if (c == first) { dup = true; break; }
                    if (!dup)
                        chars.push_back(first);
                }
                // Handle string literals (classified as s_other in sgraph)
                else if (first && first->get_expr()) {
                    seq_util& seq = m_sg.get_seq_util();
                    zstring s;
                    if (seq.str.is_string(first->get_expr(), s) && s.length() > 0) {
                        euf::snode* ch = m_sg.mk_char(s[0]);
                        bool dup = false;
                        for (euf::snode* c : chars)
                            if (c == ch) { dup = true; break; }
                        if (!dup)
                            chars.push_back(ch);
                    }
                }
            }
            return;
        }

        // leaf cases: produce representative characters for character classes
        if (re->is_full_char()) {
            // full character set (.): use 'a' as representative
            euf::snode* ch = m_sg.mk_char('a');
            bool dup = false;
            for (euf::snode* c : chars)
                if (c == ch) { dup = true; break; }
            if (!dup)
                chars.push_back(ch);
            return;
        }

        // re.range(lo, hi): use lo as representative
        if (re->get_expr()) {
            seq_util& seq = m_sg.get_seq_util();
            expr* lo = nullptr, *hi = nullptr;
            if (seq.re.is_range(re->get_expr(), lo, hi) && lo) {
                zstring s;
                unsigned ch_val = 'a';
                if (seq.is_const_char(lo, ch_val)) {
                    euf::snode* ch = m_sg.mk_char(ch_val);
                    bool dup = false;
                    for (euf::snode* c : chars)
                        if (c == ch) { dup = true; break; }
                    if (!dup)
                        chars.push_back(ch);
                }
                return;
            }
        }

        if (re->is_fail() || re->is_full_seq())
            return;

        // recurse into children (handles union, concat, star, loop, etc.)
        for (unsigned i = 0; i < re->num_args(); ++i)
            collect_first_chars(re->arg(i), chars);
    }

    // -----------------------------------------------------------------------
    // Membership processing
    // -----------------------------------------------------------------------

    bool seq_regex::process_str_mem(seq::str_mem const& mem,
                                     vector<seq::str_mem>& out_mems) {
        if (!mem.m_str || !mem.m_regex)
            return true;
        // empty string: check nullable
        if (mem.m_str->is_empty())
            return mem.m_regex->is_nullable();

        // consume ground prefix: derive regex by each leading concrete char
        seq::str_mem working = mem;
        simplify_status st = simplify_ground_prefix(working);
        if (st == simplify_status::conflict)
            return false;
        if (st == simplify_status::satisfied)
            return true;

        // after ground prefix consumption, if the front is still a concrete
        // character we can take one more step (shouldn't happen after
        // simplify_ground_prefix, but guard defensively)
        euf::snode* first = working.m_str->first();
        if (first && first->is_char()) {
            seq::str_mem derived = derive(working, first);
            if (is_empty_regex(derived.m_regex))
                return false;
            out_mems.push_back(derived);
            return true;
        }

        // string starts with a non-ground element (variable or unit):
        // return the simplified constraint for the Nielsen graph to expand
        // via character-split modifiers.
        out_mems.push_back(working);
        return true;
    }

    // -----------------------------------------------------------------------
    // History recording
    // -----------------------------------------------------------------------

    seq::str_mem seq_regex::record_history(seq::str_mem const& mem, euf::snode* history_re) {
        // Build a history chain by prepending the new regex entry to the
        // existing history. Uses regex-concat as a cons cell:
        //   new_history = re.concat(history_re, old_history)
        // where arg(0) is the latest entry and arg(1) is the tail.
        // If old_history is nullptr, the new entry becomes the terminal leaf.
        euf::snode* new_history = history_re;
        if (mem.m_history && history_re) {
            expr* re_expr = history_re->get_expr();
            expr* old_expr = mem.m_history->get_expr();
            if (re_expr && old_expr) {
                seq_util& seq = m_sg.get_seq_util();
                expr_ref chain(seq.re.mk_concat(re_expr, old_expr), m_sg.get_manager());
                new_history = m_sg.mk(chain);
            }
        }
        return seq::str_mem(mem.m_str, mem.m_regex, new_history, mem.m_id, mem.m_dep);
    }

    // -----------------------------------------------------------------------
    // Cycle detection
    // -----------------------------------------------------------------------

    euf::snode* seq_regex::extract_cycle(seq::str_mem const& mem) const {
        // Walk the history chain looking for a repeated regex.
        // A cycle exists when the current regex matches a regex in the history.
        if (!mem.m_regex || !mem.m_history)
            return nullptr;

        euf::snode* current = mem.m_regex;
        euf::snode* hist = mem.m_history;

        // Walk the history chain up to a bounded depth.
        // The history is structured as a chain of regex snapshots connected
        // via the sgraph's regex-concat: each level's arg(0) is a snapshot
        // and arg(1) is the tail. A leaf (non-concat) is a terminal entry.
        unsigned bound = 1000;
        while (hist && bound-- > 0) {
            euf::snode* entry = hist;
            euf::snode* tail = nullptr;

            // If the history node is a regex concat, decompose it:
            // arg(0) is the regex snapshot, arg(1) is the rest of the chain
            seq_util& seq = m_sg.get_seq_util();
            if (hist->is_concat() && hist->get_expr() &&
                seq.re.is_concat(hist->get_expr())) {
                entry = hist->arg(0);
                tail = hist->arg(1);
            }

            // Check pointer equality (fast, covers normalized regexes)
            if (entry == current)
                return entry;
            // Check expression-level equality as fallback
            if (entry->get_expr() && current->get_expr() &&
                entry->get_expr() == current->get_expr())
                return entry;

            hist = tail;
        }
        return nullptr;
    }

    // -----------------------------------------------------------------------
    // Stabilizer from cycle
    // -----------------------------------------------------------------------

    euf::snode* seq_regex::stabilizer_from_cycle(euf::snode* cycle_regex,
                                                   euf::snode* current_regex) {
        if (!cycle_regex || !current_regex)
            return nullptr;

        expr* re_expr = cycle_regex->get_expr();
        if (!re_expr)
            return nullptr;

        seq_util& seq = m_sg.get_seq_util();
        expr_ref star_expr(seq.re.mk_star(re_expr), m_sg.get_manager());
        return m_sg.mk(star_expr);
    }

    // -----------------------------------------------------------------------
    // Extract cycle history tokens
    // -----------------------------------------------------------------------

    euf::snode* seq_regex::extract_cycle_history(seq::str_mem const& current,
                                                   seq::str_mem const& ancestor) {
        // The history is built by simplify_and_init as a left-associative
        // string concat chain: concat(concat(concat(nil, c1), c2), c3).
        // Extract the tokens consumed since the ancestor.
        if (!current.m_history)
            return nullptr;

        unsigned cur_len = current.m_history->length();
        unsigned anc_len = ancestor.m_history ? ancestor.m_history->length() : 0;

        if (cur_len <= anc_len)
            return nullptr;

        if (anc_len == 0)
            return current.m_history;

        return m_sg.drop_left(current.m_history, anc_len);
    }

    // -----------------------------------------------------------------------
    // Get filtered stabilizer star
    // Mirrors ZIPT StrMem.GetFilteredStabilizerStar (StrMem.cs:228-243)
    // -----------------------------------------------------------------------

    euf::snode* seq_regex::get_filtered_stabilizer_star(euf::snode* re,
                                                          euf::snode* excluded_char) {
        if (!re)
            return nullptr;

        ptr_vector<euf::snode> const* stabs = get_stabilizers(re);
        if (!stabs || stabs->empty())
            return nullptr;

        seq_util& seq = m_sg.get_seq_util();
        ast_manager& m = m_sg.get_manager();
        euf::snode* filtered_union = nullptr;

        for (euf::snode* s : *stabs) {
            if (!s)
                continue;
            // Keep only stabilizers whose language cannot start with excluded_char
            euf::snode* d = m_sg.brzozowski_deriv(s, excluded_char);
            if (d && d->is_fail()) {
                if (!filtered_union) {
                    filtered_union = s;
                } else {
                    expr* e1 = filtered_union->get_expr();
                    expr* e2 = s->get_expr();
                    if (e1 && e2) {
                        expr_ref u(seq.re.mk_union(e1, e2), m);
                        filtered_union = m_sg.mk(u);
                    }
                }
            }
        }

        if (!filtered_union)
            return nullptr;

        expr* fe = filtered_union->get_expr();
        if (!fe)
            return nullptr;
        expr_ref star_expr(seq.re.mk_star(fe), m);
        return m_sg.mk(star_expr);
    }

    // -----------------------------------------------------------------------
    // Strengthened stabilizer construction with sub-cycle detection
    // Mirrors ZIPT StrMem.StabilizerFromCycle (StrMem.cs:163-225)
    // -----------------------------------------------------------------------

    euf::snode* seq_regex::strengthened_stabilizer(euf::snode* cycle_regex,
                                                     euf::snode* cycle_history) {
        if (!cycle_regex || !cycle_history)
            return nullptr;

        // Flatten the history concat chain into a vector of character tokens.
        euf::snode_vector tokens;
        cycle_history->collect_tokens(tokens);

        if (tokens.empty())
            return nullptr;

        seq_util& seq = m_sg.get_seq_util();
        ast_manager& m = m_sg.get_manager();

        // Replay tokens on the cycle regex, detecting sub-cycles.
        // A sub-cycle is detected when the derivative returns to cycle_regex.
        svector<std::pair<unsigned, unsigned>> sub_cycles;
        unsigned cycle_start = 0;
        euf::snode* current_re = cycle_regex;

        for (unsigned i = 0; i < tokens.size(); ++i) {
            euf::snode* tok = tokens[i];
            if (!tok)
                return nullptr;

            euf::snode* deriv = m_sg.brzozowski_deriv(current_re, tok);
            if (!deriv)
                return nullptr;

            // Sub-cycle: derivative returned to the cycle entry regex
            if (deriv == cycle_regex ||
                (deriv->get_expr() && cycle_regex->get_expr() &&
                 deriv->get_expr() == cycle_regex->get_expr())) {
                sub_cycles.push_back(std::make_pair(cycle_start, i + 1));
                cycle_start = i + 1;
                current_re = cycle_regex;
            } else {
                current_re = deriv;
            }
        }

        // Remaining tokens that don't complete a sub-cycle
        if (cycle_start < tokens.size())
            sub_cycles.push_back(std::make_pair(cycle_start, tokens.size()));

        if (sub_cycles.empty())
            return nullptr;

        // Build a stabilizer body for each sub-cycle.
        // body = to_re(t0) · [filteredStar(R1, t1)] · to_re(t1) · ... · to_re(t_{n-1})
        euf::snode* overall_union = nullptr;

        for (auto const& sc : sub_cycles) {
            unsigned start = sc.first;
            unsigned end = sc.second;
            if (start >= end)
                continue;

            euf::snode* re_state = cycle_regex;
            euf::snode* body = nullptr;

            for (unsigned i = start; i < end; ++i) {
                euf::snode* tok = tokens[i];
                if (!tok)
                    break;

                // Insert filtered stabilizer star before each token after the first
                if (i > start) {
                    euf::snode* filtered = get_filtered_stabilizer_star(re_state, tok);
                    if (filtered) {
                        expr* fe = filtered->get_expr();
                        if (fe) {
                            if (!body) {
                                body = filtered;
                            } else {
                                expr* be = body->get_expr();
                                if (be) {
                                    expr_ref cat(seq.re.mk_concat(be, fe), m);
                                    body = m_sg.mk(cat);
                                }
                            }
                        }
                    }
                }

                // Convert char token to regex: to_re(unit(tok))
                expr* tok_expr = tok->get_expr();
                if (!tok_expr)
                    break;

                expr_ref unit_str(seq.str.mk_unit(tok_expr), m);
                expr_ref tok_re(seq.re.mk_to_re(unit_str), m);
                euf::snode* tok_re_sn = m_sg.mk(tok_re);

                if (!body) {
                    body = tok_re_sn;
                } else {
                    expr* be = body->get_expr();
                    expr* te = tok_re_sn->get_expr();
                    if (be && te) {
                        expr_ref cat(seq.re.mk_concat(be, te), m);
                        body = m_sg.mk(cat);
                    }
                }

                // Advance the regex state
                euf::snode* deriv = m_sg.brzozowski_deriv(re_state, tok);
                if (!deriv)
                    break;
                re_state = deriv;
            }

            if (!body)
                continue;

            if (!overall_union) {
                overall_union = body;
            } else {
                expr* oe = overall_union->get_expr();
                expr* be = body->get_expr();
                if (oe && be) {
                    expr_ref u(seq.re.mk_union(oe, be), m);
                    overall_union = m_sg.mk(u);
                }
            }
        }

        return overall_union;
    }

    // -----------------------------------------------------------------------
    // Stabilizer-based subsumption (enhanced)
    // Mirrors ZIPT StrMem.TrySubsume (StrMem.cs:354-386)
    // -----------------------------------------------------------------------

    bool seq_regex::try_subsume(seq::str_mem const& mem, seq::nielsen_node const& node) {
        if (!mem.m_str || !mem.m_regex)
            return false;

        // 1. Leading token must be a variable
        euf::snode* first = mem.m_str->first();
        if (!first || !first->is_var())
            return false;

        // 2. Must have stabilizers for the regex
        if (!has_stabilizers(mem.m_regex))
            return false;

        // 3. Build stabStar = star(union(all stabilizers for this regex))
        euf::snode* stab_union = get_stabilizer_union(mem.m_regex);
        if (!stab_union)
            return false;

        seq_util& seq = m_sg.get_seq_util();
        ast_manager& mgr = m_sg.get_manager();
        expr* su_expr = stab_union->get_expr();
        if (!su_expr)
            return false;
        expr_ref stab_star(seq.re.mk_star(su_expr), mgr);
        euf::snode* stab_star_sn = m_sg.mk(stab_star);
        if (!stab_star_sn)
            return false;

        // 4. Collect all primitive regex constraints on variable `first`
        euf::snode* x_range = collect_primitive_regex_intersection(first, node);
        if (!x_range)
            return false;

        // 5. Check L(x_range) ⊆ L(stab_star)
        lbool result = is_language_subset(x_range, stab_star_sn);
        return result == l_true;
    }

}