z3/src/smt/seq/seq_regex.cpp

/*++
Copyright (c) 2026 Microsoft Corporation

Module Name:

    seq_regex.cpp

Abstract:

    Lazy regex membership processing for the Nielsen-based string solver.

Author:

    Clemens Eisenhofer 2026-03-01
    Nikolaj Bjorner (nbjorner) 2026-03-01

--*/
#include "smt/seq/seq_regex.h"

namespace seq {

    // NSB code review: change the stabilizers set to
    // add the regexes in the domain of m_stabilizers to a trail (expr_ref_vector
    // change the range to be a vector of expressions, not snodes
    // add regexes in the range of m_stabilizers to the trail
    // this is to ensure that the expressions are valid also after scope changes.
    // maybe all regexes entered are created at base level for quantifier free formulas
    // but we should not assume this. The sgraph also can change based on scope.
    // the Stabilizer data-structure persists across search.

    // Collect possible first characters of a syntactically known *string*
    // expression (the body of to_re). Regex operators (union, complement,
    // intersection, ...) are not expected here.
    void collect_possible_first_chars(seq_util& seq, euf::sgraph const& sg, expr* str,
                                      unsigned_vector& bounds, bool& may_be_empty) {
        may_be_empty = false;
        VERIFY(str);
        sort* re_sort = nullptr;
        VERIFY(!seq.is_re(str, re_sort));

        unsigned ch = 0;
        if (sg.decode_re_char(str, ch)) {
            bounds.push_back(ch);
            if (ch < zstring::max_char())
                bounds.push_back(ch + 1);
            return;
        }

        zstring s;
        if (seq.str.is_string(str, s)) {
            if (s.length() == 0) {
                may_be_empty = true;
                return;
            }
            unsigned first_ch = s[0];
            bounds.push_back(first_ch);
            if (first_ch < zstring::max_char())
                bounds.push_back(first_ch + 1);
            return;
        }

        expr* a = nullptr;
        expr* b = nullptr;
        if (seq.str.is_concat(str, a, b)) {
            bool a_may_be_empty = false;
            collect_possible_first_chars(seq, sg, a, bounds, a_may_be_empty);

            if (a_may_be_empty) {
                bool b_may_be_empty = false;
                collect_possible_first_chars(seq, sg, b, bounds, b_may_be_empty);
                may_be_empty = b_may_be_empty;
            }
            return;
        }

        UNREACHABLE();
    }


    // -----------------------------------------------------------------------
    // Stabilizer store
    // -----------------------------------------------------------------------

    void seq_regex::reset_stabilizers() {
        m_stabilizers.reset();
        m_self_stabilizing.reset();
    }

    void seq_regex::add_stabilizer(euf::snode* regex, euf::snode* stabilizer) {
        if (!regex || !stabilizer)
            return;

        unsigned id = regex->id();
        auto& stabs = m_stabilizers.insert_if_not_there(id, ptr_vector<euf::snode>());

        // De-duplicate by pointer equality (mirrors ZIPT Environment.AddStabilizer
        // which checks reference equality before adding).
        for (euf::snode* s : stabs)
            if (s == stabilizer)
                return;
        stabs.push_back(stabilizer);
    }

    euf::snode* seq_regex::get_stabilizer_union(euf::snode* regex) {
        if (!regex)
            return nullptr;

        if (!m_stabilizers.contains(regex->id()))
            return nullptr;

        auto& stabs = m_stabilizers[regex->id()];
        if (stabs.empty())
            return nullptr;

        // Single stabilizer: return it directly.
        if (stabs.size() == 1)
            return stabs[0];

        // Multiple stabilizers: build re.union chain.
        // union(s1, union(s2, ... union(sN-1, sN)...))
        seq_util& seq = m_sg.get_seq_util();
        euf::snode* result = stabs[stabs.size() - 1];
        for (unsigned i = stabs.size() - 1; i-- > 0; ) {
            expr* lhs = stabs[i]->get_expr();
            expr* rhs = result->get_expr();
            if (!lhs || !rhs)
                return nullptr;
            expr_ref un(seq.re.mk_union(lhs, rhs), m_sg.get_manager());
            result = m_sg.mk(un);
        }
        return result;
    }

    bool seq_regex::has_stabilizers(euf::snode* regex) const {
        if (!regex)
            return false;
        if (!m_stabilizers.contains(regex->id()))
            return false;
        return !m_stabilizers[regex->id()].empty();
    }

    ptr_vector<euf::snode> const* seq_regex::get_stabilizers(euf::snode* regex) const {
        if (!regex)
            return nullptr;
        if (!m_stabilizers.contains(regex->id()))
            return nullptr;
        return &m_stabilizers[regex->id()];
    }

    void seq_regex::set_self_stabilizing(euf::snode* regex) {
        if (!regex)
            return;
        m_self_stabilizing.insert(regex->id());
    }

    bool seq_regex::is_self_stabilizing(euf::snode* regex) const {
        if (!regex)
            return false;
        return m_self_stabilizing.contains(regex->id());
    }

    // -----------------------------------------------------------------------
    // Self-stabilizing auto-detection
    // -----------------------------------------------------------------------

    bool seq_regex::compute_self_stabilizing(euf::snode* regex) const {
        if (!regex)
            return false;

        // R* is always self-stabilizing: D(c, R*) = D(c,R) · R*,
        // so R* appears as the tail of every derivative and acts as
        // its own stabilizer.
        if (regex->is_star())
            return true;

        // Σ* (full_seq, i.e., re.all / .*) is self-stabilizing:
        // D(c, Σ*) = Σ* for every character c.
        if (regex->is_full_seq())
            return true;

        // ∅ (fail / empty language) is trivially self-stabilizing:
        // it has no live derivatives, so the flag is vacuously true.
        if (regex->is_fail())
            return true;

        // Complement of full_seq is ∅ (complement of Σ*), which is
        // also trivially self-stabilizing.
        if (regex->is_complement() && regex->num_args() == 1 &&
            regex->arg(0)->is_full_seq())
            return true;

        // Loop with lo=0 and no upper bound behaves like R*
        // (r{0,} ≡ r*), so it is self-stabilizing.
        if (regex->is_loop() && regex->is_nullable()) {
            // A nullable loop with a star-like body: heuristic check.
            // Only mark as self-stabilizing if the body is a Kleene closure.
            // Loop(R, 0, ∞) ~ R* — but we rely on the sgraph to normalize
            // these, so only catch exact star nodes above.
        }

        return false;
    }

    // -----------------------------------------------------------------------
    // Self-stabilizing propagation through derivatives
    // -----------------------------------------------------------------------

    void seq_regex::propagate_self_stabilizing(euf::snode* parent, euf::snode* deriv) {
        if (!parent || !deriv)
            return;

        // If the derivative is already known to be self-stabilizing (either
        // inherently or from a prior propagation), nothing to do.
        if (is_self_stabilizing(deriv))
            return;

        // If the derivative is itself inherently self-stabilizing
        // (e.g., it is a star or full_seq), mark it now.
        if (compute_self_stabilizing(deriv)) {
            set_self_stabilizing(deriv);
            return;
        }

        // Rule 1: Star parent.
        // D(c, R*) = D(c, R) · R*. The derivative always contains the
        // R* tail, so it is self-stabilizing regardless of D(c,R).
        if (parent->is_star()) {
            set_self_stabilizing(deriv);
            return;
        }

        // Rule 2: Full_seq parent.
        // D(c, Σ*) = Σ*, and Σ* is self-stabilizing.
        // (The derivative should be Σ* itself; mark it for safety.)
        if (parent->is_full_seq()) {
            set_self_stabilizing(deriv);
            return;
        }

        // Check if parent is self-stabilizing (either inherently or marked).
        bool parent_ss = is_self_stabilizing(parent) || compute_self_stabilizing(parent);

        // Rule 3: Concat parent R · S.
        // D(c, R·S) = D(c,R)·S | (nullable(R) ? D(c,S) : ∅).
        // If S is self-stabilizing, the D(c,R)·S branch inherits it.
        // If the whole parent R·S is self-stabilizing, the derivative is too.
        if (parent->is_concat() && parent->num_args() == 2) {
            euf::snode* tail = parent->arg(1);
            bool tail_ss = is_self_stabilizing(tail) || compute_self_stabilizing(tail);
            if (tail_ss || parent_ss) {
                set_self_stabilizing(deriv);
                return;
            }
        }

        // Rule 4: Union parent R | S.
        // D(c, R|S) = D(c,R) | D(c,S).
        // Self-stabilizing if both children are self-stabilizing.
        if (parent->is_union() && parent->num_args() == 2) {
            euf::snode* lhs = parent->arg(0);
            euf::snode* rhs = parent->arg(1);
            bool lhs_ss = is_self_stabilizing(lhs) || compute_self_stabilizing(lhs);
            bool rhs_ss = is_self_stabilizing(rhs) || compute_self_stabilizing(rhs);
            if (lhs_ss && rhs_ss) {
                set_self_stabilizing(deriv);
                return;
            }
        }

        // Rule 5: Intersection parent R ∩ S.
        // D(c, R∩S) = D(c,R) ∩ D(c,S).
        // Self-stabilizing if both children are self-stabilizing.
        if (parent->is_intersect() && parent->num_args() == 2) {
            euf::snode* lhs = parent->arg(0);
            euf::snode* rhs = parent->arg(1);
            bool lhs_ss = is_self_stabilizing(lhs) || compute_self_stabilizing(lhs);
            bool rhs_ss = is_self_stabilizing(rhs) || compute_self_stabilizing(rhs);
            if (lhs_ss && rhs_ss) {
                set_self_stabilizing(deriv);
                return;
            }
        }

        // Rule 6: Complement parent ~R.
        // D(c, ~R) = ~D(c, R).
        // Preserves self-stabilizing from R.
        if (parent->is_complement() && parent->num_args() == 1) {
            euf::snode* inner = parent->arg(0);
            bool inner_ss = is_self_stabilizing(inner) || compute_self_stabilizing(inner);
            if (inner_ss) {
                set_self_stabilizing(deriv);
                return;
            }
        }

        // Rule 7: Generic self-stabilizing parent.
        // If the parent was explicitly marked self-stabilizing (e.g., via
        // a previous propagation), propagate to the derivative.
        if (parent_ss) {
            set_self_stabilizing(deriv);
            return;
        }
    }

    // -----------------------------------------------------------------------
    // Derivative with propagation
    // -----------------------------------------------------------------------

    euf::snode* seq_regex::derivative_with_propagation(euf::snode* re, euf::snode* elem) {
        if (!re || !elem)
            return nullptr;
        euf::snode* deriv = derivative(re, elem);
        if (deriv)
            propagate_self_stabilizing(re, deriv);
        return deriv;
    }

    // -----------------------------------------------------------------------
    // Uniform derivative (symbolic character consumption)
    // -----------------------------------------------------------------------

    euf::snode* seq_regex::try_uniform_derivative(euf::snode* regex) {
        if (!regex)
            return nullptr;

        // Quick exits: trivial regexes with known uniform derivatives.
        // Σ* (full_seq) has derivative Σ* for every character.
        if (regex->is_full_seq())
            return regex;
        // ∅ (fail) has derivative ∅ for every character — but this means
        // every character is rejected.  Return fail so the caller can
        // detect a conflict.
        if (regex->is_fail())
            return regex;

        // Compute minterms: the character-class partition of the alphabet
        // induced by the regex.
        euf::snode_vector minterms;
        m_sg.compute_minterms(regex, minterms);
        if (minterms.empty())
            return nullptr;

        // Compute the derivative for each non-empty minterm.  If all produce
        // the same result, the derivative is independent of the character
        // value and we can consume a symbolic character deterministically.
        euf::snode* uniform = nullptr;
        for (euf::snode* mt : minterms) {
            if (!mt || mt->is_fail())
                continue;  // empty character class — no character belongs to it
            euf::snode* deriv = m_sg.brzozowski_deriv(regex, mt);
            if (!deriv)
                return nullptr;  // derivative computation failed
            if (!uniform) {
                uniform = deriv;
            } else if (uniform->id() != deriv->id()) {
                return nullptr;  // different derivatives — not uniform
            }
        }
        return uniform;  // may be nullptr if all minterms were fail/empty
    }

    // -----------------------------------------------------------------------
    // Ground prefix consumption
    // -----------------------------------------------------------------------

    bool seq_regex::is_empty_regex(euf::snode* re) const {
        if (!re)
            return false;
        // direct empty language constant
        if (re->is_fail())
            return true;
        // kinds that are never empty
        if (re->is_star() || re->is_to_re() ||
            re->is_full_char() || re->is_full_seq())
            return false;
        // loop with lo == 0 accepts ε
        if (re->is_loop() && re->is_nullable())
            return false;

        seq_util& seq = m_sg.get_seq_util();
        expr* e = re->get_expr();
        if (!e)
            return false;

        expr* r1, * r2;
        // union is empty iff both children are empty
        if (seq.re.is_union(e, r1, r2)) {
            SASSERT(re->num_args() == 2);
            return is_empty_regex(re->arg(0)) && is_empty_regex(re->arg(1));
        }
        // regex concat is empty if either child is empty
        if (seq.re.is_concat(e, r1, r2)) {
            SASSERT(re->num_args() == 2);
            return is_empty_regex(re->arg(0)) || is_empty_regex(re->arg(1));
        }
        // intersection is empty if either child is empty
        if (seq.re.is_intersection(e, r1, r2)) {
            SASSERT(re->num_args() == 2);
            if (is_empty_regex(re->arg(0)) || is_empty_regex(re->arg(1)))
                return true;
        }
        // complement of full_seq is empty
        if (re->is_complement() && re->num_args() == 1 && re->arg(0)->is_full_seq())
            return true;
        // loop(empty, lo, _) with lo > 0 is empty
        if (re->is_loop() && re->num_args() >= 1 && is_empty_regex(re->arg(0)))
            return !re->is_nullable(); // empty if not nullable (i.e., lo > 0)

        return false;
    }

    // -----------------------------------------------------------------------
    // BFS regex emptiness check — helper: collect character boundaries
    // This is faster than computing the actual minterms but probably not minimal
    // -----------------------------------------------------------------------
    void seq_regex::collect_char_boundaries(euf::snode* re, unsigned_vector& bounds) const {
        SASSERT(re && re->get_expr());

        seq_util& seq = m_sg.get_seq_util();
        expr* e = re->get_expr();

        // Range predicate re.range(lo, hi): boundary at lo and hi+1
        // Range arguments are string expressions (e.g., str.unit(ch))
        expr* lo_expr = nullptr;
        expr* hi_expr = nullptr;
        if (seq.re.is_range(e, lo_expr, hi_expr)) {
            unsigned lo = 0, hi = 0;
            if (m_sg.decode_re_char(lo_expr, lo))
                bounds.push_back(lo);
            if (m_sg.decode_re_char(hi_expr, hi) && hi < zstring::max_char())
                bounds.push_back(hi + 1);
            return;
        }

        // to_re(s): boundary at possible first characters of s
        expr* body = nullptr;
        if (seq.re.is_to_re(e, body)) {
            bool may_be_empty = false;
            collect_possible_first_chars(seq, m_sg, body, bounds, may_be_empty);
            return;
        }

        // Leaf nodes with no character discrimination
        if (re->is_fail() || re->is_full_char() || re->is_full_seq())
            return;

        // If we reached a leaf and none of the expected leaf forms matched,
        // this is a regex constructor we did not account for in boundary
        // extraction and should fail loudly in debug builds.
        if (re->num_args() == 0) {
            UNREACHABLE();
            return;
        }

        // Recurse into children (handles union, concat, star, loop, etc.)
        for (unsigned i = 0; i < re->num_args(); ++i) {
            collect_char_boundaries(re->arg(i), bounds);
        }
    }

    // -----------------------------------------------------------------------
    // BFS regex emptiness check — helper: alphabet representatives
    // Faster alternative of computing all min-terms and taking representatives of them
    // -----------------------------------------------------------------------
    bool seq_regex::get_alphabet_representatives(euf::snode* re, euf::snode_vector& reps) {
        if (!re || !re->get_expr())
            return false;

        seq_util& seq = m_sg.get_seq_util();
        unsigned max_c = seq.max_char();

        // Partition the alphabet using boundary points induced by regex
        // predicates; one representative per interval is sufficient.
        unsigned_vector bounds;
        bounds.push_back(0);
        if (max_c < UINT_MAX)
            bounds.push_back(max_c + 1);
        collect_char_boundaries(re, bounds);

        std::sort(bounds.begin(), bounds.end());
        unsigned_vector uniq;
        for (unsigned b : bounds) {
            if (uniq.empty() || uniq.back() != b)
                uniq.push_back(b);
        }
        bounds = uniq;

        for (unsigned i = 0; i + 1 < bounds.size(); ++i) {
            unsigned lo = bounds[i];
            unsigned hi = bounds[i + 1];
            if (lo <= max_c && lo < hi)
                reps.push_back(m_sg.mk_char(lo));
        }

        // Defensive fallback for degenerate inputs.
        if (reps.empty())
            reps.push_back(m_sg.mk_char(0));
        return true;
    }

    // -----------------------------------------------------------------------
    // BFS regex emptiness check
    // -----------------------------------------------------------------------

    lbool seq_regex::is_empty_bfs(euf::snode* re, unsigned max_states) {
        SASSERT(re && re->get_expr());
        if (re->is_fail())
            return l_true;
        if (re->is_nullable())
            return l_false;
        // Structural quick checks for kinds that are never empty
        if (re->is_star() || re->is_full_char() || re->is_full_seq() || re->is_to_re())
            return l_false;
        // Structural emptiness catches simple cases
        if (is_empty_regex(re))
            return l_true;
        // Only handle ground regexes; non-ground can't be fully explored
        if (!re->is_ground())
            return l_undef;
        // s_other snodes (unrecognized regex kinds, e.g. re.+) cannot be
        // efficiently explored: the alphabet partition is trivially {∅} and
        // derivative computations may be slow.  Report l_undef and let the
        // caller fall back to a more capable procedure.
        if (re->kind() == euf::snode_kind::s_other)
            return l_undef;

        // BFS over the Brzozowski derivative automaton.
        // Each state is a derivative regex snode identified by its id.
        // We explore states by computing derivatives for representative
        // characters from the alphabet partition.
        uint_set visited;
        euf::snode_vector worklist;
        worklist.push_back(re);
        visited.insert(re->id());

        unsigned states_explored = 0;

        while (!worklist.empty()) {
            if (!m_sg.get_manager().inc())
                return l_undef;
            if (states_explored >= max_states)
                return l_undef;

            euf::snode* current = worklist.back();
            worklist.pop_back();
            ++states_explored;

            // Compute representative characters for current state's
            // alphabet partition. Each representative is a concrete
            // character snode whose equivalence class has identical
            // derivative behavior.
            euf::snode_vector reps;
            if (!get_alphabet_representatives(current, reps))
                return l_undef;

            if (reps.empty())
                // Nothing found = dead-end
                continue;

            for (euf::snode* ch : reps) {
                if (!m_sg.get_manager().inc())
                    return l_undef;
                // std::cout << "Deriving by " << snode_label_html(ch, sg().get_manager()) << std::endl;
                euf::snode* deriv = m_sg.brzozowski_deriv(current, ch);
                SASSERT(deriv);
                if (deriv->is_nullable())
                    return l_false; // found an accepting state
                if (deriv->is_fail())
                    continue; // dead-end, no need to explore further
                if (is_empty_regex(deriv))
                    continue; // structurally empty subtree
                if (!visited.contains(deriv->id())) {
                    visited.insert(deriv->id());
                    worklist.push_back(deriv);
                    // std::cout << "Found [" << deriv->id() << "]: " << snode_label_html(deriv, sg().get_manager()) << std::endl;
                }
            }
        }

        return l_true;
    }

    // -----------------------------------------------------------------------
    // Multi-regex intersection emptiness check
    // BFS over the product of Brzozowski derivative automata.
    // Mirrors ZIPT NielsenNode.CheckEmptiness (NielsenNode.cs:1429-1469)
    // -----------------------------------------------------------------------

    lbool seq_regex::check_intersection_emptiness(ptr_vector<euf::snode> const& regexes,
                                                    unsigned max_states) {
        if (regexes.empty())
            return l_false; // empty intersection = full language (vacuously non-empty)

        // Single regex: delegate to is_empty_bfs
        if (regexes.size() == 1)
            return is_empty_bfs(regexes[0], max_states);

        seq_util& seq = m_sg.get_seq_util();
        ast_manager& mgr = m_sg.get_manager();

        euf::snode* result = regexes[0];
        for (unsigned i = 1; i < regexes.size(); ++i) {
            expr* r1 = result->get_expr();
            expr* r2 = regexes[i]->get_expr();
            if (!r1 || !r2) return l_undef;
            expr_ref inter(seq.re.mk_inter(r1, r2), mgr);
            result = m_sg.mk(inter);
            if (!result)
                return l_undef;
        }

        return is_empty_bfs(result, max_states);
    }

    // -----------------------------------------------------------------------
    // Language subset check: L(A) ⊆ L(B)
    // via intersection(A, complement(B)) = ∅
    // Mirrors ZIPT NielsenNode.IsLanguageSubset (NielsenNode.cs:1382-1385)
    // -----------------------------------------------------------------------

    lbool seq_regex::is_language_subset(euf::snode* subset_re, euf::snode* superset_re) {
        if (!subset_re || !superset_re)
            return l_undef;

        // Quick checks
        if (subset_re->is_fail() || is_empty_regex(subset_re))
            return l_true;  // ∅ ⊆ anything
        if (superset_re->is_full_seq())
            return l_true;  // anything ⊆ Σ*
        if (subset_re == superset_re)
            return l_true;  // L ⊆ L

        // Build complement(superset)
        seq_util& seq = m_sg.get_seq_util();
        ast_manager& mgr = m_sg.get_manager();
        expr* sup_expr = superset_re->get_expr();
        if (!sup_expr)
            return l_undef;
        expr_ref comp(seq.re.mk_complement(sup_expr), mgr);
        euf::snode* comp_sn = m_sg.mk(comp);
        if (!comp_sn)
            return l_undef;

        // Build intersection and check emptiness
        // subset ∩ complement(superset) should be empty for subset relation
        expr* sub_expr = subset_re->get_expr();
        if (!sub_expr)
            return l_undef;
        expr_ref inter(seq.re.mk_inter(sub_expr, comp.get()), mgr);
        euf::snode* inter_sn = m_sg.mk(inter);
        if (!inter_sn)
            return l_undef;

        return is_empty_bfs(inter_sn);
    }

    // -----------------------------------------------------------------------
    // Collect primitive regex intersection for a variable
    // -----------------------------------------------------------------------

    euf::snode* seq_regex::collect_primitive_regex_intersection(
            euf::snode* var, seq::nielsen_node const& node) {
        SASSERT(var);

        seq_util& seq = m_sg.get_seq_util();
        ast_manager& mgr = m_sg.get_manager();
        euf::snode* result = nullptr;

        for (auto const& mem : node.str_mems()) {
            if (!mem.m_str || !mem.m_regex)
                continue;
            // Primitive constraint: str is a single variable
            if (!mem.is_primitive())
                continue;
            euf::snode* first = mem.m_str->first();
            if (!first || first != var)
                continue;

            if (!result) {
                result = mem.m_regex;
            } else {
                expr* r1 = result->get_expr();
                expr* r2 = mem.m_regex->get_expr();
                if (r1 && r2) {
                    expr_ref inter(seq.re.mk_inter(r1, r2), mgr);
                    result = m_sg.mk(inter);
                }
            }
        }
        return result;
    }

    // -----------------------------------------------------------------------
    // Cycle detection
    // -----------------------------------------------------------------------

    bool seq_regex::detect_cycle(seq::str_mem const& mem) const {
        return extract_cycle(mem) != nullptr;
    }

    // -----------------------------------------------------------------------
    // Ground prefix consumption
    // -----------------------------------------------------------------------

    seq_regex::simplify_status seq_regex::simplify_ground_prefix(seq::str_mem& mem) {
        if (!mem.m_str || !mem.m_regex)
            return simplify_status::ok;

        while (mem.m_str && !mem.m_str->is_empty()) {
            euf::snode* first = mem.m_str->first();
            if (!first || !first->is_char())
                break;
            euf::snode* parent_re = mem.m_regex;
            euf::snode* deriv = m_sg.brzozowski_deriv(parent_re, first);
            if (!deriv)
                break;
            if (deriv->is_fail())
                return simplify_status::conflict;
            // propagate self-stabilizing flag from parent to derivative
            propagate_self_stabilizing(parent_re, deriv);
            mem.m_str = m_sg.drop_first(mem.m_str);
            mem.m_regex = deriv;
        }

        // check final state
        if (mem.m_str && mem.m_str->is_empty()) {
            if (mem.m_regex->is_nullable())
                return simplify_status::satisfied;
            return simplify_status::conflict;
        }

        return simplify_status::ok;
    }

    // -----------------------------------------------------------------------
    // Ground suffix consumption (best-effort)
    // -----------------------------------------------------------------------

    seq_regex::simplify_status seq_regex::simplify_ground_suffix(seq::str_mem& mem) {
        // Suffix consumption via reverse derivatives is complex.
        // For now, only handle the case where the entire string is ground:
        // consume all characters from the front (which covers trailing chars
        // when the string is fully ground).
        if (!mem.m_str || !mem.m_regex)
            return simplify_status::ok;
        if (!mem.m_str->is_ground())
            return simplify_status::ok;

        // If the string is ground, simplify_ground_prefix handles everything.
        return simplify_ground_prefix(mem);
    }

    // -----------------------------------------------------------------------
    // Trivial checks
    // -----------------------------------------------------------------------

    int seq_regex::check_trivial(seq::str_mem const& mem) const {
        if (!mem.m_str || !mem.m_regex)
            return 0;
        // regex is ∅ => always conflict
        if (is_empty_regex(mem.m_regex))
            return -1;
        // regex is Σ* => always satisfied
        if (is_full_regex(mem.m_regex))
            return 1;
        // empty string checks
        if (mem.m_str->is_empty()) {
            if (mem.m_regex->is_nullable())
                return 1;
            return -1;
        }
        return 0;
    }

    // -----------------------------------------------------------------------
    // Minterm computation with filtering
    // -----------------------------------------------------------------------

    void seq_regex::get_minterms(euf::snode* regex, euf::snode_vector& minterms) {
        if (!regex)
            return;

        // compute raw minterms from the regex predicates
        euf::snode_vector raw;
        m_sg.compute_minterms(regex, raw);

        // filter: keep only minterms that are non-fail (non-empty character class).
        // note: minterms are regex character-class expressions, not concrete
        // characters, so we cannot compute Brzozowski derivatives with them.
        // callers should compute derivatives using concrete or fresh chars.
        for (euf::snode* mt : raw) {
            if (!mt || mt->is_fail())
                continue;
            minterms.push_back(mt);
        }
    }

    // -----------------------------------------------------------------------
// Membership processing
    // -----------------------------------------------------------------------

    bool seq_regex::process_str_mem(seq::str_mem const& mem,
                                     vector<seq::str_mem>& out_mems) {
        if (!mem.m_str || !mem.m_regex)
            return true;
        // empty string: check nullable
        if (mem.m_str->is_empty())
            return mem.m_regex->is_nullable();

        // consume ground prefix: derive regex by each leading concrete char
        seq::str_mem working = mem;
        simplify_status st = simplify_ground_prefix(working);
        if (st == simplify_status::conflict)
            return false;
        if (st == simplify_status::satisfied)
            return true;

        // after ground prefix consumption, if the front is still a concrete
        // character we can take one more step (shouldn't happen after
        // simplify_ground_prefix, but guard defensively)
        euf::snode* first = working.m_str->first();
        if (first && first->is_char()) {
            seq::str_mem derived = derive(working, first);
            if (is_empty_regex(derived.m_regex))
                return false;
            out_mems.push_back(derived);
            return true;
        }

        // string starts with a non-ground element (variable or unit):
        // return the simplified constraint for the Nielsen graph to expand
        // via character-split modifiers.
        out_mems.push_back(working);
        return true;
    }

    // -----------------------------------------------------------------------
    // History recording
    // -----------------------------------------------------------------------

    seq::str_mem seq_regex::record_history(seq::str_mem const& mem, euf::snode* history_re) {
        // Build a history chain by prepending the new regex entry to the
        // existing history. Uses regex-concat as a cons cell:
        //   new_history = re.concat(history_re, old_history)
        // where arg(0) is the latest entry and arg(1) is the tail.
        // If old_history is nullptr, the new entry becomes the terminal leaf.
        euf::snode* new_history = history_re;
        if (mem.m_history && history_re) {
            expr* re_expr = history_re->get_expr();
            expr* old_expr = mem.m_history->get_expr();
            if (re_expr && old_expr) {
                seq_util& seq = m_sg.get_seq_util();
                expr_ref chain(seq.re.mk_concat(re_expr, old_expr), m_sg.get_manager());
                new_history = m_sg.mk(chain);
            }
        }
        return seq::str_mem(mem.m_str, mem.m_regex, new_history, mem.m_id, mem.m_dep);
    }

    // -----------------------------------------------------------------------
    // Cycle detection
    // -----------------------------------------------------------------------

    euf::snode* seq_regex::extract_cycle(seq::str_mem const& mem) const {
        // Walk the history chain looking for a repeated regex.
        // A cycle exists when the current regex matches a regex in the history.
        if (!mem.m_regex || !mem.m_history)
            return nullptr;

        euf::snode* current = mem.m_regex;
        euf::snode* hist = mem.m_history;

        // Walk the history chain up to a bounded depth.
        // The history is structured as a chain of regex snapshots connected
        // via the sgraph's regex-concat: each level's arg(0) is a snapshot
        // and arg(1) is the tail. A leaf (non-concat) is a terminal entry.
        unsigned bound = 1000;
        while (hist && bound-- > 0) {
            euf::snode* entry = hist;
            euf::snode* tail = nullptr;

            // If the history node is a regex concat, decompose it:
            // arg(0) is the regex snapshot, arg(1) is the rest of the chain
            seq_util& seq = m_sg.get_seq_util();
            if (hist->is_concat() && hist->get_expr() &&
                seq.re.is_concat(hist->get_expr())) {
                entry = hist->arg(0);
                tail = hist->arg(1);
            }

            // Check pointer equality (fast, covers normalized regexes)
            if (entry == current)
                return entry;
            // Check expression-level equality as fallback
            if (entry->get_expr() && current->get_expr() &&
                entry->get_expr() == current->get_expr())
                return entry;

            hist = tail;
        }
        return nullptr;
    }

    // -----------------------------------------------------------------------
    // Stabilizer from cycle
    // -----------------------------------------------------------------------

    euf::snode* seq_regex::stabilizer_from_cycle(euf::snode* cycle_regex,
                                                   euf::snode* current_regex) {
        if (!cycle_regex || !current_regex)
            return nullptr;

        expr* re_expr = cycle_regex->get_expr();
        if (!re_expr)
            return nullptr;

        seq_util& seq = m_sg.get_seq_util();
        expr_ref star_expr(seq.re.mk_star(re_expr), m_sg.get_manager());
        return m_sg.mk(star_expr);
    }

    // -----------------------------------------------------------------------
    // Extract cycle history tokens
    // -----------------------------------------------------------------------

    euf::snode* seq_regex::extract_cycle_history(seq::str_mem const& current,
                                                   seq::str_mem const& ancestor) {
        // The history is built by simplify_and_init as a left-associative
        // string concat chain: concat(concat(concat(nil, c1), c2), c3).
        // Extract the tokens consumed since the ancestor.
        if (!current.m_history)
            return nullptr;

        unsigned cur_len = current.m_history->length();
        unsigned anc_len = ancestor.m_history ? ancestor.m_history->length() : 0;

        if (cur_len <= anc_len)
            return nullptr;

        if (anc_len == 0)
            return current.m_history;

        return m_sg.drop_left(current.m_history, anc_len);
    }

    // -----------------------------------------------------------------------
    // Get filtered stabilizer star
    // Mirrors ZIPT StrMem.GetFilteredStabilizerStar (StrMem.cs:228-243)
    // -----------------------------------------------------------------------

    euf::snode* seq_regex::get_filtered_stabilizer_star(euf::snode* re,
                                                          euf::snode* excluded_char) {
        if (!re)
            return nullptr;

        ptr_vector<euf::snode> const* stabs = get_stabilizers(re);
        if (!stabs || stabs->empty())
            return nullptr;

        seq_util& seq = m_sg.get_seq_util();
        ast_manager& m = m_sg.get_manager();
        euf::snode* filtered_union = nullptr;

        for (euf::snode* s : *stabs) {
            if (!s)
                continue;
            // Keep only stabilizers whose language cannot start with excluded_char
            euf::snode* d = m_sg.brzozowski_deriv(s, excluded_char);
            if (d && d->is_fail()) {
                if (!filtered_union) {
                    filtered_union = s;
                } else {
                    expr* e1 = filtered_union->get_expr();
                    expr* e2 = s->get_expr();
                    if (e1 && e2) {
                        expr_ref u(seq.re.mk_union(e1, e2), m);
                        filtered_union = m_sg.mk(u);
                    }
                }
            }
        }

        if (!filtered_union)
            return nullptr;

        expr* fe = filtered_union->get_expr();
        if (!fe)
            return nullptr;
        expr_ref star_expr(seq.re.mk_star(fe), m);
        return m_sg.mk(star_expr);
    }

    // -----------------------------------------------------------------------
    // Strengthened stabilizer construction with sub-cycle detection
    // Mirrors ZIPT StrMem.StabilizerFromCycle (StrMem.cs:163-225)
    // -----------------------------------------------------------------------

    euf::snode* seq_regex::strengthened_stabilizer(euf::snode* cycle_regex,
                                                     euf::snode* cycle_history) {
        if (!cycle_regex || !cycle_history)
            return nullptr;

        // Flatten the history concat chain into a vector of character tokens.
        euf::snode_vector tokens;
        cycle_history->collect_tokens(tokens);

        if (tokens.empty())
            return nullptr;

        seq_util& seq = m_sg.get_seq_util();
        ast_manager& m = m_sg.get_manager();

        // Replay tokens on the cycle regex, detecting sub-cycles.
        // A sub-cycle is detected when the derivative returns to cycle_regex.
        svector<std::pair<unsigned, unsigned>> sub_cycles;
        unsigned cycle_start = 0;
        euf::snode* current_re = cycle_regex;

        for (unsigned i = 0; i < tokens.size(); ++i) {
            euf::snode* tok = tokens[i];
            if (!tok)
                return nullptr;

            euf::snode* deriv = m_sg.brzozowski_deriv(current_re, tok);
            if (!deriv)
                return nullptr;

            // Sub-cycle: derivative returned to the cycle entry regex
            if (deriv == cycle_regex ||
                (deriv->get_expr() && cycle_regex->get_expr() &&
                 deriv->get_expr() == cycle_regex->get_expr())) {
                sub_cycles.push_back(std::make_pair(cycle_start, i + 1));
                cycle_start = i + 1;
                current_re = cycle_regex;
            } else {
                current_re = deriv;
            }
        }

        // Remaining tokens that don't complete a sub-cycle
        if (cycle_start < tokens.size())
            sub_cycles.push_back(std::make_pair(cycle_start, tokens.size()));

        if (sub_cycles.empty())
            return nullptr;

        // Build a stabilizer body for each sub-cycle.
        // body = to_re(t0) · [filteredStar(R1, t1)] · to_re(t1) · ... · to_re(t_{n-1})
        euf::snode* overall_union = nullptr;

        for (auto const& sc : sub_cycles) {
            unsigned start = sc.first;
            unsigned end = sc.second;
            if (start >= end)
                continue;

            euf::snode* re_state = cycle_regex;
            euf::snode* body = nullptr;

            for (unsigned i = start; i < end; ++i) {
                euf::snode* tok = tokens[i];
                if (!tok)
                    break;

                // Insert filtered stabilizer star before each token after the first
                if (i > start) {
                    euf::snode* filtered = get_filtered_stabilizer_star(re_state, tok);
                    if (filtered) {
                        expr* fe = filtered->get_expr();
                        if (fe) {
                            if (!body) {
                                body = filtered;
                            } else {
                                expr* be = body->get_expr();
                                if (be) {
                                    expr_ref cat(seq.re.mk_concat(be, fe), m);
                                    body = m_sg.mk(cat);
                                }
                            }
                        }
                    }
                }

                // Convert char token to regex: to_re(unit(tok))
                expr* tok_expr = tok->get_expr();
                if (!tok_expr)
                    break;

                expr_ref unit_str(seq.str.mk_unit(tok_expr), m);
                expr_ref tok_re(seq.re.mk_to_re(unit_str), m);
                euf::snode* tok_re_sn = m_sg.mk(tok_re);

                if (!body) {
                    body = tok_re_sn;
                } else {
                    expr* be = body->get_expr();
                    expr* te = tok_re_sn->get_expr();
                    if (be && te) {
                        expr_ref cat(seq.re.mk_concat(be, te), m);
                        body = m_sg.mk(cat);
                    }
                }

                // Advance the regex state
                euf::snode* deriv = m_sg.brzozowski_deriv(re_state, tok);
                if (!deriv)
                    break;
                re_state = deriv;
            }

            if (!body)
                continue;

            if (!overall_union) {
                overall_union = body;
            } else {
                expr* oe = overall_union->get_expr();
                expr* be = body->get_expr();
                if (oe && be) {
                    expr_ref u(seq.re.mk_union(oe, be), m);
                    overall_union = m_sg.mk(u);
                }
            }
        }

        return overall_union;
    }

    // -----------------------------------------------------------------------
    // Stabilizer-based subsumption (enhanced)
    // Mirrors ZIPT StrMem.TrySubsume (StrMem.cs:354-386)
    // -----------------------------------------------------------------------

    bool seq_regex::try_subsume(seq::str_mem const& mem, seq::nielsen_node const& node) {
        if (!mem.m_str || !mem.m_regex)
            return false;

        // 1. Leading token must be a variable
        euf::snode* first = mem.m_str->first();
        if (!first || !first->is_var())
            return false;

        // 2. Must have stabilizers for the regex
        if (!has_stabilizers(mem.m_regex))
            return false;

        // 3. Build stabStar = star(union(all stabilizers for this regex))
        euf::snode* stab_union = get_stabilizer_union(mem.m_regex);
        if (!stab_union)
            return false;

        seq_util& seq = m_sg.get_seq_util();
        ast_manager& mgr = m_sg.get_manager();
        expr* su_expr = stab_union->get_expr();
        if (!su_expr)
            return false;
        expr_ref stab_star(seq.re.mk_star(su_expr), mgr);
        euf::snode* stab_star_sn = m_sg.mk(stab_star);
        if (!stab_star_sn)
            return false;

        // 4. Collect all primitive regex constraints on variable `first`
        euf::snode* x_range = collect_primitive_regex_intersection(first, node);
        if (!x_range)
            return false;

        // 5. Check L(x_range) ⊆ L(stab_star)
        lbool result = is_language_subset(x_range, stab_star_sn);
        return result == l_true;
    }

    char_set seq_regex::minterm_to_char_set(expr* re_expr) {
        seq_util& seq = m_sg.get_seq_util();
        unsigned max_c = seq.max_char();

        VERIFY(re_expr);

        // full_char: the whole alphabet [0, max_char]
        if (seq.re.is_full_char(re_expr))
            return char_set::full(max_c);

        // range [lo, hi] (hi inclusive in Z3's regex representation)
        expr* lo_expr = nullptr;
        expr* hi_expr = nullptr;
        if (seq.re.is_range(re_expr, lo_expr, hi_expr)) {
            unsigned lo = 0, hi = 0;
            bool has_lo = false, has_hi = false;

            if (lo_expr) {
                if (m_sg.decode_re_char(lo_expr, lo)) {
                    has_lo = true;
                }
            }
            if (hi_expr) {
                if (m_sg.decode_re_char(hi_expr, hi)) {
                    has_hi = true;
                }
            }

            if (has_lo && has_hi) {
                SASSERT(lo <= hi);
                if (lo > hi)
                    return char_set();
                // char_range uses exclusive upper bound; Z3 hi is inclusive
                return char_set(char_range(lo, hi + 1));
            }
        }

        // complement: alphabet minus the inner set
        expr* inner = nullptr;
        if (seq.re.is_complement(re_expr, inner))
            return minterm_to_char_set(inner).complement(max_c);

        // union: characters present in either set
        expr* r1 = nullptr, *r2 = nullptr;
        if (seq.re.is_union(re_expr, r1, r2)) {
            char_set cs = minterm_to_char_set(r1);
            cs.add(minterm_to_char_set(r2));
            return cs;
        }

        // intersection: characters present in both sets
        if (seq.re.is_intersection(re_expr, r1, r2))
            return minterm_to_char_set(r1).intersect_with(minterm_to_char_set(r2));

        // difference: r1 minus r2 = r1 ∩ complement(r2)
        if (seq.re.is_diff(re_expr, r1, r2))
            return minterm_to_char_set(r1).intersect_with(
                       minterm_to_char_set(r2).complement(max_c));

        // to_re(str.unit(c)): singleton character set
        expr* str_arg = nullptr;
        unsigned char_val = 0;
        if (seq.re.is_to_re(re_expr, str_arg) && m_sg.decode_re_char(str_arg, char_val)) {
            char_set cs;
            cs.add(char_val);
            return cs;
        }

        // empty regex: no characters can appear
        if (seq.re.is_empty(re_expr))
            return char_set();

        // Unexpected minterm shape: we should fail loudly instead of silently
        // returning a conservative approximation.
        UNREACHABLE();
        return char_set::full(max_c);
    }

}