3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2026-03-18 11:04:09 +00:00
z3/src/smt/seq/seq_regex.cpp
copilot-swe-agent[bot] e01872dc70 rename nseq_regex/state/model to seq_regex/state/model in smt/seq; add Clemens Eisenhofer as co-author
Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com>
2026-03-14 16:54:32 +00:00

1281 lines
47 KiB
C++

/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
seq_regex.cpp
Abstract:
Lazy regex membership processing for the Nielsen-based string solver.
Author:
Clemens Eisenhofer 2026-03-01
Nikolaj Bjorner (nbjorner) 2026-03-01
--*/
#include "smt/seq/seq_regex.h"
#include <unordered_set>
namespace seq {
// -----------------------------------------------------------------------
// Stabilizer store
// -----------------------------------------------------------------------
void seq_regex::reset_stabilizers() {
m_stabilizers.reset();
m_self_stabilizing.reset();
}
void seq_regex::add_stabilizer(euf::snode* regex, euf::snode* stabilizer) {
if (!regex || !stabilizer)
return;
unsigned id = regex->id();
auto& stabs = m_stabilizers.insert_if_not_there(id, ptr_vector<euf::snode>());
// De-duplicate by pointer equality (mirrors ZIPT Environment.AddStabilizer
// which checks reference equality before adding).
for (euf::snode* s : stabs)
if (s == stabilizer)
return;
stabs.push_back(stabilizer);
}
euf::snode* seq_regex::get_stabilizer_union(euf::snode* regex) {
if (!regex)
return nullptr;
if (!m_stabilizers.contains(regex->id()))
return nullptr;
auto& stabs = m_stabilizers[regex->id()];
if (stabs.empty())
return nullptr;
// Single stabilizer: return it directly.
if (stabs.size() == 1)
return stabs[0];
// Multiple stabilizers: build re.union chain.
// union(s1, union(s2, ... union(sN-1, sN)...))
seq_util& seq = m_sg.get_seq_util();
euf::snode* result = stabs[stabs.size() - 1];
for (unsigned i = stabs.size() - 1; i-- > 0; ) {
expr* lhs = stabs[i]->get_expr();
expr* rhs = result->get_expr();
if (!lhs || !rhs)
return nullptr;
expr_ref un(seq.re.mk_union(lhs, rhs), m_sg.get_manager());
result = m_sg.mk(un);
}
return result;
}
bool seq_regex::has_stabilizers(euf::snode* regex) const {
if (!regex)
return false;
if (!m_stabilizers.contains(regex->id()))
return false;
return !m_stabilizers[regex->id()].empty();
}
ptr_vector<euf::snode> const* seq_regex::get_stabilizers(euf::snode* regex) const {
if (!regex)
return nullptr;
if (!m_stabilizers.contains(regex->id()))
return nullptr;
return &m_stabilizers[regex->id()];
}
void seq_regex::set_self_stabilizing(euf::snode* regex) {
if (!regex)
return;
m_self_stabilizing.insert(regex->id());
}
bool seq_regex::is_self_stabilizing(euf::snode* regex) const {
if (!regex)
return false;
return m_self_stabilizing.contains(regex->id());
}
// -----------------------------------------------------------------------
// Self-stabilizing auto-detection
// -----------------------------------------------------------------------
bool seq_regex::compute_self_stabilizing(euf::snode* regex) const {
if (!regex)
return false;
// R* is always self-stabilizing: D(c, R*) = D(c,R) · R*,
// so R* appears as the tail of every derivative and acts as
// its own stabilizer.
if (regex->is_star())
return true;
// Σ* (full_seq, i.e., re.all / .*) is self-stabilizing:
// D(c, Σ*) = Σ* for every character c.
if (regex->is_full_seq())
return true;
// ∅ (fail / empty language) is trivially self-stabilizing:
// it has no live derivatives, so the flag is vacuously true.
if (regex->is_fail())
return true;
// Complement of full_seq is ∅ (complement of Σ*), which is
// also trivially self-stabilizing.
if (regex->is_complement() && regex->num_args() == 1 &&
regex->arg(0)->is_full_seq())
return true;
// Loop with lo=0 and no upper bound behaves like R*
// (r{0,} ≡ r*), so it is self-stabilizing.
if (regex->is_loop() && regex->is_nullable()) {
// A nullable loop with a star-like body: heuristic check.
// Only mark as self-stabilizing if the body is a Kleene closure.
// Loop(R, 0, ∞) ~ R* — but we rely on the sgraph to normalize
// these, so only catch exact star nodes above.
}
return false;
}
// -----------------------------------------------------------------------
// Self-stabilizing propagation through derivatives
// -----------------------------------------------------------------------
void seq_regex::propagate_self_stabilizing(euf::snode* parent, euf::snode* deriv) {
if (!parent || !deriv)
return;
// If the derivative is already known to be self-stabilizing (either
// inherently or from a prior propagation), nothing to do.
if (is_self_stabilizing(deriv))
return;
// If the derivative is itself inherently self-stabilizing
// (e.g., it is a star or full_seq), mark it now.
if (compute_self_stabilizing(deriv)) {
set_self_stabilizing(deriv);
return;
}
// Rule 1: Star parent.
// D(c, R*) = D(c, R) · R*. The derivative always contains the
// R* tail, so it is self-stabilizing regardless of D(c,R).
if (parent->is_star()) {
set_self_stabilizing(deriv);
return;
}
// Rule 2: Full_seq parent.
// D(c, Σ*) = Σ*, and Σ* is self-stabilizing.
// (The derivative should be Σ* itself; mark it for safety.)
if (parent->is_full_seq()) {
set_self_stabilizing(deriv);
return;
}
// Check if parent is self-stabilizing (either inherently or marked).
bool parent_ss = is_self_stabilizing(parent) || compute_self_stabilizing(parent);
// Rule 3: Concat parent R · S.
// D(c, R·S) = D(c,R)·S | (nullable(R) ? D(c,S) : ∅).
// If S is self-stabilizing, the D(c,R)·S branch inherits it.
// If the whole parent R·S is self-stabilizing, the derivative is too.
if (parent->is_concat() && parent->num_args() == 2) {
euf::snode* tail = parent->arg(1);
bool tail_ss = is_self_stabilizing(tail) || compute_self_stabilizing(tail);
if (tail_ss || parent_ss) {
set_self_stabilizing(deriv);
return;
}
}
// Rule 4: Union parent R | S.
// D(c, R|S) = D(c,R) | D(c,S).
// Self-stabilizing if both children are self-stabilizing.
if (parent->is_union() && parent->num_args() == 2) {
euf::snode* lhs = parent->arg(0);
euf::snode* rhs = parent->arg(1);
bool lhs_ss = is_self_stabilizing(lhs) || compute_self_stabilizing(lhs);
bool rhs_ss = is_self_stabilizing(rhs) || compute_self_stabilizing(rhs);
if (lhs_ss && rhs_ss) {
set_self_stabilizing(deriv);
return;
}
}
// Rule 5: Intersection parent R ∩ S.
// D(c, R∩S) = D(c,R) ∩ D(c,S).
// Self-stabilizing if both children are self-stabilizing.
if (parent->is_intersect() && parent->num_args() == 2) {
euf::snode* lhs = parent->arg(0);
euf::snode* rhs = parent->arg(1);
bool lhs_ss = is_self_stabilizing(lhs) || compute_self_stabilizing(lhs);
bool rhs_ss = is_self_stabilizing(rhs) || compute_self_stabilizing(rhs);
if (lhs_ss && rhs_ss) {
set_self_stabilizing(deriv);
return;
}
}
// Rule 6: Complement parent ~R.
// D(c, ~R) = ~D(c, R).
// Preserves self-stabilizing from R.
if (parent->is_complement() && parent->num_args() == 1) {
euf::snode* inner = parent->arg(0);
bool inner_ss = is_self_stabilizing(inner) || compute_self_stabilizing(inner);
if (inner_ss) {
set_self_stabilizing(deriv);
return;
}
}
// Rule 7: Generic self-stabilizing parent.
// If the parent was explicitly marked self-stabilizing (e.g., via
// a previous propagation), propagate to the derivative.
if (parent_ss) {
set_self_stabilizing(deriv);
return;
}
}
// -----------------------------------------------------------------------
// Derivative with propagation
// -----------------------------------------------------------------------
euf::snode* seq_regex::derivative_with_propagation(euf::snode* re, euf::snode* elem) {
if (!re || !elem)
return nullptr;
euf::snode* deriv = derivative(re, elem);
if (deriv)
propagate_self_stabilizing(re, deriv);
return deriv;
}
// -----------------------------------------------------------------------
// Uniform derivative (symbolic character consumption)
// -----------------------------------------------------------------------
euf::snode* seq_regex::try_uniform_derivative(euf::snode* regex) {
if (!regex)
return nullptr;
// Quick exits: trivial regexes with known uniform derivatives.
// Σ* (full_seq) has derivative Σ* for every character.
if (regex->is_full_seq())
return regex;
// ∅ (fail) has derivative ∅ for every character — but this means
// every character is rejected. Return fail so the caller can
// detect a conflict.
if (regex->is_fail())
return regex;
// Compute minterms: the character-class partition of the alphabet
// induced by the regex.
euf::snode_vector minterms;
m_sg.compute_minterms(regex, minterms);
if (minterms.empty())
return nullptr;
// Compute the derivative for each non-empty minterm. If all produce
// the same result, the derivative is independent of the character
// value and we can consume a symbolic character deterministically.
euf::snode* uniform = nullptr;
for (euf::snode* mt : minterms) {
if (!mt || mt->is_fail())
continue; // empty character class — no character belongs to it
euf::snode* deriv = m_sg.brzozowski_deriv(regex, mt);
if (!deriv)
return nullptr; // derivative computation failed
if (!uniform) {
uniform = deriv;
} else if (uniform->id() != deriv->id()) {
return nullptr; // different derivatives — not uniform
}
}
return uniform; // may be nullptr if all minterms were fail/empty
}
// -----------------------------------------------------------------------
// Ground prefix consumption
// -----------------------------------------------------------------------
bool seq_regex::is_empty_regex(euf::snode* re) const {
if (!re)
return false;
// direct empty language constant
if (re->is_fail())
return true;
// kinds that are never empty
if (re->is_star() || re->is_to_re() ||
re->is_full_char() || re->is_full_seq())
return false;
// loop with lo == 0 accepts ε
if (re->is_loop() && re->is_nullable())
return false;
seq_util& seq = m_sg.get_seq_util();
expr* e = re->get_expr();
if (!e)
return false;
expr* r1, * r2;
// union is empty iff both children are empty
if (seq.re.is_union(e, r1, r2)) {
SASSERT(re->num_args() == 2);
return is_empty_regex(re->arg(0)) && is_empty_regex(re->arg(1));
}
// regex concat is empty if either child is empty
if (seq.re.is_concat(e, r1, r2)) {
SASSERT(re->num_args() == 2);
return is_empty_regex(re->arg(0)) || is_empty_regex(re->arg(1));
}
// intersection is empty if either child is empty
if (seq.re.is_intersection(e, r1, r2)) {
SASSERT(re->num_args() == 2);
if (is_empty_regex(re->arg(0)) || is_empty_regex(re->arg(1)))
return true;
}
// complement of full_seq is empty
if (re->is_complement() && re->num_args() == 1 && re->arg(0)->is_full_seq())
return true;
// loop(empty, lo, _) with lo > 0 is empty
if (re->is_loop() && re->num_args() >= 1 && is_empty_regex(re->arg(0)))
return !re->is_nullable(); // empty if not nullable (i.e., lo > 0)
return false;
}
// -----------------------------------------------------------------------
// BFS regex emptiness check — helper: collect character boundaries
// -----------------------------------------------------------------------
void seq_regex::collect_char_boundaries(euf::snode* re, unsigned_vector& bounds) const {
if (!re || !re->get_expr())
return;
seq_util& seq = m_sg.get_seq_util();
expr* e = re->get_expr();
// Range predicate re.range(lo, hi): boundary at lo and hi+1
// Range arguments are string expressions (e.g., str.unit(ch))
expr* lo_expr = nullptr;
expr* hi_expr = nullptr;
if (seq.re.is_range(e, lo_expr, hi_expr)) {
zstring s_lo, s_hi;
if (lo_expr && seq.str.is_string(lo_expr, s_lo) && s_lo.length() == 1)
bounds.push_back(s_lo[0]);
if (hi_expr && seq.str.is_string(hi_expr, s_hi) && s_hi.length() == 1 && s_hi[0] < zstring::max_char())
bounds.push_back(s_hi[0] + 1);
return;
}
// to_re(s): boundary at first character and first+1
expr* body = nullptr;
if (seq.re.is_to_re(e, body)) {
zstring s;
if (seq.str.is_string(body, s) && s.length() > 0) {
unsigned first_ch = s[0];
bounds.push_back(first_ch);
if (first_ch < zstring::max_char())
bounds.push_back(first_ch + 1);
}
return;
}
// Leaf nodes with no character discrimination
if (re->is_fail() || re->is_full_char() || re->is_full_seq())
return;
// Recurse into children (handles union, concat, star, loop, etc.)
for (unsigned i = 0; i < re->num_args(); ++i)
collect_char_boundaries(re->arg(i), bounds);
}
// -----------------------------------------------------------------------
// BFS regex emptiness check — helper: alphabet representatives
// -----------------------------------------------------------------------
void seq_regex::get_alphabet_representatives(euf::snode* re, euf::snode_vector& reps) {
unsigned_vector bounds;
bounds.push_back(0); // always include character 0
collect_char_boundaries(re, bounds);
// Sort and deduplicate
std::sort(bounds.begin(), bounds.end());
unsigned prev = UINT_MAX;
for (unsigned b : bounds) {
if (b != prev) {
reps.push_back(m_sg.mk_char(b));
prev = b;
}
}
}
// -----------------------------------------------------------------------
// BFS regex emptiness check
// -----------------------------------------------------------------------
lbool seq_regex::is_empty_bfs(euf::snode* re, unsigned max_states) {
if (!re || !re->get_expr())
return l_undef;
if (re->is_fail())
return l_true;
if (re->is_nullable())
return l_false;
// Structural quick checks for kinds that are never empty
if (re->is_star() || re->is_full_char() || re->is_full_seq() || re->is_to_re())
return l_false;
// Structural emptiness catches simple cases
if (is_empty_regex(re))
return l_true;
// Only handle ground regexes; non-ground can't be fully explored
if (!re->is_ground())
return l_undef;
// BFS over the Brzozowski derivative automaton.
// Each state is a derivative regex snode identified by its id.
// We explore states by computing derivatives for representative
// characters from the alphabet partition.
uint_set visited;
euf::snode_vector worklist;
worklist.push_back(re);
visited.insert(re->id());
unsigned states_explored = 0;
bool had_failed_deriv = false;
while (!worklist.empty()) {
if (states_explored >= max_states)
return l_undef;
euf::snode* current = worklist.back();
worklist.pop_back();
++states_explored;
// Compute representative characters for current state's
// alphabet partition. Each representative is a concrete
// character snode whose equivalence class has identical
// derivative behavior.
euf::snode_vector reps;
get_alphabet_representatives(current, reps);
if (reps.empty()) {
// No representatives means no character predicates;
// use a default character to explore the single partition.
reps.push_back(m_sg.mk_char('a'));
}
for (euf::snode* ch : reps) {
euf::snode* deriv = m_sg.brzozowski_deriv(current, ch);
if (!deriv) {
// Derivative computation failed for this character.
// Track the failure but continue with other characters.
had_failed_deriv = true;
continue;
}
if (deriv->is_nullable())
return l_false; // found an accepting state
if (deriv->is_fail())
continue; // dead-end, no need to explore further
if (is_empty_regex(deriv))
continue; // structurally empty subtree
if (!visited.contains(deriv->id())) {
visited.insert(deriv->id());
worklist.push_back(deriv);
}
}
}
// Exhausted all reachable states without finding a nullable one.
// If we had any failed derivative computations, the result is
// inconclusive since we may have missed reachable states.
if (had_failed_deriv)
return l_undef;
return l_true;
}
// -----------------------------------------------------------------------
// Multi-regex intersection emptiness check
// BFS over the product of Brzozowski derivative automata.
// Mirrors ZIPT NielsenNode.CheckEmptiness (NielsenNode.cs:1429-1469)
// -----------------------------------------------------------------------
lbool seq_regex::check_intersection_emptiness(ptr_vector<euf::snode> const& regexes,
unsigned max_states) {
if (regexes.empty())
return l_false; // empty intersection = full language (vacuously non-empty)
// Quick checks: if any regex is fail/empty, intersection is empty
for (euf::snode* re : regexes) {
if (!re || !re->get_expr())
return l_undef;
if (re->is_fail() || is_empty_regex(re))
return l_true;
}
// Check if all are nullable (intersection accepts ε)
bool all_nullable = true;
for (euf::snode* re : regexes) {
if (!re->is_nullable()) { all_nullable = false; break; }
}
if (all_nullable)
return l_false;
// Single regex: delegate to is_empty_bfs
if (regexes.size() == 1)
return is_empty_bfs(regexes[0], max_states);
// Build product BFS. State = tuple of regex snode ids.
// Use a map from state hash to visited set.
using state_t = svector<unsigned>;
auto state_hash = [](state_t const& s) -> unsigned {
unsigned h = 0;
for (unsigned id : s)
h = h * 31 + id;
return h;
};
auto state_eq = [](state_t const& a, state_t const& b) -> bool {
if (a.size() != b.size()) return false;
for (unsigned i = 0; i < a.size(); ++i)
if (a[i] != b[i]) return false;
return true;
};
// Use simple set via sorted vector of hashes (good enough for bounded BFS)
std::unordered_set<unsigned> visited_hashes;
struct bfs_state {
ptr_vector<euf::snode> regexes;
};
std::vector<bfs_state> worklist;
bfs_state initial;
initial.regexes.append(regexes);
worklist.push_back(std::move(initial));
state_t init_ids;
for (euf::snode* re : regexes)
init_ids.push_back(re->id());
visited_hashes.insert(state_hash(init_ids));
unsigned states_explored = 0;
bool had_failed = false;
// Collect alphabet representatives from the intersection of all regexes
// (merge boundaries from all)
unsigned_vector all_bounds;
all_bounds.push_back(0);
for (euf::snode* re : regexes)
collect_char_boundaries(re, all_bounds);
std::sort(all_bounds.begin(), all_bounds.end());
euf::snode_vector reps;
unsigned prev = UINT_MAX;
for (unsigned b : all_bounds) {
if (b != prev) {
reps.push_back(m_sg.mk_char(b));
prev = b;
}
}
if (reps.empty())
reps.push_back(m_sg.mk_char('a'));
while (!worklist.empty()) {
if (states_explored >= max_states)
return l_undef;
bfs_state current = std::move(worklist.back());
worklist.pop_back();
++states_explored;
for (euf::snode* ch : reps) {
ptr_vector<euf::snode> derivs;
bool any_fail = false;
bool all_null = true;
bool deriv_failed = false;
for (euf::snode* re : current.regexes) {
euf::snode* d = m_sg.brzozowski_deriv(re, ch);
if (!d) { deriv_failed = true; break; }
if (d->is_fail()) { any_fail = true; break; }
if (!d->is_nullable()) all_null = false;
derivs.push_back(d);
}
if (deriv_failed) { had_failed = true; continue; }
if (any_fail) continue; // this character leads to empty intersection
if (all_null)
return l_false; // found an accepting state in the product
// Check if any component is structurally empty
bool any_empty = false;
for (euf::snode* d : derivs) {
if (is_empty_regex(d)) { any_empty = true; break; }
}
if (any_empty) continue;
// Compute state hash and check visited
state_t ids;
for (euf::snode* d : derivs)
ids.push_back(d->id());
unsigned h = state_hash(ids);
if (visited_hashes.count(h) == 0) {
visited_hashes.insert(h);
bfs_state next;
next.regexes.append(derivs);
worklist.push_back(std::move(next));
}
}
}
if (had_failed)
return l_undef;
return l_true; // exhausted all states, intersection is empty
}
// -----------------------------------------------------------------------
// Language subset check: L(A) ⊆ L(B)
// via intersection(A, complement(B)) = ∅
// Mirrors ZIPT NielsenNode.IsLanguageSubset (NielsenNode.cs:1382-1385)
// -----------------------------------------------------------------------
lbool seq_regex::is_language_subset(euf::snode* subset_re, euf::snode* superset_re) {
if (!subset_re || !superset_re)
return l_undef;
// Quick checks
if (subset_re->is_fail() || is_empty_regex(subset_re))
return l_true; // ∅ ⊆ anything
if (superset_re->is_full_seq())
return l_true; // anything ⊆ Σ*
if (subset_re == superset_re)
return l_true; // L ⊆ L
// Build complement(superset)
seq_util& seq = m_sg.get_seq_util();
ast_manager& mgr = m_sg.get_manager();
expr* sup_expr = superset_re->get_expr();
if (!sup_expr)
return l_undef;
expr_ref comp(seq.re.mk_complement(sup_expr), mgr);
euf::snode* comp_sn = m_sg.mk(comp);
if (!comp_sn)
return l_undef;
// Build intersection and check emptiness
// subset ∩ complement(superset) should be empty for subset relation
expr* sub_expr = subset_re->get_expr();
if (!sub_expr)
return l_undef;
expr_ref inter(seq.re.mk_inter(sub_expr, comp.get()), mgr);
euf::snode* inter_sn = m_sg.mk(inter);
if (!inter_sn)
return l_undef;
return is_empty_bfs(inter_sn);
}
// -----------------------------------------------------------------------
// Collect primitive regex intersection for a variable
// -----------------------------------------------------------------------
euf::snode* seq_regex::collect_primitive_regex_intersection(
euf::snode* var, seq::nielsen_node const& node) {
if (!var)
return nullptr;
seq_util& seq = m_sg.get_seq_util();
ast_manager& mgr = m_sg.get_manager();
euf::snode* result = nullptr;
for (auto const& mem : node.str_mems()) {
if (!mem.m_str || !mem.m_regex)
continue;
// Primitive constraint: str is a single variable
if (!mem.is_primitive())
continue;
euf::snode* first = mem.m_str->first();
if (!first || first != var)
continue;
if (!result) {
result = mem.m_regex;
} else {
expr* r1 = result->get_expr();
expr* r2 = mem.m_regex->get_expr();
if (r1 && r2) {
expr_ref inter(seq.re.mk_inter(r1, r2), mgr);
result = m_sg.mk(inter);
}
}
}
return result;
}
// -----------------------------------------------------------------------
// Cycle detection
// -----------------------------------------------------------------------
bool seq_regex::detect_cycle(seq::str_mem const& mem) const {
return extract_cycle(mem) != nullptr;
}
// -----------------------------------------------------------------------
// Ground prefix consumption
// -----------------------------------------------------------------------
seq_regex::simplify_status seq_regex::simplify_ground_prefix(seq::str_mem& mem) {
if (!mem.m_str || !mem.m_regex)
return simplify_status::ok;
while (mem.m_str && !mem.m_str->is_empty()) {
euf::snode* first = mem.m_str->first();
if (!first || !first->is_char())
break;
euf::snode* parent_re = mem.m_regex;
euf::snode* deriv = m_sg.brzozowski_deriv(parent_re, first);
if (!deriv)
break;
if (deriv->is_fail())
return simplify_status::conflict;
// propagate self-stabilizing flag from parent to derivative
propagate_self_stabilizing(parent_re, deriv);
mem.m_str = m_sg.drop_first(mem.m_str);
mem.m_regex = deriv;
}
// check final state
if (mem.m_str && mem.m_str->is_empty()) {
if (mem.m_regex->is_nullable())
return simplify_status::satisfied;
return simplify_status::conflict;
}
return simplify_status::ok;
}
// -----------------------------------------------------------------------
// Ground suffix consumption (best-effort)
// -----------------------------------------------------------------------
seq_regex::simplify_status seq_regex::simplify_ground_suffix(seq::str_mem& mem) {
// Suffix consumption via reverse derivatives is complex.
// For now, only handle the case where the entire string is ground:
// consume all characters from the front (which covers trailing chars
// when the string is fully ground).
if (!mem.m_str || !mem.m_regex)
return simplify_status::ok;
if (!mem.m_str->is_ground())
return simplify_status::ok;
// If the string is ground, simplify_ground_prefix handles everything.
return simplify_ground_prefix(mem);
}
// -----------------------------------------------------------------------
// Trivial checks
// -----------------------------------------------------------------------
int seq_regex::check_trivial(seq::str_mem const& mem) const {
if (!mem.m_str || !mem.m_regex)
return 0;
// regex is ∅ => always conflict
if (is_empty_regex(mem.m_regex))
return -1;
// regex is Σ* => always satisfied
if (is_full_regex(mem.m_regex))
return 1;
// empty string checks
if (mem.m_str->is_empty()) {
if (mem.m_regex->is_nullable())
return 1;
return -1;
}
return 0;
}
// -----------------------------------------------------------------------
// Minterm computation with filtering
// -----------------------------------------------------------------------
void seq_regex::get_minterms(euf::snode* regex, euf::snode_vector& minterms) {
if (!regex)
return;
// compute raw minterms from the regex predicates
euf::snode_vector raw;
m_sg.compute_minterms(regex, raw);
// filter: keep only minterms that are non-fail (non-empty character class).
// note: minterms are regex character-class expressions, not concrete
// characters, so we cannot compute Brzozowski derivatives with them.
// callers should compute derivatives using concrete or fresh chars.
for (euf::snode* mt : raw) {
if (!mt || mt->is_fail())
continue;
minterms.push_back(mt);
}
}
// -----------------------------------------------------------------------
// Collect first characters
// -----------------------------------------------------------------------
void seq_regex::collect_first_chars(euf::snode* re, euf::snode_vector& chars) {
if (!re)
return;
// to_re(s): extract first character of the string body
if (re->is_to_re()) {
euf::snode* body = re->arg(0);
if (body && !body->is_empty()) {
euf::snode* first = body->first();
if (first && first->is_char()) {
bool dup = false;
for (euf::snode* c : chars)
if (c == first) { dup = true; break; }
if (!dup)
chars.push_back(first);
}
// Handle string literals (classified as s_other in sgraph)
else if (first && first->get_expr()) {
seq_util& seq = m_sg.get_seq_util();
zstring s;
if (seq.str.is_string(first->get_expr(), s) && s.length() > 0) {
euf::snode* ch = m_sg.mk_char(s[0]);
bool dup = false;
for (euf::snode* c : chars)
if (c == ch) { dup = true; break; }
if (!dup)
chars.push_back(ch);
}
}
}
return;
}
// leaf cases: produce representative characters for character classes
if (re->is_full_char()) {
// full character set (.): use 'a' as representative
euf::snode* ch = m_sg.mk_char('a');
bool dup = false;
for (euf::snode* c : chars)
if (c == ch) { dup = true; break; }
if (!dup)
chars.push_back(ch);
return;
}
// re.range(lo, hi): use lo as representative
if (re->get_expr()) {
seq_util& seq = m_sg.get_seq_util();
expr* lo = nullptr, *hi = nullptr;
if (seq.re.is_range(re->get_expr(), lo, hi) && lo) {
zstring s;
unsigned ch_val = 'a';
if (seq.is_const_char(lo, ch_val)) {
euf::snode* ch = m_sg.mk_char(ch_val);
bool dup = false;
for (euf::snode* c : chars)
if (c == ch) { dup = true; break; }
if (!dup)
chars.push_back(ch);
}
return;
}
}
if (re->is_fail() || re->is_full_seq())
return;
// recurse into children (handles union, concat, star, loop, etc.)
for (unsigned i = 0; i < re->num_args(); ++i)
collect_first_chars(re->arg(i), chars);
}
// -----------------------------------------------------------------------
// Membership processing
// -----------------------------------------------------------------------
bool seq_regex::process_str_mem(seq::str_mem const& mem,
vector<seq::str_mem>& out_mems) {
if (!mem.m_str || !mem.m_regex)
return true;
// empty string: check nullable
if (mem.m_str->is_empty())
return mem.m_regex->is_nullable();
// consume ground prefix: derive regex by each leading concrete char
seq::str_mem working = mem;
simplify_status st = simplify_ground_prefix(working);
if (st == simplify_status::conflict)
return false;
if (st == simplify_status::satisfied)
return true;
// after ground prefix consumption, if the front is still a concrete
// character we can take one more step (shouldn't happen after
// simplify_ground_prefix, but guard defensively)
euf::snode* first = working.m_str->first();
if (first && first->is_char()) {
seq::str_mem derived = derive(working, first);
if (is_empty_regex(derived.m_regex))
return false;
out_mems.push_back(derived);
return true;
}
// string starts with a non-ground element (variable or unit):
// return the simplified constraint for the Nielsen graph to expand
// via character-split modifiers.
out_mems.push_back(working);
return true;
}
// -----------------------------------------------------------------------
// History recording
// -----------------------------------------------------------------------
seq::str_mem seq_regex::record_history(seq::str_mem const& mem, euf::snode* history_re) {
// Build a history chain by prepending the new regex entry to the
// existing history. Uses regex-concat as a cons cell:
// new_history = re.concat(history_re, old_history)
// where arg(0) is the latest entry and arg(1) is the tail.
// If old_history is nullptr, the new entry becomes the terminal leaf.
euf::snode* new_history = history_re;
if (mem.m_history && history_re) {
expr* re_expr = history_re->get_expr();
expr* old_expr = mem.m_history->get_expr();
if (re_expr && old_expr) {
seq_util& seq = m_sg.get_seq_util();
expr_ref chain(seq.re.mk_concat(re_expr, old_expr), m_sg.get_manager());
new_history = m_sg.mk(chain);
}
}
return seq::str_mem(mem.m_str, mem.m_regex, new_history, mem.m_id, mem.m_dep);
}
// -----------------------------------------------------------------------
// Cycle detection
// -----------------------------------------------------------------------
euf::snode* seq_regex::extract_cycle(seq::str_mem const& mem) const {
// Walk the history chain looking for a repeated regex.
// A cycle exists when the current regex matches a regex in the history.
if (!mem.m_regex || !mem.m_history)
return nullptr;
euf::snode* current = mem.m_regex;
euf::snode* hist = mem.m_history;
// Walk the history chain up to a bounded depth.
// The history is structured as a chain of regex snapshots connected
// via the sgraph's regex-concat: each level's arg(0) is a snapshot
// and arg(1) is the tail. A leaf (non-concat) is a terminal entry.
unsigned bound = 1000;
while (hist && bound-- > 0) {
euf::snode* entry = hist;
euf::snode* tail = nullptr;
// If the history node is a regex concat, decompose it:
// arg(0) is the regex snapshot, arg(1) is the rest of the chain
seq_util& seq = m_sg.get_seq_util();
if (hist->is_concat() && hist->get_expr() &&
seq.re.is_concat(hist->get_expr())) {
entry = hist->arg(0);
tail = hist->arg(1);
}
// Check pointer equality (fast, covers normalized regexes)
if (entry == current)
return entry;
// Check expression-level equality as fallback
if (entry->get_expr() && current->get_expr() &&
entry->get_expr() == current->get_expr())
return entry;
hist = tail;
}
return nullptr;
}
// -----------------------------------------------------------------------
// Stabilizer from cycle
// -----------------------------------------------------------------------
euf::snode* seq_regex::stabilizer_from_cycle(euf::snode* cycle_regex,
euf::snode* current_regex) {
if (!cycle_regex || !current_regex)
return nullptr;
expr* re_expr = cycle_regex->get_expr();
if (!re_expr)
return nullptr;
seq_util& seq = m_sg.get_seq_util();
expr_ref star_expr(seq.re.mk_star(re_expr), m_sg.get_manager());
return m_sg.mk(star_expr);
}
// -----------------------------------------------------------------------
// Extract cycle history tokens
// -----------------------------------------------------------------------
euf::snode* seq_regex::extract_cycle_history(seq::str_mem const& current,
seq::str_mem const& ancestor) {
// The history is built by simplify_and_init as a left-associative
// string concat chain: concat(concat(concat(nil, c1), c2), c3).
// Extract the tokens consumed since the ancestor.
if (!current.m_history)
return nullptr;
unsigned cur_len = current.m_history->length();
unsigned anc_len = ancestor.m_history ? ancestor.m_history->length() : 0;
if (cur_len <= anc_len)
return nullptr;
if (anc_len == 0)
return current.m_history;
return m_sg.drop_left(current.m_history, anc_len);
}
// -----------------------------------------------------------------------
// Get filtered stabilizer star
// Mirrors ZIPT StrMem.GetFilteredStabilizerStar (StrMem.cs:228-243)
// -----------------------------------------------------------------------
euf::snode* seq_regex::get_filtered_stabilizer_star(euf::snode* re,
euf::snode* excluded_char) {
if (!re)
return nullptr;
ptr_vector<euf::snode> const* stabs = get_stabilizers(re);
if (!stabs || stabs->empty())
return nullptr;
seq_util& seq = m_sg.get_seq_util();
ast_manager& m = m_sg.get_manager();
euf::snode* filtered_union = nullptr;
for (euf::snode* s : *stabs) {
if (!s)
continue;
// Keep only stabilizers whose language cannot start with excluded_char
euf::snode* d = m_sg.brzozowski_deriv(s, excluded_char);
if (d && d->is_fail()) {
if (!filtered_union) {
filtered_union = s;
} else {
expr* e1 = filtered_union->get_expr();
expr* e2 = s->get_expr();
if (e1 && e2) {
expr_ref u(seq.re.mk_union(e1, e2), m);
filtered_union = m_sg.mk(u);
}
}
}
}
if (!filtered_union)
return nullptr;
expr* fe = filtered_union->get_expr();
if (!fe)
return nullptr;
expr_ref star_expr(seq.re.mk_star(fe), m);
return m_sg.mk(star_expr);
}
// -----------------------------------------------------------------------
// Strengthened stabilizer construction with sub-cycle detection
// Mirrors ZIPT StrMem.StabilizerFromCycle (StrMem.cs:163-225)
// -----------------------------------------------------------------------
euf::snode* seq_regex::strengthened_stabilizer(euf::snode* cycle_regex,
euf::snode* cycle_history) {
if (!cycle_regex || !cycle_history)
return nullptr;
// Flatten the history concat chain into a vector of character tokens.
euf::snode_vector tokens;
cycle_history->collect_tokens(tokens);
if (tokens.empty())
return nullptr;
seq_util& seq = m_sg.get_seq_util();
ast_manager& m = m_sg.get_manager();
// Replay tokens on the cycle regex, detecting sub-cycles.
// A sub-cycle is detected when the derivative returns to cycle_regex.
svector<std::pair<unsigned, unsigned>> sub_cycles;
unsigned cycle_start = 0;
euf::snode* current_re = cycle_regex;
for (unsigned i = 0; i < tokens.size(); ++i) {
euf::snode* tok = tokens[i];
if (!tok)
return nullptr;
euf::snode* deriv = m_sg.brzozowski_deriv(current_re, tok);
if (!deriv)
return nullptr;
// Sub-cycle: derivative returned to the cycle entry regex
if (deriv == cycle_regex ||
(deriv->get_expr() && cycle_regex->get_expr() &&
deriv->get_expr() == cycle_regex->get_expr())) {
sub_cycles.push_back(std::make_pair(cycle_start, i + 1));
cycle_start = i + 1;
current_re = cycle_regex;
} else {
current_re = deriv;
}
}
// Remaining tokens that don't complete a sub-cycle
if (cycle_start < tokens.size())
sub_cycles.push_back(std::make_pair(cycle_start, tokens.size()));
if (sub_cycles.empty())
return nullptr;
// Build a stabilizer body for each sub-cycle.
// body = to_re(t0) · [filteredStar(R1, t1)] · to_re(t1) · ... · to_re(t_{n-1})
euf::snode* overall_union = nullptr;
for (auto const& sc : sub_cycles) {
unsigned start = sc.first;
unsigned end = sc.second;
if (start >= end)
continue;
euf::snode* re_state = cycle_regex;
euf::snode* body = nullptr;
for (unsigned i = start; i < end; ++i) {
euf::snode* tok = tokens[i];
if (!tok)
break;
// Insert filtered stabilizer star before each token after the first
if (i > start) {
euf::snode* filtered = get_filtered_stabilizer_star(re_state, tok);
if (filtered) {
expr* fe = filtered->get_expr();
if (fe) {
if (!body) {
body = filtered;
} else {
expr* be = body->get_expr();
if (be) {
expr_ref cat(seq.re.mk_concat(be, fe), m);
body = m_sg.mk(cat);
}
}
}
}
}
// Convert char token to regex: to_re(unit(tok))
expr* tok_expr = tok->get_expr();
if (!tok_expr)
break;
expr_ref unit_str(seq.str.mk_unit(tok_expr), m);
expr_ref tok_re(seq.re.mk_to_re(unit_str), m);
euf::snode* tok_re_sn = m_sg.mk(tok_re);
if (!body) {
body = tok_re_sn;
} else {
expr* be = body->get_expr();
expr* te = tok_re_sn->get_expr();
if (be && te) {
expr_ref cat(seq.re.mk_concat(be, te), m);
body = m_sg.mk(cat);
}
}
// Advance the regex state
euf::snode* deriv = m_sg.brzozowski_deriv(re_state, tok);
if (!deriv)
break;
re_state = deriv;
}
if (!body)
continue;
if (!overall_union) {
overall_union = body;
} else {
expr* oe = overall_union->get_expr();
expr* be = body->get_expr();
if (oe && be) {
expr_ref u(seq.re.mk_union(oe, be), m);
overall_union = m_sg.mk(u);
}
}
}
return overall_union;
}
// -----------------------------------------------------------------------
// Stabilizer-based subsumption (enhanced)
// Mirrors ZIPT StrMem.TrySubsume (StrMem.cs:354-386)
// -----------------------------------------------------------------------
bool seq_regex::try_subsume(seq::str_mem const& mem, seq::nielsen_node const& node) {
if (!mem.m_str || !mem.m_regex)
return false;
// 1. Leading token must be a variable
euf::snode* first = mem.m_str->first();
if (!first || !first->is_var())
return false;
// 2. Must have stabilizers for the regex
if (!has_stabilizers(mem.m_regex))
return false;
// 3. Build stabStar = star(union(all stabilizers for this regex))
euf::snode* stab_union = get_stabilizer_union(mem.m_regex);
if (!stab_union)
return false;
seq_util& seq = m_sg.get_seq_util();
ast_manager& mgr = m_sg.get_manager();
expr* su_expr = stab_union->get_expr();
if (!su_expr)
return false;
expr_ref stab_star(seq.re.mk_star(su_expr), mgr);
euf::snode* stab_star_sn = m_sg.mk(stab_star);
if (!stab_star_sn)
return false;
// 4. Collect all primitive regex constraints on variable `first`
euf::snode* x_range = collect_primitive_regex_intersection(first, node);
if (!x_range)
return false;
// 5. Check L(x_range) ⊆ L(stab_star)
lbool result = is_language_subset(x_range, stab_star_sn);
return result == l_true;
}
}