diff --git a/src/smt/seq/CMakeLists.txt b/src/smt/seq/CMakeLists.txt index db63e4c6f..e6ed34c34 100644 --- a/src/smt/seq/CMakeLists.txt +++ b/src/smt/seq/CMakeLists.txt @@ -1,5 +1,6 @@ z3_add_component(smt_seq SOURCES + seq_parikh.cpp seq_nielsen.cpp COMPONENT_DEPENDENCIES euf diff --git a/src/smt/seq/seq_nielsen.cpp b/src/smt/seq/seq_nielsen.cpp index 6c2b4a299..f741c5558 100644 --- a/src/smt/seq/seq_nielsen.cpp +++ b/src/smt/seq/seq_nielsen.cpp @@ -20,6 +20,7 @@ Author: --*/ #include "smt/seq/seq_nielsen.h" +#include "smt/seq/seq_parikh.h" #include "ast/arith_decl_plugin.h" #include "ast/ast_pp.h" #include "ast/rewriter/th_rewriter.h" @@ -443,10 +444,12 @@ namespace seq { nielsen_graph::nielsen_graph(euf::sgraph& sg, simple_solver& solver): m_sg(sg), m_solver(solver), + m_parikh(alloc(seq_parikh, sg)) { m_len_vars(sg.get_manager()) { } nielsen_graph::~nielsen_graph() { + dealloc(m_parikh); reset(); } @@ -1864,6 +1867,23 @@ namespace seq { // nielsen_graph: search // ----------------------------------------------------------------------- + void nielsen_graph::apply_parikh_to_node(nielsen_node& node) { + if (node.m_parikh_applied) return; + node.m_parikh_applied = true; + + // Generate modular length constraints (len(str) = min_len + stride·k, etc.) + // and append them to the node's integer constraint list. + m_parikh->apply_to_node(node); + + // Lightweight feasibility pre-check: does the Parikh modular constraint + // contradict the variable's current integer bounds? If so, mark this + // node as a Parikh-image conflict immediately (avoids a solver call). + if (!node.is_currently_conflict() && m_parikh->check_parikh_conflict(node)) { + node.set_general_conflict(true); + node.set_reason(backtrack_reason::parikh_image); + } + } + void nielsen_graph::assert_root_constraints_to_solver() { if (m_root_constraints_asserted) return; m_root_constraints_asserted = true; @@ -1970,6 +1990,17 @@ namespace seq { return search_result::unsat; } + // Apply Parikh image filter: generate modular length constraints and + // perform a lightweight feasibility pre-check. The filter is guarded + // internally (m_parikh_applied) so it only runs once per node. + // Note: Parikh filtering is skipped for satisfied nodes (returned above); + // a fully satisfied node has no remaining memberships to filter. + apply_parikh_to_node(*node); + if (node->is_currently_conflict()) { + ++m_stats.m_num_simplify_conflict; + return search_result::unsat; + } + // Assert any new int_constraints added during simplify_and_init for this // node into the current solver scope. Constraints inherited from the parent // (indices 0..m_parent_ic_count-1) are already present at the enclosing @@ -3333,8 +3364,15 @@ namespace seq { nielsen_subst s(first, replacement, mem.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); - // TODO: derive char_set from minterm and add as range constraint - // child->add_char_range(fresh_char, minterm_to_char_set(mt)); + // Constrain fresh_char to the character class of this minterm. + // This is the key Parikh pruning step: when x → ?c · x' is + // generated from minterm m_i, ?c must belong to the character + // class described by m_i so that str ∈ derivative(R, m_i). + if (mt->get_expr()) { + char_set cs = m_parikh->minterm_to_char_set(mt->get_expr()); + if (!cs.is_empty()) + child->add_char_range(fresh_char, cs); + } created = true; } diff --git a/src/smt/seq/seq_nielsen.h b/src/smt/seq/seq_nielsen.h index cafe39be5..4cbf16e06 100644 --- a/src/smt/seq/seq_nielsen.h +++ b/src/smt/seq/seq_nielsen.h @@ -250,6 +250,7 @@ namespace seq { class nielsen_node; class nielsen_edge; class nielsen_graph; + class seq_parikh; // Parikh image filter (see seq_parikh.h) /** * Abstract interface for an incremental solver used by nielsen_graph @@ -528,6 +529,9 @@ namespace seq { // evaluation index for run tracking unsigned m_eval_idx = 0; + // Parikh filter: set to true once apply_parikh_to_node has been applied + // to this node. Prevents duplicate constraint generation across DFS runs. + bool m_parikh_applied = false; // number of int_constraints inherited from the parent node at clone time. // int_constraints[0..m_parent_ic_count) are already asserted at the // parent's solver scope; only [m_parent_ic_count..end) need to be @@ -731,6 +735,9 @@ namespace seq { // Set to true after assert_root_constraints_to_solver() is first called. bool m_root_constraints_asserted = false; + // Parikh image filter: generates modular length constraints from regex + // memberships. Allocated in the constructor; owned by this graph. + seq_parikh* m_parikh = nullptr; // ----------------------------------------------- // Modification counter for substitution length tracking. // mirrors ZIPT's LocalInfo.CurrentModificationCnt @@ -863,6 +870,16 @@ namespace seq { private: search_result search_dfs(nielsen_node* node, unsigned depth, svector& cur_path); + // Apply the Parikh image filter to a node: generate modular length + // constraints from regex memberships and append them to the node's + // int_constraints. Also performs a lightweight feasibility pre-check; + // if a Parikh conflict is detected the node's conflict flag is set with + // backtrack_reason::parikh_image. + // + // Guarded by node.m_parikh_applied so that constraints are generated + // only once per node across DFS iterations. + void apply_parikh_to_node(nielsen_node& node); + // create a fresh variable with a unique name euf::snode* mk_fresh_var(); diff --git a/src/smt/seq/seq_parikh.cpp b/src/smt/seq/seq_parikh.cpp new file mode 100644 index 000000000..3f6deba40 --- /dev/null +++ b/src/smt/seq/seq_parikh.cpp @@ -0,0 +1,387 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_parikh.cpp + +Abstract: + + Parikh image filter implementation for the ZIPT-based Nielsen string + solver. See seq_parikh.h for the full design description. + + The key operation is compute_length_stride(re), which performs a + structural traversal of the regex to find the period k such that all + string lengths in L(re) are congruent to min_length(re) modulo k. + The stride is used to generate modular length constraints that help + the integer subsolver prune infeasible Nielsen graph nodes. + +Author: + + Clemens Eisenhofer 2026-03-10 + Nikolaj Bjorner (nbjorner) 2026-03-10 + +--*/ + +#include "smt/seq/seq_parikh.h" +#include "util/mpz.h" +#include + +namespace seq { + + seq_parikh::seq_parikh(euf::sgraph& sg) + : m(sg.get_manager()), seq(m), a(m), m_fresh_cnt(0) {} + + expr_ref seq_parikh::mk_fresh_int_var() { + std::string name = "pk!" + std::to_string(m_fresh_cnt++); + return expr_ref(m.mk_fresh_const(name.c_str(), a.mk_int()), m); + } + + // ----------------------------------------------------------------------- + // Stride computation + // ----------------------------------------------------------------------- + + // compute_length_stride: structural traversal of regex expression. + // + // Return value semantics: + // 0 — fixed length (or empty language): no modular constraint needed + // beyond the min == max bounds. + // 1 — all integer lengths ≥ min_len are achievable: no useful modular + // constraint. + // k > 1 — all lengths in L(re) satisfy len ≡ min_len (mod k): + // modular constraint len(str) = min_len + k·j is useful. + unsigned seq_parikh::compute_length_stride(expr* re) { + if (!re) return 1; + + expr* r1 = nullptr, *r2 = nullptr, *s = nullptr; + unsigned lo = 0, hi = 0; + + // Empty language: no strings exist; stride is irrelevant. + if (seq.re.is_empty(re)) + return 0; + + // Epsilon regex {""}: single fixed length 0. + if (seq.re.is_epsilon(re)) + return 0; + + // to_re(concrete_string): fixed-length, no modular constraint needed. + if (seq.re.is_to_re(re, s)) { + // min_length == max_length, covered by bounds. + return 0; + } + + // Single character: range, full_char — fixed length 1. + if (seq.re.is_range(re) || seq.re.is_full_char(re)) + return 0; + + // full_seq (.* / Σ*): every length ≥ 0 is possible. + if (seq.re.is_full_seq(re)) + return 1; + + // r* — Kleene star. + // L(r*) = {ε} ∪ L(r) ∪ L(r)·L(r) ∪ ... + // If all lengths in L(r) are congruent to c modulo s (c = min_len, s = stride), + // then L(r*) includes lengths {0, c, c+s, 2c, 2c+s, 2c+2s, ...} and + // the overall GCD is gcd(c, s). This is strictly more accurate than + // the previous gcd(min, max) approximation, which can be unsound when + // the body contains lengths whose GCD is smaller than gcd(min, max). + if (seq.re.is_star(re, r1)) { + unsigned mn = seq.re.min_length(r1); + unsigned inner = compute_length_stride(r1); + // stride(r*) = gcd(min_length(r), stride(r)) + // when inner=0 (fixed-length body), gcd(mn, 0) = mn → stride = mn + return u_gcd(mn, inner); + } + + // r+ — one or more: same stride analysis as r*. + if (seq.re.is_plus(re, r1)) { + unsigned mn = seq.re.min_length(r1); + unsigned inner = compute_length_stride(r1); + return u_gcd(mn, inner); + } + + // r? — zero or one: lengths = {0} ∪ lengths(r) + // stride = GCD(mn_r, stride(r)) unless stride(r) is 0 (fixed length). + if (seq.re.is_opt(re, r1)) { + unsigned mn = seq.re.min_length(r1); + unsigned inner = compute_length_stride(r1); + // L(r?) includes length 0 and all lengths of L(r). + // GCD(stride(r), min_len(r)) is a valid stride because: + // - the gap from 0 to min_len(r) is min_len(r) itself, and + // - subsequent lengths grow in steps governed by stride(r). + // A result > 1 gives a useful modular constraint; result == 1 + // means every non-negative integer is achievable (no constraint). + if (inner == 0) + return u_gcd(mn, 0); // gcd(mn, 0) = mn; useful when mn > 1 + return u_gcd(inner, mn); + } + + // concat(r1, r2): lengths add → stride = GCD(stride(r1), stride(r2)). + if (seq.re.is_concat(re, r1, r2)) { + unsigned s1 = compute_length_stride(r1); + unsigned s2 = compute_length_stride(r2); + return u_gcd(s1, s2); + } + + // union(r1, r2): lengths from either branch → need GCD of both + // strides and the difference between their minimum lengths. + if (seq.re.is_union(re, r1, r2)) { + unsigned s1 = compute_length_stride(r1); + unsigned s2 = compute_length_stride(r2); + unsigned m1 = seq.re.min_length(r1); + unsigned m2 = seq.re.min_length(r2); + unsigned d = (m1 >= m2) ? (m1 - m2) : (m2 - m1); + // Replace 0-strides with d for GCD computation: + // a fixed-length branch only introduces constraint via its offset. + unsigned g = u_gcd(s1 == 0 ? d : s1, s2 == 0 ? d : s2); + g = u_gcd(g, d); + return g; + } + + // loop(r, lo, hi): the length of any word is a sum of lo..hi copies of + // lengths from L(r). Since all lengths in L(r) are ≡ min_len(r) mod + // stride(r), the overall stride is gcd(min_len(r), stride(r)). + if (seq.re.is_loop(re, r1, lo, hi)) { + unsigned mn = seq.re.min_length(r1); + unsigned inner = compute_length_stride(r1); + return u_gcd(mn, inner); + } + if (seq.re.is_loop(re, r1, lo)) { + unsigned mn = seq.re.min_length(r1); + unsigned inner = compute_length_stride(r1); + return u_gcd(mn, inner); + } + + // intersection(r1, r2): lengths must be in both languages. + // A conservative safe choice: GCD(stride(r1), stride(r2)) is a valid + // stride for the intersection (every length in the intersection is + // also in r1 and in r2). + if (seq.re.is_intersection(re, r1, r2)) { + unsigned s1 = compute_length_stride(r1); + unsigned s2 = compute_length_stride(r2); + return u_gcd(s1, s2); + } + + // For complement, diff, reverse, derivative, of_pred, and anything + // else we cannot analyse statically: be conservative and return 1 + // (no useful modular constraint rather than an unsound one). + return 1; + } + + // ----------------------------------------------------------------------- + // Constraint generation + // ----------------------------------------------------------------------- + + void seq_parikh::generate_parikh_constraints(str_mem const& mem, + vector& out) { + if (!mem.m_regex || !mem.m_str) + return; + + expr* re_expr = mem.m_regex->get_expr(); + if (!re_expr || !seq.is_re(re_expr)) + return; + + // Length bounds from the regex. + unsigned min_len = seq.re.min_length(re_expr); + unsigned max_len = seq.re.max_length(re_expr); + + // If min_len >= max_len the bounds already pin the length exactly + // (or the language is empty — empty language is detected by simplify_and_init + // via Brzozowski derivative / is_empty checks, not here). + // We only generate modular constraints when the length is variable. + if (min_len >= max_len) + return; + + unsigned stride = compute_length_stride(re_expr); + + // stride == 1: every integer length is possible — no useful constraint. + // stride == 0: fixed length or empty — handled by bounds. + if (stride <= 1) + return; + + // Build len(str) as an arithmetic expression. + expr_ref len_str(seq.str.mk_length(mem.m_str->get_expr()), m); + + // Introduce fresh integer variable k ≥ 0. + expr_ref k_var = mk_fresh_int_var(); + + // Constraint 1: len(str) = min_len + stride · k + expr_ref min_expr(a.mk_int(min_len), m); + expr_ref stride_expr(a.mk_int(stride), m); + expr_ref stride_k(a.mk_mul(stride_expr, k_var), m); + expr_ref rhs(a.mk_add(min_expr, stride_k), m); + out.push_back(int_constraint(len_str, rhs, + int_constraint_kind::eq, mem.m_dep, m)); + + // Constraint 2: k ≥ 0 + expr_ref zero(a.mk_int(0), m); + out.push_back(int_constraint(k_var, zero, + int_constraint_kind::ge, mem.m_dep, m)); + + // Constraint 3 (optional): k ≤ max_k when max_len is bounded. + // max_k = floor((max_len - min_len) / stride) + // This gives the solver an explicit upper bound on k. + // The subtraction is safe because min_len < max_len is guaranteed + // by the early return above. + if (max_len != UINT_MAX) { + SASSERT(max_len > min_len); + unsigned range = max_len - min_len; + unsigned max_k = range / stride; + expr_ref max_k_expr(a.mk_int(max_k), m); + out.push_back(int_constraint(k_var, max_k_expr, + int_constraint_kind::le, mem.m_dep, m)); + } + } + + void seq_parikh::apply_to_node(nielsen_node& node) { + vector constraints; + for (str_mem const& mem : node.str_mems()) + generate_parikh_constraints(mem, constraints); + for (auto& ic : constraints) + node.add_int_constraint(ic); + } + + // ----------------------------------------------------------------------- + // Quick Parikh feasibility check (no solver call) + // ----------------------------------------------------------------------- + + // Returns true if a Parikh conflict is detected: there exists a membership + // str ∈ re for a single-variable str where the modular length constraint + // len(str) = min_len + stride * k (k ≥ 0) + // is inconsistent with the variable's current integer bounds [lb, ub]. + // + // This check is lightweight — it uses only modular arithmetic on the already- + // known regex min/max lengths and the per-variable bounds stored in the node. + bool seq_parikh::check_parikh_conflict(nielsen_node& node) { + for (str_mem const& mem : node.str_mems()) { + if (!mem.m_str || !mem.m_regex || !mem.m_str->is_var()) + continue; + + expr* re_expr = mem.m_regex->get_expr(); + if (!re_expr || !seq.is_re(re_expr)) + continue; + + unsigned min_len = seq.re.min_length(re_expr); + unsigned max_len = seq.re.max_length(re_expr); + if (min_len >= max_len) continue; // fixed or empty — no stride constraint + + unsigned stride = compute_length_stride(re_expr); + if (stride <= 1) continue; // no useful modular constraint + // stride > 1 guaranteed from here onward. + SASSERT(stride > 1); + + unsigned lb = node.var_lb(mem.m_str); + unsigned ub = node.var_ub(mem.m_str); + + // Check: ∃k ≥ 0 such that lb ≤ min_len + stride * k ≤ ub ? + // + // First find the smallest k satisfying the lower bound: + // k_min = 0 if min_len ≥ lb + // k_min = ⌈(lb - min_len) / stride⌉ otherwise + // + // Then verify min_len + stride * k_min ≤ ub. + unsigned k_min = 0; + if (lb > min_len) { + unsigned gap = lb - min_len; + // Ceiling division: k_min = ceil(gap / stride). + // Guard: (gap + stride - 1) may overflow if gap is close to UINT_MAX. + // In that case k_min would be huge, and min_len + stride*k_min would + // also overflow ub → treat as a conflict immediately. + if (gap > UINT_MAX - (stride - 1)) { + return true; // ceiling division would overflow → k_min too large + } + k_min = (gap + stride - 1) / stride; + } + + // Overflow guard: stride * k_min may overflow unsigned. + unsigned len_at_k_min; + if (k_min > (UINT_MAX - min_len) / stride) { + // Overflow: min_len + stride * k_min > UINT_MAX ≥ ub → conflict. + return true; + } + len_at_k_min = min_len + stride * k_min; + + if (ub != UINT_MAX && len_at_k_min > ub) + return true; // no valid k exists → Parikh conflict + } + return false; + } + + // ----------------------------------------------------------------------- + // minterm → char_set conversion + // ----------------------------------------------------------------------- + + // Convert a regex minterm expression to a char_set. + // + // Minterms are Boolean combinations of character-class predicates + // (re.range, re.full_char, complement, intersection) produced by + // sgraph::compute_minterms(). This function converts them to a + // concrete char_set so that fresh character variables introduced by + // apply_regex_var_split can be constrained with add_char_range(). + // + // The implementation is recursive and handles: + // re.full_char → [0, max_char] (full alphabet) + // re.range(lo, hi) → [lo, hi+1) (hi inclusive in Z3) + // re.complement(r) → alphabet \ minterm_to_char_set(r) + // re.inter(r1, r2) → intersection of both sides + // re.diff(r1, r2) → r1 ∩ complement(r2) + // re.to_re(str.unit(c)) → singleton {c} + // re.empty → empty set + // anything else → [0, max_char] (conservative) + char_set seq_parikh::minterm_to_char_set(expr* re_expr) { + unsigned max_c = seq.max_char(); + + if (!re_expr) + return char_set::full(max_c); + + // full_char: the whole alphabet [0, max_char] + if (seq.re.is_full_char(re_expr)) + return char_set::full(max_c); + + // range [lo, hi] (hi inclusive in Z3's regex representation) + unsigned lo = 0, hi = 0; + if (seq.re.is_range(re_expr, lo, hi)) { + // lo > hi is a degenerate range; should not arise from well-formed minterms + SASSERT(lo <= hi); + if (lo > hi) return char_set(); + // char_range uses exclusive upper bound; Z3 hi is inclusive + return char_set(char_range(lo, hi + 1)); + } + + // complement: alphabet minus the inner set + expr* inner = nullptr; + if (seq.re.is_complement(re_expr, inner)) + return minterm_to_char_set(inner).complement(max_c); + + // intersection: characters present in both sets + expr* r1 = nullptr, *r2 = nullptr; + if (seq.re.is_intersection(re_expr, r1, r2)) + return minterm_to_char_set(r1).intersect_with(minterm_to_char_set(r2)); + + // difference: r1 minus r2 = r1 ∩ complement(r2) + if (seq.re.is_diff(re_expr, r1, r2)) + return minterm_to_char_set(r1).intersect_with( + minterm_to_char_set(r2).complement(max_c)); + + // to_re(str.unit(c)): singleton character set + expr* str_arg = nullptr; + expr* ch_expr = nullptr; + unsigned char_val = 0; + if (seq.re.is_to_re(re_expr, str_arg) && + seq.str.is_unit(str_arg, ch_expr) && + seq.is_const_char(ch_expr, char_val)) { + char_set cs; + cs.add(char_val); + return cs; + } + + // empty regex: no characters can appear + if (seq.re.is_empty(re_expr)) + return char_set(); + + // Conservative fallback: return the full alphabet so that + // no unsound constraints are added for unrecognised expressions. + return char_set::full(max_c); + } + +} // namespace seq diff --git a/src/smt/seq/seq_parikh.h b/src/smt/seq/seq_parikh.h new file mode 100644 index 000000000..8e8a4e178 --- /dev/null +++ b/src/smt/seq/seq_parikh.h @@ -0,0 +1,150 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_parikh.h + +Abstract: + + Parikh image filter for the ZIPT-based Nielsen string solver. + + Implements Parikh-based arithmetic constraint generation for + nielsen_node instances. For a regex membership constraint str ∈ r, + the Parikh image of r constrains the multiset of characters in str. + This module computes the "length stride" (period) of the length + language of r and generates modular arithmetic constraints of the form + + len(str) = min_len + stride · k (k ≥ 0, k fresh integer) + + which tighten the arithmetic subproblem beyond the simple min/max + length bounds already produced by nielsen_node::init_var_bounds_from_mems(). + + For example: + • str ∈ (ab)* → min_len = 0, stride = 2 → len(str) = 2·k + • str ∈ a(bc)* → min_len = 1, stride = 2 → len(str) = 1 + 2·k + • str ∈ ab|abc → stride = 1 (no useful modular constraint) + + The generated int_constraints are added to the node's integer constraint + set and discharged by the integer subsolver (see seq_nielsen.h, + simple_solver). + + Implements the Parikh filter described in ZIPT + (https://github.com/CEisenhofer/ZIPT/tree/parikh/ZIPT/Constraints) + replacing ZIPT's PDD-based Parikh subsolver with Z3's linear arithmetic. + +Author: + + Clemens Eisenhofer 2026-03-10 + Nikolaj Bjorner (nbjorner) 2026-03-10 + +--*/ +#pragma once + +#include "ast/arith_decl_plugin.h" +#include "ast/seq_decl_plugin.h" +#include "smt/seq/seq_nielsen.h" + +namespace seq { + + /** + * Parikh image filter: generates modular length constraints from + * regex membership constraints in a nielsen_node. + * + * Usage: + * seq_parikh parikh(sg); + * parikh.apply_to_node(node); // adds constraints to node + * + * Or per-membership: + * vector out; + * parikh.generate_parikh_constraints(mem, out); + */ + class seq_parikh { + ast_manager& m; + seq_util seq; + arith_util a; + unsigned m_fresh_cnt; // counter for fresh variable names + + // Compute the stride (period) of the length language of a regex. + // + // The stride k satisfies: all lengths in L(re) are congruent to + // min_length(re) modulo k. A stride of 1 means every integer + // length is possible (no useful modular constraint). A stride of + // 0 is a sentinel meaning the language is empty or has a single + // fixed length (already captured by bounds). + // + // Examples: + // stride(to_re("ab")) = 0 (fixed length 2) + // stride((ab)*) = 2 (lengths 0, 2, 4, ...) + // stride((abc)*) = 3 (lengths 0, 3, 6, ...) + // stride(a*b*) = 1 (all lengths possible) + // stride((ab)*(cd)*) = 2 (lengths 0, 2, 4, ...) + // stride((ab)*|(abc)*) = 1 (lengths 0, 2, 3, 4, ...) + unsigned compute_length_stride(expr* re); + + // Create a fresh integer variable (name "pk!N") for use as the + // Parikh multiplier variable k in len(str) = min_len + stride·k. + expr_ref mk_fresh_int_var(); + + public: + explicit seq_parikh(euf::sgraph& sg); + + // Generate Parikh modular length constraints for one membership. + // + // When stride > 1 and min_len < max_len (bounds don't pin length exactly, + // and the language is non-empty): + // adds: len(str) = min_len + stride · k (equality) + // k ≥ 0 (non-negativity) + // k ≤ (max_len - min_len) / stride (upper bound, when max_len bounded) + // These tighten the integer constraint set for the subsolver. + // Dependencies are copied from mem.m_dep. + // Does nothing when min_len ≥ max_len (empty or fixed-length language). + void generate_parikh_constraints(str_mem const& mem, + vector& out); + + // Apply Parikh constraints to all memberships at a node. + // Calls generate_parikh_constraints for each str_mem in the node + // and appends the resulting int_constraints to node.int_constraints(). + void apply_to_node(nielsen_node& node); + + // Quick Parikh feasibility check (no solver call). + // + // For each single-variable membership str ∈ re, checks whether the + // modular constraint len(str) = min_len + stride · k (k ≥ 0) + // has any solution given the current per-variable bounds stored in + // node.var_lb(str) and node.var_ub(str). + // + // Returns true when a conflict is detected (no valid k exists for + // some membership). The caller should then mark the node with + // backtrack_reason::parikh_image. + // + // This is a lightweight pre-check that avoids calling the integer + // subsolver. It is sound (never returns true for a satisfiable node) + // but incomplete (may miss conflicts that require the full solver). + bool check_parikh_conflict(nielsen_node& node); + + // Compute the length stride of a regex expression. + // Exposed for testing and external callers. + unsigned get_length_stride(expr* re) { return compute_length_stride(re); } + + // Convert a regex minterm expression to a char_set. + // + // A minterm is a Boolean combination of character-class predicates + // (re.range, re.full_char, complement, intersection) that names a + // single, indivisible character equivalence class. Minterms are + // produced by sgraph::compute_minterms and used in + // apply_regex_var_split to constrain fresh character variables. + // + // Supported expressions: + // re.full_char → full set [0, max_char] + // re.range(lo, hi) → char_set [lo, hi] (inclusive on both ends) + // re.complement(r) → complement of minterm_to_char_set(r) + // re.inter(r1, r2) → intersection of both sets + // re.diff(r1, r2) → r1 minus r2 (= r1 ∩ complement(r2)) + // re.to_re(unit(c)) → singleton {c} + // re.empty → empty set + // anything else → full set (conservative, sound over-approximation) + char_set minterm_to_char_set(expr* minterm_re); + }; + +} // namespace seq diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index acc598edf..1620bd790 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -132,6 +132,7 @@ add_executable(test-z3 sls_test.cpp sls_seq_plugin.cpp seq_nielsen.cpp + seq_parikh.cpp nseq_basic.cpp nseq_regex.cpp nseq_zipt.cpp diff --git a/src/test/main.cpp b/src/test/main.cpp index 9fc8bf1ec..0f378df67 100644 --- a/src/test/main.cpp +++ b/src/test/main.cpp @@ -287,6 +287,7 @@ int main(int argc, char ** argv) { TST(scoped_vector); TST(sls_seq_plugin); TST(seq_nielsen); + TST(seq_parikh); TST(nseq_basic); TST(nseq_regex); TST(nseq_zipt); diff --git a/src/test/seq_parikh.cpp b/src/test/seq_parikh.cpp new file mode 100644 index 000000000..83bbc3da4 --- /dev/null +++ b/src/test/seq_parikh.cpp @@ -0,0 +1,883 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_parikh.cpp + +Abstract: + + Unit tests for seq_parikh (Parikh image filter for the ZIPT Nielsen solver). + + Tests cover: + - compute_length_stride / get_length_stride for all regex forms + - generate_parikh_constraints: constraint shape, count, and dependency + - apply_to_node: integration with nielsen_node + - check_parikh_conflict: lightweight feasibility pre-check + - minterm_to_char_set: regex-minterm to char_set conversion + +Author: + + Clemens Eisenhofer 2026-03-11 + Nikolaj Bjorner (nbjorner) 2026-03-11 + +--*/ + +#include "util/util.h" +#include "util/zstring.h" +#include "ast/euf/euf_egraph.h" +#include "ast/euf/euf_sgraph.h" +#include "smt/seq/seq_nielsen.h" +#include "smt/seq/seq_parikh.h" +#include "ast/arith_decl_plugin.h" +#include "ast/reg_decl_plugins.h" +#include "ast/ast_pp.h" +#include + +// --------------------------------------------------------------------------- +// Minimal solver stub (no-op) +// --------------------------------------------------------------------------- +class parikh_test_solver : public seq::simple_solver { +public: + void push() override {} + void pop(unsigned) override {} + void assert_expr(expr*) override {} + lbool check() override { return l_true; } +}; + +// --------------------------------------------------------------------------- +// Helpers to build common regex expressions +// --------------------------------------------------------------------------- + +// build to_re("AB") — a fixed two-character string regex +static expr_ref mk_to_re_ab(ast_manager& m, seq_util& seq) { + expr_ref ch_a(seq.str.mk_char('A'), m); + expr_ref ch_b(seq.str.mk_char('B'), m); + expr_ref unit_a(seq.str.mk_unit(ch_a), m); + expr_ref unit_b(seq.str.mk_unit(ch_b), m); + expr_ref ab(seq.str.mk_concat(unit_a, unit_b), m); + return expr_ref(seq.re.mk_to_re(ab), m); +} + +// build (ab)* — star of the two-character sequence +static expr_ref mk_ab_star(ast_manager& m, seq_util& seq) { + return expr_ref(seq.re.mk_star(mk_to_re_ab(m, seq)), m); +} + +// build (abc)* — star of a three-character sequence +static expr_ref mk_abc_star(ast_manager& m, seq_util& seq) { + expr_ref ch_a(seq.str.mk_char('A'), m); + expr_ref ch_b(seq.str.mk_char('B'), m); + expr_ref ch_c(seq.str.mk_char('C'), m); + expr_ref unit_a(seq.str.mk_unit(ch_a), m); + expr_ref unit_b(seq.str.mk_unit(ch_b), m); + expr_ref unit_c(seq.str.mk_unit(ch_c), m); + sort_ref str_sort(seq.str.mk_string_sort(), m); + expr_ref abc(seq.str.mk_concat(unit_a, seq.str.mk_concat(unit_b, unit_c)), m); + return expr_ref(seq.re.mk_star(seq.re.mk_to_re(abc)), m); +} + +// build to_re("A") — single-character regex +static expr_ref mk_to_re_a(ast_manager& m, seq_util& seq) { + expr_ref ch_a(seq.str.mk_char('A'), m); + expr_ref unit_a(seq.str.mk_unit(ch_a), m); + return expr_ref(seq.re.mk_to_re(unit_a), m); +} + +// --------------------------------------------------------------------------- +// Stride tests: compute_length_stride / get_length_stride +// --------------------------------------------------------------------------- + +// stride(to_re("AB")) == 0 (fixed length) +static void test_stride_fixed_length() { + std::cout << "test_stride_fixed_length\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + + expr_ref re = mk_to_re_ab(m, seq); + SASSERT(parikh.get_length_stride(re) == 0); +} + +// stride((ab)*) == 2 +static void test_stride_star_fixed_body() { + std::cout << "test_stride_star_fixed_body\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + + expr_ref re = mk_ab_star(m, seq); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride((ab)*) = " << stride << "\n"; + SASSERT(stride == 2); +} + +// stride((abc)*) == 3 +static void test_stride_star_three_char() { + std::cout << "test_stride_star_three_char\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + + expr_ref re = mk_abc_star(m, seq); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride((abc)*) = " << stride << "\n"; + SASSERT(stride == 3); +} + +// stride((ab)+) == 2 +static void test_stride_plus() { + std::cout << "test_stride_plus\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + + expr_ref re_body = mk_to_re_ab(m, seq); + expr_ref re(seq.re.mk_plus(re_body), m); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride((ab)+) = " << stride << "\n"; + SASSERT(stride == 2); +} + +// stride(a* b*) == 1 — union of independent stars → every length possible +static void test_stride_concat_stars() { + std::cout << "test_stride_concat_stars\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + + expr_ref a_star(seq.re.mk_star(mk_to_re_a(m, seq)), m); + expr_ref b_star(seq.re.mk_star(mk_to_re_a(m, seq)), m); + expr_ref re(seq.re.mk_concat(a_star, b_star), m); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride(a* b*) = " << stride << "\n"; + // both stars have stride 1 (single-char body → gcd(1,0)=1) → gcd(1,1)=1 + SASSERT(stride == 1); +} + +// stride((ab)* | (abc)*) == gcd(2,3) = 1 +static void test_stride_union_no_common_period() { + std::cout << "test_stride_union_no_common_period\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + + expr_ref ab_star = mk_ab_star(m, seq); + expr_ref abc_star = mk_abc_star(m, seq); + expr_ref re(seq.re.mk_union(ab_star, abc_star), m); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride((ab)*|(abc)*) = " << stride << "\n"; + // lengths: {0,2,4,...} union {0,3,6,...} → GCD(2,3)=1 + SASSERT(stride == 1); +} + +// stride((ab)*|(de)*) == gcd(2,2) = 2 +static void test_stride_union_same_period() { + std::cout << "test_stride_union_same_period\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + + expr_ref ab_star = mk_ab_star(m, seq); + // de_star: (de)* — same stride 2 + expr_ref ch_d(seq.str.mk_char('D'), m); + expr_ref ch_e(seq.str.mk_char('E'), m); + expr_ref unit_d(seq.str.mk_unit(ch_d), m); + expr_ref unit_e(seq.str.mk_unit(ch_e), m); + expr_ref de(seq.str.mk_concat(unit_d, unit_e), m); + expr_ref de_star(seq.re.mk_star(seq.re.mk_to_re(de)), m); + expr_ref re(seq.re.mk_union(ab_star, de_star), m); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride((ab)*|(de)*) = " << stride << "\n"; + SASSERT(stride == 2); +} + +// stride(loop((ab), 1, 3)) == 2 — loop with fixed-length body +static void test_stride_loop() { + std::cout << "test_stride_loop\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + + expr_ref ab = mk_to_re_ab(m, seq); + expr_ref re(seq.re.mk_loop(ab, 1, 3), m); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride(loop(ab,1,3)) = " << stride << "\n"; + SASSERT(stride == 2); +} + +// stride(re.full_seq) == 1 (every length possible) +static void test_stride_full_seq() { + std::cout << "test_stride_full_seq\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + sort_ref str_sort(seq.str.mk_string_sort(), m); + + expr_ref re(seq.re.mk_full_seq(str_sort), m); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride(.*) = " << stride << "\n"; + SASSERT(stride == 1); +} + +// stride(re.empty) == 0 +static void test_stride_empty_regex() { + std::cout << "test_stride_empty_regex\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + sort_ref str_sort(seq.str.mk_string_sort(), m); + + expr_ref re(seq.re.mk_empty(str_sort), m); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride(empty) = " << stride << "\n"; + SASSERT(stride == 0); +} + +// stride(re.epsilon) == 0 +static void test_stride_epsilon() { + std::cout << "test_stride_epsilon\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + sort_ref str_sort(seq.str.mk_string_sort(), m); + + // epsilon is to_re("") — empty string + sort_ref str_s(seq.str.mk_string_sort(), m); + expr_ref empty_str(seq.str.mk_empty(str_s), m); + expr_ref re(seq.re.mk_to_re(empty_str), m); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride(epsilon) = " << stride << "\n"; + SASSERT(stride == 0); +} + +// stride((ab)?) == 2 (gcd(2, 0) = 2 via opt handling; min_len(ab)=2) +static void test_stride_opt() { + std::cout << "test_stride_opt\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + + expr_ref ab = mk_to_re_ab(m, seq); + expr_ref re(seq.re.mk_opt(ab), m); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride((ab)?) = " << stride << "\n"; + SASSERT(stride == 2); +} + +// --------------------------------------------------------------------------- +// generate_parikh_constraints tests +// --------------------------------------------------------------------------- + +// (ab)* → len(x) = 0 + 2·k, k ≥ 0 (stride 2, min_len 0) +static void test_generate_constraints_ab_star() { + std::cout << "test_generate_constraints_ab_star\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + arith_util arith(m); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + expr_ref re = mk_ab_star(m, seq); + euf::snode* regex = sg.mk(re); + seq::dep_tracker dep; dep.insert(0); + seq::str_mem mem(x, regex, nullptr, 0, dep); + + vector out; + parikh.generate_parikh_constraints(mem, out); + + // expect at least: len(x)=0+2k and k>=0 + // (no upper bound because max_length is UINT_MAX for unbounded star) + std::cout << " generated " << out.size() << " constraints\n"; + SASSERT(out.size() >= 2); + + // check that one constraint is an equality (len(x) = 0 + 2·k) + bool has_eq = false; + for (auto const& ic : out) + if (ic.m_kind == seq::int_constraint_kind::eq) has_eq = true; + SASSERT(has_eq); + + // check that one constraint is k >= 0 + bool has_ge = false; + for (auto const& ic : out) + if (ic.m_kind == seq::int_constraint_kind::ge) has_ge = true; + SASSERT(has_ge); + + // should NOT have an upper bound (star is unbounded) + bool has_le = false; + for (auto const& ic : out) + if (ic.m_kind == seq::int_constraint_kind::le) has_le = true; + SASSERT(!has_le); +} + +// loop((ab), 1, 3): bounded → k ≤ floor((6-2)/2) = 2 +static void test_generate_constraints_bounded_loop() { + std::cout << "test_generate_constraints_bounded_loop\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + // loop("ab", 1, 3): min_len=2, max_len=6, stride=2 + expr_ref ab = mk_to_re_ab(m, seq); + expr_ref re(seq.re.mk_loop(ab, 1, 3), m); + euf::snode* regex = sg.mk(re); + seq::dep_tracker dep; dep.insert(0); + seq::str_mem mem(x, regex, nullptr, 0, dep); + + vector out; + parikh.generate_parikh_constraints(mem, out); + + // expect: eq + ge + le = 3 constraints + std::cout << " generated " << out.size() << " constraints\n"; + SASSERT(out.size() == 3); + + bool has_eq = false, has_ge = false, has_le = false; + for (auto const& ic : out) { + if (ic.m_kind == seq::int_constraint_kind::eq) has_eq = true; + if (ic.m_kind == seq::int_constraint_kind::ge) has_ge = true; + if (ic.m_kind == seq::int_constraint_kind::le) has_le = true; + } + SASSERT(has_eq); + SASSERT(has_ge); + SASSERT(has_le); +} + +// stride == 1 → no constraints generated +static void test_generate_constraints_stride_one() { + std::cout << "test_generate_constraints_stride_one\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + sort_ref str_sort(seq.str.mk_string_sort(), m); + + euf::snode* x = sg.mk_var(symbol("x")); + // full_seq: stride=1 → no modular constraint + expr_ref re(seq.re.mk_full_seq(str_sort), m); + euf::snode* regex = sg.mk(re); + seq::dep_tracker dep; dep.insert(0); + seq::str_mem mem(x, regex, nullptr, 0, dep); + + vector out; + parikh.generate_parikh_constraints(mem, out); + std::cout << " generated " << out.size() << " constraints (expect 0)\n"; + SASSERT(out.empty()); +} + +// fixed-length regex (min == max) → no constraints generated +static void test_generate_constraints_fixed_length() { + std::cout << "test_generate_constraints_fixed_length\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + expr_ref re = mk_to_re_ab(m, seq); // fixed len 2 + euf::snode* regex = sg.mk(re); + seq::dep_tracker dep; dep.insert(0); + seq::str_mem mem(x, regex, nullptr, 0, dep); + + vector out; + parikh.generate_parikh_constraints(mem, out); + std::cout << " generated " << out.size() << " constraints (expect 0)\n"; + SASSERT(out.empty()); +} + +// dependency is propagated to all generated constraints +static void test_generate_constraints_dep_propagated() { + std::cout << "test_generate_constraints_dep_propagated\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + expr_ref re = mk_ab_star(m, seq); + euf::snode* regex = sg.mk(re); + seq::dep_tracker dep; dep.insert(7); + seq::str_mem mem(x, regex, nullptr, 0, dep); + + vector out; + parikh.generate_parikh_constraints(mem, out); + + // all generated constraints must carry bit 7 in their dependency + for (auto const& ic : out) { + SASSERT(!ic.m_dep.empty()); + seq::dep_tracker d7; d7.insert(7); + SASSERT(d7.subset_of(ic.m_dep)); + } + std::cout << " all constraints carry dep bit 7\n"; +} + +// --------------------------------------------------------------------------- +// apply_to_node tests +// --------------------------------------------------------------------------- + +// applying to a node with one membership adds constraints to node +static void test_apply_to_node_adds_constraints() { + std::cout << "test_apply_to_node_adds_constraints\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + parikh_test_solver solver; + seq::nielsen_graph ng(sg, solver); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + expr_ref re = mk_ab_star(m, seq); // stride 2 → generates constraints + euf::snode* regex = sg.mk(re); + ng.add_str_mem(x, regex); + + // root node should have no int_constraints initially + SASSERT(ng.root() != nullptr); + unsigned before = ng.root()->int_constraints().size(); + + parikh.apply_to_node(*ng.root()); + + unsigned after = ng.root()->int_constraints().size(); + std::cout << " before=" << before << " after=" << after << "\n"; + SASSERT(after > before); +} + +// applying twice is idempotent (m_parikh_applied would prevent double-add +// via nielsen_graph::apply_parikh_to_node, but seq_parikh::apply_to_node +// itself does not guard — so calling apply_to_node directly adds again; +// this test verifies the direct call does add, not the idempotency guard) +static void test_apply_to_node_stride_one_no_constraints() { + std::cout << "test_apply_to_node_stride_one_no_constraints\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + sort_ref str_sort(seq.str.mk_string_sort(), m); + parikh_test_solver solver; + seq::nielsen_graph ng(sg, solver); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + expr_ref re(seq.re.mk_full_seq(str_sort), m); // stride 1 → no constraints + euf::snode* regex = sg.mk(re); + ng.add_str_mem(x, regex); + + unsigned before = ng.root()->int_constraints().size(); + parikh.apply_to_node(*ng.root()); + unsigned after = ng.root()->int_constraints().size(); + std::cout << " before=" << before << " after=" << after << " (expect no change)\n"; + SASSERT(after == before); +} + +// --------------------------------------------------------------------------- +// check_parikh_conflict tests +// --------------------------------------------------------------------------- + +// no conflict when var_lb=0, var_ub=UINT_MAX (unconstrained) +static void test_check_conflict_unconstrained_no_conflict() { + std::cout << "test_check_conflict_unconstrained_no_conflict\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + parikh_test_solver solver; + seq::nielsen_graph ng(sg, solver); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + expr_ref re = mk_ab_star(m, seq); // stride 2, min_len 0 + euf::snode* regex = sg.mk(re); + ng.add_str_mem(x, regex); + + // no bounds set → default lb=0, ub=UINT_MAX → no conflict + bool conflict = parikh.check_parikh_conflict(*ng.root()); + std::cout << " conflict = " << conflict << " (expect 0)\n"; + SASSERT(!conflict); +} + +// conflict: lb=3, ub=5, stride=2, min_len=0 +// valid lengths: 0,2,4,6,... ∩ [3,5] = {4} → no conflict +static void test_check_conflict_valid_k_exists() { + std::cout << "test_check_conflict_valid_k_exists\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + parikh_test_solver solver; + seq::nielsen_graph ng(sg, solver); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + expr_ref re = mk_ab_star(m, seq); // stride 2, min_len 0; lengths 0,2,4,... + euf::snode* regex = sg.mk(re); + ng.add_str_mem(x, regex); + + // lb=3, ub=5: length 4 is achievable (k=2) → no conflict + seq::dep_tracker dep; dep.insert(0); + ng.root()->add_lower_int_bound(x, 3, dep); + ng.root()->add_upper_int_bound(x, 5, dep); + + bool conflict = parikh.check_parikh_conflict(*ng.root()); + std::cout << " conflict = " << conflict << " (expect 0)\n"; + SASSERT(!conflict); +} + +// conflict: lb=3, ub=3, stride=2, min_len=0 +// valid lengths: {0,2,4,...} ∩ [3,3] = {} → conflict +static void test_check_conflict_no_valid_k() { + std::cout << "test_check_conflict_no_valid_k\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + parikh_test_solver solver; + seq::nielsen_graph ng(sg, solver); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + expr_ref re = mk_ab_star(m, seq); // stride 2, min_len 0; lengths {0,2,4,...} + euf::snode* regex = sg.mk(re); + ng.add_str_mem(x, regex); + + // lb=3, ub=3: only odd length 3 — never a multiple of 2 → conflict + seq::dep_tracker dep; dep.insert(0); + ng.root()->add_lower_int_bound(x, 3, dep); + ng.root()->add_upper_int_bound(x, 3, dep); + + bool conflict = parikh.check_parikh_conflict(*ng.root()); + std::cout << " conflict = " << conflict << " (expect 1)\n"; + SASSERT(conflict); +} + +// conflict: lb=5, ub=5, stride=3, min_len=0 +// valid lengths of (abc)*: {0,3,6,...} ∩ {5} = {} → conflict +static void test_check_conflict_abc_star() { + std::cout << "test_check_conflict_abc_star\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + parikh_test_solver solver; + seq::nielsen_graph ng(sg, solver); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + expr_ref re = mk_abc_star(m, seq); // stride 3, min_len 0; lengths {0,3,6,...} + euf::snode* regex = sg.mk(re); + ng.add_str_mem(x, regex); + + // lb=5, ub=5 → no valid k (5 is not a multiple of 3) → conflict + seq::dep_tracker dep; dep.insert(0); + ng.root()->add_lower_int_bound(x, 5, dep); + ng.root()->add_upper_int_bound(x, 5, dep); + + bool conflict = parikh.check_parikh_conflict(*ng.root()); + std::cout << " conflict = " << conflict << " (expect 1)\n"; + SASSERT(conflict); +} + +// no conflict for stride==1 regex even with narrow bounds +static void test_check_conflict_stride_one_never_conflicts() { + std::cout << "test_check_conflict_stride_one_never_conflicts\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + sort_ref str_sort(seq.str.mk_string_sort(), m); + parikh_test_solver solver; + seq::nielsen_graph ng(sg, solver); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + expr_ref re(seq.re.mk_full_seq(str_sort), m); // stride 1 → no constraint + euf::snode* regex = sg.mk(re); + ng.add_str_mem(x, regex); + + seq::dep_tracker dep; dep.insert(0); + ng.root()->add_lower_int_bound(x, 7, dep); + ng.root()->add_upper_int_bound(x, 7, dep); + + bool conflict = parikh.check_parikh_conflict(*ng.root()); + std::cout << " conflict = " << conflict << " (expect 0: stride=1 skipped)\n"; + SASSERT(!conflict); +} + +// --------------------------------------------------------------------------- +// minterm_to_char_set tests +// --------------------------------------------------------------------------- + +// re.full_char → full alphabet [0, max_char] +static void test_minterm_full_char() { + std::cout << "test_minterm_full_char\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + sort_ref str_sort(seq.str.mk_string_sort(), m); + + expr_ref re(seq.re.mk_full_char(str_sort), m); + char_set cs = parikh.minterm_to_char_set(re); + std::cout << " full_char char_count = " << cs.char_count() << "\n"; + SASSERT(cs.is_full(seq.max_char())); +} + +// re.empty → empty char_set +static void test_minterm_empty_regex() { + std::cout << "test_minterm_empty_regex\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + sort_ref str_sort(seq.str.mk_string_sort(), m); + + expr_ref re(seq.re.mk_empty(str_sort), m); + char_set cs = parikh.minterm_to_char_set(re); + std::cout << " empty regex → char_set empty: " << cs.is_empty() << "\n"; + SASSERT(cs.is_empty()); +} + +// re.range('A','Z') → 26 characters +static void test_minterm_range() { + std::cout << "test_minterm_range\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + + // Z3 re.range takes string arguments "A" and "Z" + expr_ref lo_str(seq.str.mk_string(zstring("A")), m); + expr_ref hi_str(seq.str.mk_string(zstring("Z")), m); + expr_ref re(seq.re.mk_range(lo_str, hi_str), m); + char_set cs = parikh.minterm_to_char_set(re); + std::cout << " range(A,Z) char_count = " << cs.char_count() << "\n"; + SASSERT(cs.char_count() == 26); + SASSERT(cs.contains('A')); + SASSERT(cs.contains('Z')); + SASSERT(!cs.contains('a')); +} + +// complement of re.range('A','Z') should not contain A-Z +static void test_minterm_complement() { + std::cout << "test_minterm_complement\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + sort_ref str_sort(seq.str.mk_string_sort(), m); + + expr_ref lo_str(seq.str.mk_string(zstring("A")), m); + expr_ref hi_str(seq.str.mk_string(zstring("Z")), m); + expr_ref range(seq.re.mk_range(lo_str, hi_str), m); + expr_ref re(seq.re.mk_complement(range), m); + char_set cs = parikh.minterm_to_char_set(re); + + // complement of [A-Z] should not contain any letter in A-Z + for (unsigned c = 'A'; c <= 'Z'; ++c) + SASSERT(!cs.contains(c)); + // but should contain e.g. '0' + SASSERT(cs.contains('0')); + std::cout << " complement ok: A-Z excluded, '0' included\n"; +} + +// intersection of range('A','Z') and range('M','Z') == range('M','Z') +static void test_minterm_intersection() { + std::cout << "test_minterm_intersection\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + + expr_ref lo_az(seq.str.mk_string(zstring("A")), m); + expr_ref hi_az(seq.str.mk_string(zstring("Z")), m); + expr_ref lo_mz(seq.str.mk_string(zstring("M")), m); + + expr_ref range_az(seq.re.mk_range(lo_az, hi_az), m); + expr_ref range_mz(seq.re.mk_range(lo_mz, hi_az), m); + expr_ref re(seq.re.mk_inter(range_az, range_mz), m); + char_set cs = parikh.minterm_to_char_set(re); + + // intersection [A-Z] ∩ [M-Z] = [M-Z]: 14 characters + std::cout << " intersection [A-Z]∩[M-Z] char_count = " << cs.char_count() << "\n"; + SASSERT(cs.char_count() == 14); // M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z + SASSERT(!cs.contains('A')); + SASSERT(cs.contains('M')); + SASSERT(cs.contains('Z')); +} + +// diff(range('A','Z'), range('A','M')) == range('N','Z') +static void test_minterm_diff() { + std::cout << "test_minterm_diff\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + + expr_ref lo_az(seq.str.mk_string(zstring("A")), m); + expr_ref hi_az(seq.str.mk_string(zstring("Z")), m); + expr_ref lo_am(seq.str.mk_string(zstring("A")), m); + expr_ref hi_am(seq.str.mk_string(zstring("M")), m); + + expr_ref range_az(seq.re.mk_range(lo_az, hi_az), m); + expr_ref range_am(seq.re.mk_range(lo_am, hi_am), m); + expr_ref re(seq.re.mk_diff(range_az, range_am), m); + char_set cs = parikh.minterm_to_char_set(re); + + // diff [A-Z] \ [A-M] = [N-Z]: 13 characters + std::cout << " diff [A-Z]\\[A-M] char_count = " << cs.char_count() << "\n"; + SASSERT(cs.char_count() == 13); // N..Z + SASSERT(!cs.contains('A')); + SASSERT(!cs.contains('M')); + SASSERT(cs.contains('N')); + SASSERT(cs.contains('Z')); +} + +// to_re(unit('A')) → singleton {'A'} +static void test_minterm_singleton() { + std::cout << "test_minterm_singleton\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + + expr_ref ch_a(seq.str.mk_char('A'), m); + expr_ref unit_a(seq.str.mk_unit(ch_a), m); + expr_ref re(seq.re.mk_to_re(unit_a), m); + char_set cs = parikh.minterm_to_char_set(re); + + std::cout << " singleton char_count = " << cs.char_count() << "\n"; + SASSERT(cs.char_count() == 1); + SASSERT(cs.contains('A')); + SASSERT(!cs.contains('B')); +} + +// nullptr → full set (conservative fallback) +static void test_minterm_nullptr_is_full() { + std::cout << "test_minterm_nullptr_is_full\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + + char_set cs = parikh.minterm_to_char_set(nullptr); + SASSERT(cs.is_full(seq.max_char())); + std::cout << " nullptr → full set ok\n"; +} + +// --------------------------------------------------------------------------- +// Entry point +// --------------------------------------------------------------------------- + +void tst_seq_parikh() { + // stride tests + test_stride_fixed_length(); + test_stride_star_fixed_body(); + test_stride_star_three_char(); + test_stride_plus(); + test_stride_concat_stars(); + test_stride_union_no_common_period(); + test_stride_union_same_period(); + test_stride_loop(); + test_stride_full_seq(); + test_stride_empty_regex(); + test_stride_epsilon(); + test_stride_opt(); + + // generate_parikh_constraints tests + test_generate_constraints_ab_star(); + test_generate_constraints_bounded_loop(); + test_generate_constraints_stride_one(); + test_generate_constraints_fixed_length(); + test_generate_constraints_dep_propagated(); + + // apply_to_node tests + test_apply_to_node_adds_constraints(); + test_apply_to_node_stride_one_no_constraints(); + + // check_parikh_conflict tests + test_check_conflict_unconstrained_no_conflict(); + test_check_conflict_valid_k_exists(); + test_check_conflict_no_valid_k(); + test_check_conflict_abc_star(); + test_check_conflict_stride_one_never_conflicts(); + + // minterm_to_char_set tests + test_minterm_full_char(); + test_minterm_empty_regex(); + test_minterm_range(); + test_minterm_complement(); + test_minterm_intersection(); + test_minterm_diff(); + test_minterm_singleton(); + test_minterm_nullptr_is_full(); +}