diff --git a/src/smt/seq/CMakeLists.txt b/src/smt/seq/CMakeLists.txt index 9ea9c4bab..e6ed34c34 100644 --- a/src/smt/seq/CMakeLists.txt +++ b/src/smt/seq/CMakeLists.txt @@ -1,6 +1,6 @@ z3_add_component(smt_seq SOURCES - nseq_parikh.cpp + seq_parikh.cpp seq_nielsen.cpp COMPONENT_DEPENDENCIES euf diff --git a/src/smt/seq/nseq_parikh.cpp b/src/smt/seq/nseq_parikh.cpp index 788221f68..56f2bb344 100644 --- a/src/smt/seq/nseq_parikh.cpp +++ b/src/smt/seq/nseq_parikh.cpp @@ -5,334 +5,11 @@ Module Name: nseq_parikh.cpp -Abstract: +Note: - Parikh image filter implementation for the ZIPT-based Nielsen string - solver. See nseq_parith.h for the full design description. - - The key operation is compute_length_stride(re), which performs a - structural traversal of the regex to find the period k such that all - string lengths in L(re) are congruent to min_length(re) modulo k. - The stride is used to generate modular length constraints that help - the integer subsolver prune infeasible Nielsen graph nodes. - -Author: - - Clemens Eisenhofer 2026-03-10 - Nikolaj Bjorner (nbjorner) 2026-03-10 + This file is retained for backwards compatibility. + The canonical implementation is now smt/seq/seq_parikh.cpp. --*/ +// intentionally empty — see seq_parikh.cpp -#include "smt/seq/nseq_parith.h" -#include "ast/arith_decl_plugin.h" -#include "ast/seq_decl_plugin.h" -#include "util/mpz.h" -#include - -namespace seq { - - nseq_parith::nseq_parith(euf::sgraph& sg) - : m_sg(sg), m_fresh_cnt(0) {} - - expr_ref nseq_parith::mk_fresh_int_var() { - ast_manager& m = m_sg.get_manager(); - arith_util arith(m); - std::string name = "pk!" + std::to_string(m_fresh_cnt++); - return expr_ref(m.mk_fresh_const(name.c_str(), arith.mk_int()), m); - } - - // ----------------------------------------------------------------------- - // Stride computation - // ----------------------------------------------------------------------- - - // compute_length_stride: structural traversal of regex expression. - // - // Return value semantics: - // 0 — fixed length (or empty language): no modular constraint needed - // beyond the min == max bounds. - // 1 — all integer lengths ≥ min_len are achievable: no useful modular - // constraint. - // k > 1 — all lengths in L(re) satisfy len ≡ min_len (mod k): - // modular constraint len(str) = min_len + k·j is useful. - unsigned nseq_parith::compute_length_stride(expr* re) { - if (!re) return 1; - - seq_util& seq = m_sg.get_seq_util(); - expr* r1 = nullptr, *r2 = nullptr, *s = nullptr; - unsigned lo = 0, hi = 0; - - // Empty language: no strings exist; stride is irrelevant. - if (seq.re.is_empty(re)) - return 0; - - // Epsilon regex {""}: single fixed length 0. - if (seq.re.is_epsilon(re)) - return 0; - - // to_re(concrete_string): fixed-length, no modular constraint needed. - if (seq.re.is_to_re(re, s)) { - // min_length == max_length, covered by bounds. - return 0; - } - - // Single character: range, full_char — fixed length 1. - if (seq.re.is_range(re) || seq.re.is_full_char(re)) - return 0; - - // full_seq (.* / Σ*): every length ≥ 0 is possible. - if (seq.re.is_full_seq(re)) - return 1; - - // r* — Kleene star. - // L(r*) = {ε} ∪ L(r) ∪ L(r)·L(r) ∪ ... - // If r has a fixed length k, then L(r*) = {0, k, 2k, ...} → stride k. - // If r has variable length, strides from different iterations combine - // by GCD. - if (seq.re.is_star(re, r1)) { - unsigned mn = seq.re.min_length(r1); - unsigned mx = seq.re.max_length(r1); - // When the body has unbounded length (mx == UINT_MAX), different - // iterations can produce many different lengths, and the stride of - // the star as a whole degenerates to gcd(mn, mn) = mn (or 1 if - // mn == 1). This is conservative: we use the body's min-length - // as the only available fixed quantity. - if (mx == UINT_MAX) mx = mn; - if (mn == mx) { - // Fixed-length body: L(r*) = {0, mn, 2·mn, ...} → stride = mn. - // When mn == 1 the stride would be 1, which gives no useful - // modular constraint, so return 0 instead. - return (mn > 1) ? mn : 0; - } - // Variable-length body: GCD of min and max lengths - return u_gcd(mn, mx); - } - - // r+ — one or more: same stride analysis as r*. - if (seq.re.is_plus(re, r1)) { - unsigned mn = seq.re.min_length(r1); - unsigned mx = seq.re.max_length(r1); - if (mx == UINT_MAX) mx = mn; // same conservative treatment as star - if (mn == mx) - return (mn > 1) ? mn : 0; - return u_gcd(mn, mx); - } - - // r? — zero or one: lengths = {0} ∪ lengths(r) - // stride = GCD(mn_r, stride(r)) unless stride(r) is 0 (fixed length). - if (seq.re.is_opt(re, r1)) { - unsigned mn = seq.re.min_length(r1); - unsigned inner = compute_length_stride(r1); - // L(r?) includes length 0 and all lengths of L(r). - // GCD(stride(r), min_len(r)) is a valid stride because: - // - the gap from 0 to min_len(r) is min_len(r) itself, and - // - subsequent lengths grow in steps governed by stride(r). - // A result > 1 gives a useful modular constraint; result == 1 - // means every non-negative integer is achievable (no constraint). - if (inner == 0) - return u_gcd(mn, 0); // gcd(mn, 0) = mn; useful when mn > 1 - return u_gcd(inner, mn); - } - - // concat(r1, r2): lengths add → stride = GCD(stride(r1), stride(r2)). - if (seq.re.is_concat(re, r1, r2)) { - unsigned s1 = compute_length_stride(r1); - unsigned s2 = compute_length_stride(r2); - return u_gcd(s1, s2); - } - - // union(r1, r2): lengths from either branch → need GCD of both - // strides and the difference between their minimum lengths. - if (seq.re.is_union(re, r1, r2)) { - unsigned s1 = compute_length_stride(r1); - unsigned s2 = compute_length_stride(r2); - unsigned m1 = seq.re.min_length(r1); - unsigned m2 = seq.re.min_length(r2); - unsigned d = (m1 >= m2) ? (m1 - m2) : (m2 - m1); - // Replace 0-strides with d for GCD computation: - // a fixed-length branch only introduces constraint via its offset. - unsigned g = u_gcd(s1 == 0 ? d : s1, s2 == 0 ? d : s2); - g = u_gcd(g, d); - return g; - } - - // loop(r, lo, hi): lengths = {lo·len(r), ..., hi·len(r)} if r is fixed. - // stride = len(r) when r is fixed-length; otherwise GCD. - if (seq.re.is_loop(re, r1, lo, hi)) { - unsigned mn = seq.re.min_length(r1); - unsigned mx = seq.re.max_length(r1); - if (mx == UINT_MAX) mx = mn; - if (mn == mx) - return (mn > 1) ? mn : 0; - return u_gcd(mn, mx); - } - if (seq.re.is_loop(re, r1, lo)) { - unsigned mn = seq.re.min_length(r1); - unsigned mx = seq.re.max_length(r1); - if (mx == UINT_MAX) mx = mn; - if (mn == mx) - return (mn > 1) ? mn : 0; - return u_gcd(mn, mx); - } - - // intersection(r1, r2): lengths must be in both languages. - // A conservative safe choice: GCD(stride(r1), stride(r2)) is a valid - // stride for the intersection (every length in the intersection is - // also in r1 and in r2). - if (seq.re.is_intersection(re, r1, r2)) { - unsigned s1 = compute_length_stride(r1); - unsigned s2 = compute_length_stride(r2); - return u_gcd(s1, s2); - } - - // For complement, diff, reverse, derivative, of_pred, and anything - // else we cannot analyse statically: be conservative and return 1 - // (no useful modular constraint rather than an unsound one). - return 1; - } - - // ----------------------------------------------------------------------- - // Constraint generation - // ----------------------------------------------------------------------- - - void nseq_parith::generate_parikh_constraints(str_mem const& mem, - vector& out) { - if (!mem.m_regex || !mem.m_str) - return; - - ast_manager& m = m_sg.get_manager(); - seq_util& seq = m_sg.get_seq_util(); - arith_util arith(m); - - expr* re_expr = mem.m_regex->get_expr(); - if (!re_expr || !seq.is_re(re_expr)) - return; - - // Length bounds from the regex. - unsigned min_len = seq.re.min_length(re_expr); - unsigned max_len = seq.re.max_length(re_expr); - - // If min_len >= max_len the bounds already pin the length exactly - // (or the language is empty — empty language is detected by simplify_and_init - // via Brzozowski derivative / is_empty checks, not here). - // We only generate modular constraints when the length is variable. - if (min_len >= max_len) - return; - - unsigned stride = compute_length_stride(re_expr); - - // stride == 1: every integer length is possible — no useful constraint. - // stride == 0: fixed length or empty — handled by bounds. - if (stride <= 1) - return; - - // Build len(str) as an arithmetic expression. - expr_ref len_str(seq.str.mk_length(mem.m_str->get_expr()), m); - - // Introduce fresh integer variable k ≥ 0. - expr_ref k_var = mk_fresh_int_var(); - - // Constraint 1: len(str) = min_len + stride · k - expr_ref min_expr(arith.mk_int(min_len), m); - expr_ref stride_expr(arith.mk_int(stride), m); - expr_ref stride_k(arith.mk_mul(stride_expr, k_var), m); - expr_ref rhs(arith.mk_add(min_expr, stride_k), m); - out.push_back(int_constraint(len_str, rhs, - int_constraint_kind::eq, mem.m_dep, m)); - - // Constraint 2: k ≥ 0 - expr_ref zero(arith.mk_int(0), m); - out.push_back(int_constraint(k_var, zero, - int_constraint_kind::ge, mem.m_dep, m)); - - // Constraint 3 (optional): k ≤ max_k when max_len is bounded. - // max_k = floor((max_len - min_len) / stride) - // This gives the solver an explicit upper bound on k. - // The subtraction is safe because min_len < max_len is guaranteed - // by the early return above. - if (max_len != UINT_MAX) { - SASSERT(max_len > min_len); - unsigned range = max_len - min_len; - unsigned max_k = range / stride; - expr_ref max_k_expr(arith.mk_int(max_k), m); - out.push_back(int_constraint(k_var, max_k_expr, - int_constraint_kind::le, mem.m_dep, m)); - } - } - - void nseq_parith::apply_to_node(nielsen_node& node) { - vector constraints; - for (str_mem const& mem : node.str_mems()) - generate_parikh_constraints(mem, constraints); - for (auto& ic : constraints) - node.add_int_constraint(ic); - } - - // ----------------------------------------------------------------------- - // Quick Parikh feasibility check (no solver call) - // ----------------------------------------------------------------------- - - // Returns true if a Parikh conflict is detected: there exists a membership - // str ∈ re for a single-variable str where the modular length constraint - // len(str) = min_len + stride * k (k ≥ 0) - // is inconsistent with the variable's current integer bounds [lb, ub]. - // - // This check is lightweight — it uses only modular arithmetic on the already- - // known regex min/max lengths and the per-variable bounds stored in the node. - bool nseq_parith::check_parikh_conflict(nielsen_node& node) { - seq_util& seq = m_sg.get_seq_util(); - - for (str_mem const& mem : node.str_mems()) { - if (!mem.m_str || !mem.m_regex || !mem.m_str->is_var()) - continue; - - expr* re_expr = mem.m_regex->get_expr(); - if (!re_expr || !seq.is_re(re_expr)) - continue; - - unsigned min_len = seq.re.min_length(re_expr); - unsigned max_len = seq.re.max_length(re_expr); - if (min_len >= max_len) continue; // fixed or empty — no stride constraint - - unsigned stride = compute_length_stride(re_expr); - if (stride <= 1) continue; // no useful modular constraint - // stride > 1 guaranteed from here onward. - SASSERT(stride > 1); - - unsigned lb = node.var_lb(mem.m_str); - unsigned ub = node.var_ub(mem.m_str); - - // Check: ∃k ≥ 0 such that lb ≤ min_len + stride * k ≤ ub ? - // - // First find the smallest k satisfying the lower bound: - // k_min = 0 if min_len ≥ lb - // k_min = ⌈(lb - min_len) / stride⌉ otherwise - // - // Then verify min_len + stride * k_min ≤ ub. - unsigned k_min = 0; - if (lb > min_len) { - unsigned gap = lb - min_len; - // Ceiling division: k_min = ceil(gap / stride). - // Guard: (gap + stride - 1) may overflow if gap is close to UINT_MAX. - // In that case k_min would be huge, and min_len + stride*k_min would - // also overflow ub → treat as a conflict immediately. - if (gap > UINT_MAX - (stride - 1)) { - return true; // ceiling division would overflow → k_min too large - } - k_min = (gap + stride - 1) / stride; - } - - // Overflow guard: stride * k_min may overflow unsigned. - unsigned len_at_k_min; - if (k_min > (UINT_MAX - min_len) / stride) { - // Overflow: min_len + stride * k_min > UINT_MAX ≥ ub → conflict. - return true; - } - len_at_k_min = min_len + stride * k_min; - - if (ub != UINT_MAX && len_at_k_min > ub) - return true; // no valid k exists → Parikh conflict - } - return false; - } - -} // namespace seq diff --git a/src/smt/seq/nseq_parith.h b/src/smt/seq/nseq_parith.h index 5d019cfe4..182f54a55 100644 --- a/src/smt/seq/nseq_parith.h +++ b/src/smt/seq/nseq_parith.h @@ -5,124 +5,16 @@ Module Name: nseq_parith.h -Abstract: +Note: - Parikh image filter for the ZIPT-based Nielsen string solver. - - Implements Parikh-based arithmetic constraint generation for - nielsen_node instances. For a regex membership constraint str ∈ r, - the Parikh image of r constrains the multiset of characters in str. - This module computes the "length stride" (period) of the length - language of r and generates modular arithmetic constraints of the form - - len(str) = min_len + stride · k (k ≥ 0, k fresh integer) - - which tighten the arithmetic subproblem beyond the simple min/max - length bounds already produced by nielsen_node::init_var_bounds_from_mems(). - - For example: - • str ∈ (ab)* → min_len = 0, stride = 2 → len(str) = 2·k - • str ∈ a(bc)* → min_len = 1, stride = 2 → len(str) = 1 + 2·k - • str ∈ ab|abc → stride = 1 (no useful modular constraint) - - The generated int_constraints are added to the node's integer constraint - set and discharged by the integer subsolver (see seq_nielsen.h, - simple_solver). - - Implements the Parikh filter described in ZIPT - (https://github.com/CEisenhofer/ZIPT/tree/parikh/ZIPT/Constraints) - replacing ZIPT's PDD-based Parikh subsolver with Z3's linear arithmetic. - -Author: - - Clemens Eisenhofer 2026-03-10 - Nikolaj Bjorner (nbjorner) 2026-03-10 + This file is retained for backwards compatibility. + The canonical header is now smt/seq/seq_parikh.h. --*/ #pragma once - -#include "ast/euf/euf_sgraph.h" -#include "smt/seq/seq_nielsen.h" +#include "smt/seq/seq_parikh.h" namespace seq { - - /** - * Parikh image filter: generates modular length constraints from - * regex membership constraints in a nielsen_node. - * - * Usage: - * nseq_parith parith(sg); - * parith.apply_to_node(node); // adds constraints to node - * - * Or per-membership: - * vector out; - * parith.generate_parikh_constraints(mem, out); - */ - class nseq_parith { - euf::sgraph& m_sg; - unsigned m_fresh_cnt; // counter for fresh variable names - - // Compute the stride (period) of the length language of a regex. - // - // The stride k satisfies: all lengths in L(re) are congruent to - // min_length(re) modulo k. A stride of 1 means every integer - // length is possible (no useful modular constraint). A stride of - // 0 is a sentinel meaning the language is empty or has a single - // fixed length (already captured by bounds). - // - // Examples: - // stride(to_re("ab")) = 0 (fixed length 2) - // stride((ab)*) = 2 (lengths 0, 2, 4, ...) - // stride((abc)*) = 3 (lengths 0, 3, 6, ...) - // stride(a*b*) = 1 (all lengths possible) - // stride((ab)*(cd)*) = 2 (lengths 0, 2, 4, ...) - // stride((ab)*|(abc)*) = 1 (lengths 0, 2, 3, 4, ...) - unsigned compute_length_stride(expr* re); - - // Create a fresh integer variable (name "pk!N") for use as the - // Parikh multiplier variable k in len(str) = min_len + stride·k. - expr_ref mk_fresh_int_var(); - - public: - explicit nseq_parith(euf::sgraph& sg); - - // Generate Parikh modular length constraints for one membership. - // - // When stride > 1 and min_len < max_len (bounds don't pin length exactly, - // and the language is non-empty): - // adds: len(str) = min_len + stride · k (equality) - // k ≥ 0 (non-negativity) - // k ≤ (max_len - min_len) / stride (upper bound, when max_len bounded) - // These tighten the integer constraint set for the subsolver. - // Dependencies are copied from mem.m_dep. - // Does nothing when min_len ≥ max_len (empty or fixed-length language). - void generate_parikh_constraints(str_mem const& mem, - vector& out); - - // Apply Parikh constraints to all memberships at a node. - // Calls generate_parikh_constraints for each str_mem in the node - // and appends the resulting int_constraints to node.int_constraints(). - void apply_to_node(nielsen_node& node); - - // Quick Parikh feasibility check (no solver call). - // - // For each single-variable membership str ∈ re, checks whether the - // modular constraint len(str) = min_len + stride · k (k ≥ 0) - // has any solution given the current per-variable bounds stored in - // node.var_lb(str) and node.var_ub(str). - // - // Returns true when a conflict is detected (no valid k exists for - // some membership). The caller should then mark the node with - // backtrack_reason::parikh_image. - // - // This is a lightweight pre-check that avoids calling the integer - // subsolver. It is sound (never returns true for a satisfiable node) - // but incomplete (may miss conflicts that require the full solver). - bool check_parikh_conflict(nielsen_node& node); - - // Compute the length stride of a regex expression. - // Exposed for testing and external callers. - unsigned get_length_stride(expr* re) { return compute_length_stride(re); } - }; - -} // namespace seq + // backwards-compat alias + using nseq_parith = seq_parikh; +} diff --git a/src/smt/seq/seq_nielsen.cpp b/src/smt/seq/seq_nielsen.cpp index 4524dd7c0..81fd1b2ba 100644 --- a/src/smt/seq/seq_nielsen.cpp +++ b/src/smt/seq/seq_nielsen.cpp @@ -20,7 +20,7 @@ Author: --*/ #include "smt/seq/seq_nielsen.h" -#include "smt/seq/nseq_parith.h" +#include "smt/seq/seq_parikh.h" #include "ast/arith_decl_plugin.h" #include "ast/ast_pp.h" #include "util/hashtable.h" @@ -433,7 +433,7 @@ namespace seq { nielsen_graph::nielsen_graph(euf::sgraph& sg, simple_solver& solver): m_sg(sg), m_solver(solver), - m_parith(alloc(nseq_parith, sg)) { + m_parith(alloc(seq_parikh, sg)) { } nielsen_graph::~nielsen_graph() { diff --git a/src/smt/seq/seq_nielsen.h b/src/smt/seq/seq_nielsen.h index 70ab76edc..0be8ae274 100644 --- a/src/smt/seq/seq_nielsen.h +++ b/src/smt/seq/seq_nielsen.h @@ -248,7 +248,7 @@ namespace seq { class nielsen_node; class nielsen_edge; class nielsen_graph; - class nseq_parith; // Parikh image filter (defined in nseq_parith.h) + class seq_parikh; // Parikh image filter (see seq_parikh.h) /** * Abstract interface for an incremental solver used by nielsen_graph @@ -718,7 +718,7 @@ namespace seq { // Parikh image filter: generates modular length constraints from regex // memberships. Allocated in the constructor; owned by this graph. - nseq_parith* m_parith = nullptr; + seq_parikh* m_parith = nullptr; public: // Construct with a caller-supplied solver. Ownership is NOT transferred; diff --git a/src/smt/seq/seq_parikh.cpp b/src/smt/seq/seq_parikh.cpp new file mode 100644 index 000000000..255d09d72 --- /dev/null +++ b/src/smt/seq/seq_parikh.cpp @@ -0,0 +1,327 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_parikh.cpp + +Abstract: + + Parikh image filter implementation for the ZIPT-based Nielsen string + solver. See seq_parikh.h for the full design description. + + The key operation is compute_length_stride(re), which performs a + structural traversal of the regex to find the period k such that all + string lengths in L(re) are congruent to min_length(re) modulo k. + The stride is used to generate modular length constraints that help + the integer subsolver prune infeasible Nielsen graph nodes. + +Author: + + Clemens Eisenhofer 2026-03-10 + Nikolaj Bjorner (nbjorner) 2026-03-10 + +--*/ + +#include "smt/seq/seq_parikh.h" +#include "util/mpz.h" +#include + +namespace seq { + + seq_parikh::seq_parikh(euf::sgraph& sg) + : m(sg.get_manager()), seq(m), a(m), m_fresh_cnt(0) {} + + expr_ref seq_parikh::mk_fresh_int_var() { + std::string name = "pk!" + std::to_string(m_fresh_cnt++); + return expr_ref(m.mk_fresh_const(name.c_str(), a.mk_int()), m); + } + + // ----------------------------------------------------------------------- + // Stride computation + // ----------------------------------------------------------------------- + + // compute_length_stride: structural traversal of regex expression. + // + // Return value semantics: + // 0 — fixed length (or empty language): no modular constraint needed + // beyond the min == max bounds. + // 1 — all integer lengths ≥ min_len are achievable: no useful modular + // constraint. + // k > 1 — all lengths in L(re) satisfy len ≡ min_len (mod k): + // modular constraint len(str) = min_len + k·j is useful. + unsigned seq_parikh::compute_length_stride(expr* re) { + if (!re) return 1; + + expr* r1 = nullptr, *r2 = nullptr, *s = nullptr; + unsigned lo = 0, hi = 0; + + // Empty language: no strings exist; stride is irrelevant. + if (seq.re.is_empty(re)) + return 0; + + // Epsilon regex {""}: single fixed length 0. + if (seq.re.is_epsilon(re)) + return 0; + + // to_re(concrete_string): fixed-length, no modular constraint needed. + if (seq.re.is_to_re(re, s)) { + // min_length == max_length, covered by bounds. + return 0; + } + + // Single character: range, full_char — fixed length 1. + if (seq.re.is_range(re) || seq.re.is_full_char(re)) + return 0; + + // full_seq (.* / Σ*): every length ≥ 0 is possible. + if (seq.re.is_full_seq(re)) + return 1; + + // r* — Kleene star. + // L(r*) = {ε} ∪ L(r) ∪ L(r)·L(r) ∪ ... + // If r has a fixed length k, then L(r*) = {0, k, 2k, ...} → stride k. + // If r has variable length, strides from different iterations combine + // by GCD. + if (seq.re.is_star(re, r1)) { + unsigned mn = seq.re.min_length(r1); + unsigned mx = seq.re.max_length(r1); + // When the body has unbounded length (mx == UINT_MAX), different + // iterations can produce many different lengths, and the stride of + // the star as a whole degenerates to gcd(mn, mn) = mn (or 1 if + // mn == 1). This is conservative: we use the body's min-length + // as the only available fixed quantity. + if (mx == UINT_MAX) mx = mn; + if (mn == mx) { + // Fixed-length body: L(r*) = {0, mn, 2·mn, ...} → stride = mn. + // When mn == 1 the stride would be 1, which gives no useful + // modular constraint, so return 0 instead. + return (mn > 1) ? mn : 0; + } + // Variable-length body: GCD of min and max lengths + return u_gcd(mn, mx); + } + + // r+ — one or more: same stride analysis as r*. + if (seq.re.is_plus(re, r1)) { + unsigned mn = seq.re.min_length(r1); + unsigned mx = seq.re.max_length(r1); + if (mx == UINT_MAX) mx = mn; // same conservative treatment as star + if (mn == mx) + return (mn > 1) ? mn : 0; + return u_gcd(mn, mx); + } + + // r? — zero or one: lengths = {0} ∪ lengths(r) + // stride = GCD(mn_r, stride(r)) unless stride(r) is 0 (fixed length). + if (seq.re.is_opt(re, r1)) { + unsigned mn = seq.re.min_length(r1); + unsigned inner = compute_length_stride(r1); + // L(r?) includes length 0 and all lengths of L(r). + // GCD(stride(r), min_len(r)) is a valid stride because: + // - the gap from 0 to min_len(r) is min_len(r) itself, and + // - subsequent lengths grow in steps governed by stride(r). + // A result > 1 gives a useful modular constraint; result == 1 + // means every non-negative integer is achievable (no constraint). + if (inner == 0) + return u_gcd(mn, 0); // gcd(mn, 0) = mn; useful when mn > 1 + return u_gcd(inner, mn); + } + + // concat(r1, r2): lengths add → stride = GCD(stride(r1), stride(r2)). + if (seq.re.is_concat(re, r1, r2)) { + unsigned s1 = compute_length_stride(r1); + unsigned s2 = compute_length_stride(r2); + return u_gcd(s1, s2); + } + + // union(r1, r2): lengths from either branch → need GCD of both + // strides and the difference between their minimum lengths. + if (seq.re.is_union(re, r1, r2)) { + unsigned s1 = compute_length_stride(r1); + unsigned s2 = compute_length_stride(r2); + unsigned m1 = seq.re.min_length(r1); + unsigned m2 = seq.re.min_length(r2); + unsigned d = (m1 >= m2) ? (m1 - m2) : (m2 - m1); + // Replace 0-strides with d for GCD computation: + // a fixed-length branch only introduces constraint via its offset. + unsigned g = u_gcd(s1 == 0 ? d : s1, s2 == 0 ? d : s2); + g = u_gcd(g, d); + return g; + } + + // loop(r, lo, hi): lengths = {lo·len(r), ..., hi·len(r)} if r is fixed. + // stride = len(r) when r is fixed-length; otherwise GCD. + if (seq.re.is_loop(re, r1, lo, hi)) { + unsigned mn = seq.re.min_length(r1); + unsigned mx = seq.re.max_length(r1); + if (mx == UINT_MAX) mx = mn; + if (mn == mx) + return (mn > 1) ? mn : 0; + return u_gcd(mn, mx); + } + if (seq.re.is_loop(re, r1, lo)) { + unsigned mn = seq.re.min_length(r1); + unsigned mx = seq.re.max_length(r1); + if (mx == UINT_MAX) mx = mn; + if (mn == mx) + return (mn > 1) ? mn : 0; + return u_gcd(mn, mx); + } + + // intersection(r1, r2): lengths must be in both languages. + // A conservative safe choice: GCD(stride(r1), stride(r2)) is a valid + // stride for the intersection (every length in the intersection is + // also in r1 and in r2). + if (seq.re.is_intersection(re, r1, r2)) { + unsigned s1 = compute_length_stride(r1); + unsigned s2 = compute_length_stride(r2); + return u_gcd(s1, s2); + } + + // For complement, diff, reverse, derivative, of_pred, and anything + // else we cannot analyse statically: be conservative and return 1 + // (no useful modular constraint rather than an unsound one). + return 1; + } + + // ----------------------------------------------------------------------- + // Constraint generation + // ----------------------------------------------------------------------- + + void seq_parikh::generate_parikh_constraints(str_mem const& mem, + vector& out) { + if (!mem.m_regex || !mem.m_str) + return; + + expr* re_expr = mem.m_regex->get_expr(); + if (!re_expr || !seq.is_re(re_expr)) + return; + + // Length bounds from the regex. + unsigned min_len = seq.re.min_length(re_expr); + unsigned max_len = seq.re.max_length(re_expr); + + // If min_len >= max_len the bounds already pin the length exactly + // (or the language is empty — empty language is detected by simplify_and_init + // via Brzozowski derivative / is_empty checks, not here). + // We only generate modular constraints when the length is variable. + if (min_len >= max_len) + return; + + unsigned stride = compute_length_stride(re_expr); + + // stride == 1: every integer length is possible — no useful constraint. + // stride == 0: fixed length or empty — handled by bounds. + if (stride <= 1) + return; + + // Build len(str) as an arithmetic expression. + expr_ref len_str(seq.str.mk_length(mem.m_str->get_expr()), m); + + // Introduce fresh integer variable k ≥ 0. + expr_ref k_var = mk_fresh_int_var(); + + // Constraint 1: len(str) = min_len + stride · k + expr_ref min_expr(a.mk_int(min_len), m); + expr_ref stride_expr(a.mk_int(stride), m); + expr_ref stride_k(a.mk_mul(stride_expr, k_var), m); + expr_ref rhs(a.mk_add(min_expr, stride_k), m); + out.push_back(int_constraint(len_str, rhs, + int_constraint_kind::eq, mem.m_dep, m)); + + // Constraint 2: k ≥ 0 + expr_ref zero(a.mk_int(0), m); + out.push_back(int_constraint(k_var, zero, + int_constraint_kind::ge, mem.m_dep, m)); + + // Constraint 3 (optional): k ≤ max_k when max_len is bounded. + // max_k = floor((max_len - min_len) / stride) + // This gives the solver an explicit upper bound on k. + // The subtraction is safe because min_len < max_len is guaranteed + // by the early return above. + if (max_len != UINT_MAX) { + SASSERT(max_len > min_len); + unsigned range = max_len - min_len; + unsigned max_k = range / stride; + expr_ref max_k_expr(a.mk_int(max_k), m); + out.push_back(int_constraint(k_var, max_k_expr, + int_constraint_kind::le, mem.m_dep, m)); + } + } + + void seq_parikh::apply_to_node(nielsen_node& node) { + vector constraints; + for (str_mem const& mem : node.str_mems()) + generate_parikh_constraints(mem, constraints); + for (auto& ic : constraints) + node.add_int_constraint(ic); + } + + // ----------------------------------------------------------------------- + // Quick Parikh feasibility check (no solver call) + // ----------------------------------------------------------------------- + + // Returns true if a Parikh conflict is detected: there exists a membership + // str ∈ re for a single-variable str where the modular length constraint + // len(str) = min_len + stride * k (k ≥ 0) + // is inconsistent with the variable's current integer bounds [lb, ub]. + // + // This check is lightweight — it uses only modular arithmetic on the already- + // known regex min/max lengths and the per-variable bounds stored in the node. + bool seq_parikh::check_parikh_conflict(nielsen_node& node) { + for (str_mem const& mem : node.str_mems()) { + if (!mem.m_str || !mem.m_regex || !mem.m_str->is_var()) + continue; + + expr* re_expr = mem.m_regex->get_expr(); + if (!re_expr || !seq.is_re(re_expr)) + continue; + + unsigned min_len = seq.re.min_length(re_expr); + unsigned max_len = seq.re.max_length(re_expr); + if (min_len >= max_len) continue; // fixed or empty — no stride constraint + + unsigned stride = compute_length_stride(re_expr); + if (stride <= 1) continue; // no useful modular constraint + // stride > 1 guaranteed from here onward. + SASSERT(stride > 1); + + unsigned lb = node.var_lb(mem.m_str); + unsigned ub = node.var_ub(mem.m_str); + + // Check: ∃k ≥ 0 such that lb ≤ min_len + stride * k ≤ ub ? + // + // First find the smallest k satisfying the lower bound: + // k_min = 0 if min_len ≥ lb + // k_min = ⌈(lb - min_len) / stride⌉ otherwise + // + // Then verify min_len + stride * k_min ≤ ub. + unsigned k_min = 0; + if (lb > min_len) { + unsigned gap = lb - min_len; + // Ceiling division: k_min = ceil(gap / stride). + // Guard: (gap + stride - 1) may overflow if gap is close to UINT_MAX. + // In that case k_min would be huge, and min_len + stride*k_min would + // also overflow ub → treat as a conflict immediately. + if (gap > UINT_MAX - (stride - 1)) { + return true; // ceiling division would overflow → k_min too large + } + k_min = (gap + stride - 1) / stride; + } + + // Overflow guard: stride * k_min may overflow unsigned. + unsigned len_at_k_min; + if (k_min > (UINT_MAX - min_len) / stride) { + // Overflow: min_len + stride * k_min > UINT_MAX ≥ ub → conflict. + return true; + } + len_at_k_min = min_len + stride * k_min; + + if (ub != UINT_MAX && len_at_k_min > ub) + return true; // no valid k exists → Parikh conflict + } + return false; + } + +} // namespace seq diff --git a/src/smt/seq/seq_parikh.h b/src/smt/seq/seq_parikh.h new file mode 100644 index 000000000..730854832 --- /dev/null +++ b/src/smt/seq/seq_parikh.h @@ -0,0 +1,131 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_parikh.h + +Abstract: + + Parikh image filter for the ZIPT-based Nielsen string solver. + + Implements Parikh-based arithmetic constraint generation for + nielsen_node instances. For a regex membership constraint str ∈ r, + the Parikh image of r constrains the multiset of characters in str. + This module computes the "length stride" (period) of the length + language of r and generates modular arithmetic constraints of the form + + len(str) = min_len + stride · k (k ≥ 0, k fresh integer) + + which tighten the arithmetic subproblem beyond the simple min/max + length bounds already produced by nielsen_node::init_var_bounds_from_mems(). + + For example: + • str ∈ (ab)* → min_len = 0, stride = 2 → len(str) = 2·k + • str ∈ a(bc)* → min_len = 1, stride = 2 → len(str) = 1 + 2·k + • str ∈ ab|abc → stride = 1 (no useful modular constraint) + + The generated int_constraints are added to the node's integer constraint + set and discharged by the integer subsolver (see seq_nielsen.h, + simple_solver). + + Implements the Parikh filter described in ZIPT + (https://github.com/CEisenhofer/ZIPT/tree/parikh/ZIPT/Constraints) + replacing ZIPT's PDD-based Parikh subsolver with Z3's linear arithmetic. + +Author: + + Clemens Eisenhofer 2026-03-10 + Nikolaj Bjorner (nbjorner) 2026-03-10 + +--*/ +#pragma once + +#include "ast/arith_decl_plugin.h" +#include "ast/seq_decl_plugin.h" +#include "smt/seq/seq_nielsen.h" + +namespace seq { + + /** + * Parikh image filter: generates modular length constraints from + * regex membership constraints in a nielsen_node. + * + * Usage: + * seq_parikh parikh(sg); + * parikh.apply_to_node(node); // adds constraints to node + * + * Or per-membership: + * vector out; + * parikh.generate_parikh_constraints(mem, out); + */ + class seq_parikh { + ast_manager& m; + seq_util seq; + arith_util a; + unsigned m_fresh_cnt; // counter for fresh variable names + + // Compute the stride (period) of the length language of a regex. + // + // The stride k satisfies: all lengths in L(re) are congruent to + // min_length(re) modulo k. A stride of 1 means every integer + // length is possible (no useful modular constraint). A stride of + // 0 is a sentinel meaning the language is empty or has a single + // fixed length (already captured by bounds). + // + // Examples: + // stride(to_re("ab")) = 0 (fixed length 2) + // stride((ab)*) = 2 (lengths 0, 2, 4, ...) + // stride((abc)*) = 3 (lengths 0, 3, 6, ...) + // stride(a*b*) = 1 (all lengths possible) + // stride((ab)*(cd)*) = 2 (lengths 0, 2, 4, ...) + // stride((ab)*|(abc)*) = 1 (lengths 0, 2, 3, 4, ...) + unsigned compute_length_stride(expr* re); + + // Create a fresh integer variable (name "pk!N") for use as the + // Parikh multiplier variable k in len(str) = min_len + stride·k. + expr_ref mk_fresh_int_var(); + + public: + explicit seq_parikh(euf::sgraph& sg); + + // Generate Parikh modular length constraints for one membership. + // + // When stride > 1 and min_len < max_len (bounds don't pin length exactly, + // and the language is non-empty): + // adds: len(str) = min_len + stride · k (equality) + // k ≥ 0 (non-negativity) + // k ≤ (max_len - min_len) / stride (upper bound, when max_len bounded) + // These tighten the integer constraint set for the subsolver. + // Dependencies are copied from mem.m_dep. + // Does nothing when min_len ≥ max_len (empty or fixed-length language). + void generate_parikh_constraints(str_mem const& mem, + vector& out); + + // Apply Parikh constraints to all memberships at a node. + // Calls generate_parikh_constraints for each str_mem in the node + // and appends the resulting int_constraints to node.int_constraints(). + void apply_to_node(nielsen_node& node); + + // Quick Parikh feasibility check (no solver call). + // + // For each single-variable membership str ∈ re, checks whether the + // modular constraint len(str) = min_len + stride · k (k ≥ 0) + // has any solution given the current per-variable bounds stored in + // node.var_lb(str) and node.var_ub(str). + // + // Returns true when a conflict is detected (no valid k exists for + // some membership). The caller should then mark the node with + // backtrack_reason::parikh_image. + // + // This is a lightweight pre-check that avoids calling the integer + // subsolver. It is sound (never returns true for a satisfiable node) + // but incomplete (may miss conflicts that require the full solver). + bool check_parikh_conflict(nielsen_node& node); + + // Compute the length stride of a regex expression. + // Exposed for testing and external callers. + unsigned get_length_stride(expr* re) { return compute_length_stride(re); } + }; + +} // namespace seq