From 3054f0cb4144fa45c272e7125c1013fddbea7b41 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 10 Mar 2026 23:43:18 +0000 Subject: [PATCH 01/13] Initial plan From 334df71b1198fcbfb70204db5910918b512056c3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 11 Mar 2026 00:05:26 +0000 Subject: [PATCH 02/13] Add nseq_parith.h and nseq_parikh.cpp: Parikh filter for ZIPT string solver Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com> --- src/smt/seq/CMakeLists.txt | 1 + src/smt/seq/nseq_parikh.cpp | 274 ++++++++++++++++++++++++++++++++++++ src/smt/seq/nseq_parith.h | 112 +++++++++++++++ 3 files changed, 387 insertions(+) create mode 100644 src/smt/seq/nseq_parikh.cpp create mode 100644 src/smt/seq/nseq_parith.h diff --git a/src/smt/seq/CMakeLists.txt b/src/smt/seq/CMakeLists.txt index db63e4c6f..9ea9c4bab 100644 --- a/src/smt/seq/CMakeLists.txt +++ b/src/smt/seq/CMakeLists.txt @@ -1,5 +1,6 @@ z3_add_component(smt_seq SOURCES + nseq_parikh.cpp seq_nielsen.cpp COMPONENT_DEPENDENCIES euf diff --git a/src/smt/seq/nseq_parikh.cpp b/src/smt/seq/nseq_parikh.cpp new file mode 100644 index 000000000..9bc2f4bcd --- /dev/null +++ b/src/smt/seq/nseq_parikh.cpp @@ -0,0 +1,274 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + nseq_parikh.cpp + +Abstract: + + Parikh image filter implementation for the ZIPT-based Nielsen string + solver. See nseq_parith.h for the full design description. + + The key operation is compute_length_stride(re), which performs a + structural traversal of the regex to find the period k such that all + string lengths in L(re) are congruent to min_length(re) modulo k. + The stride is used to generate modular length constraints that help + the integer subsolver prune infeasible Nielsen graph nodes. + +Author: + + Nikolaj Bjorner (nbjorner) 2026-03-10 + +--*/ + +#include "smt/seq/nseq_parith.h" +#include "ast/arith_decl_plugin.h" +#include "ast/seq_decl_plugin.h" +#include + +namespace seq { + + // ----------------------------------------------------------------------- + // Helpers + // ----------------------------------------------------------------------- + + // GCD via Euclidean algorithm. gcd(0, x) = x, gcd(0, 0) = 0. + unsigned nseq_parith::gcd(unsigned a, unsigned b) { + if (a == 0 && b == 0) return 0; + while (b != 0) { + unsigned t = b; + b = a % b; + a = t; + } + return a; + } + + nseq_parith::nseq_parith(euf::sgraph& sg) + : m_sg(sg), m_fresh_cnt(0) {} + + expr_ref nseq_parith::mk_fresh_int_var() { + ast_manager& m = m_sg.get_manager(); + arith_util arith(m); + std::string name = "pk!" + std::to_string(m_fresh_cnt++); + return expr_ref(m.mk_fresh_const(name.c_str(), arith.mk_int()), m); + } + + // ----------------------------------------------------------------------- + // Stride computation + // ----------------------------------------------------------------------- + + // compute_length_stride: structural traversal of regex expression. + // + // Return value semantics: + // 0 — fixed length (or empty language): no modular constraint needed + // beyond the min == max bounds. + // 1 — all integer lengths ≥ min_len are achievable: no useful modular + // constraint. + // k > 1 — all lengths in L(re) satisfy len ≡ min_len (mod k): + // modular constraint len(str) = min_len + k·j is useful. + unsigned nseq_parith::compute_length_stride(expr* re) { + if (!re) return 1; + + seq_util& seq = m_sg.get_seq_util(); + expr* r1 = nullptr, *r2 = nullptr, *s = nullptr; + unsigned lo = 0, hi = 0; + + // Empty language: no strings exist; stride is irrelevant. + if (seq.re.is_empty(re)) + return 0; + + // Epsilon regex {""}: single fixed length 0. + if (seq.re.is_epsilon(re)) + return 0; + + // to_re(concrete_string): fixed-length, no modular constraint needed. + if (seq.re.is_to_re(re, s)) { + // min_length == max_length, covered by bounds. + return 0; + } + + // Single character: range, full_char — fixed length 1. + if (seq.re.is_range(re) || seq.re.is_full_char(re)) + return 0; + + // full_seq (.* / Σ*): every length ≥ 0 is possible. + if (seq.re.is_full_seq(re)) + return 1; + + // r* — Kleene star. + // L(r*) = {ε} ∪ L(r) ∪ L(r)·L(r) ∪ ... + // If r has a fixed length k, then L(r*) = {0, k, 2k, ...} → stride k. + // If r has variable length, strides from different iterations combine + // by GCD. + if (seq.re.is_star(re, r1)) { + unsigned mn = seq.re.min_length(r1); + unsigned mx = seq.re.max_length(r1); + // When the body has unbounded length (mx == UINT_MAX), different + // iterations can produce many different lengths, and the stride of + // the star as a whole degenerates to gcd(mn, mn) = mn (or 1 if + // mn == 1). This is conservative: we use the body's min-length + // as the only available fixed quantity. + if (mx == UINT_MAX) mx = mn; + if (mn == mx) { + // Fixed-length body: L(r*) = {0, mn, 2·mn, ...} → stride = mn. + // When mn == 1 the stride would be 1, which gives no useful + // modular constraint, so return 0 instead. + return (mn > 1) ? mn : 0; + } + // Variable-length body: GCD of min and max lengths + return gcd(mn, mx); + } + + // r+ — one or more: same stride analysis as r*. + if (seq.re.is_plus(re, r1)) { + unsigned mn = seq.re.min_length(r1); + unsigned mx = seq.re.max_length(r1); + if (mx == UINT_MAX) mx = mn; // same conservative treatment as star + if (mn == mx) + return (mn > 1) ? mn : 0; + return gcd(mn, mx); + } + + // r? — zero or one: lengths = {0} ∪ lengths(r) + // stride = GCD(mn_r, stride(r)) unless stride(r) is 0 (fixed length). + if (seq.re.is_opt(re, r1)) { + unsigned mn = seq.re.min_length(r1); + unsigned inner = compute_length_stride(r1); + // L(r?) includes length 0 and all lengths of L(r). + // GCD(stride(r), min_len(r)) is a valid stride because: + // - the gap from 0 to min_len(r) is min_len(r) itself, and + // - subsequent lengths grow in steps governed by stride(r). + // A result > 1 gives a useful modular constraint; result == 1 + // means every non-negative integer is achievable (no constraint). + if (inner == 0) + return gcd(mn, 0); // gcd(mn, 0) = mn; useful when mn > 1 + return gcd(inner, mn); + } + + // concat(r1, r2): lengths add → stride = GCD(stride(r1), stride(r2)). + if (seq.re.is_concat(re, r1, r2)) { + unsigned s1 = compute_length_stride(r1); + unsigned s2 = compute_length_stride(r2); + // 0 (fixed) on either side: result is governed by the other. + if (s1 == 0 && s2 == 0) return 0; + if (s1 == 0) return s2; + if (s2 == 0) return s1; + return gcd(s1, s2); + } + + // union(r1, r2): lengths from either branch → need GCD of both + // strides and the difference between their minimum lengths. + if (seq.re.is_union(re, r1, r2)) { + unsigned s1 = compute_length_stride(r1); + unsigned s2 = compute_length_stride(r2); + unsigned m1 = seq.re.min_length(r1); + unsigned m2 = seq.re.min_length(r2); + unsigned d = (m1 >= m2) ? (m1 - m2) : (m2 - m1); + // Replace 0-strides with d for GCD computation: + // a fixed-length branch only introduces constraint via its offset. + unsigned g = gcd(s1 == 0 ? d : s1, s2 == 0 ? d : s2); + g = gcd(g, d); + return g; + } + + // loop(r, lo, hi): lengths = {lo·len(r), ..., hi·len(r)} if r is fixed. + // stride = len(r) when r is fixed-length; otherwise GCD. + if (seq.re.is_loop(re, r1, lo, hi)) { + unsigned mn = seq.re.min_length(r1); + unsigned mx = seq.re.max_length(r1); + if (mx == UINT_MAX) mx = mn; + if (mn == mx) + return (mn > 1) ? mn : 0; + return gcd(mn, mx); + } + if (seq.re.is_loop(re, r1, lo)) { + unsigned mn = seq.re.min_length(r1); + unsigned mx = seq.re.max_length(r1); + if (mx == UINT_MAX) mx = mn; + if (mn == mx) + return (mn > 1) ? mn : 0; + return gcd(mn, mx); + } + + // intersection(r1, r2): lengths must be in both languages. + // A conservative safe choice: GCD(stride(r1), stride(r2)) is a valid + // stride for the intersection (every length in the intersection is + // also in r1 and in r2). + if (seq.re.is_intersection(re, r1, r2)) { + unsigned s1 = compute_length_stride(r1); + unsigned s2 = compute_length_stride(r2); + if (s1 == 0 && s2 == 0) return 0; + if (s1 == 0) return s2; + if (s2 == 0) return s1; + return gcd(s1, s2); + } + + // For complement, diff, reverse, derivative, of_pred, and anything + // else we cannot analyse statically: be conservative and return 1 + // (no useful modular constraint rather than an unsound one). + return 1; + } + + // ----------------------------------------------------------------------- + // Constraint generation + // ----------------------------------------------------------------------- + + void nseq_parith::generate_parikh_constraints(str_mem const& mem, + vector& out) { + if (!mem.m_regex || !mem.m_str) + return; + + ast_manager& m = m_sg.get_manager(); + seq_util& seq = m_sg.get_seq_util(); + arith_util arith(m); + + expr* re_expr = mem.m_regex->get_expr(); + if (!re_expr || !seq.is_re(re_expr)) + return; + + // Length bounds from the regex. + unsigned min_len = seq.re.min_length(re_expr); + unsigned max_len = seq.re.max_length(re_expr); + + // If min_len == max_len the bounds already pin the length exactly; + // no modular constraint is needed. + if (min_len == max_len) + return; + + unsigned stride = compute_length_stride(re_expr); + + // stride == 1: every integer length is possible — no useful constraint. + // stride == 0: fixed length or empty — handled by bounds. + if (stride <= 1) + return; + + // Build len(str) as an arithmetic expression. + expr_ref len_str(seq.str.mk_length(mem.m_str->get_expr()), m); + + // Introduce fresh integer variable k ≥ 0. + expr_ref k_var = mk_fresh_int_var(); + + // Constraint 1: len(str) = min_len + stride · k + expr_ref min_expr(arith.mk_int(min_len), m); + expr_ref stride_expr(arith.mk_int(stride), m); + expr_ref stride_k(arith.mk_mul(stride_expr, k_var), m); + expr_ref rhs(arith.mk_add(min_expr, stride_k), m); + out.push_back(int_constraint(len_str, rhs, + int_constraint_kind::eq, mem.m_dep, m)); + + // Constraint 2: k ≥ 0 + expr_ref zero(arith.mk_int(0), m); + out.push_back(int_constraint(k_var, zero, + int_constraint_kind::ge, mem.m_dep, m)); + } + + void nseq_parith::apply_to_node(nielsen_node& node) { + vector constraints; + for (str_mem const& mem : node.str_mems()) + generate_parikh_constraints(mem, constraints); + for (auto& ic : constraints) + node.add_int_constraint(ic); + } + +} // namespace seq diff --git a/src/smt/seq/nseq_parith.h b/src/smt/seq/nseq_parith.h new file mode 100644 index 000000000..e7f84e247 --- /dev/null +++ b/src/smt/seq/nseq_parith.h @@ -0,0 +1,112 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + nseq_parith.h + +Abstract: + + Parikh image filter for the ZIPT-based Nielsen string solver. + + Implements Parikh-based arithmetic constraint generation for + nielsen_node instances. For a regex membership constraint str ∈ r, + the Parikh image of r constrains the multiset of characters in str. + This module computes the "length stride" (period) of the length + language of r and generates modular arithmetic constraints of the form + + len(str) = min_len + stride · k (k ≥ 0, k fresh integer) + + which tighten the arithmetic subproblem beyond the simple min/max + length bounds already produced by nielsen_node::init_var_bounds_from_mems(). + + For example: + • str ∈ (ab)* → min_len = 0, stride = 2 → len(str) = 2·k + • str ∈ a(bc)* → min_len = 1, stride = 2 → len(str) = 1 + 2·k + • str ∈ ab|abc → stride = 1 (no useful modular constraint) + + The generated int_constraints are added to the node's integer constraint + set and discharged by the integer subsolver (see seq_nielsen.h, + simple_solver). + + Implements the Parikh filter described in ZIPT + (https://github.com/CEisenhofer/ZIPT/tree/parikh/ZIPT/Constraints) + replacing ZIPT's PDD-based Parikh subsolver with Z3's linear arithmetic. + +Author: + + Nikolaj Bjorner (nbjorner) 2026-03-10 + +--*/ +#pragma once + +#include "ast/euf/euf_sgraph.h" +#include "smt/seq/seq_nielsen.h" + +namespace seq { + + /** + * Parikh image filter: generates modular length constraints from + * regex membership constraints in a nielsen_node. + * + * Usage: + * nseq_parith parith(sg); + * parith.apply_to_node(node); // adds constraints to node + * + * Or per-membership: + * vector out; + * parith.generate_parikh_constraints(mem, out); + */ + class nseq_parith { + euf::sgraph& m_sg; + unsigned m_fresh_cnt; // counter for fresh variable names + + // Compute GCD of a and b. gcd(0, x) = x by convention. + // Returns 0 only when both arguments are 0. + static unsigned gcd(unsigned a, unsigned b); + + // Compute the stride (period) of the length language of a regex. + // + // The stride k satisfies: all lengths in L(re) are congruent to + // min_length(re) modulo k. A stride of 1 means every integer + // length is possible (no useful modular constraint). A stride of + // 0 is a sentinel meaning the language is empty or has a single + // fixed length (already captured by bounds). + // + // Examples: + // stride(to_re("ab")) = 0 (fixed length 2) + // stride((ab)*) = 2 (lengths 0, 2, 4, ...) + // stride((abc)*) = 3 (lengths 0, 3, 6, ...) + // stride(a*b*) = 1 (all lengths possible) + // stride((ab)*(cd)*) = 2 (lengths 0, 2, 4, ...) + // stride((ab)*|(abc)*) = 1 (lengths 0, 2, 3, 4, ...) + unsigned compute_length_stride(expr* re); + + // Create a fresh integer variable (name "pk!N") for use as the + // Parikh multiplier variable k in len(str) = min_len + stride·k. + expr_ref mk_fresh_int_var(); + + public: + explicit nseq_parith(euf::sgraph& sg); + + // Generate Parikh modular length constraints for one membership. + // + // When stride > 1 and min_len < max_len (bounds don't pin length): + // adds: len(str) = min_len + stride · k (equality) + // k ≥ 0 (non-negativity) + // These tighten the integer constraint set for the subsolver. + // Dependencies are copied from mem.m_dep. + void generate_parikh_constraints(str_mem const& mem, + vector& out); + + // Apply Parikh constraints to all memberships at a node. + // Calls generate_parikh_constraints for each str_mem in the node + // and appends the resulting int_constraints to node.int_constraints(). + void apply_to_node(nielsen_node& node); + + // Compute the length stride of a regex expression. + // Exposed for testing and external callers. + unsigned get_length_stride(expr* re) { return compute_length_stride(re); } + }; + +} // namespace seq From d267e452a2402bb8449823fdd6536f56993da5ee Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 11 Mar 2026 04:21:30 +0000 Subject: [PATCH 03/13] Add Clemens Eisenhofer as co-author to nseq_parith.h and nseq_parikh.cpp Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com> --- src/smt/seq/nseq_parikh.cpp | 1 + src/smt/seq/nseq_parith.h | 1 + 2 files changed, 2 insertions(+) diff --git a/src/smt/seq/nseq_parikh.cpp b/src/smt/seq/nseq_parikh.cpp index 9bc2f4bcd..4ce843052 100644 --- a/src/smt/seq/nseq_parikh.cpp +++ b/src/smt/seq/nseq_parikh.cpp @@ -18,6 +18,7 @@ Abstract: Author: + Clemens Eisenhofer 2026-03-10 Nikolaj Bjorner (nbjorner) 2026-03-10 --*/ diff --git a/src/smt/seq/nseq_parith.h b/src/smt/seq/nseq_parith.h index e7f84e247..719efa5d9 100644 --- a/src/smt/seq/nseq_parith.h +++ b/src/smt/seq/nseq_parith.h @@ -35,6 +35,7 @@ Abstract: Author: + Clemens Eisenhofer 2026-03-10 Nikolaj Bjorner (nbjorner) 2026-03-10 --*/ From 4cdfceabc5c7dbbd10072f418c7b96db433f953b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 11 Mar 2026 04:24:16 +0000 Subject: [PATCH 04/13] Use u_gcd from util/mpz.h instead of local gcd definition Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com> --- src/smt/seq/nseq_parikh.cpp | 36 +++++++++++------------------------- src/smt/seq/nseq_parith.h | 4 ---- 2 files changed, 11 insertions(+), 29 deletions(-) diff --git a/src/smt/seq/nseq_parikh.cpp b/src/smt/seq/nseq_parikh.cpp index 4ce843052..a6d1d20e5 100644 --- a/src/smt/seq/nseq_parikh.cpp +++ b/src/smt/seq/nseq_parikh.cpp @@ -26,25 +26,11 @@ Author: #include "smt/seq/nseq_parith.h" #include "ast/arith_decl_plugin.h" #include "ast/seq_decl_plugin.h" +#include "util/mpz.h" #include namespace seq { - // ----------------------------------------------------------------------- - // Helpers - // ----------------------------------------------------------------------- - - // GCD via Euclidean algorithm. gcd(0, x) = x, gcd(0, 0) = 0. - unsigned nseq_parith::gcd(unsigned a, unsigned b) { - if (a == 0 && b == 0) return 0; - while (b != 0) { - unsigned t = b; - b = a % b; - a = t; - } - return a; - } - nseq_parith::nseq_parith(euf::sgraph& sg) : m_sg(sg), m_fresh_cnt(0) {} @@ -118,7 +104,7 @@ namespace seq { return (mn > 1) ? mn : 0; } // Variable-length body: GCD of min and max lengths - return gcd(mn, mx); + return u_gcd(mn, mx); } // r+ — one or more: same stride analysis as r*. @@ -128,7 +114,7 @@ namespace seq { if (mx == UINT_MAX) mx = mn; // same conservative treatment as star if (mn == mx) return (mn > 1) ? mn : 0; - return gcd(mn, mx); + return u_gcd(mn, mx); } // r? — zero or one: lengths = {0} ∪ lengths(r) @@ -143,8 +129,8 @@ namespace seq { // A result > 1 gives a useful modular constraint; result == 1 // means every non-negative integer is achievable (no constraint). if (inner == 0) - return gcd(mn, 0); // gcd(mn, 0) = mn; useful when mn > 1 - return gcd(inner, mn); + return u_gcd(mn, 0); // gcd(mn, 0) = mn; useful when mn > 1 + return u_gcd(inner, mn); } // concat(r1, r2): lengths add → stride = GCD(stride(r1), stride(r2)). @@ -155,7 +141,7 @@ namespace seq { if (s1 == 0 && s2 == 0) return 0; if (s1 == 0) return s2; if (s2 == 0) return s1; - return gcd(s1, s2); + return u_gcd(s1, s2); } // union(r1, r2): lengths from either branch → need GCD of both @@ -168,8 +154,8 @@ namespace seq { unsigned d = (m1 >= m2) ? (m1 - m2) : (m2 - m1); // Replace 0-strides with d for GCD computation: // a fixed-length branch only introduces constraint via its offset. - unsigned g = gcd(s1 == 0 ? d : s1, s2 == 0 ? d : s2); - g = gcd(g, d); + unsigned g = u_gcd(s1 == 0 ? d : s1, s2 == 0 ? d : s2); + g = u_gcd(g, d); return g; } @@ -181,7 +167,7 @@ namespace seq { if (mx == UINT_MAX) mx = mn; if (mn == mx) return (mn > 1) ? mn : 0; - return gcd(mn, mx); + return u_gcd(mn, mx); } if (seq.re.is_loop(re, r1, lo)) { unsigned mn = seq.re.min_length(r1); @@ -189,7 +175,7 @@ namespace seq { if (mx == UINT_MAX) mx = mn; if (mn == mx) return (mn > 1) ? mn : 0; - return gcd(mn, mx); + return u_gcd(mn, mx); } // intersection(r1, r2): lengths must be in both languages. @@ -202,7 +188,7 @@ namespace seq { if (s1 == 0 && s2 == 0) return 0; if (s1 == 0) return s2; if (s2 == 0) return s1; - return gcd(s1, s2); + return u_gcd(s1, s2); } // For complement, diff, reverse, derivative, of_pred, and anything diff --git a/src/smt/seq/nseq_parith.h b/src/smt/seq/nseq_parith.h index 719efa5d9..6f8fc7ae3 100644 --- a/src/smt/seq/nseq_parith.h +++ b/src/smt/seq/nseq_parith.h @@ -62,10 +62,6 @@ namespace seq { euf::sgraph& m_sg; unsigned m_fresh_cnt; // counter for fresh variable names - // Compute GCD of a and b. gcd(0, x) = x by convention. - // Returns 0 only when both arguments are 0. - static unsigned gcd(unsigned a, unsigned b); - // Compute the stride (period) of the length language of a regex. // // The stride k satisfies: all lengths in L(re) are congruent to From 35ee8f917d02839d675a97b7fd7eb5238ec181c3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 11 Mar 2026 04:26:26 +0000 Subject: [PATCH 05/13] Remove redundant zero-guards before u_gcd calls in concat/intersection cases Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com> --- src/smt/seq/nseq_parikh.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/smt/seq/nseq_parikh.cpp b/src/smt/seq/nseq_parikh.cpp index a6d1d20e5..9ee4690b8 100644 --- a/src/smt/seq/nseq_parikh.cpp +++ b/src/smt/seq/nseq_parikh.cpp @@ -137,10 +137,6 @@ namespace seq { if (seq.re.is_concat(re, r1, r2)) { unsigned s1 = compute_length_stride(r1); unsigned s2 = compute_length_stride(r2); - // 0 (fixed) on either side: result is governed by the other. - if (s1 == 0 && s2 == 0) return 0; - if (s1 == 0) return s2; - if (s2 == 0) return s1; return u_gcd(s1, s2); } @@ -185,9 +181,6 @@ namespace seq { if (seq.re.is_intersection(re, r1, r2)) { unsigned s1 = compute_length_stride(r1); unsigned s2 = compute_length_stride(r2); - if (s1 == 0 && s2 == 0) return 0; - if (s1 == 0) return s2; - if (s2 == 0) return s1; return u_gcd(s1, s2); } From eca5fcc7bb27a27e2847a3f536ca32ecacd82bf0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 11 Mar 2026 05:10:30 +0000 Subject: [PATCH 06/13] Integrate nseq_parith into nielsen_graph; add k upper bound and check_parikh_conflict Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com> --- src/smt/seq/nseq_parikh.cpp | 71 +++++++++++++++++++++++++++++++++++++ src/smt/seq/nseq_parith.h | 17 +++++++++ src/smt/seq/seq_nielsen.cpp | 31 +++++++++++++++- src/smt/seq/seq_nielsen.h | 19 ++++++++++ 4 files changed, 137 insertions(+), 1 deletion(-) diff --git a/src/smt/seq/nseq_parikh.cpp b/src/smt/seq/nseq_parikh.cpp index 9ee4690b8..13efbd94a 100644 --- a/src/smt/seq/nseq_parikh.cpp +++ b/src/smt/seq/nseq_parikh.cpp @@ -241,6 +241,18 @@ namespace seq { expr_ref zero(arith.mk_int(0), m); out.push_back(int_constraint(k_var, zero, int_constraint_kind::ge, mem.m_dep, m)); + + // Constraint 3 (optional): k ≤ max_k when max_len is bounded. + // max_k = floor((max_len - min_len) / stride) + // This gives the solver an explicit upper bound on k, which tightens + // the search space when combined with other constraints on len(str). + if (max_len != UINT_MAX) { + unsigned range = max_len - min_len; // max_len >= min_len here + unsigned max_k = range / stride; + expr_ref max_k_expr(arith.mk_int(max_k), m); + out.push_back(int_constraint(k_var, max_k_expr, + int_constraint_kind::le, mem.m_dep, m)); + } } void nseq_parith::apply_to_node(nielsen_node& node) { @@ -251,4 +263,63 @@ namespace seq { node.add_int_constraint(ic); } + // ----------------------------------------------------------------------- + // Quick Parikh feasibility check (no solver call) + // ----------------------------------------------------------------------- + + // Returns true if a Parikh conflict is detected: there exists a membership + // str ∈ re for a single-variable str where the modular length constraint + // len(str) = min_len + stride * k (k ≥ 0) + // is inconsistent with the variable's current integer bounds [lb, ub]. + // + // This check is lightweight — it uses only modular arithmetic on the already- + // known regex min/max lengths and the per-variable bounds stored in the node. + bool nseq_parith::check_parikh_conflict(nielsen_node& node) { + seq_util& seq = m_sg.get_seq_util(); + + for (str_mem const& mem : node.str_mems()) { + if (!mem.m_str || !mem.m_regex || !mem.m_str->is_var()) + continue; + + expr* re_expr = mem.m_regex->get_expr(); + if (!re_expr || !seq.is_re(re_expr)) + continue; + + unsigned min_len = seq.re.min_length(re_expr); + unsigned max_len = seq.re.max_length(re_expr); + if (min_len >= max_len) continue; // fixed or empty — no stride constraint + + unsigned stride = compute_length_stride(re_expr); + if (stride <= 1) continue; // no useful modular constraint + + unsigned lb = node.var_lb(mem.m_str); + unsigned ub = node.var_ub(mem.m_str); + + // Check: ∃k ≥ 0 such that lb ≤ min_len + stride * k ≤ ub ? + // + // First find the smallest k satisfying the lower bound: + // k_min = 0 if min_len ≥ lb + // k_min = ⌈(lb - min_len) / stride⌉ otherwise + // + // Then verify min_len + stride * k_min ≤ ub. + unsigned k_min = 0; + if (lb > min_len) { + unsigned gap = lb - min_len; + k_min = (gap + stride - 1) / stride; // ceiling division + } + + // Overflow guard: stride * k_min may overflow unsigned. + unsigned len_at_k_min; + if (k_min > (UINT_MAX - min_len) / stride) { + // Overflow: min_len + stride * k_min > UINT_MAX ≥ ub → conflict. + return true; + } + len_at_k_min = min_len + stride * k_min; + + if (ub != UINT_MAX && len_at_k_min > ub) + return true; // no valid k exists → Parikh conflict + } + return false; + } + } // namespace seq diff --git a/src/smt/seq/nseq_parith.h b/src/smt/seq/nseq_parith.h index 6f8fc7ae3..e10e5b6eb 100644 --- a/src/smt/seq/nseq_parith.h +++ b/src/smt/seq/nseq_parith.h @@ -91,6 +91,7 @@ namespace seq { // When stride > 1 and min_len < max_len (bounds don't pin length): // adds: len(str) = min_len + stride · k (equality) // k ≥ 0 (non-negativity) + // k ≤ (max_len - min_len) / stride (upper bound, when max_len bounded) // These tighten the integer constraint set for the subsolver. // Dependencies are copied from mem.m_dep. void generate_parikh_constraints(str_mem const& mem, @@ -101,6 +102,22 @@ namespace seq { // and appends the resulting int_constraints to node.int_constraints(). void apply_to_node(nielsen_node& node); + // Quick Parikh feasibility check (no solver call). + // + // For each single-variable membership str ∈ re, checks whether the + // modular constraint len(str) = min_len + stride · k (k ≥ 0) + // has any solution given the current per-variable bounds stored in + // node.var_lb(str) and node.var_ub(str). + // + // Returns true when a conflict is detected (no valid k exists for + // some membership). The caller should then mark the node with + // backtrack_reason::parikh_image. + // + // This is a lightweight pre-check that avoids calling the integer + // subsolver. It is sound (never returns true for a satisfiable node) + // but incomplete (may miss conflicts that require the full solver). + bool check_parikh_conflict(nielsen_node& node); + // Compute the length stride of a regex expression. // Exposed for testing and external callers. unsigned get_length_stride(expr* re) { return compute_length_stride(re); } diff --git a/src/smt/seq/seq_nielsen.cpp b/src/smt/seq/seq_nielsen.cpp index 6a2c730ca..98f114fa6 100644 --- a/src/smt/seq/seq_nielsen.cpp +++ b/src/smt/seq/seq_nielsen.cpp @@ -20,6 +20,7 @@ Author: --*/ #include "smt/seq/seq_nielsen.h" +#include "smt/seq/nseq_parith.h" #include "ast/arith_decl_plugin.h" #include "ast/ast_pp.h" #include "util/hashtable.h" @@ -431,10 +432,12 @@ namespace seq { nielsen_graph::nielsen_graph(euf::sgraph& sg, simple_solver& solver): m_sg(sg), - m_solver(solver) { + m_solver(solver), + m_parith(alloc(nseq_parith, sg)) { } nielsen_graph::~nielsen_graph() { + dealloc(m_parith); reset(); } @@ -1032,6 +1035,23 @@ namespace seq { // nielsen_graph: search // ----------------------------------------------------------------------- + void nielsen_graph::apply_parikh_to_node(nielsen_node& node) { + if (node.m_parikh_applied) return; + node.m_parikh_applied = true; + + // Generate modular length constraints (len(str) = min_len + stride·k, etc.) + // and append them to the node's integer constraint list. + m_parith->apply_to_node(node); + + // Lightweight feasibility pre-check: does the Parikh modular constraint + // contradict the variable's current integer bounds? If so, mark this + // node as a Parikh-image conflict immediately (avoids a solver call). + if (!node.is_currently_conflict() && m_parith->check_parikh_conflict(node)) { + node.m_is_general_conflict = true; + node.m_reason = backtrack_reason::parikh_image; + } + } + void nielsen_graph::assert_root_constraints_to_solver() { if (m_root_constraints_asserted) return; m_root_constraints_asserted = true; @@ -1129,6 +1149,15 @@ namespace seq { return search_result::sat; } + // Apply Parikh image filter: generate modular length constraints and + // perform a lightweight feasibility pre-check. The filter is guarded + // internally (m_parikh_applied) so it only runs once per node. + apply_parikh_to_node(*node); + if (node->is_currently_conflict()) { + ++m_stats.m_num_simplify_conflict; + return search_result::unsat; + } + // integer feasibility check: collect side constraints along the path // and verify they are jointly satisfiable using the LP solver if (!cur_path.empty() && !check_int_feasibility(node, cur_path)) { diff --git a/src/smt/seq/seq_nielsen.h b/src/smt/seq/seq_nielsen.h index 1660be312..70ab76edc 100644 --- a/src/smt/seq/seq_nielsen.h +++ b/src/smt/seq/seq_nielsen.h @@ -248,6 +248,7 @@ namespace seq { class nielsen_node; class nielsen_edge; class nielsen_graph; + class nseq_parith; // Parikh image filter (defined in nseq_parith.h) /** * Abstract interface for an incremental solver used by nielsen_graph @@ -519,6 +520,10 @@ namespace seq { // evaluation index for run tracking unsigned m_eval_idx = 0; + // Parikh filter: set to true once apply_parikh_to_node has been applied + // to this node. Prevents duplicate constraint generation across DFS runs. + bool m_parikh_applied = false; + public: nielsen_node(nielsen_graph* graph, unsigned id); @@ -711,6 +716,10 @@ namespace seq { // Set to true after assert_root_constraints_to_solver() is first called. bool m_root_constraints_asserted = false; + // Parikh image filter: generates modular length constraints from regex + // memberships. Allocated in the constructor; owned by this graph. + nseq_parith* m_parith = nullptr; + public: // Construct with a caller-supplied solver. Ownership is NOT transferred; // the caller is responsible for keeping the solver alive. @@ -816,6 +825,16 @@ namespace seq { private: search_result search_dfs(nielsen_node* node, unsigned depth, svector& cur_path); + // Apply the Parikh image filter to a node: generate modular length + // constraints from regex memberships and append them to the node's + // int_constraints. Also performs a lightweight feasibility pre-check; + // if a Parikh conflict is detected the node's conflict flag is set with + // backtrack_reason::parikh_image. + // + // Guarded by node.m_parikh_applied so that constraints are generated + // only once per node across DFS iterations. + void apply_parikh_to_node(nielsen_node& node); + // create a fresh variable with a unique name euf::snode* mk_fresh_var(); From 4ac5315846d7c18c82d1c5592c5a314cf66f5297 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 11 Mar 2026 05:15:20 +0000 Subject: [PATCH 07/13] Fix review: min>=max guard, ceiling-div overflow, SASSERT, accessor methods, comments Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com> --- src/smt/seq/nseq_parikh.cpp | 27 ++++++++++++++++++++------- src/smt/seq/nseq_parith.h | 4 +++- src/smt/seq/seq_nielsen.cpp | 6 ++++-- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/src/smt/seq/nseq_parikh.cpp b/src/smt/seq/nseq_parikh.cpp index 13efbd94a..788221f68 100644 --- a/src/smt/seq/nseq_parikh.cpp +++ b/src/smt/seq/nseq_parikh.cpp @@ -211,9 +211,11 @@ namespace seq { unsigned min_len = seq.re.min_length(re_expr); unsigned max_len = seq.re.max_length(re_expr); - // If min_len == max_len the bounds already pin the length exactly; - // no modular constraint is needed. - if (min_len == max_len) + // If min_len >= max_len the bounds already pin the length exactly + // (or the language is empty — empty language is detected by simplify_and_init + // via Brzozowski derivative / is_empty checks, not here). + // We only generate modular constraints when the length is variable. + if (min_len >= max_len) return; unsigned stride = compute_length_stride(re_expr); @@ -244,10 +246,12 @@ namespace seq { // Constraint 3 (optional): k ≤ max_k when max_len is bounded. // max_k = floor((max_len - min_len) / stride) - // This gives the solver an explicit upper bound on k, which tightens - // the search space when combined with other constraints on len(str). + // This gives the solver an explicit upper bound on k. + // The subtraction is safe because min_len < max_len is guaranteed + // by the early return above. if (max_len != UINT_MAX) { - unsigned range = max_len - min_len; // max_len >= min_len here + SASSERT(max_len > min_len); + unsigned range = max_len - min_len; unsigned max_k = range / stride; expr_ref max_k_expr(arith.mk_int(max_k), m); out.push_back(int_constraint(k_var, max_k_expr, @@ -291,6 +295,8 @@ namespace seq { unsigned stride = compute_length_stride(re_expr); if (stride <= 1) continue; // no useful modular constraint + // stride > 1 guaranteed from here onward. + SASSERT(stride > 1); unsigned lb = node.var_lb(mem.m_str); unsigned ub = node.var_ub(mem.m_str); @@ -305,7 +311,14 @@ namespace seq { unsigned k_min = 0; if (lb > min_len) { unsigned gap = lb - min_len; - k_min = (gap + stride - 1) / stride; // ceiling division + // Ceiling division: k_min = ceil(gap / stride). + // Guard: (gap + stride - 1) may overflow if gap is close to UINT_MAX. + // In that case k_min would be huge, and min_len + stride*k_min would + // also overflow ub → treat as a conflict immediately. + if (gap > UINT_MAX - (stride - 1)) { + return true; // ceiling division would overflow → k_min too large + } + k_min = (gap + stride - 1) / stride; } // Overflow guard: stride * k_min may overflow unsigned. diff --git a/src/smt/seq/nseq_parith.h b/src/smt/seq/nseq_parith.h index e10e5b6eb..5d019cfe4 100644 --- a/src/smt/seq/nseq_parith.h +++ b/src/smt/seq/nseq_parith.h @@ -88,12 +88,14 @@ namespace seq { // Generate Parikh modular length constraints for one membership. // - // When stride > 1 and min_len < max_len (bounds don't pin length): + // When stride > 1 and min_len < max_len (bounds don't pin length exactly, + // and the language is non-empty): // adds: len(str) = min_len + stride · k (equality) // k ≥ 0 (non-negativity) // k ≤ (max_len - min_len) / stride (upper bound, when max_len bounded) // These tighten the integer constraint set for the subsolver. // Dependencies are copied from mem.m_dep. + // Does nothing when min_len ≥ max_len (empty or fixed-length language). void generate_parikh_constraints(str_mem const& mem, vector& out); diff --git a/src/smt/seq/seq_nielsen.cpp b/src/smt/seq/seq_nielsen.cpp index 98f114fa6..4524dd7c0 100644 --- a/src/smt/seq/seq_nielsen.cpp +++ b/src/smt/seq/seq_nielsen.cpp @@ -1047,8 +1047,8 @@ namespace seq { // contradict the variable's current integer bounds? If so, mark this // node as a Parikh-image conflict immediately (avoids a solver call). if (!node.is_currently_conflict() && m_parith->check_parikh_conflict(node)) { - node.m_is_general_conflict = true; - node.m_reason = backtrack_reason::parikh_image; + node.set_general_conflict(true); + node.set_reason(backtrack_reason::parikh_image); } } @@ -1152,6 +1152,8 @@ namespace seq { // Apply Parikh image filter: generate modular length constraints and // perform a lightweight feasibility pre-check. The filter is guarded // internally (m_parikh_applied) so it only runs once per node. + // Note: Parikh filtering is skipped for satisfied nodes (returned above); + // a fully satisfied node has no remaining memberships to filter. apply_parikh_to_node(*node); if (node->is_currently_conflict()) { ++m_stats.m_num_simplify_conflict; From 213ddd36ba72d254825ca2f66f3debcdfada3c3e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 11 Mar 2026 05:41:16 +0000 Subject: [PATCH 08/13] =?UTF-8?q?Rename=20nseq=5Fparikh=E2=86=92seq=5Fpari?= =?UTF-8?q?kh;=20add=20m/seq/a=20member=20attributes=20to=20seq=5Fparikh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com> --- src/smt/seq/CMakeLists.txt | 2 +- src/smt/seq/nseq_parikh.cpp | 331 +----------------------------------- src/smt/seq/nseq_parith.h | 122 +------------ src/smt/seq/seq_nielsen.cpp | 4 +- src/smt/seq/seq_nielsen.h | 4 +- src/smt/seq/seq_parikh.cpp | 327 +++++++++++++++++++++++++++++++++++ src/smt/seq/seq_parikh.h | 131 ++++++++++++++ 7 files changed, 474 insertions(+), 447 deletions(-) create mode 100644 src/smt/seq/seq_parikh.cpp create mode 100644 src/smt/seq/seq_parikh.h diff --git a/src/smt/seq/CMakeLists.txt b/src/smt/seq/CMakeLists.txt index 9ea9c4bab..e6ed34c34 100644 --- a/src/smt/seq/CMakeLists.txt +++ b/src/smt/seq/CMakeLists.txt @@ -1,6 +1,6 @@ z3_add_component(smt_seq SOURCES - nseq_parikh.cpp + seq_parikh.cpp seq_nielsen.cpp COMPONENT_DEPENDENCIES euf diff --git a/src/smt/seq/nseq_parikh.cpp b/src/smt/seq/nseq_parikh.cpp index 788221f68..56f2bb344 100644 --- a/src/smt/seq/nseq_parikh.cpp +++ b/src/smt/seq/nseq_parikh.cpp @@ -5,334 +5,11 @@ Module Name: nseq_parikh.cpp -Abstract: +Note: - Parikh image filter implementation for the ZIPT-based Nielsen string - solver. See nseq_parith.h for the full design description. - - The key operation is compute_length_stride(re), which performs a - structural traversal of the regex to find the period k such that all - string lengths in L(re) are congruent to min_length(re) modulo k. - The stride is used to generate modular length constraints that help - the integer subsolver prune infeasible Nielsen graph nodes. - -Author: - - Clemens Eisenhofer 2026-03-10 - Nikolaj Bjorner (nbjorner) 2026-03-10 + This file is retained for backwards compatibility. + The canonical implementation is now smt/seq/seq_parikh.cpp. --*/ +// intentionally empty — see seq_parikh.cpp -#include "smt/seq/nseq_parith.h" -#include "ast/arith_decl_plugin.h" -#include "ast/seq_decl_plugin.h" -#include "util/mpz.h" -#include - -namespace seq { - - nseq_parith::nseq_parith(euf::sgraph& sg) - : m_sg(sg), m_fresh_cnt(0) {} - - expr_ref nseq_parith::mk_fresh_int_var() { - ast_manager& m = m_sg.get_manager(); - arith_util arith(m); - std::string name = "pk!" + std::to_string(m_fresh_cnt++); - return expr_ref(m.mk_fresh_const(name.c_str(), arith.mk_int()), m); - } - - // ----------------------------------------------------------------------- - // Stride computation - // ----------------------------------------------------------------------- - - // compute_length_stride: structural traversal of regex expression. - // - // Return value semantics: - // 0 — fixed length (or empty language): no modular constraint needed - // beyond the min == max bounds. - // 1 — all integer lengths ≥ min_len are achievable: no useful modular - // constraint. - // k > 1 — all lengths in L(re) satisfy len ≡ min_len (mod k): - // modular constraint len(str) = min_len + k·j is useful. - unsigned nseq_parith::compute_length_stride(expr* re) { - if (!re) return 1; - - seq_util& seq = m_sg.get_seq_util(); - expr* r1 = nullptr, *r2 = nullptr, *s = nullptr; - unsigned lo = 0, hi = 0; - - // Empty language: no strings exist; stride is irrelevant. - if (seq.re.is_empty(re)) - return 0; - - // Epsilon regex {""}: single fixed length 0. - if (seq.re.is_epsilon(re)) - return 0; - - // to_re(concrete_string): fixed-length, no modular constraint needed. - if (seq.re.is_to_re(re, s)) { - // min_length == max_length, covered by bounds. - return 0; - } - - // Single character: range, full_char — fixed length 1. - if (seq.re.is_range(re) || seq.re.is_full_char(re)) - return 0; - - // full_seq (.* / Σ*): every length ≥ 0 is possible. - if (seq.re.is_full_seq(re)) - return 1; - - // r* — Kleene star. - // L(r*) = {ε} ∪ L(r) ∪ L(r)·L(r) ∪ ... - // If r has a fixed length k, then L(r*) = {0, k, 2k, ...} → stride k. - // If r has variable length, strides from different iterations combine - // by GCD. - if (seq.re.is_star(re, r1)) { - unsigned mn = seq.re.min_length(r1); - unsigned mx = seq.re.max_length(r1); - // When the body has unbounded length (mx == UINT_MAX), different - // iterations can produce many different lengths, and the stride of - // the star as a whole degenerates to gcd(mn, mn) = mn (or 1 if - // mn == 1). This is conservative: we use the body's min-length - // as the only available fixed quantity. - if (mx == UINT_MAX) mx = mn; - if (mn == mx) { - // Fixed-length body: L(r*) = {0, mn, 2·mn, ...} → stride = mn. - // When mn == 1 the stride would be 1, which gives no useful - // modular constraint, so return 0 instead. - return (mn > 1) ? mn : 0; - } - // Variable-length body: GCD of min and max lengths - return u_gcd(mn, mx); - } - - // r+ — one or more: same stride analysis as r*. - if (seq.re.is_plus(re, r1)) { - unsigned mn = seq.re.min_length(r1); - unsigned mx = seq.re.max_length(r1); - if (mx == UINT_MAX) mx = mn; // same conservative treatment as star - if (mn == mx) - return (mn > 1) ? mn : 0; - return u_gcd(mn, mx); - } - - // r? — zero or one: lengths = {0} ∪ lengths(r) - // stride = GCD(mn_r, stride(r)) unless stride(r) is 0 (fixed length). - if (seq.re.is_opt(re, r1)) { - unsigned mn = seq.re.min_length(r1); - unsigned inner = compute_length_stride(r1); - // L(r?) includes length 0 and all lengths of L(r). - // GCD(stride(r), min_len(r)) is a valid stride because: - // - the gap from 0 to min_len(r) is min_len(r) itself, and - // - subsequent lengths grow in steps governed by stride(r). - // A result > 1 gives a useful modular constraint; result == 1 - // means every non-negative integer is achievable (no constraint). - if (inner == 0) - return u_gcd(mn, 0); // gcd(mn, 0) = mn; useful when mn > 1 - return u_gcd(inner, mn); - } - - // concat(r1, r2): lengths add → stride = GCD(stride(r1), stride(r2)). - if (seq.re.is_concat(re, r1, r2)) { - unsigned s1 = compute_length_stride(r1); - unsigned s2 = compute_length_stride(r2); - return u_gcd(s1, s2); - } - - // union(r1, r2): lengths from either branch → need GCD of both - // strides and the difference between their minimum lengths. - if (seq.re.is_union(re, r1, r2)) { - unsigned s1 = compute_length_stride(r1); - unsigned s2 = compute_length_stride(r2); - unsigned m1 = seq.re.min_length(r1); - unsigned m2 = seq.re.min_length(r2); - unsigned d = (m1 >= m2) ? (m1 - m2) : (m2 - m1); - // Replace 0-strides with d for GCD computation: - // a fixed-length branch only introduces constraint via its offset. - unsigned g = u_gcd(s1 == 0 ? d : s1, s2 == 0 ? d : s2); - g = u_gcd(g, d); - return g; - } - - // loop(r, lo, hi): lengths = {lo·len(r), ..., hi·len(r)} if r is fixed. - // stride = len(r) when r is fixed-length; otherwise GCD. - if (seq.re.is_loop(re, r1, lo, hi)) { - unsigned mn = seq.re.min_length(r1); - unsigned mx = seq.re.max_length(r1); - if (mx == UINT_MAX) mx = mn; - if (mn == mx) - return (mn > 1) ? mn : 0; - return u_gcd(mn, mx); - } - if (seq.re.is_loop(re, r1, lo)) { - unsigned mn = seq.re.min_length(r1); - unsigned mx = seq.re.max_length(r1); - if (mx == UINT_MAX) mx = mn; - if (mn == mx) - return (mn > 1) ? mn : 0; - return u_gcd(mn, mx); - } - - // intersection(r1, r2): lengths must be in both languages. - // A conservative safe choice: GCD(stride(r1), stride(r2)) is a valid - // stride for the intersection (every length in the intersection is - // also in r1 and in r2). - if (seq.re.is_intersection(re, r1, r2)) { - unsigned s1 = compute_length_stride(r1); - unsigned s2 = compute_length_stride(r2); - return u_gcd(s1, s2); - } - - // For complement, diff, reverse, derivative, of_pred, and anything - // else we cannot analyse statically: be conservative and return 1 - // (no useful modular constraint rather than an unsound one). - return 1; - } - - // ----------------------------------------------------------------------- - // Constraint generation - // ----------------------------------------------------------------------- - - void nseq_parith::generate_parikh_constraints(str_mem const& mem, - vector& out) { - if (!mem.m_regex || !mem.m_str) - return; - - ast_manager& m = m_sg.get_manager(); - seq_util& seq = m_sg.get_seq_util(); - arith_util arith(m); - - expr* re_expr = mem.m_regex->get_expr(); - if (!re_expr || !seq.is_re(re_expr)) - return; - - // Length bounds from the regex. - unsigned min_len = seq.re.min_length(re_expr); - unsigned max_len = seq.re.max_length(re_expr); - - // If min_len >= max_len the bounds already pin the length exactly - // (or the language is empty — empty language is detected by simplify_and_init - // via Brzozowski derivative / is_empty checks, not here). - // We only generate modular constraints when the length is variable. - if (min_len >= max_len) - return; - - unsigned stride = compute_length_stride(re_expr); - - // stride == 1: every integer length is possible — no useful constraint. - // stride == 0: fixed length or empty — handled by bounds. - if (stride <= 1) - return; - - // Build len(str) as an arithmetic expression. - expr_ref len_str(seq.str.mk_length(mem.m_str->get_expr()), m); - - // Introduce fresh integer variable k ≥ 0. - expr_ref k_var = mk_fresh_int_var(); - - // Constraint 1: len(str) = min_len + stride · k - expr_ref min_expr(arith.mk_int(min_len), m); - expr_ref stride_expr(arith.mk_int(stride), m); - expr_ref stride_k(arith.mk_mul(stride_expr, k_var), m); - expr_ref rhs(arith.mk_add(min_expr, stride_k), m); - out.push_back(int_constraint(len_str, rhs, - int_constraint_kind::eq, mem.m_dep, m)); - - // Constraint 2: k ≥ 0 - expr_ref zero(arith.mk_int(0), m); - out.push_back(int_constraint(k_var, zero, - int_constraint_kind::ge, mem.m_dep, m)); - - // Constraint 3 (optional): k ≤ max_k when max_len is bounded. - // max_k = floor((max_len - min_len) / stride) - // This gives the solver an explicit upper bound on k. - // The subtraction is safe because min_len < max_len is guaranteed - // by the early return above. - if (max_len != UINT_MAX) { - SASSERT(max_len > min_len); - unsigned range = max_len - min_len; - unsigned max_k = range / stride; - expr_ref max_k_expr(arith.mk_int(max_k), m); - out.push_back(int_constraint(k_var, max_k_expr, - int_constraint_kind::le, mem.m_dep, m)); - } - } - - void nseq_parith::apply_to_node(nielsen_node& node) { - vector constraints; - for (str_mem const& mem : node.str_mems()) - generate_parikh_constraints(mem, constraints); - for (auto& ic : constraints) - node.add_int_constraint(ic); - } - - // ----------------------------------------------------------------------- - // Quick Parikh feasibility check (no solver call) - // ----------------------------------------------------------------------- - - // Returns true if a Parikh conflict is detected: there exists a membership - // str ∈ re for a single-variable str where the modular length constraint - // len(str) = min_len + stride * k (k ≥ 0) - // is inconsistent with the variable's current integer bounds [lb, ub]. - // - // This check is lightweight — it uses only modular arithmetic on the already- - // known regex min/max lengths and the per-variable bounds stored in the node. - bool nseq_parith::check_parikh_conflict(nielsen_node& node) { - seq_util& seq = m_sg.get_seq_util(); - - for (str_mem const& mem : node.str_mems()) { - if (!mem.m_str || !mem.m_regex || !mem.m_str->is_var()) - continue; - - expr* re_expr = mem.m_regex->get_expr(); - if (!re_expr || !seq.is_re(re_expr)) - continue; - - unsigned min_len = seq.re.min_length(re_expr); - unsigned max_len = seq.re.max_length(re_expr); - if (min_len >= max_len) continue; // fixed or empty — no stride constraint - - unsigned stride = compute_length_stride(re_expr); - if (stride <= 1) continue; // no useful modular constraint - // stride > 1 guaranteed from here onward. - SASSERT(stride > 1); - - unsigned lb = node.var_lb(mem.m_str); - unsigned ub = node.var_ub(mem.m_str); - - // Check: ∃k ≥ 0 such that lb ≤ min_len + stride * k ≤ ub ? - // - // First find the smallest k satisfying the lower bound: - // k_min = 0 if min_len ≥ lb - // k_min = ⌈(lb - min_len) / stride⌉ otherwise - // - // Then verify min_len + stride * k_min ≤ ub. - unsigned k_min = 0; - if (lb > min_len) { - unsigned gap = lb - min_len; - // Ceiling division: k_min = ceil(gap / stride). - // Guard: (gap + stride - 1) may overflow if gap is close to UINT_MAX. - // In that case k_min would be huge, and min_len + stride*k_min would - // also overflow ub → treat as a conflict immediately. - if (gap > UINT_MAX - (stride - 1)) { - return true; // ceiling division would overflow → k_min too large - } - k_min = (gap + stride - 1) / stride; - } - - // Overflow guard: stride * k_min may overflow unsigned. - unsigned len_at_k_min; - if (k_min > (UINT_MAX - min_len) / stride) { - // Overflow: min_len + stride * k_min > UINT_MAX ≥ ub → conflict. - return true; - } - len_at_k_min = min_len + stride * k_min; - - if (ub != UINT_MAX && len_at_k_min > ub) - return true; // no valid k exists → Parikh conflict - } - return false; - } - -} // namespace seq diff --git a/src/smt/seq/nseq_parith.h b/src/smt/seq/nseq_parith.h index 5d019cfe4..182f54a55 100644 --- a/src/smt/seq/nseq_parith.h +++ b/src/smt/seq/nseq_parith.h @@ -5,124 +5,16 @@ Module Name: nseq_parith.h -Abstract: +Note: - Parikh image filter for the ZIPT-based Nielsen string solver. - - Implements Parikh-based arithmetic constraint generation for - nielsen_node instances. For a regex membership constraint str ∈ r, - the Parikh image of r constrains the multiset of characters in str. - This module computes the "length stride" (period) of the length - language of r and generates modular arithmetic constraints of the form - - len(str) = min_len + stride · k (k ≥ 0, k fresh integer) - - which tighten the arithmetic subproblem beyond the simple min/max - length bounds already produced by nielsen_node::init_var_bounds_from_mems(). - - For example: - • str ∈ (ab)* → min_len = 0, stride = 2 → len(str) = 2·k - • str ∈ a(bc)* → min_len = 1, stride = 2 → len(str) = 1 + 2·k - • str ∈ ab|abc → stride = 1 (no useful modular constraint) - - The generated int_constraints are added to the node's integer constraint - set and discharged by the integer subsolver (see seq_nielsen.h, - simple_solver). - - Implements the Parikh filter described in ZIPT - (https://github.com/CEisenhofer/ZIPT/tree/parikh/ZIPT/Constraints) - replacing ZIPT's PDD-based Parikh subsolver with Z3's linear arithmetic. - -Author: - - Clemens Eisenhofer 2026-03-10 - Nikolaj Bjorner (nbjorner) 2026-03-10 + This file is retained for backwards compatibility. + The canonical header is now smt/seq/seq_parikh.h. --*/ #pragma once - -#include "ast/euf/euf_sgraph.h" -#include "smt/seq/seq_nielsen.h" +#include "smt/seq/seq_parikh.h" namespace seq { - - /** - * Parikh image filter: generates modular length constraints from - * regex membership constraints in a nielsen_node. - * - * Usage: - * nseq_parith parith(sg); - * parith.apply_to_node(node); // adds constraints to node - * - * Or per-membership: - * vector out; - * parith.generate_parikh_constraints(mem, out); - */ - class nseq_parith { - euf::sgraph& m_sg; - unsigned m_fresh_cnt; // counter for fresh variable names - - // Compute the stride (period) of the length language of a regex. - // - // The stride k satisfies: all lengths in L(re) are congruent to - // min_length(re) modulo k. A stride of 1 means every integer - // length is possible (no useful modular constraint). A stride of - // 0 is a sentinel meaning the language is empty or has a single - // fixed length (already captured by bounds). - // - // Examples: - // stride(to_re("ab")) = 0 (fixed length 2) - // stride((ab)*) = 2 (lengths 0, 2, 4, ...) - // stride((abc)*) = 3 (lengths 0, 3, 6, ...) - // stride(a*b*) = 1 (all lengths possible) - // stride((ab)*(cd)*) = 2 (lengths 0, 2, 4, ...) - // stride((ab)*|(abc)*) = 1 (lengths 0, 2, 3, 4, ...) - unsigned compute_length_stride(expr* re); - - // Create a fresh integer variable (name "pk!N") for use as the - // Parikh multiplier variable k in len(str) = min_len + stride·k. - expr_ref mk_fresh_int_var(); - - public: - explicit nseq_parith(euf::sgraph& sg); - - // Generate Parikh modular length constraints for one membership. - // - // When stride > 1 and min_len < max_len (bounds don't pin length exactly, - // and the language is non-empty): - // adds: len(str) = min_len + stride · k (equality) - // k ≥ 0 (non-negativity) - // k ≤ (max_len - min_len) / stride (upper bound, when max_len bounded) - // These tighten the integer constraint set for the subsolver. - // Dependencies are copied from mem.m_dep. - // Does nothing when min_len ≥ max_len (empty or fixed-length language). - void generate_parikh_constraints(str_mem const& mem, - vector& out); - - // Apply Parikh constraints to all memberships at a node. - // Calls generate_parikh_constraints for each str_mem in the node - // and appends the resulting int_constraints to node.int_constraints(). - void apply_to_node(nielsen_node& node); - - // Quick Parikh feasibility check (no solver call). - // - // For each single-variable membership str ∈ re, checks whether the - // modular constraint len(str) = min_len + stride · k (k ≥ 0) - // has any solution given the current per-variable bounds stored in - // node.var_lb(str) and node.var_ub(str). - // - // Returns true when a conflict is detected (no valid k exists for - // some membership). The caller should then mark the node with - // backtrack_reason::parikh_image. - // - // This is a lightweight pre-check that avoids calling the integer - // subsolver. It is sound (never returns true for a satisfiable node) - // but incomplete (may miss conflicts that require the full solver). - bool check_parikh_conflict(nielsen_node& node); - - // Compute the length stride of a regex expression. - // Exposed for testing and external callers. - unsigned get_length_stride(expr* re) { return compute_length_stride(re); } - }; - -} // namespace seq + // backwards-compat alias + using nseq_parith = seq_parikh; +} diff --git a/src/smt/seq/seq_nielsen.cpp b/src/smt/seq/seq_nielsen.cpp index 4524dd7c0..81fd1b2ba 100644 --- a/src/smt/seq/seq_nielsen.cpp +++ b/src/smt/seq/seq_nielsen.cpp @@ -20,7 +20,7 @@ Author: --*/ #include "smt/seq/seq_nielsen.h" -#include "smt/seq/nseq_parith.h" +#include "smt/seq/seq_parikh.h" #include "ast/arith_decl_plugin.h" #include "ast/ast_pp.h" #include "util/hashtable.h" @@ -433,7 +433,7 @@ namespace seq { nielsen_graph::nielsen_graph(euf::sgraph& sg, simple_solver& solver): m_sg(sg), m_solver(solver), - m_parith(alloc(nseq_parith, sg)) { + m_parith(alloc(seq_parikh, sg)) { } nielsen_graph::~nielsen_graph() { diff --git a/src/smt/seq/seq_nielsen.h b/src/smt/seq/seq_nielsen.h index 70ab76edc..0be8ae274 100644 --- a/src/smt/seq/seq_nielsen.h +++ b/src/smt/seq/seq_nielsen.h @@ -248,7 +248,7 @@ namespace seq { class nielsen_node; class nielsen_edge; class nielsen_graph; - class nseq_parith; // Parikh image filter (defined in nseq_parith.h) + class seq_parikh; // Parikh image filter (see seq_parikh.h) /** * Abstract interface for an incremental solver used by nielsen_graph @@ -718,7 +718,7 @@ namespace seq { // Parikh image filter: generates modular length constraints from regex // memberships. Allocated in the constructor; owned by this graph. - nseq_parith* m_parith = nullptr; + seq_parikh* m_parith = nullptr; public: // Construct with a caller-supplied solver. Ownership is NOT transferred; diff --git a/src/smt/seq/seq_parikh.cpp b/src/smt/seq/seq_parikh.cpp new file mode 100644 index 000000000..255d09d72 --- /dev/null +++ b/src/smt/seq/seq_parikh.cpp @@ -0,0 +1,327 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_parikh.cpp + +Abstract: + + Parikh image filter implementation for the ZIPT-based Nielsen string + solver. See seq_parikh.h for the full design description. + + The key operation is compute_length_stride(re), which performs a + structural traversal of the regex to find the period k such that all + string lengths in L(re) are congruent to min_length(re) modulo k. + The stride is used to generate modular length constraints that help + the integer subsolver prune infeasible Nielsen graph nodes. + +Author: + + Clemens Eisenhofer 2026-03-10 + Nikolaj Bjorner (nbjorner) 2026-03-10 + +--*/ + +#include "smt/seq/seq_parikh.h" +#include "util/mpz.h" +#include + +namespace seq { + + seq_parikh::seq_parikh(euf::sgraph& sg) + : m(sg.get_manager()), seq(m), a(m), m_fresh_cnt(0) {} + + expr_ref seq_parikh::mk_fresh_int_var() { + std::string name = "pk!" + std::to_string(m_fresh_cnt++); + return expr_ref(m.mk_fresh_const(name.c_str(), a.mk_int()), m); + } + + // ----------------------------------------------------------------------- + // Stride computation + // ----------------------------------------------------------------------- + + // compute_length_stride: structural traversal of regex expression. + // + // Return value semantics: + // 0 — fixed length (or empty language): no modular constraint needed + // beyond the min == max bounds. + // 1 — all integer lengths ≥ min_len are achievable: no useful modular + // constraint. + // k > 1 — all lengths in L(re) satisfy len ≡ min_len (mod k): + // modular constraint len(str) = min_len + k·j is useful. + unsigned seq_parikh::compute_length_stride(expr* re) { + if (!re) return 1; + + expr* r1 = nullptr, *r2 = nullptr, *s = nullptr; + unsigned lo = 0, hi = 0; + + // Empty language: no strings exist; stride is irrelevant. + if (seq.re.is_empty(re)) + return 0; + + // Epsilon regex {""}: single fixed length 0. + if (seq.re.is_epsilon(re)) + return 0; + + // to_re(concrete_string): fixed-length, no modular constraint needed. + if (seq.re.is_to_re(re, s)) { + // min_length == max_length, covered by bounds. + return 0; + } + + // Single character: range, full_char — fixed length 1. + if (seq.re.is_range(re) || seq.re.is_full_char(re)) + return 0; + + // full_seq (.* / Σ*): every length ≥ 0 is possible. + if (seq.re.is_full_seq(re)) + return 1; + + // r* — Kleene star. + // L(r*) = {ε} ∪ L(r) ∪ L(r)·L(r) ∪ ... + // If r has a fixed length k, then L(r*) = {0, k, 2k, ...} → stride k. + // If r has variable length, strides from different iterations combine + // by GCD. + if (seq.re.is_star(re, r1)) { + unsigned mn = seq.re.min_length(r1); + unsigned mx = seq.re.max_length(r1); + // When the body has unbounded length (mx == UINT_MAX), different + // iterations can produce many different lengths, and the stride of + // the star as a whole degenerates to gcd(mn, mn) = mn (or 1 if + // mn == 1). This is conservative: we use the body's min-length + // as the only available fixed quantity. + if (mx == UINT_MAX) mx = mn; + if (mn == mx) { + // Fixed-length body: L(r*) = {0, mn, 2·mn, ...} → stride = mn. + // When mn == 1 the stride would be 1, which gives no useful + // modular constraint, so return 0 instead. + return (mn > 1) ? mn : 0; + } + // Variable-length body: GCD of min and max lengths + return u_gcd(mn, mx); + } + + // r+ — one or more: same stride analysis as r*. + if (seq.re.is_plus(re, r1)) { + unsigned mn = seq.re.min_length(r1); + unsigned mx = seq.re.max_length(r1); + if (mx == UINT_MAX) mx = mn; // same conservative treatment as star + if (mn == mx) + return (mn > 1) ? mn : 0; + return u_gcd(mn, mx); + } + + // r? — zero or one: lengths = {0} ∪ lengths(r) + // stride = GCD(mn_r, stride(r)) unless stride(r) is 0 (fixed length). + if (seq.re.is_opt(re, r1)) { + unsigned mn = seq.re.min_length(r1); + unsigned inner = compute_length_stride(r1); + // L(r?) includes length 0 and all lengths of L(r). + // GCD(stride(r), min_len(r)) is a valid stride because: + // - the gap from 0 to min_len(r) is min_len(r) itself, and + // - subsequent lengths grow in steps governed by stride(r). + // A result > 1 gives a useful modular constraint; result == 1 + // means every non-negative integer is achievable (no constraint). + if (inner == 0) + return u_gcd(mn, 0); // gcd(mn, 0) = mn; useful when mn > 1 + return u_gcd(inner, mn); + } + + // concat(r1, r2): lengths add → stride = GCD(stride(r1), stride(r2)). + if (seq.re.is_concat(re, r1, r2)) { + unsigned s1 = compute_length_stride(r1); + unsigned s2 = compute_length_stride(r2); + return u_gcd(s1, s2); + } + + // union(r1, r2): lengths from either branch → need GCD of both + // strides and the difference between their minimum lengths. + if (seq.re.is_union(re, r1, r2)) { + unsigned s1 = compute_length_stride(r1); + unsigned s2 = compute_length_stride(r2); + unsigned m1 = seq.re.min_length(r1); + unsigned m2 = seq.re.min_length(r2); + unsigned d = (m1 >= m2) ? (m1 - m2) : (m2 - m1); + // Replace 0-strides with d for GCD computation: + // a fixed-length branch only introduces constraint via its offset. + unsigned g = u_gcd(s1 == 0 ? d : s1, s2 == 0 ? d : s2); + g = u_gcd(g, d); + return g; + } + + // loop(r, lo, hi): lengths = {lo·len(r), ..., hi·len(r)} if r is fixed. + // stride = len(r) when r is fixed-length; otherwise GCD. + if (seq.re.is_loop(re, r1, lo, hi)) { + unsigned mn = seq.re.min_length(r1); + unsigned mx = seq.re.max_length(r1); + if (mx == UINT_MAX) mx = mn; + if (mn == mx) + return (mn > 1) ? mn : 0; + return u_gcd(mn, mx); + } + if (seq.re.is_loop(re, r1, lo)) { + unsigned mn = seq.re.min_length(r1); + unsigned mx = seq.re.max_length(r1); + if (mx == UINT_MAX) mx = mn; + if (mn == mx) + return (mn > 1) ? mn : 0; + return u_gcd(mn, mx); + } + + // intersection(r1, r2): lengths must be in both languages. + // A conservative safe choice: GCD(stride(r1), stride(r2)) is a valid + // stride for the intersection (every length in the intersection is + // also in r1 and in r2). + if (seq.re.is_intersection(re, r1, r2)) { + unsigned s1 = compute_length_stride(r1); + unsigned s2 = compute_length_stride(r2); + return u_gcd(s1, s2); + } + + // For complement, diff, reverse, derivative, of_pred, and anything + // else we cannot analyse statically: be conservative and return 1 + // (no useful modular constraint rather than an unsound one). + return 1; + } + + // ----------------------------------------------------------------------- + // Constraint generation + // ----------------------------------------------------------------------- + + void seq_parikh::generate_parikh_constraints(str_mem const& mem, + vector& out) { + if (!mem.m_regex || !mem.m_str) + return; + + expr* re_expr = mem.m_regex->get_expr(); + if (!re_expr || !seq.is_re(re_expr)) + return; + + // Length bounds from the regex. + unsigned min_len = seq.re.min_length(re_expr); + unsigned max_len = seq.re.max_length(re_expr); + + // If min_len >= max_len the bounds already pin the length exactly + // (or the language is empty — empty language is detected by simplify_and_init + // via Brzozowski derivative / is_empty checks, not here). + // We only generate modular constraints when the length is variable. + if (min_len >= max_len) + return; + + unsigned stride = compute_length_stride(re_expr); + + // stride == 1: every integer length is possible — no useful constraint. + // stride == 0: fixed length or empty — handled by bounds. + if (stride <= 1) + return; + + // Build len(str) as an arithmetic expression. + expr_ref len_str(seq.str.mk_length(mem.m_str->get_expr()), m); + + // Introduce fresh integer variable k ≥ 0. + expr_ref k_var = mk_fresh_int_var(); + + // Constraint 1: len(str) = min_len + stride · k + expr_ref min_expr(a.mk_int(min_len), m); + expr_ref stride_expr(a.mk_int(stride), m); + expr_ref stride_k(a.mk_mul(stride_expr, k_var), m); + expr_ref rhs(a.mk_add(min_expr, stride_k), m); + out.push_back(int_constraint(len_str, rhs, + int_constraint_kind::eq, mem.m_dep, m)); + + // Constraint 2: k ≥ 0 + expr_ref zero(a.mk_int(0), m); + out.push_back(int_constraint(k_var, zero, + int_constraint_kind::ge, mem.m_dep, m)); + + // Constraint 3 (optional): k ≤ max_k when max_len is bounded. + // max_k = floor((max_len - min_len) / stride) + // This gives the solver an explicit upper bound on k. + // The subtraction is safe because min_len < max_len is guaranteed + // by the early return above. + if (max_len != UINT_MAX) { + SASSERT(max_len > min_len); + unsigned range = max_len - min_len; + unsigned max_k = range / stride; + expr_ref max_k_expr(a.mk_int(max_k), m); + out.push_back(int_constraint(k_var, max_k_expr, + int_constraint_kind::le, mem.m_dep, m)); + } + } + + void seq_parikh::apply_to_node(nielsen_node& node) { + vector constraints; + for (str_mem const& mem : node.str_mems()) + generate_parikh_constraints(mem, constraints); + for (auto& ic : constraints) + node.add_int_constraint(ic); + } + + // ----------------------------------------------------------------------- + // Quick Parikh feasibility check (no solver call) + // ----------------------------------------------------------------------- + + // Returns true if a Parikh conflict is detected: there exists a membership + // str ∈ re for a single-variable str where the modular length constraint + // len(str) = min_len + stride * k (k ≥ 0) + // is inconsistent with the variable's current integer bounds [lb, ub]. + // + // This check is lightweight — it uses only modular arithmetic on the already- + // known regex min/max lengths and the per-variable bounds stored in the node. + bool seq_parikh::check_parikh_conflict(nielsen_node& node) { + for (str_mem const& mem : node.str_mems()) { + if (!mem.m_str || !mem.m_regex || !mem.m_str->is_var()) + continue; + + expr* re_expr = mem.m_regex->get_expr(); + if (!re_expr || !seq.is_re(re_expr)) + continue; + + unsigned min_len = seq.re.min_length(re_expr); + unsigned max_len = seq.re.max_length(re_expr); + if (min_len >= max_len) continue; // fixed or empty — no stride constraint + + unsigned stride = compute_length_stride(re_expr); + if (stride <= 1) continue; // no useful modular constraint + // stride > 1 guaranteed from here onward. + SASSERT(stride > 1); + + unsigned lb = node.var_lb(mem.m_str); + unsigned ub = node.var_ub(mem.m_str); + + // Check: ∃k ≥ 0 such that lb ≤ min_len + stride * k ≤ ub ? + // + // First find the smallest k satisfying the lower bound: + // k_min = 0 if min_len ≥ lb + // k_min = ⌈(lb - min_len) / stride⌉ otherwise + // + // Then verify min_len + stride * k_min ≤ ub. + unsigned k_min = 0; + if (lb > min_len) { + unsigned gap = lb - min_len; + // Ceiling division: k_min = ceil(gap / stride). + // Guard: (gap + stride - 1) may overflow if gap is close to UINT_MAX. + // In that case k_min would be huge, and min_len + stride*k_min would + // also overflow ub → treat as a conflict immediately. + if (gap > UINT_MAX - (stride - 1)) { + return true; // ceiling division would overflow → k_min too large + } + k_min = (gap + stride - 1) / stride; + } + + // Overflow guard: stride * k_min may overflow unsigned. + unsigned len_at_k_min; + if (k_min > (UINT_MAX - min_len) / stride) { + // Overflow: min_len + stride * k_min > UINT_MAX ≥ ub → conflict. + return true; + } + len_at_k_min = min_len + stride * k_min; + + if (ub != UINT_MAX && len_at_k_min > ub) + return true; // no valid k exists → Parikh conflict + } + return false; + } + +} // namespace seq diff --git a/src/smt/seq/seq_parikh.h b/src/smt/seq/seq_parikh.h new file mode 100644 index 000000000..730854832 --- /dev/null +++ b/src/smt/seq/seq_parikh.h @@ -0,0 +1,131 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_parikh.h + +Abstract: + + Parikh image filter for the ZIPT-based Nielsen string solver. + + Implements Parikh-based arithmetic constraint generation for + nielsen_node instances. For a regex membership constraint str ∈ r, + the Parikh image of r constrains the multiset of characters in str. + This module computes the "length stride" (period) of the length + language of r and generates modular arithmetic constraints of the form + + len(str) = min_len + stride · k (k ≥ 0, k fresh integer) + + which tighten the arithmetic subproblem beyond the simple min/max + length bounds already produced by nielsen_node::init_var_bounds_from_mems(). + + For example: + • str ∈ (ab)* → min_len = 0, stride = 2 → len(str) = 2·k + • str ∈ a(bc)* → min_len = 1, stride = 2 → len(str) = 1 + 2·k + • str ∈ ab|abc → stride = 1 (no useful modular constraint) + + The generated int_constraints are added to the node's integer constraint + set and discharged by the integer subsolver (see seq_nielsen.h, + simple_solver). + + Implements the Parikh filter described in ZIPT + (https://github.com/CEisenhofer/ZIPT/tree/parikh/ZIPT/Constraints) + replacing ZIPT's PDD-based Parikh subsolver with Z3's linear arithmetic. + +Author: + + Clemens Eisenhofer 2026-03-10 + Nikolaj Bjorner (nbjorner) 2026-03-10 + +--*/ +#pragma once + +#include "ast/arith_decl_plugin.h" +#include "ast/seq_decl_plugin.h" +#include "smt/seq/seq_nielsen.h" + +namespace seq { + + /** + * Parikh image filter: generates modular length constraints from + * regex membership constraints in a nielsen_node. + * + * Usage: + * seq_parikh parikh(sg); + * parikh.apply_to_node(node); // adds constraints to node + * + * Or per-membership: + * vector out; + * parikh.generate_parikh_constraints(mem, out); + */ + class seq_parikh { + ast_manager& m; + seq_util seq; + arith_util a; + unsigned m_fresh_cnt; // counter for fresh variable names + + // Compute the stride (period) of the length language of a regex. + // + // The stride k satisfies: all lengths in L(re) are congruent to + // min_length(re) modulo k. A stride of 1 means every integer + // length is possible (no useful modular constraint). A stride of + // 0 is a sentinel meaning the language is empty or has a single + // fixed length (already captured by bounds). + // + // Examples: + // stride(to_re("ab")) = 0 (fixed length 2) + // stride((ab)*) = 2 (lengths 0, 2, 4, ...) + // stride((abc)*) = 3 (lengths 0, 3, 6, ...) + // stride(a*b*) = 1 (all lengths possible) + // stride((ab)*(cd)*) = 2 (lengths 0, 2, 4, ...) + // stride((ab)*|(abc)*) = 1 (lengths 0, 2, 3, 4, ...) + unsigned compute_length_stride(expr* re); + + // Create a fresh integer variable (name "pk!N") for use as the + // Parikh multiplier variable k in len(str) = min_len + stride·k. + expr_ref mk_fresh_int_var(); + + public: + explicit seq_parikh(euf::sgraph& sg); + + // Generate Parikh modular length constraints for one membership. + // + // When stride > 1 and min_len < max_len (bounds don't pin length exactly, + // and the language is non-empty): + // adds: len(str) = min_len + stride · k (equality) + // k ≥ 0 (non-negativity) + // k ≤ (max_len - min_len) / stride (upper bound, when max_len bounded) + // These tighten the integer constraint set for the subsolver. + // Dependencies are copied from mem.m_dep. + // Does nothing when min_len ≥ max_len (empty or fixed-length language). + void generate_parikh_constraints(str_mem const& mem, + vector& out); + + // Apply Parikh constraints to all memberships at a node. + // Calls generate_parikh_constraints for each str_mem in the node + // and appends the resulting int_constraints to node.int_constraints(). + void apply_to_node(nielsen_node& node); + + // Quick Parikh feasibility check (no solver call). + // + // For each single-variable membership str ∈ re, checks whether the + // modular constraint len(str) = min_len + stride · k (k ≥ 0) + // has any solution given the current per-variable bounds stored in + // node.var_lb(str) and node.var_ub(str). + // + // Returns true when a conflict is detected (no valid k exists for + // some membership). The caller should then mark the node with + // backtrack_reason::parikh_image. + // + // This is a lightweight pre-check that avoids calling the integer + // subsolver. It is sound (never returns true for a satisfiable node) + // but incomplete (may miss conflicts that require the full solver). + bool check_parikh_conflict(nielsen_node& node); + + // Compute the length stride of a regex expression. + // Exposed for testing and external callers. + unsigned get_length_stride(expr* re) { return compute_length_stride(re); } + }; + +} // namespace seq From 45a574d7fad82d6b92ae17c2e431b45866cf8bfa Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 11 Mar 2026 15:48:05 +0000 Subject: [PATCH 09/13] Remove nseq_parith.h and nseq_parikh.cpp backwards-compat shims Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com> --- src/smt/seq/nseq_parikh.cpp | 15 --------------- src/smt/seq/nseq_parith.h | 20 -------------------- 2 files changed, 35 deletions(-) delete mode 100644 src/smt/seq/nseq_parikh.cpp delete mode 100644 src/smt/seq/nseq_parith.h diff --git a/src/smt/seq/nseq_parikh.cpp b/src/smt/seq/nseq_parikh.cpp deleted file mode 100644 index 56f2bb344..000000000 --- a/src/smt/seq/nseq_parikh.cpp +++ /dev/null @@ -1,15 +0,0 @@ -/*++ -Copyright (c) 2026 Microsoft Corporation - -Module Name: - - nseq_parikh.cpp - -Note: - - This file is retained for backwards compatibility. - The canonical implementation is now smt/seq/seq_parikh.cpp. - ---*/ -// intentionally empty — see seq_parikh.cpp - diff --git a/src/smt/seq/nseq_parith.h b/src/smt/seq/nseq_parith.h deleted file mode 100644 index 182f54a55..000000000 --- a/src/smt/seq/nseq_parith.h +++ /dev/null @@ -1,20 +0,0 @@ -/*++ -Copyright (c) 2026 Microsoft Corporation - -Module Name: - - nseq_parith.h - -Note: - - This file is retained for backwards compatibility. - The canonical header is now smt/seq/seq_parikh.h. - ---*/ -#pragma once -#include "smt/seq/seq_parikh.h" - -namespace seq { - // backwards-compat alias - using nseq_parith = seq_parikh; -} From 194dfe6c610d15780fb2c077be94cb32a2a22e6a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 11 Mar 2026 17:21:27 +0000 Subject: [PATCH 10/13] Port ZIPT Parikh features: minterm_to_char_set, char range constraints, fix stride soundness Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com> --- src/smt/seq/seq_nielsen.cpp | 11 ++- src/smt/seq/seq_parikh.cpp | 129 ++++++++++++++++++++++++++---------- src/smt/seq/seq_parikh.h | 19 ++++++ 3 files changed, 122 insertions(+), 37 deletions(-) diff --git a/src/smt/seq/seq_nielsen.cpp b/src/smt/seq/seq_nielsen.cpp index 81fd1b2ba..58fd96692 100644 --- a/src/smt/seq/seq_nielsen.cpp +++ b/src/smt/seq/seq_nielsen.cpp @@ -2301,8 +2301,15 @@ namespace seq { nielsen_subst s(first, replacement, mem.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); - // TODO: derive char_set from minterm and add as range constraint - // child->add_char_range(fresh_char, minterm_to_char_set(mt)); + // Constrain fresh_char to the character class of this minterm. + // This is the key Parikh pruning step: when x → ?c · x' is + // generated from minterm m_i, ?c must belong to the character + // class described by m_i so that str ∈ derivative(R, m_i). + if (mt->get_expr()) { + char_set cs = m_parith->minterm_to_char_set(mt->get_expr()); + if (!cs.is_empty()) + child->add_char_range(fresh_char, cs); + } created = true; } diff --git a/src/smt/seq/seq_parikh.cpp b/src/smt/seq/seq_parikh.cpp index 255d09d72..367412dad 100644 --- a/src/smt/seq/seq_parikh.cpp +++ b/src/smt/seq/seq_parikh.cpp @@ -80,36 +80,24 @@ namespace seq { // r* — Kleene star. // L(r*) = {ε} ∪ L(r) ∪ L(r)·L(r) ∪ ... - // If r has a fixed length k, then L(r*) = {0, k, 2k, ...} → stride k. - // If r has variable length, strides from different iterations combine - // by GCD. + // If all lengths in L(r) are congruent to c modulo s (c = min_len, s = stride), + // then L(r*) includes lengths {0, c, c+s, 2c, 2c+s, 2c+2s, ...} and + // the overall GCD is gcd(c, s). This is strictly more accurate than + // the previous gcd(min, max) approximation, which can be unsound when + // the body contains lengths whose GCD is smaller than gcd(min, max). if (seq.re.is_star(re, r1)) { unsigned mn = seq.re.min_length(r1); - unsigned mx = seq.re.max_length(r1); - // When the body has unbounded length (mx == UINT_MAX), different - // iterations can produce many different lengths, and the stride of - // the star as a whole degenerates to gcd(mn, mn) = mn (or 1 if - // mn == 1). This is conservative: we use the body's min-length - // as the only available fixed quantity. - if (mx == UINT_MAX) mx = mn; - if (mn == mx) { - // Fixed-length body: L(r*) = {0, mn, 2·mn, ...} → stride = mn. - // When mn == 1 the stride would be 1, which gives no useful - // modular constraint, so return 0 instead. - return (mn > 1) ? mn : 0; - } - // Variable-length body: GCD of min and max lengths - return u_gcd(mn, mx); + unsigned inner = compute_length_stride(r1); + // stride(r*) = gcd(min_length(r), stride(r)) + // when inner=0 (fixed-length body), gcd(mn, 0) = mn → stride = mn + return u_gcd(mn, inner); } // r+ — one or more: same stride analysis as r*. if (seq.re.is_plus(re, r1)) { unsigned mn = seq.re.min_length(r1); - unsigned mx = seq.re.max_length(r1); - if (mx == UINT_MAX) mx = mn; // same conservative treatment as star - if (mn == mx) - return (mn > 1) ? mn : 0; - return u_gcd(mn, mx); + unsigned inner = compute_length_stride(r1); + return u_gcd(mn, inner); } // r? — zero or one: lengths = {0} ∪ lengths(r) @@ -150,23 +138,18 @@ namespace seq { return g; } - // loop(r, lo, hi): lengths = {lo·len(r), ..., hi·len(r)} if r is fixed. - // stride = len(r) when r is fixed-length; otherwise GCD. + // loop(r, lo, hi): the length of any word is a sum of lo..hi copies of + // lengths from L(r). Since all lengths in L(r) are ≡ min_len(r) mod + // stride(r), the overall stride is gcd(min_len(r), stride(r)). if (seq.re.is_loop(re, r1, lo, hi)) { unsigned mn = seq.re.min_length(r1); - unsigned mx = seq.re.max_length(r1); - if (mx == UINT_MAX) mx = mn; - if (mn == mx) - return (mn > 1) ? mn : 0; - return u_gcd(mn, mx); + unsigned inner = compute_length_stride(r1); + return u_gcd(mn, inner); } if (seq.re.is_loop(re, r1, lo)) { unsigned mn = seq.re.min_length(r1); - unsigned mx = seq.re.max_length(r1); - if (mx == UINT_MAX) mx = mn; - if (mn == mx) - return (mn > 1) ? mn : 0; - return u_gcd(mn, mx); + unsigned inner = compute_length_stride(r1); + return u_gcd(mn, inner); } // intersection(r1, r2): lengths must be in both languages. @@ -324,4 +307,80 @@ namespace seq { return false; } + // ----------------------------------------------------------------------- + // minterm → char_set conversion + // ----------------------------------------------------------------------- + + // Convert a regex minterm expression to a char_set. + // + // Minterms are Boolean combinations of character-class predicates + // (re.range, re.full_char, complement, intersection) produced by + // sgraph::compute_minterms(). This function converts them to a + // concrete char_set so that fresh character variables introduced by + // apply_regex_var_split can be constrained with add_char_range(). + // + // The implementation is recursive and handles: + // re.full_char → [0, max_char] (full alphabet) + // re.range(lo, hi) → [lo, hi+1) (hi inclusive in Z3) + // re.complement(r) → alphabet \ minterm_to_char_set(r) + // re.inter(r1, r2) → intersection of both sides + // re.diff(r1, r2) → r1 ∩ complement(r2) + // re.to_re(str.unit(c)) → singleton {c} + // re.empty → empty set + // anything else → [0, max_char] (conservative) + char_set seq_parikh::minterm_to_char_set(expr* re_expr) { + unsigned max_c = seq.max_char(); + + if (!re_expr) + return char_set::full(max_c); + + // full_char: the whole alphabet [0, max_char] + if (seq.re.is_full_char(re_expr)) + return char_set::full(max_c); + + // range [lo, hi] (hi inclusive in Z3's regex representation) + unsigned lo = 0, hi = 0; + if (seq.re.is_range(re_expr, lo, hi)) { + if (lo > hi) return char_set(); + // char_range uses exclusive upper bound; Z3 hi is inclusive + return char_set(char_range(lo, hi + 1)); + } + + // complement: alphabet minus the inner set + expr* inner = nullptr; + if (seq.re.is_complement(re_expr, inner)) + return minterm_to_char_set(inner).complement(max_c); + + // intersection: characters present in both sets + expr* r1 = nullptr, *r2 = nullptr; + if (seq.re.is_intersection(re_expr, r1, r2)) + return minterm_to_char_set(r1).intersect_with(minterm_to_char_set(r2)); + + // difference: r1 minus r2 = r1 ∩ complement(r2) + if (seq.re.is_diff(re_expr, r1, r2)) + return minterm_to_char_set(r1).intersect_with( + minterm_to_char_set(r2).complement(max_c)); + + // to_re(str.unit(c)): singleton character set + expr* str_arg = nullptr; + if (seq.re.is_to_re(re_expr, str_arg)) { + expr* ch_expr = nullptr; + unsigned char_val = 0; + if (seq.str.is_unit(str_arg, ch_expr) && + seq.is_const_char(ch_expr, char_val)) { + char_set cs; + cs.add(char_val); + return cs; + } + } + + // empty regex: no characters can appear + if (seq.re.is_empty(re_expr)) + return char_set(); + + // Conservative fallback: return the full alphabet so that + // no unsound constraints are added for unrecognised expressions. + return char_set::full(max_c); + } + } // namespace seq diff --git a/src/smt/seq/seq_parikh.h b/src/smt/seq/seq_parikh.h index 730854832..8e8a4e178 100644 --- a/src/smt/seq/seq_parikh.h +++ b/src/smt/seq/seq_parikh.h @@ -126,6 +126,25 @@ namespace seq { // Compute the length stride of a regex expression. // Exposed for testing and external callers. unsigned get_length_stride(expr* re) { return compute_length_stride(re); } + + // Convert a regex minterm expression to a char_set. + // + // A minterm is a Boolean combination of character-class predicates + // (re.range, re.full_char, complement, intersection) that names a + // single, indivisible character equivalence class. Minterms are + // produced by sgraph::compute_minterms and used in + // apply_regex_var_split to constrain fresh character variables. + // + // Supported expressions: + // re.full_char → full set [0, max_char] + // re.range(lo, hi) → char_set [lo, hi] (inclusive on both ends) + // re.complement(r) → complement of minterm_to_char_set(r) + // re.inter(r1, r2) → intersection of both sets + // re.diff(r1, r2) → r1 minus r2 (= r1 ∩ complement(r2)) + // re.to_re(unit(c)) → singleton {c} + // re.empty → empty set + // anything else → full set (conservative, sound over-approximation) + char_set minterm_to_char_set(expr* minterm_re); }; } // namespace seq From 30c2a9ccdc44cc56dcca408b1a5e1a3176504b2d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 11 Mar 2026 17:23:00 +0000 Subject: [PATCH 11/13] Address code review: add SASSERT for lo>hi, simplify is_to_re handling Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com> --- src/smt/seq/seq_parikh.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/smt/seq/seq_parikh.cpp b/src/smt/seq/seq_parikh.cpp index 367412dad..3f6deba40 100644 --- a/src/smt/seq/seq_parikh.cpp +++ b/src/smt/seq/seq_parikh.cpp @@ -341,6 +341,8 @@ namespace seq { // range [lo, hi] (hi inclusive in Z3's regex representation) unsigned lo = 0, hi = 0; if (seq.re.is_range(re_expr, lo, hi)) { + // lo > hi is a degenerate range; should not arise from well-formed minterms + SASSERT(lo <= hi); if (lo > hi) return char_set(); // char_range uses exclusive upper bound; Z3 hi is inclusive return char_set(char_range(lo, hi + 1)); @@ -363,15 +365,14 @@ namespace seq { // to_re(str.unit(c)): singleton character set expr* str_arg = nullptr; - if (seq.re.is_to_re(re_expr, str_arg)) { - expr* ch_expr = nullptr; - unsigned char_val = 0; - if (seq.str.is_unit(str_arg, ch_expr) && - seq.is_const_char(ch_expr, char_val)) { - char_set cs; - cs.add(char_val); - return cs; - } + expr* ch_expr = nullptr; + unsigned char_val = 0; + if (seq.re.is_to_re(re_expr, str_arg) && + seq.str.is_unit(str_arg, ch_expr) && + seq.is_const_char(ch_expr, char_val)) { + char_set cs; + cs.add(char_val); + return cs; } // empty regex: no characters can appear From 4b2f5e2bb0f1e634efc2ffa0359dea51713aee86 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 11 Mar 2026 18:36:28 +0000 Subject: [PATCH 12/13] Add seq_parikh unit tests: 32 tests covering stride, constraints, conflict, char_set Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com> --- src/test/CMakeLists.txt | 1 + src/test/main.cpp | 1 + src/test/seq_parikh.cpp | 883 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 885 insertions(+) create mode 100644 src/test/seq_parikh.cpp diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index acc598edf..1620bd790 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -132,6 +132,7 @@ add_executable(test-z3 sls_test.cpp sls_seq_plugin.cpp seq_nielsen.cpp + seq_parikh.cpp nseq_basic.cpp nseq_regex.cpp nseq_zipt.cpp diff --git a/src/test/main.cpp b/src/test/main.cpp index 9fc8bf1ec..0f378df67 100644 --- a/src/test/main.cpp +++ b/src/test/main.cpp @@ -287,6 +287,7 @@ int main(int argc, char ** argv) { TST(scoped_vector); TST(sls_seq_plugin); TST(seq_nielsen); + TST(seq_parikh); TST(nseq_basic); TST(nseq_regex); TST(nseq_zipt); diff --git a/src/test/seq_parikh.cpp b/src/test/seq_parikh.cpp new file mode 100644 index 000000000..83bbc3da4 --- /dev/null +++ b/src/test/seq_parikh.cpp @@ -0,0 +1,883 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + seq_parikh.cpp + +Abstract: + + Unit tests for seq_parikh (Parikh image filter for the ZIPT Nielsen solver). + + Tests cover: + - compute_length_stride / get_length_stride for all regex forms + - generate_parikh_constraints: constraint shape, count, and dependency + - apply_to_node: integration with nielsen_node + - check_parikh_conflict: lightweight feasibility pre-check + - minterm_to_char_set: regex-minterm to char_set conversion + +Author: + + Clemens Eisenhofer 2026-03-11 + Nikolaj Bjorner (nbjorner) 2026-03-11 + +--*/ + +#include "util/util.h" +#include "util/zstring.h" +#include "ast/euf/euf_egraph.h" +#include "ast/euf/euf_sgraph.h" +#include "smt/seq/seq_nielsen.h" +#include "smt/seq/seq_parikh.h" +#include "ast/arith_decl_plugin.h" +#include "ast/reg_decl_plugins.h" +#include "ast/ast_pp.h" +#include + +// --------------------------------------------------------------------------- +// Minimal solver stub (no-op) +// --------------------------------------------------------------------------- +class parikh_test_solver : public seq::simple_solver { +public: + void push() override {} + void pop(unsigned) override {} + void assert_expr(expr*) override {} + lbool check() override { return l_true; } +}; + +// --------------------------------------------------------------------------- +// Helpers to build common regex expressions +// --------------------------------------------------------------------------- + +// build to_re("AB") — a fixed two-character string regex +static expr_ref mk_to_re_ab(ast_manager& m, seq_util& seq) { + expr_ref ch_a(seq.str.mk_char('A'), m); + expr_ref ch_b(seq.str.mk_char('B'), m); + expr_ref unit_a(seq.str.mk_unit(ch_a), m); + expr_ref unit_b(seq.str.mk_unit(ch_b), m); + expr_ref ab(seq.str.mk_concat(unit_a, unit_b), m); + return expr_ref(seq.re.mk_to_re(ab), m); +} + +// build (ab)* — star of the two-character sequence +static expr_ref mk_ab_star(ast_manager& m, seq_util& seq) { + return expr_ref(seq.re.mk_star(mk_to_re_ab(m, seq)), m); +} + +// build (abc)* — star of a three-character sequence +static expr_ref mk_abc_star(ast_manager& m, seq_util& seq) { + expr_ref ch_a(seq.str.mk_char('A'), m); + expr_ref ch_b(seq.str.mk_char('B'), m); + expr_ref ch_c(seq.str.mk_char('C'), m); + expr_ref unit_a(seq.str.mk_unit(ch_a), m); + expr_ref unit_b(seq.str.mk_unit(ch_b), m); + expr_ref unit_c(seq.str.mk_unit(ch_c), m); + sort_ref str_sort(seq.str.mk_string_sort(), m); + expr_ref abc(seq.str.mk_concat(unit_a, seq.str.mk_concat(unit_b, unit_c)), m); + return expr_ref(seq.re.mk_star(seq.re.mk_to_re(abc)), m); +} + +// build to_re("A") — single-character regex +static expr_ref mk_to_re_a(ast_manager& m, seq_util& seq) { + expr_ref ch_a(seq.str.mk_char('A'), m); + expr_ref unit_a(seq.str.mk_unit(ch_a), m); + return expr_ref(seq.re.mk_to_re(unit_a), m); +} + +// --------------------------------------------------------------------------- +// Stride tests: compute_length_stride / get_length_stride +// --------------------------------------------------------------------------- + +// stride(to_re("AB")) == 0 (fixed length) +static void test_stride_fixed_length() { + std::cout << "test_stride_fixed_length\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + + expr_ref re = mk_to_re_ab(m, seq); + SASSERT(parikh.get_length_stride(re) == 0); +} + +// stride((ab)*) == 2 +static void test_stride_star_fixed_body() { + std::cout << "test_stride_star_fixed_body\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + + expr_ref re = mk_ab_star(m, seq); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride((ab)*) = " << stride << "\n"; + SASSERT(stride == 2); +} + +// stride((abc)*) == 3 +static void test_stride_star_three_char() { + std::cout << "test_stride_star_three_char\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + + expr_ref re = mk_abc_star(m, seq); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride((abc)*) = " << stride << "\n"; + SASSERT(stride == 3); +} + +// stride((ab)+) == 2 +static void test_stride_plus() { + std::cout << "test_stride_plus\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + + expr_ref re_body = mk_to_re_ab(m, seq); + expr_ref re(seq.re.mk_plus(re_body), m); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride((ab)+) = " << stride << "\n"; + SASSERT(stride == 2); +} + +// stride(a* b*) == 1 — union of independent stars → every length possible +static void test_stride_concat_stars() { + std::cout << "test_stride_concat_stars\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + + expr_ref a_star(seq.re.mk_star(mk_to_re_a(m, seq)), m); + expr_ref b_star(seq.re.mk_star(mk_to_re_a(m, seq)), m); + expr_ref re(seq.re.mk_concat(a_star, b_star), m); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride(a* b*) = " << stride << "\n"; + // both stars have stride 1 (single-char body → gcd(1,0)=1) → gcd(1,1)=1 + SASSERT(stride == 1); +} + +// stride((ab)* | (abc)*) == gcd(2,3) = 1 +static void test_stride_union_no_common_period() { + std::cout << "test_stride_union_no_common_period\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + + expr_ref ab_star = mk_ab_star(m, seq); + expr_ref abc_star = mk_abc_star(m, seq); + expr_ref re(seq.re.mk_union(ab_star, abc_star), m); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride((ab)*|(abc)*) = " << stride << "\n"; + // lengths: {0,2,4,...} union {0,3,6,...} → GCD(2,3)=1 + SASSERT(stride == 1); +} + +// stride((ab)*|(de)*) == gcd(2,2) = 2 +static void test_stride_union_same_period() { + std::cout << "test_stride_union_same_period\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + + expr_ref ab_star = mk_ab_star(m, seq); + // de_star: (de)* — same stride 2 + expr_ref ch_d(seq.str.mk_char('D'), m); + expr_ref ch_e(seq.str.mk_char('E'), m); + expr_ref unit_d(seq.str.mk_unit(ch_d), m); + expr_ref unit_e(seq.str.mk_unit(ch_e), m); + expr_ref de(seq.str.mk_concat(unit_d, unit_e), m); + expr_ref de_star(seq.re.mk_star(seq.re.mk_to_re(de)), m); + expr_ref re(seq.re.mk_union(ab_star, de_star), m); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride((ab)*|(de)*) = " << stride << "\n"; + SASSERT(stride == 2); +} + +// stride(loop((ab), 1, 3)) == 2 — loop with fixed-length body +static void test_stride_loop() { + std::cout << "test_stride_loop\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + + expr_ref ab = mk_to_re_ab(m, seq); + expr_ref re(seq.re.mk_loop(ab, 1, 3), m); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride(loop(ab,1,3)) = " << stride << "\n"; + SASSERT(stride == 2); +} + +// stride(re.full_seq) == 1 (every length possible) +static void test_stride_full_seq() { + std::cout << "test_stride_full_seq\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + sort_ref str_sort(seq.str.mk_string_sort(), m); + + expr_ref re(seq.re.mk_full_seq(str_sort), m); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride(.*) = " << stride << "\n"; + SASSERT(stride == 1); +} + +// stride(re.empty) == 0 +static void test_stride_empty_regex() { + std::cout << "test_stride_empty_regex\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + sort_ref str_sort(seq.str.mk_string_sort(), m); + + expr_ref re(seq.re.mk_empty(str_sort), m); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride(empty) = " << stride << "\n"; + SASSERT(stride == 0); +} + +// stride(re.epsilon) == 0 +static void test_stride_epsilon() { + std::cout << "test_stride_epsilon\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + sort_ref str_sort(seq.str.mk_string_sort(), m); + + // epsilon is to_re("") — empty string + sort_ref str_s(seq.str.mk_string_sort(), m); + expr_ref empty_str(seq.str.mk_empty(str_s), m); + expr_ref re(seq.re.mk_to_re(empty_str), m); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride(epsilon) = " << stride << "\n"; + SASSERT(stride == 0); +} + +// stride((ab)?) == 2 (gcd(2, 0) = 2 via opt handling; min_len(ab)=2) +static void test_stride_opt() { + std::cout << "test_stride_opt\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + + expr_ref ab = mk_to_re_ab(m, seq); + expr_ref re(seq.re.mk_opt(ab), m); + unsigned stride = parikh.get_length_stride(re); + std::cout << " stride((ab)?) = " << stride << "\n"; + SASSERT(stride == 2); +} + +// --------------------------------------------------------------------------- +// generate_parikh_constraints tests +// --------------------------------------------------------------------------- + +// (ab)* → len(x) = 0 + 2·k, k ≥ 0 (stride 2, min_len 0) +static void test_generate_constraints_ab_star() { + std::cout << "test_generate_constraints_ab_star\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + arith_util arith(m); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + expr_ref re = mk_ab_star(m, seq); + euf::snode* regex = sg.mk(re); + seq::dep_tracker dep; dep.insert(0); + seq::str_mem mem(x, regex, nullptr, 0, dep); + + vector out; + parikh.generate_parikh_constraints(mem, out); + + // expect at least: len(x)=0+2k and k>=0 + // (no upper bound because max_length is UINT_MAX for unbounded star) + std::cout << " generated " << out.size() << " constraints\n"; + SASSERT(out.size() >= 2); + + // check that one constraint is an equality (len(x) = 0 + 2·k) + bool has_eq = false; + for (auto const& ic : out) + if (ic.m_kind == seq::int_constraint_kind::eq) has_eq = true; + SASSERT(has_eq); + + // check that one constraint is k >= 0 + bool has_ge = false; + for (auto const& ic : out) + if (ic.m_kind == seq::int_constraint_kind::ge) has_ge = true; + SASSERT(has_ge); + + // should NOT have an upper bound (star is unbounded) + bool has_le = false; + for (auto const& ic : out) + if (ic.m_kind == seq::int_constraint_kind::le) has_le = true; + SASSERT(!has_le); +} + +// loop((ab), 1, 3): bounded → k ≤ floor((6-2)/2) = 2 +static void test_generate_constraints_bounded_loop() { + std::cout << "test_generate_constraints_bounded_loop\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + // loop("ab", 1, 3): min_len=2, max_len=6, stride=2 + expr_ref ab = mk_to_re_ab(m, seq); + expr_ref re(seq.re.mk_loop(ab, 1, 3), m); + euf::snode* regex = sg.mk(re); + seq::dep_tracker dep; dep.insert(0); + seq::str_mem mem(x, regex, nullptr, 0, dep); + + vector out; + parikh.generate_parikh_constraints(mem, out); + + // expect: eq + ge + le = 3 constraints + std::cout << " generated " << out.size() << " constraints\n"; + SASSERT(out.size() == 3); + + bool has_eq = false, has_ge = false, has_le = false; + for (auto const& ic : out) { + if (ic.m_kind == seq::int_constraint_kind::eq) has_eq = true; + if (ic.m_kind == seq::int_constraint_kind::ge) has_ge = true; + if (ic.m_kind == seq::int_constraint_kind::le) has_le = true; + } + SASSERT(has_eq); + SASSERT(has_ge); + SASSERT(has_le); +} + +// stride == 1 → no constraints generated +static void test_generate_constraints_stride_one() { + std::cout << "test_generate_constraints_stride_one\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + sort_ref str_sort(seq.str.mk_string_sort(), m); + + euf::snode* x = sg.mk_var(symbol("x")); + // full_seq: stride=1 → no modular constraint + expr_ref re(seq.re.mk_full_seq(str_sort), m); + euf::snode* regex = sg.mk(re); + seq::dep_tracker dep; dep.insert(0); + seq::str_mem mem(x, regex, nullptr, 0, dep); + + vector out; + parikh.generate_parikh_constraints(mem, out); + std::cout << " generated " << out.size() << " constraints (expect 0)\n"; + SASSERT(out.empty()); +} + +// fixed-length regex (min == max) → no constraints generated +static void test_generate_constraints_fixed_length() { + std::cout << "test_generate_constraints_fixed_length\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + expr_ref re = mk_to_re_ab(m, seq); // fixed len 2 + euf::snode* regex = sg.mk(re); + seq::dep_tracker dep; dep.insert(0); + seq::str_mem mem(x, regex, nullptr, 0, dep); + + vector out; + parikh.generate_parikh_constraints(mem, out); + std::cout << " generated " << out.size() << " constraints (expect 0)\n"; + SASSERT(out.empty()); +} + +// dependency is propagated to all generated constraints +static void test_generate_constraints_dep_propagated() { + std::cout << "test_generate_constraints_dep_propagated\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + expr_ref re = mk_ab_star(m, seq); + euf::snode* regex = sg.mk(re); + seq::dep_tracker dep; dep.insert(7); + seq::str_mem mem(x, regex, nullptr, 0, dep); + + vector out; + parikh.generate_parikh_constraints(mem, out); + + // all generated constraints must carry bit 7 in their dependency + for (auto const& ic : out) { + SASSERT(!ic.m_dep.empty()); + seq::dep_tracker d7; d7.insert(7); + SASSERT(d7.subset_of(ic.m_dep)); + } + std::cout << " all constraints carry dep bit 7\n"; +} + +// --------------------------------------------------------------------------- +// apply_to_node tests +// --------------------------------------------------------------------------- + +// applying to a node with one membership adds constraints to node +static void test_apply_to_node_adds_constraints() { + std::cout << "test_apply_to_node_adds_constraints\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + parikh_test_solver solver; + seq::nielsen_graph ng(sg, solver); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + expr_ref re = mk_ab_star(m, seq); // stride 2 → generates constraints + euf::snode* regex = sg.mk(re); + ng.add_str_mem(x, regex); + + // root node should have no int_constraints initially + SASSERT(ng.root() != nullptr); + unsigned before = ng.root()->int_constraints().size(); + + parikh.apply_to_node(*ng.root()); + + unsigned after = ng.root()->int_constraints().size(); + std::cout << " before=" << before << " after=" << after << "\n"; + SASSERT(after > before); +} + +// applying twice is idempotent (m_parikh_applied would prevent double-add +// via nielsen_graph::apply_parikh_to_node, but seq_parikh::apply_to_node +// itself does not guard — so calling apply_to_node directly adds again; +// this test verifies the direct call does add, not the idempotency guard) +static void test_apply_to_node_stride_one_no_constraints() { + std::cout << "test_apply_to_node_stride_one_no_constraints\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + sort_ref str_sort(seq.str.mk_string_sort(), m); + parikh_test_solver solver; + seq::nielsen_graph ng(sg, solver); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + expr_ref re(seq.re.mk_full_seq(str_sort), m); // stride 1 → no constraints + euf::snode* regex = sg.mk(re); + ng.add_str_mem(x, regex); + + unsigned before = ng.root()->int_constraints().size(); + parikh.apply_to_node(*ng.root()); + unsigned after = ng.root()->int_constraints().size(); + std::cout << " before=" << before << " after=" << after << " (expect no change)\n"; + SASSERT(after == before); +} + +// --------------------------------------------------------------------------- +// check_parikh_conflict tests +// --------------------------------------------------------------------------- + +// no conflict when var_lb=0, var_ub=UINT_MAX (unconstrained) +static void test_check_conflict_unconstrained_no_conflict() { + std::cout << "test_check_conflict_unconstrained_no_conflict\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + parikh_test_solver solver; + seq::nielsen_graph ng(sg, solver); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + expr_ref re = mk_ab_star(m, seq); // stride 2, min_len 0 + euf::snode* regex = sg.mk(re); + ng.add_str_mem(x, regex); + + // no bounds set → default lb=0, ub=UINT_MAX → no conflict + bool conflict = parikh.check_parikh_conflict(*ng.root()); + std::cout << " conflict = " << conflict << " (expect 0)\n"; + SASSERT(!conflict); +} + +// conflict: lb=3, ub=5, stride=2, min_len=0 +// valid lengths: 0,2,4,6,... ∩ [3,5] = {4} → no conflict +static void test_check_conflict_valid_k_exists() { + std::cout << "test_check_conflict_valid_k_exists\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + parikh_test_solver solver; + seq::nielsen_graph ng(sg, solver); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + expr_ref re = mk_ab_star(m, seq); // stride 2, min_len 0; lengths 0,2,4,... + euf::snode* regex = sg.mk(re); + ng.add_str_mem(x, regex); + + // lb=3, ub=5: length 4 is achievable (k=2) → no conflict + seq::dep_tracker dep; dep.insert(0); + ng.root()->add_lower_int_bound(x, 3, dep); + ng.root()->add_upper_int_bound(x, 5, dep); + + bool conflict = parikh.check_parikh_conflict(*ng.root()); + std::cout << " conflict = " << conflict << " (expect 0)\n"; + SASSERT(!conflict); +} + +// conflict: lb=3, ub=3, stride=2, min_len=0 +// valid lengths: {0,2,4,...} ∩ [3,3] = {} → conflict +static void test_check_conflict_no_valid_k() { + std::cout << "test_check_conflict_no_valid_k\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + parikh_test_solver solver; + seq::nielsen_graph ng(sg, solver); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + expr_ref re = mk_ab_star(m, seq); // stride 2, min_len 0; lengths {0,2,4,...} + euf::snode* regex = sg.mk(re); + ng.add_str_mem(x, regex); + + // lb=3, ub=3: only odd length 3 — never a multiple of 2 → conflict + seq::dep_tracker dep; dep.insert(0); + ng.root()->add_lower_int_bound(x, 3, dep); + ng.root()->add_upper_int_bound(x, 3, dep); + + bool conflict = parikh.check_parikh_conflict(*ng.root()); + std::cout << " conflict = " << conflict << " (expect 1)\n"; + SASSERT(conflict); +} + +// conflict: lb=5, ub=5, stride=3, min_len=0 +// valid lengths of (abc)*: {0,3,6,...} ∩ {5} = {} → conflict +static void test_check_conflict_abc_star() { + std::cout << "test_check_conflict_abc_star\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + parikh_test_solver solver; + seq::nielsen_graph ng(sg, solver); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + expr_ref re = mk_abc_star(m, seq); // stride 3, min_len 0; lengths {0,3,6,...} + euf::snode* regex = sg.mk(re); + ng.add_str_mem(x, regex); + + // lb=5, ub=5 → no valid k (5 is not a multiple of 3) → conflict + seq::dep_tracker dep; dep.insert(0); + ng.root()->add_lower_int_bound(x, 5, dep); + ng.root()->add_upper_int_bound(x, 5, dep); + + bool conflict = parikh.check_parikh_conflict(*ng.root()); + std::cout << " conflict = " << conflict << " (expect 1)\n"; + SASSERT(conflict); +} + +// no conflict for stride==1 regex even with narrow bounds +static void test_check_conflict_stride_one_never_conflicts() { + std::cout << "test_check_conflict_stride_one_never_conflicts\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + sort_ref str_sort(seq.str.mk_string_sort(), m); + parikh_test_solver solver; + seq::nielsen_graph ng(sg, solver); + seq::seq_parikh parikh(sg); + + euf::snode* x = sg.mk_var(symbol("x")); + expr_ref re(seq.re.mk_full_seq(str_sort), m); // stride 1 → no constraint + euf::snode* regex = sg.mk(re); + ng.add_str_mem(x, regex); + + seq::dep_tracker dep; dep.insert(0); + ng.root()->add_lower_int_bound(x, 7, dep); + ng.root()->add_upper_int_bound(x, 7, dep); + + bool conflict = parikh.check_parikh_conflict(*ng.root()); + std::cout << " conflict = " << conflict << " (expect 0: stride=1 skipped)\n"; + SASSERT(!conflict); +} + +// --------------------------------------------------------------------------- +// minterm_to_char_set tests +// --------------------------------------------------------------------------- + +// re.full_char → full alphabet [0, max_char] +static void test_minterm_full_char() { + std::cout << "test_minterm_full_char\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + sort_ref str_sort(seq.str.mk_string_sort(), m); + + expr_ref re(seq.re.mk_full_char(str_sort), m); + char_set cs = parikh.minterm_to_char_set(re); + std::cout << " full_char char_count = " << cs.char_count() << "\n"; + SASSERT(cs.is_full(seq.max_char())); +} + +// re.empty → empty char_set +static void test_minterm_empty_regex() { + std::cout << "test_minterm_empty_regex\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + sort_ref str_sort(seq.str.mk_string_sort(), m); + + expr_ref re(seq.re.mk_empty(str_sort), m); + char_set cs = parikh.minterm_to_char_set(re); + std::cout << " empty regex → char_set empty: " << cs.is_empty() << "\n"; + SASSERT(cs.is_empty()); +} + +// re.range('A','Z') → 26 characters +static void test_minterm_range() { + std::cout << "test_minterm_range\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + + // Z3 re.range takes string arguments "A" and "Z" + expr_ref lo_str(seq.str.mk_string(zstring("A")), m); + expr_ref hi_str(seq.str.mk_string(zstring("Z")), m); + expr_ref re(seq.re.mk_range(lo_str, hi_str), m); + char_set cs = parikh.minterm_to_char_set(re); + std::cout << " range(A,Z) char_count = " << cs.char_count() << "\n"; + SASSERT(cs.char_count() == 26); + SASSERT(cs.contains('A')); + SASSERT(cs.contains('Z')); + SASSERT(!cs.contains('a')); +} + +// complement of re.range('A','Z') should not contain A-Z +static void test_minterm_complement() { + std::cout << "test_minterm_complement\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + sort_ref str_sort(seq.str.mk_string_sort(), m); + + expr_ref lo_str(seq.str.mk_string(zstring("A")), m); + expr_ref hi_str(seq.str.mk_string(zstring("Z")), m); + expr_ref range(seq.re.mk_range(lo_str, hi_str), m); + expr_ref re(seq.re.mk_complement(range), m); + char_set cs = parikh.minterm_to_char_set(re); + + // complement of [A-Z] should not contain any letter in A-Z + for (unsigned c = 'A'; c <= 'Z'; ++c) + SASSERT(!cs.contains(c)); + // but should contain e.g. '0' + SASSERT(cs.contains('0')); + std::cout << " complement ok: A-Z excluded, '0' included\n"; +} + +// intersection of range('A','Z') and range('M','Z') == range('M','Z') +static void test_minterm_intersection() { + std::cout << "test_minterm_intersection\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + + expr_ref lo_az(seq.str.mk_string(zstring("A")), m); + expr_ref hi_az(seq.str.mk_string(zstring("Z")), m); + expr_ref lo_mz(seq.str.mk_string(zstring("M")), m); + + expr_ref range_az(seq.re.mk_range(lo_az, hi_az), m); + expr_ref range_mz(seq.re.mk_range(lo_mz, hi_az), m); + expr_ref re(seq.re.mk_inter(range_az, range_mz), m); + char_set cs = parikh.minterm_to_char_set(re); + + // intersection [A-Z] ∩ [M-Z] = [M-Z]: 14 characters + std::cout << " intersection [A-Z]∩[M-Z] char_count = " << cs.char_count() << "\n"; + SASSERT(cs.char_count() == 14); // M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z + SASSERT(!cs.contains('A')); + SASSERT(cs.contains('M')); + SASSERT(cs.contains('Z')); +} + +// diff(range('A','Z'), range('A','M')) == range('N','Z') +static void test_minterm_diff() { + std::cout << "test_minterm_diff\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + + expr_ref lo_az(seq.str.mk_string(zstring("A")), m); + expr_ref hi_az(seq.str.mk_string(zstring("Z")), m); + expr_ref lo_am(seq.str.mk_string(zstring("A")), m); + expr_ref hi_am(seq.str.mk_string(zstring("M")), m); + + expr_ref range_az(seq.re.mk_range(lo_az, hi_az), m); + expr_ref range_am(seq.re.mk_range(lo_am, hi_am), m); + expr_ref re(seq.re.mk_diff(range_az, range_am), m); + char_set cs = parikh.minterm_to_char_set(re); + + // diff [A-Z] \ [A-M] = [N-Z]: 13 characters + std::cout << " diff [A-Z]\\[A-M] char_count = " << cs.char_count() << "\n"; + SASSERT(cs.char_count() == 13); // N..Z + SASSERT(!cs.contains('A')); + SASSERT(!cs.contains('M')); + SASSERT(cs.contains('N')); + SASSERT(cs.contains('Z')); +} + +// to_re(unit('A')) → singleton {'A'} +static void test_minterm_singleton() { + std::cout << "test_minterm_singleton\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq_util seq(m); + seq::seq_parikh parikh(sg); + + expr_ref ch_a(seq.str.mk_char('A'), m); + expr_ref unit_a(seq.str.mk_unit(ch_a), m); + expr_ref re(seq.re.mk_to_re(unit_a), m); + char_set cs = parikh.minterm_to_char_set(re); + + std::cout << " singleton char_count = " << cs.char_count() << "\n"; + SASSERT(cs.char_count() == 1); + SASSERT(cs.contains('A')); + SASSERT(!cs.contains('B')); +} + +// nullptr → full set (conservative fallback) +static void test_minterm_nullptr_is_full() { + std::cout << "test_minterm_nullptr_is_full\n"; + ast_manager m; + reg_decl_plugins(m); + euf::egraph eg(m); + euf::sgraph sg(m, eg); + seq::seq_parikh parikh(sg); + seq_util seq(m); + + char_set cs = parikh.minterm_to_char_set(nullptr); + SASSERT(cs.is_full(seq.max_char())); + std::cout << " nullptr → full set ok\n"; +} + +// --------------------------------------------------------------------------- +// Entry point +// --------------------------------------------------------------------------- + +void tst_seq_parikh() { + // stride tests + test_stride_fixed_length(); + test_stride_star_fixed_body(); + test_stride_star_three_char(); + test_stride_plus(); + test_stride_concat_stars(); + test_stride_union_no_common_period(); + test_stride_union_same_period(); + test_stride_loop(); + test_stride_full_seq(); + test_stride_empty_regex(); + test_stride_epsilon(); + test_stride_opt(); + + // generate_parikh_constraints tests + test_generate_constraints_ab_star(); + test_generate_constraints_bounded_loop(); + test_generate_constraints_stride_one(); + test_generate_constraints_fixed_length(); + test_generate_constraints_dep_propagated(); + + // apply_to_node tests + test_apply_to_node_adds_constraints(); + test_apply_to_node_stride_one_no_constraints(); + + // check_parikh_conflict tests + test_check_conflict_unconstrained_no_conflict(); + test_check_conflict_valid_k_exists(); + test_check_conflict_no_valid_k(); + test_check_conflict_abc_star(); + test_check_conflict_stride_one_never_conflicts(); + + // minterm_to_char_set tests + test_minterm_full_char(); + test_minterm_empty_regex(); + test_minterm_range(); + test_minterm_complement(); + test_minterm_intersection(); + test_minterm_diff(); + test_minterm_singleton(); + test_minterm_nullptr_is_full(); +} From a51ba544ea3e90d9d78dbddebc27c0829187f031 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 11 Mar 2026 19:08:11 +0000 Subject: [PATCH 13/13] Fix typo: rename m_parith to m_parikh in seq_nielsen.h/.cpp Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com> --- src/smt/seq/seq_nielsen.cpp | 10 +++++----- src/smt/seq/seq_nielsen.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/smt/seq/seq_nielsen.cpp b/src/smt/seq/seq_nielsen.cpp index 58fd96692..d2f3a9eec 100644 --- a/src/smt/seq/seq_nielsen.cpp +++ b/src/smt/seq/seq_nielsen.cpp @@ -433,11 +433,11 @@ namespace seq { nielsen_graph::nielsen_graph(euf::sgraph& sg, simple_solver& solver): m_sg(sg), m_solver(solver), - m_parith(alloc(seq_parikh, sg)) { + m_parikh(alloc(seq_parikh, sg)) { } nielsen_graph::~nielsen_graph() { - dealloc(m_parith); + dealloc(m_parikh); reset(); } @@ -1041,12 +1041,12 @@ namespace seq { // Generate modular length constraints (len(str) = min_len + stride·k, etc.) // and append them to the node's integer constraint list. - m_parith->apply_to_node(node); + m_parikh->apply_to_node(node); // Lightweight feasibility pre-check: does the Parikh modular constraint // contradict the variable's current integer bounds? If so, mark this // node as a Parikh-image conflict immediately (avoids a solver call). - if (!node.is_currently_conflict() && m_parith->check_parikh_conflict(node)) { + if (!node.is_currently_conflict() && m_parikh->check_parikh_conflict(node)) { node.set_general_conflict(true); node.set_reason(backtrack_reason::parikh_image); } @@ -2306,7 +2306,7 @@ namespace seq { // generated from minterm m_i, ?c must belong to the character // class described by m_i so that str ∈ derivative(R, m_i). if (mt->get_expr()) { - char_set cs = m_parith->minterm_to_char_set(mt->get_expr()); + char_set cs = m_parikh->minterm_to_char_set(mt->get_expr()); if (!cs.is_empty()) child->add_char_range(fresh_char, cs); } diff --git a/src/smt/seq/seq_nielsen.h b/src/smt/seq/seq_nielsen.h index 0be8ae274..36adcc9ae 100644 --- a/src/smt/seq/seq_nielsen.h +++ b/src/smt/seq/seq_nielsen.h @@ -718,7 +718,7 @@ namespace seq { // Parikh image filter: generates modular length constraints from regex // memberships. Allocated in the constructor; owned by this graph. - seq_parikh* m_parith = nullptr; + seq_parikh* m_parikh = nullptr; public: // Construct with a caller-supplied solver. Ownership is NOT transferred;