3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2026-03-17 10:33:48 +00:00

Add nseq_parith.h and nseq_parikh.cpp: Parikh filter for ZIPT string solver

Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot] 2026-03-11 00:05:26 +00:00
parent 3054f0cb41
commit 334df71b11
3 changed files with 387 additions and 0 deletions

View file

@ -1,5 +1,6 @@
z3_add_component(smt_seq
SOURCES
nseq_parikh.cpp
seq_nielsen.cpp
COMPONENT_DEPENDENCIES
euf

274
src/smt/seq/nseq_parikh.cpp Normal file
View file

@ -0,0 +1,274 @@
/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
nseq_parikh.cpp
Abstract:
Parikh image filter implementation for the ZIPT-based Nielsen string
solver. See nseq_parith.h for the full design description.
The key operation is compute_length_stride(re), which performs a
structural traversal of the regex to find the period k such that all
string lengths in L(re) are congruent to min_length(re) modulo k.
The stride is used to generate modular length constraints that help
the integer subsolver prune infeasible Nielsen graph nodes.
Author:
Nikolaj Bjorner (nbjorner) 2026-03-10
--*/
#include "smt/seq/nseq_parith.h"
#include "ast/arith_decl_plugin.h"
#include "ast/seq_decl_plugin.h"
#include <string>
namespace seq {
// -----------------------------------------------------------------------
// Helpers
// -----------------------------------------------------------------------
// GCD via Euclidean algorithm. gcd(0, x) = x, gcd(0, 0) = 0.
unsigned nseq_parith::gcd(unsigned a, unsigned b) {
if (a == 0 && b == 0) return 0;
while (b != 0) {
unsigned t = b;
b = a % b;
a = t;
}
return a;
}
nseq_parith::nseq_parith(euf::sgraph& sg)
: m_sg(sg), m_fresh_cnt(0) {}
expr_ref nseq_parith::mk_fresh_int_var() {
ast_manager& m = m_sg.get_manager();
arith_util arith(m);
std::string name = "pk!" + std::to_string(m_fresh_cnt++);
return expr_ref(m.mk_fresh_const(name.c_str(), arith.mk_int()), m);
}
// -----------------------------------------------------------------------
// Stride computation
// -----------------------------------------------------------------------
// compute_length_stride: structural traversal of regex expression.
//
// Return value semantics:
// 0 — fixed length (or empty language): no modular constraint needed
// beyond the min == max bounds.
// 1 — all integer lengths ≥ min_len are achievable: no useful modular
// constraint.
// k > 1 — all lengths in L(re) satisfy len ≡ min_len (mod k):
// modular constraint len(str) = min_len + k·j is useful.
unsigned nseq_parith::compute_length_stride(expr* re) {
if (!re) return 1;
seq_util& seq = m_sg.get_seq_util();
expr* r1 = nullptr, *r2 = nullptr, *s = nullptr;
unsigned lo = 0, hi = 0;
// Empty language: no strings exist; stride is irrelevant.
if (seq.re.is_empty(re))
return 0;
// Epsilon regex {""}: single fixed length 0.
if (seq.re.is_epsilon(re))
return 0;
// to_re(concrete_string): fixed-length, no modular constraint needed.
if (seq.re.is_to_re(re, s)) {
// min_length == max_length, covered by bounds.
return 0;
}
// Single character: range, full_char — fixed length 1.
if (seq.re.is_range(re) || seq.re.is_full_char(re))
return 0;
// full_seq (.* / Σ*): every length ≥ 0 is possible.
if (seq.re.is_full_seq(re))
return 1;
// r* — Kleene star.
// L(r*) = {ε} L(r) L(r)·L(r) ...
// If r has a fixed length k, then L(r*) = {0, k, 2k, ...} → stride k.
// If r has variable length, strides from different iterations combine
// by GCD.
if (seq.re.is_star(re, r1)) {
unsigned mn = seq.re.min_length(r1);
unsigned mx = seq.re.max_length(r1);
// When the body has unbounded length (mx == UINT_MAX), different
// iterations can produce many different lengths, and the stride of
// the star as a whole degenerates to gcd(mn, mn) = mn (or 1 if
// mn == 1). This is conservative: we use the body's min-length
// as the only available fixed quantity.
if (mx == UINT_MAX) mx = mn;
if (mn == mx) {
// Fixed-length body: L(r*) = {0, mn, 2·mn, ...} → stride = mn.
// When mn == 1 the stride would be 1, which gives no useful
// modular constraint, so return 0 instead.
return (mn > 1) ? mn : 0;
}
// Variable-length body: GCD of min and max lengths
return gcd(mn, mx);
}
// r+ — one or more: same stride analysis as r*.
if (seq.re.is_plus(re, r1)) {
unsigned mn = seq.re.min_length(r1);
unsigned mx = seq.re.max_length(r1);
if (mx == UINT_MAX) mx = mn; // same conservative treatment as star
if (mn == mx)
return (mn > 1) ? mn : 0;
return gcd(mn, mx);
}
// r? — zero or one: lengths = {0} lengths(r)
// stride = GCD(mn_r, stride(r)) unless stride(r) is 0 (fixed length).
if (seq.re.is_opt(re, r1)) {
unsigned mn = seq.re.min_length(r1);
unsigned inner = compute_length_stride(r1);
// L(r?) includes length 0 and all lengths of L(r).
// GCD(stride(r), min_len(r)) is a valid stride because:
// - the gap from 0 to min_len(r) is min_len(r) itself, and
// - subsequent lengths grow in steps governed by stride(r).
// A result > 1 gives a useful modular constraint; result == 1
// means every non-negative integer is achievable (no constraint).
if (inner == 0)
return gcd(mn, 0); // gcd(mn, 0) = mn; useful when mn > 1
return gcd(inner, mn);
}
// concat(r1, r2): lengths add → stride = GCD(stride(r1), stride(r2)).
if (seq.re.is_concat(re, r1, r2)) {
unsigned s1 = compute_length_stride(r1);
unsigned s2 = compute_length_stride(r2);
// 0 (fixed) on either side: result is governed by the other.
if (s1 == 0 && s2 == 0) return 0;
if (s1 == 0) return s2;
if (s2 == 0) return s1;
return gcd(s1, s2);
}
// union(r1, r2): lengths from either branch → need GCD of both
// strides and the difference between their minimum lengths.
if (seq.re.is_union(re, r1, r2)) {
unsigned s1 = compute_length_stride(r1);
unsigned s2 = compute_length_stride(r2);
unsigned m1 = seq.re.min_length(r1);
unsigned m2 = seq.re.min_length(r2);
unsigned d = (m1 >= m2) ? (m1 - m2) : (m2 - m1);
// Replace 0-strides with d for GCD computation:
// a fixed-length branch only introduces constraint via its offset.
unsigned g = gcd(s1 == 0 ? d : s1, s2 == 0 ? d : s2);
g = gcd(g, d);
return g;
}
// loop(r, lo, hi): lengths = {lo·len(r), ..., hi·len(r)} if r is fixed.
// stride = len(r) when r is fixed-length; otherwise GCD.
if (seq.re.is_loop(re, r1, lo, hi)) {
unsigned mn = seq.re.min_length(r1);
unsigned mx = seq.re.max_length(r1);
if (mx == UINT_MAX) mx = mn;
if (mn == mx)
return (mn > 1) ? mn : 0;
return gcd(mn, mx);
}
if (seq.re.is_loop(re, r1, lo)) {
unsigned mn = seq.re.min_length(r1);
unsigned mx = seq.re.max_length(r1);
if (mx == UINT_MAX) mx = mn;
if (mn == mx)
return (mn > 1) ? mn : 0;
return gcd(mn, mx);
}
// intersection(r1, r2): lengths must be in both languages.
// A conservative safe choice: GCD(stride(r1), stride(r2)) is a valid
// stride for the intersection (every length in the intersection is
// also in r1 and in r2).
if (seq.re.is_intersection(re, r1, r2)) {
unsigned s1 = compute_length_stride(r1);
unsigned s2 = compute_length_stride(r2);
if (s1 == 0 && s2 == 0) return 0;
if (s1 == 0) return s2;
if (s2 == 0) return s1;
return gcd(s1, s2);
}
// For complement, diff, reverse, derivative, of_pred, and anything
// else we cannot analyse statically: be conservative and return 1
// (no useful modular constraint rather than an unsound one).
return 1;
}
// -----------------------------------------------------------------------
// Constraint generation
// -----------------------------------------------------------------------
void nseq_parith::generate_parikh_constraints(str_mem const& mem,
vector<int_constraint>& out) {
if (!mem.m_regex || !mem.m_str)
return;
ast_manager& m = m_sg.get_manager();
seq_util& seq = m_sg.get_seq_util();
arith_util arith(m);
expr* re_expr = mem.m_regex->get_expr();
if (!re_expr || !seq.is_re(re_expr))
return;
// Length bounds from the regex.
unsigned min_len = seq.re.min_length(re_expr);
unsigned max_len = seq.re.max_length(re_expr);
// If min_len == max_len the bounds already pin the length exactly;
// no modular constraint is needed.
if (min_len == max_len)
return;
unsigned stride = compute_length_stride(re_expr);
// stride == 1: every integer length is possible — no useful constraint.
// stride == 0: fixed length or empty — handled by bounds.
if (stride <= 1)
return;
// Build len(str) as an arithmetic expression.
expr_ref len_str(seq.str.mk_length(mem.m_str->get_expr()), m);
// Introduce fresh integer variable k ≥ 0.
expr_ref k_var = mk_fresh_int_var();
// Constraint 1: len(str) = min_len + stride · k
expr_ref min_expr(arith.mk_int(min_len), m);
expr_ref stride_expr(arith.mk_int(stride), m);
expr_ref stride_k(arith.mk_mul(stride_expr, k_var), m);
expr_ref rhs(arith.mk_add(min_expr, stride_k), m);
out.push_back(int_constraint(len_str, rhs,
int_constraint_kind::eq, mem.m_dep, m));
// Constraint 2: k ≥ 0
expr_ref zero(arith.mk_int(0), m);
out.push_back(int_constraint(k_var, zero,
int_constraint_kind::ge, mem.m_dep, m));
}
void nseq_parith::apply_to_node(nielsen_node& node) {
vector<int_constraint> constraints;
for (str_mem const& mem : node.str_mems())
generate_parikh_constraints(mem, constraints);
for (auto& ic : constraints)
node.add_int_constraint(ic);
}
} // namespace seq

112
src/smt/seq/nseq_parith.h Normal file
View file

@ -0,0 +1,112 @@
/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
nseq_parith.h
Abstract:
Parikh image filter for the ZIPT-based Nielsen string solver.
Implements Parikh-based arithmetic constraint generation for
nielsen_node instances. For a regex membership constraint str r,
the Parikh image of r constrains the multiset of characters in str.
This module computes the "length stride" (period) of the length
language of r and generates modular arithmetic constraints of the form
len(str) = min_len + stride · k (k 0, k fresh integer)
which tighten the arithmetic subproblem beyond the simple min/max
length bounds already produced by nielsen_node::init_var_bounds_from_mems().
For example:
str (ab)* min_len = 0, stride = 2 len(str) = 2·k
str a(bc)* min_len = 1, stride = 2 len(str) = 1 + 2·k
str ab|abc stride = 1 (no useful modular constraint)
The generated int_constraints are added to the node's integer constraint
set and discharged by the integer subsolver (see seq_nielsen.h,
simple_solver).
Implements the Parikh filter described in ZIPT
(https://github.com/CEisenhofer/ZIPT/tree/parikh/ZIPT/Constraints)
replacing ZIPT's PDD-based Parikh subsolver with Z3's linear arithmetic.
Author:
Nikolaj Bjorner (nbjorner) 2026-03-10
--*/
#pragma once
#include "ast/euf/euf_sgraph.h"
#include "smt/seq/seq_nielsen.h"
namespace seq {
/**
* Parikh image filter: generates modular length constraints from
* regex membership constraints in a nielsen_node.
*
* Usage:
* nseq_parith parith(sg);
* parith.apply_to_node(node); // adds constraints to node
*
* Or per-membership:
* vector<int_constraint> out;
* parith.generate_parikh_constraints(mem, out);
*/
class nseq_parith {
euf::sgraph& m_sg;
unsigned m_fresh_cnt; // counter for fresh variable names
// Compute GCD of a and b. gcd(0, x) = x by convention.
// Returns 0 only when both arguments are 0.
static unsigned gcd(unsigned a, unsigned b);
// Compute the stride (period) of the length language of a regex.
//
// The stride k satisfies: all lengths in L(re) are congruent to
// min_length(re) modulo k. A stride of 1 means every integer
// length is possible (no useful modular constraint). A stride of
// 0 is a sentinel meaning the language is empty or has a single
// fixed length (already captured by bounds).
//
// Examples:
// stride(to_re("ab")) = 0 (fixed length 2)
// stride((ab)*) = 2 (lengths 0, 2, 4, ...)
// stride((abc)*) = 3 (lengths 0, 3, 6, ...)
// stride(a*b*) = 1 (all lengths possible)
// stride((ab)*(cd)*) = 2 (lengths 0, 2, 4, ...)
// stride((ab)*|(abc)*) = 1 (lengths 0, 2, 3, 4, ...)
unsigned compute_length_stride(expr* re);
// Create a fresh integer variable (name "pk!N") for use as the
// Parikh multiplier variable k in len(str) = min_len + stride·k.
expr_ref mk_fresh_int_var();
public:
explicit nseq_parith(euf::sgraph& sg);
// Generate Parikh modular length constraints for one membership.
//
// When stride > 1 and min_len < max_len (bounds don't pin length):
// adds: len(str) = min_len + stride · k (equality)
// k ≥ 0 (non-negativity)
// These tighten the integer constraint set for the subsolver.
// Dependencies are copied from mem.m_dep.
void generate_parikh_constraints(str_mem const& mem,
vector<int_constraint>& out);
// Apply Parikh constraints to all memberships at a node.
// Calls generate_parikh_constraints for each str_mem in the node
// and appends the resulting int_constraints to node.int_constraints().
void apply_to_node(nielsen_node& node);
// Compute the length stride of a regex expression.
// Exposed for testing and external callers.
unsigned get_length_stride(expr* re) { return compute_length_stride(re); }
};
} // namespace seq