3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2026-05-18 07:59:29 +00:00
z3/src/smt/seq/seq_parikh.cpp

314 lines
13 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*++
Copyright (c) 2026 Microsoft Corporation
Module Name:
seq_parikh.cpp
Abstract:
Parikh image filter implementation for the ZIPT-based Nielsen string
solver. See seq_parikh.h for the full design description.
The key operation is compute_length_stride(re), which performs a
structural traversal of the regex to find the period k such that all
string lengths in L(re) are congruent to min_length(re) modulo k.
The stride is used to generate modular length constraints that help
the integer subsolver prune infeasible Nielsen graph nodes.
Author:
Clemens Eisenhofer 2026-03-10
Nikolaj Bjorner (nbjorner) 2026-03-10
--*/
#include "smt/seq/seq_parikh.h"
#include "util/mpz.h"
#include <string>
namespace seq {
seq_parikh::seq_parikh(euf::sgraph& sg)
: m(sg.get_manager()), seq(m), a(m), m_fresh_cnt(0) {}
expr_ref seq_parikh::mk_fresh_int_var() {
std::string name = "pk!" + std::to_string(m_fresh_cnt++);
return expr_ref(m.mk_fresh_const(name.c_str(), a.mk_int()), m);
}
// -----------------------------------------------------------------------
// Stride computation
// -----------------------------------------------------------------------
// compute_length_stride: structural traversal of regex expression.
//
// Return value semantics:
// 0 — fixed length (or empty language): no modular constraint needed
// beyond the min == max bounds.
// 1 — all integer lengths ≥ min_len are achievable: no useful modular
// constraint.
// k > 1 — all lengths in L(re) satisfy len ≡ min_len (mod k):
// modular constraint len(str) = min_len + k·j is useful.
unsigned seq_parikh::compute_length_stride(expr* re) {
if (!re) return 1;
expr* r1 = nullptr, *r2 = nullptr, *s = nullptr;
unsigned lo = 0, hi = 0;
// Empty language: no strings exist; stride is irrelevant.
if (seq.re.is_empty(re))
return 0;
// Epsilon regex {""}: single fixed length 0.
if (seq.re.is_epsilon(re))
return 0;
// to_re(concrete_string): fixed-length, no modular constraint needed.
if (seq.re.is_to_re(re, s)) {
// min_length == max_length, covered by bounds.
return 0;
}
// Single character: range, full_char — fixed length 1.
if (seq.re.is_range(re) || seq.re.is_full_char(re))
return 0;
// full_seq (.* / Σ*): every length ≥ 0 is possible.
if (seq.re.is_full_seq(re))
return 1;
// r* — Kleene star.
// L(r*) = {ε} L(r) L(r)·L(r) ...
// If all lengths in L(r) are congruent to c modulo s (c = min_len, s = stride),
// then L(r*) includes lengths {0, c, c+s, 2c, 2c+s, 2c+2s, ...} and
// the overall GCD is gcd(c, s). This is strictly more accurate than
// the previous gcd(min, max) approximation, which can be unsound when
// the body contains lengths whose GCD is smaller than gcd(min, max).
if (seq.re.is_star(re, r1)) {
unsigned mn = seq.re.min_length(r1);
unsigned inner = compute_length_stride(r1);
// stride(r*) = gcd(min_length(r), stride(r))
// when inner=0 (fixed-length body), gcd(mn, 0) = mn → stride = mn
return u_gcd(mn, inner);
}
// r+ — one or more: same stride analysis as r*.
if (seq.re.is_plus(re, r1)) {
unsigned mn = seq.re.min_length(r1);
unsigned inner = compute_length_stride(r1);
return u_gcd(mn, inner);
}
// r? — zero or one: lengths = {0} lengths(r)
// stride = GCD(mn_r, stride(r)) unless stride(r) is 0 (fixed length).
if (seq.re.is_opt(re, r1)) {
unsigned mn = seq.re.min_length(r1);
unsigned inner = compute_length_stride(r1);
// L(r?) includes length 0 and all lengths of L(r).
// GCD(stride(r), min_len(r)) is a valid stride because:
// - the gap from 0 to min_len(r) is min_len(r) itself, and
// - subsequent lengths grow in steps governed by stride(r).
// A result > 1 gives a useful modular constraint; result == 1
// means every non-negative integer is achievable (no constraint).
if (inner == 0)
return u_gcd(mn, 0); // gcd(mn, 0) = mn; useful when mn > 1
return u_gcd(inner, mn);
}
// concat(r1, r2): lengths add → stride = GCD(stride(r1), stride(r2)).
if (seq.re.is_concat(re, r1, r2)) {
unsigned s1 = compute_length_stride(r1);
unsigned s2 = compute_length_stride(r2);
return u_gcd(s1, s2);
}
// union(r1, r2): lengths from either branch → need GCD of both
// strides and the difference between their minimum lengths.
if (seq.re.is_union(re, r1, r2)) {
unsigned s1 = compute_length_stride(r1);
unsigned s2 = compute_length_stride(r2);
unsigned m1 = seq.re.min_length(r1);
unsigned m2 = seq.re.min_length(r2);
unsigned d = (m1 >= m2) ? (m1 - m2) : (m2 - m1);
// Replace 0-strides with d for GCD computation:
// a fixed-length branch only introduces constraint via its offset.
unsigned g = u_gcd(s1 == 0 ? d : s1, s2 == 0 ? d : s2);
g = u_gcd(g, d);
return g;
}
// loop(r, lo, hi): the length of any word is a sum of lo..hi copies of
// lengths from L(r). Since all lengths in L(r) are ≡ min_len(r) mod
// stride(r), the overall stride is gcd(min_len(r), stride(r)).
if (seq.re.is_loop(re, r1, lo, hi)) {
unsigned mn = seq.re.min_length(r1);
unsigned inner = compute_length_stride(r1);
return u_gcd(mn, inner);
}
if (seq.re.is_loop(re, r1, lo)) {
unsigned mn = seq.re.min_length(r1);
unsigned inner = compute_length_stride(r1);
return u_gcd(mn, inner);
}
// intersection(r1, r2): lengths must be in both languages.
// A conservative safe choice: GCD(stride(r1), stride(r2)) is a valid
// stride for the intersection (every length in the intersection is
// also in r1 and in r2).
if (seq.re.is_intersection(re, r1, r2)) {
unsigned s1 = compute_length_stride(r1);
unsigned s2 = compute_length_stride(r2);
return u_gcd(s1, s2);
}
// For complement, diff, reverse, derivative, of_pred, and anything
// else we cannot analyse statically: be conservative and return 1
// (no useful modular constraint rather than an unsound one).
return 1;
}
// -----------------------------------------------------------------------
// Constraint generation
// -----------------------------------------------------------------------
void seq_parikh::generate_parikh_constraints(str_mem const& mem,
vector<constraint>& out) {
if (!mem.m_regex || !mem.m_str)
return;
expr* re_expr = mem.m_regex->get_expr();
if (!re_expr || !seq.is_re(re_expr))
return;
// Length bounds from the regex.
unsigned min_len = seq.re.min_length(re_expr);
unsigned max_len = seq.re.max_length(re_expr);
// If min_len >= max_len the bounds already pin the length exactly
// (or the language is empty — empty language is detected by simplify_and_init
// via Brzozowski derivative / is_empty checks, not here).
// We only generate modular constraints when the length is variable.
if (min_len >= max_len)
return;
unsigned stride = compute_length_stride(re_expr);
// stride == 1: every integer length is possible — no useful constraint.
// stride == 0: fixed length or empty — handled by bounds.
if (stride <= 1)
return;
// Build len(str) as an arithmetic expression.
expr_ref len_str(seq.str.mk_length(mem.m_str->get_expr()), m);
// Introduce fresh integer variable k ≥ 0.
expr_ref k_var = mk_fresh_int_var();
// Constraint 1: len(str) = min_len + stride · k
expr_ref min_expr(a.mk_int(min_len), m);
expr_ref stride_expr(a.mk_int(stride), m);
expr_ref stride_k(a.mk_mul(stride_expr, k_var), m);
expr_ref rhs(a.mk_add(min_expr, stride_k), m);
out.push_back(constraint(m.mk_eq(len_str, rhs), mem.m_dep, m));
// Constraint 2: k ≥ 0
expr_ref zero(a.mk_int(0), m);
out.push_back(constraint(a.mk_ge(k_var, zero), mem.m_dep, m));
// Constraint 3 (optional): k ≤ max_k when max_len is bounded.
// max_k = floor((max_len - min_len) / stride)
// This gives the solver an explicit upper bound on k.
// The subtraction is safe because min_len < max_len is guaranteed
// by the early return above.
if (max_len != UINT_MAX) {
SASSERT(max_len > min_len);
unsigned range = max_len - min_len;
unsigned max_k = range / stride;
expr_ref max_k_expr(a.mk_int(max_k), m);
out.push_back(constraint(a.mk_le(k_var, max_k_expr), mem.m_dep, m));
}
}
void seq_parikh::apply_to_node(nielsen_node& node) {
vector<constraint> constraints;
for (str_mem const& mem : node.str_mems())
generate_parikh_constraints(mem, constraints);
for (auto& ic : constraints)
node.add_constraint(ic);
}
// -----------------------------------------------------------------------
// Quick Parikh feasibility check (no solver call)
// -----------------------------------------------------------------------
// Returns true if a Parikh conflict is detected: there exists a membership
// str ∈ re for a single-variable str where the modular length constraint
// len(str) = min_len + stride * k (k ≥ 0)
// is inconsistent with the variable's current integer bounds [lb, ub].
//
// This check is lightweight — it uses only modular arithmetic on the already-
// known regex min/max lengths and the per-variable bounds stored in the node.
bool seq_parikh::check_parikh_conflict(nielsen_node& node) {
for (str_mem const& mem : node.str_mems()) {
if (!mem.m_str || !mem.m_regex || !mem.m_str->is_var())
continue;
expr* re_expr = mem.m_regex->get_expr();
if (!re_expr || !seq.is_re(re_expr))
continue;
unsigned min_len = seq.re.min_length(re_expr);
unsigned max_len = seq.re.max_length(re_expr);
if (min_len >= max_len) continue; // fixed or empty — no stride constraint
unsigned stride = compute_length_stride(re_expr);
if (stride <= 1) continue; // no useful modular constraint
// stride > 1 guaranteed from here onward.
SASSERT(stride > 1);
rational lb_r, ub_r;
if (!node.lower_bound(mem.m_str->get_expr(), lb_r) || !node.upper_bound(mem.m_str->get_expr(), ub_r))
continue;
SASSERT(lb_r <= ub_r);
if (ub_r > INT_MAX)
continue;
const unsigned lb = (unsigned)lb_r.get_int32();
const unsigned ub = (unsigned)ub_r.get_int32();
// Check: ∃k ≥ 0 such that lb ≤ min_len + stride * k ≤ ub ?
//
// First find the smallest k satisfying the lower bound:
// k_min = 0 if min_len ≥ lb
// k_min = ⌈(lb - min_len) / stride⌉ otherwise
//
// Then verify min_len + stride * k_min ≤ ub.
unsigned k_min = 0;
if (lb > min_len) {
unsigned gap = lb - min_len;
// Ceiling division: k_min = ceil(gap / stride).
// Guard: (gap + stride - 1) may overflow if gap is close to UINT_MAX.
// In that case k_min would be huge, and min_len + stride*k_min would
// also overflow ub → treat as a conflict immediately.
if (gap > UINT_MAX - (stride - 1)) {
return true; // ceiling division would overflow → k_min too large
}
k_min = (gap + stride - 1) / stride;
}
// Overflow guard: stride * k_min may overflow unsigned.
unsigned len_at_k_min;
if (k_min > (UINT_MAX - min_len) / stride) {
// Overflow: min_len + stride * k_min > UINT_MAX ≥ ub → conflict.
return true;
}
len_at_k_min = min_len + stride * k_min;
if (ub != UINT_MAX && len_at_k_min > ub)
return true; // no valid k exists → Parikh conflict
}
return false;
}
} // namespace seq