mirror of
https://github.com/Z3Prover/z3
synced 2026-07-03 13:56:08 +00:00
Rename nseq_parikh→seq_parikh; add m/seq/a member attributes to seq_parikh
Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com>
This commit is contained in:
parent
4ac5315846
commit
213ddd36ba
7 changed files with 474 additions and 447 deletions
|
|
@ -1,6 +1,6 @@
|
||||||
z3_add_component(smt_seq
|
z3_add_component(smt_seq
|
||||||
SOURCES
|
SOURCES
|
||||||
nseq_parikh.cpp
|
seq_parikh.cpp
|
||||||
seq_nielsen.cpp
|
seq_nielsen.cpp
|
||||||
COMPONENT_DEPENDENCIES
|
COMPONENT_DEPENDENCIES
|
||||||
euf
|
euf
|
||||||
|
|
|
||||||
|
|
@ -5,334 +5,11 @@ Module Name:
|
||||||
|
|
||||||
nseq_parikh.cpp
|
nseq_parikh.cpp
|
||||||
|
|
||||||
Abstract:
|
Note:
|
||||||
|
|
||||||
Parikh image filter implementation for the ZIPT-based Nielsen string
|
This file is retained for backwards compatibility.
|
||||||
solver. See nseq_parith.h for the full design description.
|
The canonical implementation is now smt/seq/seq_parikh.cpp.
|
||||||
|
|
||||||
The key operation is compute_length_stride(re), which performs a
|
|
||||||
structural traversal of the regex to find the period k such that all
|
|
||||||
string lengths in L(re) are congruent to min_length(re) modulo k.
|
|
||||||
The stride is used to generate modular length constraints that help
|
|
||||||
the integer subsolver prune infeasible Nielsen graph nodes.
|
|
||||||
|
|
||||||
Author:
|
|
||||||
|
|
||||||
Clemens Eisenhofer 2026-03-10
|
|
||||||
Nikolaj Bjorner (nbjorner) 2026-03-10
|
|
||||||
|
|
||||||
--*/
|
--*/
|
||||||
|
// intentionally empty — see seq_parikh.cpp
|
||||||
|
|
||||||
#include "smt/seq/nseq_parith.h"
|
|
||||||
#include "ast/arith_decl_plugin.h"
|
|
||||||
#include "ast/seq_decl_plugin.h"
|
|
||||||
#include "util/mpz.h"
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
namespace seq {
|
|
||||||
|
|
||||||
nseq_parith::nseq_parith(euf::sgraph& sg)
|
|
||||||
: m_sg(sg), m_fresh_cnt(0) {}
|
|
||||||
|
|
||||||
expr_ref nseq_parith::mk_fresh_int_var() {
|
|
||||||
ast_manager& m = m_sg.get_manager();
|
|
||||||
arith_util arith(m);
|
|
||||||
std::string name = "pk!" + std::to_string(m_fresh_cnt++);
|
|
||||||
return expr_ref(m.mk_fresh_const(name.c_str(), arith.mk_int()), m);
|
|
||||||
}
|
|
||||||
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
// Stride computation
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
|
|
||||||
// compute_length_stride: structural traversal of regex expression.
|
|
||||||
//
|
|
||||||
// Return value semantics:
|
|
||||||
// 0 — fixed length (or empty language): no modular constraint needed
|
|
||||||
// beyond the min == max bounds.
|
|
||||||
// 1 — all integer lengths ≥ min_len are achievable: no useful modular
|
|
||||||
// constraint.
|
|
||||||
// k > 1 — all lengths in L(re) satisfy len ≡ min_len (mod k):
|
|
||||||
// modular constraint len(str) = min_len + k·j is useful.
|
|
||||||
unsigned nseq_parith::compute_length_stride(expr* re) {
|
|
||||||
if (!re) return 1;
|
|
||||||
|
|
||||||
seq_util& seq = m_sg.get_seq_util();
|
|
||||||
expr* r1 = nullptr, *r2 = nullptr, *s = nullptr;
|
|
||||||
unsigned lo = 0, hi = 0;
|
|
||||||
|
|
||||||
// Empty language: no strings exist; stride is irrelevant.
|
|
||||||
if (seq.re.is_empty(re))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
// Epsilon regex {""}: single fixed length 0.
|
|
||||||
if (seq.re.is_epsilon(re))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
// to_re(concrete_string): fixed-length, no modular constraint needed.
|
|
||||||
if (seq.re.is_to_re(re, s)) {
|
|
||||||
// min_length == max_length, covered by bounds.
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Single character: range, full_char — fixed length 1.
|
|
||||||
if (seq.re.is_range(re) || seq.re.is_full_char(re))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
// full_seq (.* / Σ*): every length ≥ 0 is possible.
|
|
||||||
if (seq.re.is_full_seq(re))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
// r* — Kleene star.
|
|
||||||
// L(r*) = {ε} ∪ L(r) ∪ L(r)·L(r) ∪ ...
|
|
||||||
// If r has a fixed length k, then L(r*) = {0, k, 2k, ...} → stride k.
|
|
||||||
// If r has variable length, strides from different iterations combine
|
|
||||||
// by GCD.
|
|
||||||
if (seq.re.is_star(re, r1)) {
|
|
||||||
unsigned mn = seq.re.min_length(r1);
|
|
||||||
unsigned mx = seq.re.max_length(r1);
|
|
||||||
// When the body has unbounded length (mx == UINT_MAX), different
|
|
||||||
// iterations can produce many different lengths, and the stride of
|
|
||||||
// the star as a whole degenerates to gcd(mn, mn) = mn (or 1 if
|
|
||||||
// mn == 1). This is conservative: we use the body's min-length
|
|
||||||
// as the only available fixed quantity.
|
|
||||||
if (mx == UINT_MAX) mx = mn;
|
|
||||||
if (mn == mx) {
|
|
||||||
// Fixed-length body: L(r*) = {0, mn, 2·mn, ...} → stride = mn.
|
|
||||||
// When mn == 1 the stride would be 1, which gives no useful
|
|
||||||
// modular constraint, so return 0 instead.
|
|
||||||
return (mn > 1) ? mn : 0;
|
|
||||||
}
|
|
||||||
// Variable-length body: GCD of min and max lengths
|
|
||||||
return u_gcd(mn, mx);
|
|
||||||
}
|
|
||||||
|
|
||||||
// r+ — one or more: same stride analysis as r*.
|
|
||||||
if (seq.re.is_plus(re, r1)) {
|
|
||||||
unsigned mn = seq.re.min_length(r1);
|
|
||||||
unsigned mx = seq.re.max_length(r1);
|
|
||||||
if (mx == UINT_MAX) mx = mn; // same conservative treatment as star
|
|
||||||
if (mn == mx)
|
|
||||||
return (mn > 1) ? mn : 0;
|
|
||||||
return u_gcd(mn, mx);
|
|
||||||
}
|
|
||||||
|
|
||||||
// r? — zero or one: lengths = {0} ∪ lengths(r)
|
|
||||||
// stride = GCD(mn_r, stride(r)) unless stride(r) is 0 (fixed length).
|
|
||||||
if (seq.re.is_opt(re, r1)) {
|
|
||||||
unsigned mn = seq.re.min_length(r1);
|
|
||||||
unsigned inner = compute_length_stride(r1);
|
|
||||||
// L(r?) includes length 0 and all lengths of L(r).
|
|
||||||
// GCD(stride(r), min_len(r)) is a valid stride because:
|
|
||||||
// - the gap from 0 to min_len(r) is min_len(r) itself, and
|
|
||||||
// - subsequent lengths grow in steps governed by stride(r).
|
|
||||||
// A result > 1 gives a useful modular constraint; result == 1
|
|
||||||
// means every non-negative integer is achievable (no constraint).
|
|
||||||
if (inner == 0)
|
|
||||||
return u_gcd(mn, 0); // gcd(mn, 0) = mn; useful when mn > 1
|
|
||||||
return u_gcd(inner, mn);
|
|
||||||
}
|
|
||||||
|
|
||||||
// concat(r1, r2): lengths add → stride = GCD(stride(r1), stride(r2)).
|
|
||||||
if (seq.re.is_concat(re, r1, r2)) {
|
|
||||||
unsigned s1 = compute_length_stride(r1);
|
|
||||||
unsigned s2 = compute_length_stride(r2);
|
|
||||||
return u_gcd(s1, s2);
|
|
||||||
}
|
|
||||||
|
|
||||||
// union(r1, r2): lengths from either branch → need GCD of both
|
|
||||||
// strides and the difference between their minimum lengths.
|
|
||||||
if (seq.re.is_union(re, r1, r2)) {
|
|
||||||
unsigned s1 = compute_length_stride(r1);
|
|
||||||
unsigned s2 = compute_length_stride(r2);
|
|
||||||
unsigned m1 = seq.re.min_length(r1);
|
|
||||||
unsigned m2 = seq.re.min_length(r2);
|
|
||||||
unsigned d = (m1 >= m2) ? (m1 - m2) : (m2 - m1);
|
|
||||||
// Replace 0-strides with d for GCD computation:
|
|
||||||
// a fixed-length branch only introduces constraint via its offset.
|
|
||||||
unsigned g = u_gcd(s1 == 0 ? d : s1, s2 == 0 ? d : s2);
|
|
||||||
g = u_gcd(g, d);
|
|
||||||
return g;
|
|
||||||
}
|
|
||||||
|
|
||||||
// loop(r, lo, hi): lengths = {lo·len(r), ..., hi·len(r)} if r is fixed.
|
|
||||||
// stride = len(r) when r is fixed-length; otherwise GCD.
|
|
||||||
if (seq.re.is_loop(re, r1, lo, hi)) {
|
|
||||||
unsigned mn = seq.re.min_length(r1);
|
|
||||||
unsigned mx = seq.re.max_length(r1);
|
|
||||||
if (mx == UINT_MAX) mx = mn;
|
|
||||||
if (mn == mx)
|
|
||||||
return (mn > 1) ? mn : 0;
|
|
||||||
return u_gcd(mn, mx);
|
|
||||||
}
|
|
||||||
if (seq.re.is_loop(re, r1, lo)) {
|
|
||||||
unsigned mn = seq.re.min_length(r1);
|
|
||||||
unsigned mx = seq.re.max_length(r1);
|
|
||||||
if (mx == UINT_MAX) mx = mn;
|
|
||||||
if (mn == mx)
|
|
||||||
return (mn > 1) ? mn : 0;
|
|
||||||
return u_gcd(mn, mx);
|
|
||||||
}
|
|
||||||
|
|
||||||
// intersection(r1, r2): lengths must be in both languages.
|
|
||||||
// A conservative safe choice: GCD(stride(r1), stride(r2)) is a valid
|
|
||||||
// stride for the intersection (every length in the intersection is
|
|
||||||
// also in r1 and in r2).
|
|
||||||
if (seq.re.is_intersection(re, r1, r2)) {
|
|
||||||
unsigned s1 = compute_length_stride(r1);
|
|
||||||
unsigned s2 = compute_length_stride(r2);
|
|
||||||
return u_gcd(s1, s2);
|
|
||||||
}
|
|
||||||
|
|
||||||
// For complement, diff, reverse, derivative, of_pred, and anything
|
|
||||||
// else we cannot analyse statically: be conservative and return 1
|
|
||||||
// (no useful modular constraint rather than an unsound one).
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
// Constraint generation
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
|
|
||||||
void nseq_parith::generate_parikh_constraints(str_mem const& mem,
|
|
||||||
vector<int_constraint>& out) {
|
|
||||||
if (!mem.m_regex || !mem.m_str)
|
|
||||||
return;
|
|
||||||
|
|
||||||
ast_manager& m = m_sg.get_manager();
|
|
||||||
seq_util& seq = m_sg.get_seq_util();
|
|
||||||
arith_util arith(m);
|
|
||||||
|
|
||||||
expr* re_expr = mem.m_regex->get_expr();
|
|
||||||
if (!re_expr || !seq.is_re(re_expr))
|
|
||||||
return;
|
|
||||||
|
|
||||||
// Length bounds from the regex.
|
|
||||||
unsigned min_len = seq.re.min_length(re_expr);
|
|
||||||
unsigned max_len = seq.re.max_length(re_expr);
|
|
||||||
|
|
||||||
// If min_len >= max_len the bounds already pin the length exactly
|
|
||||||
// (or the language is empty — empty language is detected by simplify_and_init
|
|
||||||
// via Brzozowski derivative / is_empty checks, not here).
|
|
||||||
// We only generate modular constraints when the length is variable.
|
|
||||||
if (min_len >= max_len)
|
|
||||||
return;
|
|
||||||
|
|
||||||
unsigned stride = compute_length_stride(re_expr);
|
|
||||||
|
|
||||||
// stride == 1: every integer length is possible — no useful constraint.
|
|
||||||
// stride == 0: fixed length or empty — handled by bounds.
|
|
||||||
if (stride <= 1)
|
|
||||||
return;
|
|
||||||
|
|
||||||
// Build len(str) as an arithmetic expression.
|
|
||||||
expr_ref len_str(seq.str.mk_length(mem.m_str->get_expr()), m);
|
|
||||||
|
|
||||||
// Introduce fresh integer variable k ≥ 0.
|
|
||||||
expr_ref k_var = mk_fresh_int_var();
|
|
||||||
|
|
||||||
// Constraint 1: len(str) = min_len + stride · k
|
|
||||||
expr_ref min_expr(arith.mk_int(min_len), m);
|
|
||||||
expr_ref stride_expr(arith.mk_int(stride), m);
|
|
||||||
expr_ref stride_k(arith.mk_mul(stride_expr, k_var), m);
|
|
||||||
expr_ref rhs(arith.mk_add(min_expr, stride_k), m);
|
|
||||||
out.push_back(int_constraint(len_str, rhs,
|
|
||||||
int_constraint_kind::eq, mem.m_dep, m));
|
|
||||||
|
|
||||||
// Constraint 2: k ≥ 0
|
|
||||||
expr_ref zero(arith.mk_int(0), m);
|
|
||||||
out.push_back(int_constraint(k_var, zero,
|
|
||||||
int_constraint_kind::ge, mem.m_dep, m));
|
|
||||||
|
|
||||||
// Constraint 3 (optional): k ≤ max_k when max_len is bounded.
|
|
||||||
// max_k = floor((max_len - min_len) / stride)
|
|
||||||
// This gives the solver an explicit upper bound on k.
|
|
||||||
// The subtraction is safe because min_len < max_len is guaranteed
|
|
||||||
// by the early return above.
|
|
||||||
if (max_len != UINT_MAX) {
|
|
||||||
SASSERT(max_len > min_len);
|
|
||||||
unsigned range = max_len - min_len;
|
|
||||||
unsigned max_k = range / stride;
|
|
||||||
expr_ref max_k_expr(arith.mk_int(max_k), m);
|
|
||||||
out.push_back(int_constraint(k_var, max_k_expr,
|
|
||||||
int_constraint_kind::le, mem.m_dep, m));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void nseq_parith::apply_to_node(nielsen_node& node) {
|
|
||||||
vector<int_constraint> constraints;
|
|
||||||
for (str_mem const& mem : node.str_mems())
|
|
||||||
generate_parikh_constraints(mem, constraints);
|
|
||||||
for (auto& ic : constraints)
|
|
||||||
node.add_int_constraint(ic);
|
|
||||||
}
|
|
||||||
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
// Quick Parikh feasibility check (no solver call)
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
|
|
||||||
// Returns true if a Parikh conflict is detected: there exists a membership
|
|
||||||
// str ∈ re for a single-variable str where the modular length constraint
|
|
||||||
// len(str) = min_len + stride * k (k ≥ 0)
|
|
||||||
// is inconsistent with the variable's current integer bounds [lb, ub].
|
|
||||||
//
|
|
||||||
// This check is lightweight — it uses only modular arithmetic on the already-
|
|
||||||
// known regex min/max lengths and the per-variable bounds stored in the node.
|
|
||||||
bool nseq_parith::check_parikh_conflict(nielsen_node& node) {
|
|
||||||
seq_util& seq = m_sg.get_seq_util();
|
|
||||||
|
|
||||||
for (str_mem const& mem : node.str_mems()) {
|
|
||||||
if (!mem.m_str || !mem.m_regex || !mem.m_str->is_var())
|
|
||||||
continue;
|
|
||||||
|
|
||||||
expr* re_expr = mem.m_regex->get_expr();
|
|
||||||
if (!re_expr || !seq.is_re(re_expr))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
unsigned min_len = seq.re.min_length(re_expr);
|
|
||||||
unsigned max_len = seq.re.max_length(re_expr);
|
|
||||||
if (min_len >= max_len) continue; // fixed or empty — no stride constraint
|
|
||||||
|
|
||||||
unsigned stride = compute_length_stride(re_expr);
|
|
||||||
if (stride <= 1) continue; // no useful modular constraint
|
|
||||||
// stride > 1 guaranteed from here onward.
|
|
||||||
SASSERT(stride > 1);
|
|
||||||
|
|
||||||
unsigned lb = node.var_lb(mem.m_str);
|
|
||||||
unsigned ub = node.var_ub(mem.m_str);
|
|
||||||
|
|
||||||
// Check: ∃k ≥ 0 such that lb ≤ min_len + stride * k ≤ ub ?
|
|
||||||
//
|
|
||||||
// First find the smallest k satisfying the lower bound:
|
|
||||||
// k_min = 0 if min_len ≥ lb
|
|
||||||
// k_min = ⌈(lb - min_len) / stride⌉ otherwise
|
|
||||||
//
|
|
||||||
// Then verify min_len + stride * k_min ≤ ub.
|
|
||||||
unsigned k_min = 0;
|
|
||||||
if (lb > min_len) {
|
|
||||||
unsigned gap = lb - min_len;
|
|
||||||
// Ceiling division: k_min = ceil(gap / stride).
|
|
||||||
// Guard: (gap + stride - 1) may overflow if gap is close to UINT_MAX.
|
|
||||||
// In that case k_min would be huge, and min_len + stride*k_min would
|
|
||||||
// also overflow ub → treat as a conflict immediately.
|
|
||||||
if (gap > UINT_MAX - (stride - 1)) {
|
|
||||||
return true; // ceiling division would overflow → k_min too large
|
|
||||||
}
|
|
||||||
k_min = (gap + stride - 1) / stride;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Overflow guard: stride * k_min may overflow unsigned.
|
|
||||||
unsigned len_at_k_min;
|
|
||||||
if (k_min > (UINT_MAX - min_len) / stride) {
|
|
||||||
// Overflow: min_len + stride * k_min > UINT_MAX ≥ ub → conflict.
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
len_at_k_min = min_len + stride * k_min;
|
|
||||||
|
|
||||||
if (ub != UINT_MAX && len_at_k_min > ub)
|
|
||||||
return true; // no valid k exists → Parikh conflict
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace seq
|
|
||||||
|
|
|
||||||
|
|
@ -5,124 +5,16 @@ Module Name:
|
||||||
|
|
||||||
nseq_parith.h
|
nseq_parith.h
|
||||||
|
|
||||||
Abstract:
|
Note:
|
||||||
|
|
||||||
Parikh image filter for the ZIPT-based Nielsen string solver.
|
This file is retained for backwards compatibility.
|
||||||
|
The canonical header is now smt/seq/seq_parikh.h.
|
||||||
Implements Parikh-based arithmetic constraint generation for
|
|
||||||
nielsen_node instances. For a regex membership constraint str ∈ r,
|
|
||||||
the Parikh image of r constrains the multiset of characters in str.
|
|
||||||
This module computes the "length stride" (period) of the length
|
|
||||||
language of r and generates modular arithmetic constraints of the form
|
|
||||||
|
|
||||||
len(str) = min_len + stride · k (k ≥ 0, k fresh integer)
|
|
||||||
|
|
||||||
which tighten the arithmetic subproblem beyond the simple min/max
|
|
||||||
length bounds already produced by nielsen_node::init_var_bounds_from_mems().
|
|
||||||
|
|
||||||
For example:
|
|
||||||
• str ∈ (ab)* → min_len = 0, stride = 2 → len(str) = 2·k
|
|
||||||
• str ∈ a(bc)* → min_len = 1, stride = 2 → len(str) = 1 + 2·k
|
|
||||||
• str ∈ ab|abc → stride = 1 (no useful modular constraint)
|
|
||||||
|
|
||||||
The generated int_constraints are added to the node's integer constraint
|
|
||||||
set and discharged by the integer subsolver (see seq_nielsen.h,
|
|
||||||
simple_solver).
|
|
||||||
|
|
||||||
Implements the Parikh filter described in ZIPT
|
|
||||||
(https://github.com/CEisenhofer/ZIPT/tree/parikh/ZIPT/Constraints)
|
|
||||||
replacing ZIPT's PDD-based Parikh subsolver with Z3's linear arithmetic.
|
|
||||||
|
|
||||||
Author:
|
|
||||||
|
|
||||||
Clemens Eisenhofer 2026-03-10
|
|
||||||
Nikolaj Bjorner (nbjorner) 2026-03-10
|
|
||||||
|
|
||||||
--*/
|
--*/
|
||||||
#pragma once
|
#pragma once
|
||||||
|
#include "smt/seq/seq_parikh.h"
|
||||||
#include "ast/euf/euf_sgraph.h"
|
|
||||||
#include "smt/seq/seq_nielsen.h"
|
|
||||||
|
|
||||||
namespace seq {
|
namespace seq {
|
||||||
|
// backwards-compat alias
|
||||||
/**
|
using nseq_parith = seq_parikh;
|
||||||
* Parikh image filter: generates modular length constraints from
|
}
|
||||||
* regex membership constraints in a nielsen_node.
|
|
||||||
*
|
|
||||||
* Usage:
|
|
||||||
* nseq_parith parith(sg);
|
|
||||||
* parith.apply_to_node(node); // adds constraints to node
|
|
||||||
*
|
|
||||||
* Or per-membership:
|
|
||||||
* vector<int_constraint> out;
|
|
||||||
* parith.generate_parikh_constraints(mem, out);
|
|
||||||
*/
|
|
||||||
class nseq_parith {
|
|
||||||
euf::sgraph& m_sg;
|
|
||||||
unsigned m_fresh_cnt; // counter for fresh variable names
|
|
||||||
|
|
||||||
// Compute the stride (period) of the length language of a regex.
|
|
||||||
//
|
|
||||||
// The stride k satisfies: all lengths in L(re) are congruent to
|
|
||||||
// min_length(re) modulo k. A stride of 1 means every integer
|
|
||||||
// length is possible (no useful modular constraint). A stride of
|
|
||||||
// 0 is a sentinel meaning the language is empty or has a single
|
|
||||||
// fixed length (already captured by bounds).
|
|
||||||
//
|
|
||||||
// Examples:
|
|
||||||
// stride(to_re("ab")) = 0 (fixed length 2)
|
|
||||||
// stride((ab)*) = 2 (lengths 0, 2, 4, ...)
|
|
||||||
// stride((abc)*) = 3 (lengths 0, 3, 6, ...)
|
|
||||||
// stride(a*b*) = 1 (all lengths possible)
|
|
||||||
// stride((ab)*(cd)*) = 2 (lengths 0, 2, 4, ...)
|
|
||||||
// stride((ab)*|(abc)*) = 1 (lengths 0, 2, 3, 4, ...)
|
|
||||||
unsigned compute_length_stride(expr* re);
|
|
||||||
|
|
||||||
// Create a fresh integer variable (name "pk!N") for use as the
|
|
||||||
// Parikh multiplier variable k in len(str) = min_len + stride·k.
|
|
||||||
expr_ref mk_fresh_int_var();
|
|
||||||
|
|
||||||
public:
|
|
||||||
explicit nseq_parith(euf::sgraph& sg);
|
|
||||||
|
|
||||||
// Generate Parikh modular length constraints for one membership.
|
|
||||||
//
|
|
||||||
// When stride > 1 and min_len < max_len (bounds don't pin length exactly,
|
|
||||||
// and the language is non-empty):
|
|
||||||
// adds: len(str) = min_len + stride · k (equality)
|
|
||||||
// k ≥ 0 (non-negativity)
|
|
||||||
// k ≤ (max_len - min_len) / stride (upper bound, when max_len bounded)
|
|
||||||
// These tighten the integer constraint set for the subsolver.
|
|
||||||
// Dependencies are copied from mem.m_dep.
|
|
||||||
// Does nothing when min_len ≥ max_len (empty or fixed-length language).
|
|
||||||
void generate_parikh_constraints(str_mem const& mem,
|
|
||||||
vector<int_constraint>& out);
|
|
||||||
|
|
||||||
// Apply Parikh constraints to all memberships at a node.
|
|
||||||
// Calls generate_parikh_constraints for each str_mem in the node
|
|
||||||
// and appends the resulting int_constraints to node.int_constraints().
|
|
||||||
void apply_to_node(nielsen_node& node);
|
|
||||||
|
|
||||||
// Quick Parikh feasibility check (no solver call).
|
|
||||||
//
|
|
||||||
// For each single-variable membership str ∈ re, checks whether the
|
|
||||||
// modular constraint len(str) = min_len + stride · k (k ≥ 0)
|
|
||||||
// has any solution given the current per-variable bounds stored in
|
|
||||||
// node.var_lb(str) and node.var_ub(str).
|
|
||||||
//
|
|
||||||
// Returns true when a conflict is detected (no valid k exists for
|
|
||||||
// some membership). The caller should then mark the node with
|
|
||||||
// backtrack_reason::parikh_image.
|
|
||||||
//
|
|
||||||
// This is a lightweight pre-check that avoids calling the integer
|
|
||||||
// subsolver. It is sound (never returns true for a satisfiable node)
|
|
||||||
// but incomplete (may miss conflicts that require the full solver).
|
|
||||||
bool check_parikh_conflict(nielsen_node& node);
|
|
||||||
|
|
||||||
// Compute the length stride of a regex expression.
|
|
||||||
// Exposed for testing and external callers.
|
|
||||||
unsigned get_length_stride(expr* re) { return compute_length_stride(re); }
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace seq
|
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,7 @@ Author:
|
||||||
--*/
|
--*/
|
||||||
|
|
||||||
#include "smt/seq/seq_nielsen.h"
|
#include "smt/seq/seq_nielsen.h"
|
||||||
#include "smt/seq/nseq_parith.h"
|
#include "smt/seq/seq_parikh.h"
|
||||||
#include "ast/arith_decl_plugin.h"
|
#include "ast/arith_decl_plugin.h"
|
||||||
#include "ast/ast_pp.h"
|
#include "ast/ast_pp.h"
|
||||||
#include "util/hashtable.h"
|
#include "util/hashtable.h"
|
||||||
|
|
@ -433,7 +433,7 @@ namespace seq {
|
||||||
nielsen_graph::nielsen_graph(euf::sgraph& sg, simple_solver& solver):
|
nielsen_graph::nielsen_graph(euf::sgraph& sg, simple_solver& solver):
|
||||||
m_sg(sg),
|
m_sg(sg),
|
||||||
m_solver(solver),
|
m_solver(solver),
|
||||||
m_parith(alloc(nseq_parith, sg)) {
|
m_parith(alloc(seq_parikh, sg)) {
|
||||||
}
|
}
|
||||||
|
|
||||||
nielsen_graph::~nielsen_graph() {
|
nielsen_graph::~nielsen_graph() {
|
||||||
|
|
|
||||||
|
|
@ -248,7 +248,7 @@ namespace seq {
|
||||||
class nielsen_node;
|
class nielsen_node;
|
||||||
class nielsen_edge;
|
class nielsen_edge;
|
||||||
class nielsen_graph;
|
class nielsen_graph;
|
||||||
class nseq_parith; // Parikh image filter (defined in nseq_parith.h)
|
class seq_parikh; // Parikh image filter (see seq_parikh.h)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Abstract interface for an incremental solver used by nielsen_graph
|
* Abstract interface for an incremental solver used by nielsen_graph
|
||||||
|
|
@ -718,7 +718,7 @@ namespace seq {
|
||||||
|
|
||||||
// Parikh image filter: generates modular length constraints from regex
|
// Parikh image filter: generates modular length constraints from regex
|
||||||
// memberships. Allocated in the constructor; owned by this graph.
|
// memberships. Allocated in the constructor; owned by this graph.
|
||||||
nseq_parith* m_parith = nullptr;
|
seq_parikh* m_parith = nullptr;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// Construct with a caller-supplied solver. Ownership is NOT transferred;
|
// Construct with a caller-supplied solver. Ownership is NOT transferred;
|
||||||
|
|
|
||||||
327
src/smt/seq/seq_parikh.cpp
Normal file
327
src/smt/seq/seq_parikh.cpp
Normal file
|
|
@ -0,0 +1,327 @@
|
||||||
|
/*++
|
||||||
|
Copyright (c) 2026 Microsoft Corporation
|
||||||
|
|
||||||
|
Module Name:
|
||||||
|
|
||||||
|
seq_parikh.cpp
|
||||||
|
|
||||||
|
Abstract:
|
||||||
|
|
||||||
|
Parikh image filter implementation for the ZIPT-based Nielsen string
|
||||||
|
solver. See seq_parikh.h for the full design description.
|
||||||
|
|
||||||
|
The key operation is compute_length_stride(re), which performs a
|
||||||
|
structural traversal of the regex to find the period k such that all
|
||||||
|
string lengths in L(re) are congruent to min_length(re) modulo k.
|
||||||
|
The stride is used to generate modular length constraints that help
|
||||||
|
the integer subsolver prune infeasible Nielsen graph nodes.
|
||||||
|
|
||||||
|
Author:
|
||||||
|
|
||||||
|
Clemens Eisenhofer 2026-03-10
|
||||||
|
Nikolaj Bjorner (nbjorner) 2026-03-10
|
||||||
|
|
||||||
|
--*/
|
||||||
|
|
||||||
|
#include "smt/seq/seq_parikh.h"
|
||||||
|
#include "util/mpz.h"
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace seq {
|
||||||
|
|
||||||
|
seq_parikh::seq_parikh(euf::sgraph& sg)
|
||||||
|
: m(sg.get_manager()), seq(m), a(m), m_fresh_cnt(0) {}
|
||||||
|
|
||||||
|
expr_ref seq_parikh::mk_fresh_int_var() {
|
||||||
|
std::string name = "pk!" + std::to_string(m_fresh_cnt++);
|
||||||
|
return expr_ref(m.mk_fresh_const(name.c_str(), a.mk_int()), m);
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
// Stride computation
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
// compute_length_stride: structural traversal of regex expression.
|
||||||
|
//
|
||||||
|
// Return value semantics:
|
||||||
|
// 0 — fixed length (or empty language): no modular constraint needed
|
||||||
|
// beyond the min == max bounds.
|
||||||
|
// 1 — all integer lengths ≥ min_len are achievable: no useful modular
|
||||||
|
// constraint.
|
||||||
|
// k > 1 — all lengths in L(re) satisfy len ≡ min_len (mod k):
|
||||||
|
// modular constraint len(str) = min_len + k·j is useful.
|
||||||
|
unsigned seq_parikh::compute_length_stride(expr* re) {
|
||||||
|
if (!re) return 1;
|
||||||
|
|
||||||
|
expr* r1 = nullptr, *r2 = nullptr, *s = nullptr;
|
||||||
|
unsigned lo = 0, hi = 0;
|
||||||
|
|
||||||
|
// Empty language: no strings exist; stride is irrelevant.
|
||||||
|
if (seq.re.is_empty(re))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
// Epsilon regex {""}: single fixed length 0.
|
||||||
|
if (seq.re.is_epsilon(re))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
// to_re(concrete_string): fixed-length, no modular constraint needed.
|
||||||
|
if (seq.re.is_to_re(re, s)) {
|
||||||
|
// min_length == max_length, covered by bounds.
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Single character: range, full_char — fixed length 1.
|
||||||
|
if (seq.re.is_range(re) || seq.re.is_full_char(re))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
// full_seq (.* / Σ*): every length ≥ 0 is possible.
|
||||||
|
if (seq.re.is_full_seq(re))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
// r* — Kleene star.
|
||||||
|
// L(r*) = {ε} ∪ L(r) ∪ L(r)·L(r) ∪ ...
|
||||||
|
// If r has a fixed length k, then L(r*) = {0, k, 2k, ...} → stride k.
|
||||||
|
// If r has variable length, strides from different iterations combine
|
||||||
|
// by GCD.
|
||||||
|
if (seq.re.is_star(re, r1)) {
|
||||||
|
unsigned mn = seq.re.min_length(r1);
|
||||||
|
unsigned mx = seq.re.max_length(r1);
|
||||||
|
// When the body has unbounded length (mx == UINT_MAX), different
|
||||||
|
// iterations can produce many different lengths, and the stride of
|
||||||
|
// the star as a whole degenerates to gcd(mn, mn) = mn (or 1 if
|
||||||
|
// mn == 1). This is conservative: we use the body's min-length
|
||||||
|
// as the only available fixed quantity.
|
||||||
|
if (mx == UINT_MAX) mx = mn;
|
||||||
|
if (mn == mx) {
|
||||||
|
// Fixed-length body: L(r*) = {0, mn, 2·mn, ...} → stride = mn.
|
||||||
|
// When mn == 1 the stride would be 1, which gives no useful
|
||||||
|
// modular constraint, so return 0 instead.
|
||||||
|
return (mn > 1) ? mn : 0;
|
||||||
|
}
|
||||||
|
// Variable-length body: GCD of min and max lengths
|
||||||
|
return u_gcd(mn, mx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// r+ — one or more: same stride analysis as r*.
|
||||||
|
if (seq.re.is_plus(re, r1)) {
|
||||||
|
unsigned mn = seq.re.min_length(r1);
|
||||||
|
unsigned mx = seq.re.max_length(r1);
|
||||||
|
if (mx == UINT_MAX) mx = mn; // same conservative treatment as star
|
||||||
|
if (mn == mx)
|
||||||
|
return (mn > 1) ? mn : 0;
|
||||||
|
return u_gcd(mn, mx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// r? — zero or one: lengths = {0} ∪ lengths(r)
|
||||||
|
// stride = GCD(mn_r, stride(r)) unless stride(r) is 0 (fixed length).
|
||||||
|
if (seq.re.is_opt(re, r1)) {
|
||||||
|
unsigned mn = seq.re.min_length(r1);
|
||||||
|
unsigned inner = compute_length_stride(r1);
|
||||||
|
// L(r?) includes length 0 and all lengths of L(r).
|
||||||
|
// GCD(stride(r), min_len(r)) is a valid stride because:
|
||||||
|
// - the gap from 0 to min_len(r) is min_len(r) itself, and
|
||||||
|
// - subsequent lengths grow in steps governed by stride(r).
|
||||||
|
// A result > 1 gives a useful modular constraint; result == 1
|
||||||
|
// means every non-negative integer is achievable (no constraint).
|
||||||
|
if (inner == 0)
|
||||||
|
return u_gcd(mn, 0); // gcd(mn, 0) = mn; useful when mn > 1
|
||||||
|
return u_gcd(inner, mn);
|
||||||
|
}
|
||||||
|
|
||||||
|
// concat(r1, r2): lengths add → stride = GCD(stride(r1), stride(r2)).
|
||||||
|
if (seq.re.is_concat(re, r1, r2)) {
|
||||||
|
unsigned s1 = compute_length_stride(r1);
|
||||||
|
unsigned s2 = compute_length_stride(r2);
|
||||||
|
return u_gcd(s1, s2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// union(r1, r2): lengths from either branch → need GCD of both
|
||||||
|
// strides and the difference between their minimum lengths.
|
||||||
|
if (seq.re.is_union(re, r1, r2)) {
|
||||||
|
unsigned s1 = compute_length_stride(r1);
|
||||||
|
unsigned s2 = compute_length_stride(r2);
|
||||||
|
unsigned m1 = seq.re.min_length(r1);
|
||||||
|
unsigned m2 = seq.re.min_length(r2);
|
||||||
|
unsigned d = (m1 >= m2) ? (m1 - m2) : (m2 - m1);
|
||||||
|
// Replace 0-strides with d for GCD computation:
|
||||||
|
// a fixed-length branch only introduces constraint via its offset.
|
||||||
|
unsigned g = u_gcd(s1 == 0 ? d : s1, s2 == 0 ? d : s2);
|
||||||
|
g = u_gcd(g, d);
|
||||||
|
return g;
|
||||||
|
}
|
||||||
|
|
||||||
|
// loop(r, lo, hi): lengths = {lo·len(r), ..., hi·len(r)} if r is fixed.
|
||||||
|
// stride = len(r) when r is fixed-length; otherwise GCD.
|
||||||
|
if (seq.re.is_loop(re, r1, lo, hi)) {
|
||||||
|
unsigned mn = seq.re.min_length(r1);
|
||||||
|
unsigned mx = seq.re.max_length(r1);
|
||||||
|
if (mx == UINT_MAX) mx = mn;
|
||||||
|
if (mn == mx)
|
||||||
|
return (mn > 1) ? mn : 0;
|
||||||
|
return u_gcd(mn, mx);
|
||||||
|
}
|
||||||
|
if (seq.re.is_loop(re, r1, lo)) {
|
||||||
|
unsigned mn = seq.re.min_length(r1);
|
||||||
|
unsigned mx = seq.re.max_length(r1);
|
||||||
|
if (mx == UINT_MAX) mx = mn;
|
||||||
|
if (mn == mx)
|
||||||
|
return (mn > 1) ? mn : 0;
|
||||||
|
return u_gcd(mn, mx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// intersection(r1, r2): lengths must be in both languages.
|
||||||
|
// A conservative safe choice: GCD(stride(r1), stride(r2)) is a valid
|
||||||
|
// stride for the intersection (every length in the intersection is
|
||||||
|
// also in r1 and in r2).
|
||||||
|
if (seq.re.is_intersection(re, r1, r2)) {
|
||||||
|
unsigned s1 = compute_length_stride(r1);
|
||||||
|
unsigned s2 = compute_length_stride(r2);
|
||||||
|
return u_gcd(s1, s2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// For complement, diff, reverse, derivative, of_pred, and anything
|
||||||
|
// else we cannot analyse statically: be conservative and return 1
|
||||||
|
// (no useful modular constraint rather than an unsound one).
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
// Constraint generation
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
void seq_parikh::generate_parikh_constraints(str_mem const& mem,
|
||||||
|
vector<int_constraint>& out) {
|
||||||
|
if (!mem.m_regex || !mem.m_str)
|
||||||
|
return;
|
||||||
|
|
||||||
|
expr* re_expr = mem.m_regex->get_expr();
|
||||||
|
if (!re_expr || !seq.is_re(re_expr))
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Length bounds from the regex.
|
||||||
|
unsigned min_len = seq.re.min_length(re_expr);
|
||||||
|
unsigned max_len = seq.re.max_length(re_expr);
|
||||||
|
|
||||||
|
// If min_len >= max_len the bounds already pin the length exactly
|
||||||
|
// (or the language is empty — empty language is detected by simplify_and_init
|
||||||
|
// via Brzozowski derivative / is_empty checks, not here).
|
||||||
|
// We only generate modular constraints when the length is variable.
|
||||||
|
if (min_len >= max_len)
|
||||||
|
return;
|
||||||
|
|
||||||
|
unsigned stride = compute_length_stride(re_expr);
|
||||||
|
|
||||||
|
// stride == 1: every integer length is possible — no useful constraint.
|
||||||
|
// stride == 0: fixed length or empty — handled by bounds.
|
||||||
|
if (stride <= 1)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Build len(str) as an arithmetic expression.
|
||||||
|
expr_ref len_str(seq.str.mk_length(mem.m_str->get_expr()), m);
|
||||||
|
|
||||||
|
// Introduce fresh integer variable k ≥ 0.
|
||||||
|
expr_ref k_var = mk_fresh_int_var();
|
||||||
|
|
||||||
|
// Constraint 1: len(str) = min_len + stride · k
|
||||||
|
expr_ref min_expr(a.mk_int(min_len), m);
|
||||||
|
expr_ref stride_expr(a.mk_int(stride), m);
|
||||||
|
expr_ref stride_k(a.mk_mul(stride_expr, k_var), m);
|
||||||
|
expr_ref rhs(a.mk_add(min_expr, stride_k), m);
|
||||||
|
out.push_back(int_constraint(len_str, rhs,
|
||||||
|
int_constraint_kind::eq, mem.m_dep, m));
|
||||||
|
|
||||||
|
// Constraint 2: k ≥ 0
|
||||||
|
expr_ref zero(a.mk_int(0), m);
|
||||||
|
out.push_back(int_constraint(k_var, zero,
|
||||||
|
int_constraint_kind::ge, mem.m_dep, m));
|
||||||
|
|
||||||
|
// Constraint 3 (optional): k ≤ max_k when max_len is bounded.
|
||||||
|
// max_k = floor((max_len - min_len) / stride)
|
||||||
|
// This gives the solver an explicit upper bound on k.
|
||||||
|
// The subtraction is safe because min_len < max_len is guaranteed
|
||||||
|
// by the early return above.
|
||||||
|
if (max_len != UINT_MAX) {
|
||||||
|
SASSERT(max_len > min_len);
|
||||||
|
unsigned range = max_len - min_len;
|
||||||
|
unsigned max_k = range / stride;
|
||||||
|
expr_ref max_k_expr(a.mk_int(max_k), m);
|
||||||
|
out.push_back(int_constraint(k_var, max_k_expr,
|
||||||
|
int_constraint_kind::le, mem.m_dep, m));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void seq_parikh::apply_to_node(nielsen_node& node) {
|
||||||
|
vector<int_constraint> constraints;
|
||||||
|
for (str_mem const& mem : node.str_mems())
|
||||||
|
generate_parikh_constraints(mem, constraints);
|
||||||
|
for (auto& ic : constraints)
|
||||||
|
node.add_int_constraint(ic);
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
// Quick Parikh feasibility check (no solver call)
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
// Returns true if a Parikh conflict is detected: there exists a membership
|
||||||
|
// str ∈ re for a single-variable str where the modular length constraint
|
||||||
|
// len(str) = min_len + stride * k (k ≥ 0)
|
||||||
|
// is inconsistent with the variable's current integer bounds [lb, ub].
|
||||||
|
//
|
||||||
|
// This check is lightweight — it uses only modular arithmetic on the already-
|
||||||
|
// known regex min/max lengths and the per-variable bounds stored in the node.
|
||||||
|
bool seq_parikh::check_parikh_conflict(nielsen_node& node) {
|
||||||
|
for (str_mem const& mem : node.str_mems()) {
|
||||||
|
if (!mem.m_str || !mem.m_regex || !mem.m_str->is_var())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
expr* re_expr = mem.m_regex->get_expr();
|
||||||
|
if (!re_expr || !seq.is_re(re_expr))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
unsigned min_len = seq.re.min_length(re_expr);
|
||||||
|
unsigned max_len = seq.re.max_length(re_expr);
|
||||||
|
if (min_len >= max_len) continue; // fixed or empty — no stride constraint
|
||||||
|
|
||||||
|
unsigned stride = compute_length_stride(re_expr);
|
||||||
|
if (stride <= 1) continue; // no useful modular constraint
|
||||||
|
// stride > 1 guaranteed from here onward.
|
||||||
|
SASSERT(stride > 1);
|
||||||
|
|
||||||
|
unsigned lb = node.var_lb(mem.m_str);
|
||||||
|
unsigned ub = node.var_ub(mem.m_str);
|
||||||
|
|
||||||
|
// Check: ∃k ≥ 0 such that lb ≤ min_len + stride * k ≤ ub ?
|
||||||
|
//
|
||||||
|
// First find the smallest k satisfying the lower bound:
|
||||||
|
// k_min = 0 if min_len ≥ lb
|
||||||
|
// k_min = ⌈(lb - min_len) / stride⌉ otherwise
|
||||||
|
//
|
||||||
|
// Then verify min_len + stride * k_min ≤ ub.
|
||||||
|
unsigned k_min = 0;
|
||||||
|
if (lb > min_len) {
|
||||||
|
unsigned gap = lb - min_len;
|
||||||
|
// Ceiling division: k_min = ceil(gap / stride).
|
||||||
|
// Guard: (gap + stride - 1) may overflow if gap is close to UINT_MAX.
|
||||||
|
// In that case k_min would be huge, and min_len + stride*k_min would
|
||||||
|
// also overflow ub → treat as a conflict immediately.
|
||||||
|
if (gap > UINT_MAX - (stride - 1)) {
|
||||||
|
return true; // ceiling division would overflow → k_min too large
|
||||||
|
}
|
||||||
|
k_min = (gap + stride - 1) / stride;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Overflow guard: stride * k_min may overflow unsigned.
|
||||||
|
unsigned len_at_k_min;
|
||||||
|
if (k_min > (UINT_MAX - min_len) / stride) {
|
||||||
|
// Overflow: min_len + stride * k_min > UINT_MAX ≥ ub → conflict.
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
len_at_k_min = min_len + stride * k_min;
|
||||||
|
|
||||||
|
if (ub != UINT_MAX && len_at_k_min > ub)
|
||||||
|
return true; // no valid k exists → Parikh conflict
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace seq
|
||||||
131
src/smt/seq/seq_parikh.h
Normal file
131
src/smt/seq/seq_parikh.h
Normal file
|
|
@ -0,0 +1,131 @@
|
||||||
|
/*++
|
||||||
|
Copyright (c) 2026 Microsoft Corporation
|
||||||
|
|
||||||
|
Module Name:
|
||||||
|
|
||||||
|
seq_parikh.h
|
||||||
|
|
||||||
|
Abstract:
|
||||||
|
|
||||||
|
Parikh image filter for the ZIPT-based Nielsen string solver.
|
||||||
|
|
||||||
|
Implements Parikh-based arithmetic constraint generation for
|
||||||
|
nielsen_node instances. For a regex membership constraint str ∈ r,
|
||||||
|
the Parikh image of r constrains the multiset of characters in str.
|
||||||
|
This module computes the "length stride" (period) of the length
|
||||||
|
language of r and generates modular arithmetic constraints of the form
|
||||||
|
|
||||||
|
len(str) = min_len + stride · k (k ≥ 0, k fresh integer)
|
||||||
|
|
||||||
|
which tighten the arithmetic subproblem beyond the simple min/max
|
||||||
|
length bounds already produced by nielsen_node::init_var_bounds_from_mems().
|
||||||
|
|
||||||
|
For example:
|
||||||
|
• str ∈ (ab)* → min_len = 0, stride = 2 → len(str) = 2·k
|
||||||
|
• str ∈ a(bc)* → min_len = 1, stride = 2 → len(str) = 1 + 2·k
|
||||||
|
• str ∈ ab|abc → stride = 1 (no useful modular constraint)
|
||||||
|
|
||||||
|
The generated int_constraints are added to the node's integer constraint
|
||||||
|
set and discharged by the integer subsolver (see seq_nielsen.h,
|
||||||
|
simple_solver).
|
||||||
|
|
||||||
|
Implements the Parikh filter described in ZIPT
|
||||||
|
(https://github.com/CEisenhofer/ZIPT/tree/parikh/ZIPT/Constraints)
|
||||||
|
replacing ZIPT's PDD-based Parikh subsolver with Z3's linear arithmetic.
|
||||||
|
|
||||||
|
Author:
|
||||||
|
|
||||||
|
Clemens Eisenhofer 2026-03-10
|
||||||
|
Nikolaj Bjorner (nbjorner) 2026-03-10
|
||||||
|
|
||||||
|
--*/
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ast/arith_decl_plugin.h"
|
||||||
|
#include "ast/seq_decl_plugin.h"
|
||||||
|
#include "smt/seq/seq_nielsen.h"
|
||||||
|
|
||||||
|
namespace seq {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parikh image filter: generates modular length constraints from
|
||||||
|
* regex membership constraints in a nielsen_node.
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* seq_parikh parikh(sg);
|
||||||
|
* parikh.apply_to_node(node); // adds constraints to node
|
||||||
|
*
|
||||||
|
* Or per-membership:
|
||||||
|
* vector<int_constraint> out;
|
||||||
|
* parikh.generate_parikh_constraints(mem, out);
|
||||||
|
*/
|
||||||
|
class seq_parikh {
|
||||||
|
ast_manager& m;
|
||||||
|
seq_util seq;
|
||||||
|
arith_util a;
|
||||||
|
unsigned m_fresh_cnt; // counter for fresh variable names
|
||||||
|
|
||||||
|
// Compute the stride (period) of the length language of a regex.
|
||||||
|
//
|
||||||
|
// The stride k satisfies: all lengths in L(re) are congruent to
|
||||||
|
// min_length(re) modulo k. A stride of 1 means every integer
|
||||||
|
// length is possible (no useful modular constraint). A stride of
|
||||||
|
// 0 is a sentinel meaning the language is empty or has a single
|
||||||
|
// fixed length (already captured by bounds).
|
||||||
|
//
|
||||||
|
// Examples:
|
||||||
|
// stride(to_re("ab")) = 0 (fixed length 2)
|
||||||
|
// stride((ab)*) = 2 (lengths 0, 2, 4, ...)
|
||||||
|
// stride((abc)*) = 3 (lengths 0, 3, 6, ...)
|
||||||
|
// stride(a*b*) = 1 (all lengths possible)
|
||||||
|
// stride((ab)*(cd)*) = 2 (lengths 0, 2, 4, ...)
|
||||||
|
// stride((ab)*|(abc)*) = 1 (lengths 0, 2, 3, 4, ...)
|
||||||
|
unsigned compute_length_stride(expr* re);
|
||||||
|
|
||||||
|
// Create a fresh integer variable (name "pk!N") for use as the
|
||||||
|
// Parikh multiplier variable k in len(str) = min_len + stride·k.
|
||||||
|
expr_ref mk_fresh_int_var();
|
||||||
|
|
||||||
|
public:
|
||||||
|
explicit seq_parikh(euf::sgraph& sg);
|
||||||
|
|
||||||
|
// Generate Parikh modular length constraints for one membership.
|
||||||
|
//
|
||||||
|
// When stride > 1 and min_len < max_len (bounds don't pin length exactly,
|
||||||
|
// and the language is non-empty):
|
||||||
|
// adds: len(str) = min_len + stride · k (equality)
|
||||||
|
// k ≥ 0 (non-negativity)
|
||||||
|
// k ≤ (max_len - min_len) / stride (upper bound, when max_len bounded)
|
||||||
|
// These tighten the integer constraint set for the subsolver.
|
||||||
|
// Dependencies are copied from mem.m_dep.
|
||||||
|
// Does nothing when min_len ≥ max_len (empty or fixed-length language).
|
||||||
|
void generate_parikh_constraints(str_mem const& mem,
|
||||||
|
vector<int_constraint>& out);
|
||||||
|
|
||||||
|
// Apply Parikh constraints to all memberships at a node.
|
||||||
|
// Calls generate_parikh_constraints for each str_mem in the node
|
||||||
|
// and appends the resulting int_constraints to node.int_constraints().
|
||||||
|
void apply_to_node(nielsen_node& node);
|
||||||
|
|
||||||
|
// Quick Parikh feasibility check (no solver call).
|
||||||
|
//
|
||||||
|
// For each single-variable membership str ∈ re, checks whether the
|
||||||
|
// modular constraint len(str) = min_len + stride · k (k ≥ 0)
|
||||||
|
// has any solution given the current per-variable bounds stored in
|
||||||
|
// node.var_lb(str) and node.var_ub(str).
|
||||||
|
//
|
||||||
|
// Returns true when a conflict is detected (no valid k exists for
|
||||||
|
// some membership). The caller should then mark the node with
|
||||||
|
// backtrack_reason::parikh_image.
|
||||||
|
//
|
||||||
|
// This is a lightweight pre-check that avoids calling the integer
|
||||||
|
// subsolver. It is sound (never returns true for a satisfiable node)
|
||||||
|
// but incomplete (may miss conflicts that require the full solver).
|
||||||
|
bool check_parikh_conflict(nielsen_node& node);
|
||||||
|
|
||||||
|
// Compute the length stride of a regex expression.
|
||||||
|
// Exposed for testing and external callers.
|
||||||
|
unsigned get_length_stride(expr* re) { return compute_length_stride(re); }
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace seq
|
||||||
Loading…
Add table
Add a link
Reference in a new issue