From 194dfe6c610d15780fb2c077be94cb32a2a22e6a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 11 Mar 2026 17:21:27 +0000 Subject: [PATCH] Port ZIPT Parikh features: minterm_to_char_set, char range constraints, fix stride soundness Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com> --- src/smt/seq/seq_nielsen.cpp | 11 ++- src/smt/seq/seq_parikh.cpp | 129 ++++++++++++++++++++++++++---------- src/smt/seq/seq_parikh.h | 19 ++++++ 3 files changed, 122 insertions(+), 37 deletions(-) diff --git a/src/smt/seq/seq_nielsen.cpp b/src/smt/seq/seq_nielsen.cpp index 81fd1b2ba..58fd96692 100644 --- a/src/smt/seq/seq_nielsen.cpp +++ b/src/smt/seq/seq_nielsen.cpp @@ -2301,8 +2301,15 @@ namespace seq { nielsen_subst s(first, replacement, mem.m_dep); e->add_subst(s); child->apply_subst(m_sg, s); - // TODO: derive char_set from minterm and add as range constraint - // child->add_char_range(fresh_char, minterm_to_char_set(mt)); + // Constrain fresh_char to the character class of this minterm. + // This is the key Parikh pruning step: when x → ?c · x' is + // generated from minterm m_i, ?c must belong to the character + // class described by m_i so that str ∈ derivative(R, m_i). + if (mt->get_expr()) { + char_set cs = m_parith->minterm_to_char_set(mt->get_expr()); + if (!cs.is_empty()) + child->add_char_range(fresh_char, cs); + } created = true; } diff --git a/src/smt/seq/seq_parikh.cpp b/src/smt/seq/seq_parikh.cpp index 255d09d72..367412dad 100644 --- a/src/smt/seq/seq_parikh.cpp +++ b/src/smt/seq/seq_parikh.cpp @@ -80,36 +80,24 @@ namespace seq { // r* — Kleene star. // L(r*) = {ε} ∪ L(r) ∪ L(r)·L(r) ∪ ... - // If r has a fixed length k, then L(r*) = {0, k, 2k, ...} → stride k. - // If r has variable length, strides from different iterations combine - // by GCD. + // If all lengths in L(r) are congruent to c modulo s (c = min_len, s = stride), + // then L(r*) includes lengths {0, c, c+s, 2c, 2c+s, 2c+2s, ...} and + // the overall GCD is gcd(c, s). This is strictly more accurate than + // the previous gcd(min, max) approximation, which can be unsound when + // the body contains lengths whose GCD is smaller than gcd(min, max). if (seq.re.is_star(re, r1)) { unsigned mn = seq.re.min_length(r1); - unsigned mx = seq.re.max_length(r1); - // When the body has unbounded length (mx == UINT_MAX), different - // iterations can produce many different lengths, and the stride of - // the star as a whole degenerates to gcd(mn, mn) = mn (or 1 if - // mn == 1). This is conservative: we use the body's min-length - // as the only available fixed quantity. - if (mx == UINT_MAX) mx = mn; - if (mn == mx) { - // Fixed-length body: L(r*) = {0, mn, 2·mn, ...} → stride = mn. - // When mn == 1 the stride would be 1, which gives no useful - // modular constraint, so return 0 instead. - return (mn > 1) ? mn : 0; - } - // Variable-length body: GCD of min and max lengths - return u_gcd(mn, mx); + unsigned inner = compute_length_stride(r1); + // stride(r*) = gcd(min_length(r), stride(r)) + // when inner=0 (fixed-length body), gcd(mn, 0) = mn → stride = mn + return u_gcd(mn, inner); } // r+ — one or more: same stride analysis as r*. if (seq.re.is_plus(re, r1)) { unsigned mn = seq.re.min_length(r1); - unsigned mx = seq.re.max_length(r1); - if (mx == UINT_MAX) mx = mn; // same conservative treatment as star - if (mn == mx) - return (mn > 1) ? mn : 0; - return u_gcd(mn, mx); + unsigned inner = compute_length_stride(r1); + return u_gcd(mn, inner); } // r? — zero or one: lengths = {0} ∪ lengths(r) @@ -150,23 +138,18 @@ namespace seq { return g; } - // loop(r, lo, hi): lengths = {lo·len(r), ..., hi·len(r)} if r is fixed. - // stride = len(r) when r is fixed-length; otherwise GCD. + // loop(r, lo, hi): the length of any word is a sum of lo..hi copies of + // lengths from L(r). Since all lengths in L(r) are ≡ min_len(r) mod + // stride(r), the overall stride is gcd(min_len(r), stride(r)). if (seq.re.is_loop(re, r1, lo, hi)) { unsigned mn = seq.re.min_length(r1); - unsigned mx = seq.re.max_length(r1); - if (mx == UINT_MAX) mx = mn; - if (mn == mx) - return (mn > 1) ? mn : 0; - return u_gcd(mn, mx); + unsigned inner = compute_length_stride(r1); + return u_gcd(mn, inner); } if (seq.re.is_loop(re, r1, lo)) { unsigned mn = seq.re.min_length(r1); - unsigned mx = seq.re.max_length(r1); - if (mx == UINT_MAX) mx = mn; - if (mn == mx) - return (mn > 1) ? mn : 0; - return u_gcd(mn, mx); + unsigned inner = compute_length_stride(r1); + return u_gcd(mn, inner); } // intersection(r1, r2): lengths must be in both languages. @@ -324,4 +307,80 @@ namespace seq { return false; } + // ----------------------------------------------------------------------- + // minterm → char_set conversion + // ----------------------------------------------------------------------- + + // Convert a regex minterm expression to a char_set. + // + // Minterms are Boolean combinations of character-class predicates + // (re.range, re.full_char, complement, intersection) produced by + // sgraph::compute_minterms(). This function converts them to a + // concrete char_set so that fresh character variables introduced by + // apply_regex_var_split can be constrained with add_char_range(). + // + // The implementation is recursive and handles: + // re.full_char → [0, max_char] (full alphabet) + // re.range(lo, hi) → [lo, hi+1) (hi inclusive in Z3) + // re.complement(r) → alphabet \ minterm_to_char_set(r) + // re.inter(r1, r2) → intersection of both sides + // re.diff(r1, r2) → r1 ∩ complement(r2) + // re.to_re(str.unit(c)) → singleton {c} + // re.empty → empty set + // anything else → [0, max_char] (conservative) + char_set seq_parikh::minterm_to_char_set(expr* re_expr) { + unsigned max_c = seq.max_char(); + + if (!re_expr) + return char_set::full(max_c); + + // full_char: the whole alphabet [0, max_char] + if (seq.re.is_full_char(re_expr)) + return char_set::full(max_c); + + // range [lo, hi] (hi inclusive in Z3's regex representation) + unsigned lo = 0, hi = 0; + if (seq.re.is_range(re_expr, lo, hi)) { + if (lo > hi) return char_set(); + // char_range uses exclusive upper bound; Z3 hi is inclusive + return char_set(char_range(lo, hi + 1)); + } + + // complement: alphabet minus the inner set + expr* inner = nullptr; + if (seq.re.is_complement(re_expr, inner)) + return minterm_to_char_set(inner).complement(max_c); + + // intersection: characters present in both sets + expr* r1 = nullptr, *r2 = nullptr; + if (seq.re.is_intersection(re_expr, r1, r2)) + return minterm_to_char_set(r1).intersect_with(minterm_to_char_set(r2)); + + // difference: r1 minus r2 = r1 ∩ complement(r2) + if (seq.re.is_diff(re_expr, r1, r2)) + return minterm_to_char_set(r1).intersect_with( + minterm_to_char_set(r2).complement(max_c)); + + // to_re(str.unit(c)): singleton character set + expr* str_arg = nullptr; + if (seq.re.is_to_re(re_expr, str_arg)) { + expr* ch_expr = nullptr; + unsigned char_val = 0; + if (seq.str.is_unit(str_arg, ch_expr) && + seq.is_const_char(ch_expr, char_val)) { + char_set cs; + cs.add(char_val); + return cs; + } + } + + // empty regex: no characters can appear + if (seq.re.is_empty(re_expr)) + return char_set(); + + // Conservative fallback: return the full alphabet so that + // no unsound constraints are added for unrecognised expressions. + return char_set::full(max_c); + } + } // namespace seq diff --git a/src/smt/seq/seq_parikh.h b/src/smt/seq/seq_parikh.h index 730854832..8e8a4e178 100644 --- a/src/smt/seq/seq_parikh.h +++ b/src/smt/seq/seq_parikh.h @@ -126,6 +126,25 @@ namespace seq { // Compute the length stride of a regex expression. // Exposed for testing and external callers. unsigned get_length_stride(expr* re) { return compute_length_stride(re); } + + // Convert a regex minterm expression to a char_set. + // + // A minterm is a Boolean combination of character-class predicates + // (re.range, re.full_char, complement, intersection) that names a + // single, indivisible character equivalence class. Minterms are + // produced by sgraph::compute_minterms and used in + // apply_regex_var_split to constrain fresh character variables. + // + // Supported expressions: + // re.full_char → full set [0, max_char] + // re.range(lo, hi) → char_set [lo, hi] (inclusive on both ends) + // re.complement(r) → complement of minterm_to_char_set(r) + // re.inter(r1, r2) → intersection of both sets + // re.diff(r1, r2) → r1 minus r2 (= r1 ∩ complement(r2)) + // re.to_re(unit(c)) → singleton {c} + // re.empty → empty set + // anything else → full set (conservative, sound over-approximation) + char_set minterm_to_char_set(expr* minterm_re); }; } // namespace seq