3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2026-06-19 23:26:30 +00:00

Smart constructors for regex ranges: canonical form at construction time (#9814)

Regex range expressions (`re.range`) and Boolean operations over them
were left in unsimplified form, defeating downstream optimisations
(bisimulation classical fast-path, derivative engine) and producing
semantically-empty terms not syntactically equal to `re.none`.

## Changes

### `seq_decl_plugin.h` / `seq_decl_plugin.cpp`

- **`seq_util::rex::mk_range(sort*, unsigned lo, unsigned hi)`** — new
smart constructor that normalises at call time:
  - `lo > hi` → `re.empty`
  - `lo == hi` → `str.to_re` (singleton string)
  - `lo < hi` → `re.range`
- **`mk_info_rec` `OP_RE_RANGE`** — concrete non-empty ranges (both
bounds are single-char literals with `lo ≤ hi`) now return `classical =
true`, enabling the XOR-bisimulation `classical_distinguishing`
fast-path on character-predicate leaves. Symbolic/unknown ranges retain
`classical = false`.

### `seq_rewriter.cpp`

- **`mk_re_range`** — singleton collapse: `(re.range "a" "a")` →
`(str.to_re "a")`
- **`mk_regex_inter_normalize`** — range × range intersection: `[a,b] ∩
[c,d]` → `[max(a,c), min(b,d)]`, or `re.none` (disjoint), or `str.to_re`
(boundary singleton); now delegates to `re().mk_range(sort*, lo, hi)`
- **`mk_regex_union_normalize`** — range × range union for
overlapping/adjacent ranges: `[a,b] ∪ [c,d]` → `[min(a,c), max(b,d)]`;
disjoint ranges fall through to existing `merge_regex_sets`; now
delegates to `re().mk_range(sort*, lo, hi)`
- **`mk_re_complement`** — range complement expands to one or two
concrete ranges instead of an opaque `re.comp` node; now delegates to
`re().mk_range(sort*, lo, hi)`:
  - `comp([0, b])` → `[b+1, max]`
  - `comp([a, max])` → `[0, a-1]`
  - `comp([a, b])` → `[0, a-1] ∪ [b+1, max]`

```
(simplify (re.range "z" "a"))                              ; → re.none
(simplify (re.range "a" "a"))                              ; → (str.to_re "a")
(simplify (re.inter (re.range "a" "z") (re.range "f" "k"))); → (re.range "f" "k")
(simplify (re.union (re.range "a" "f") (re.range "g" "k"))); → (re.range "a" "k")
(simplify (re.comp  (re.range "b" "y")))                   ; → (re.union [0,a] [z,max])
```

### Tests

New `src/test/seq_rewriter.cpp` with 14 cases covering all the above
reductions plus downstream propagation (star/concat/union/inter
absorbing empty ranges).

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: Nikolaj Bjorner <nbjorner@microsoft.com>
This commit is contained in:
Copilot 2026-06-16 13:58:56 -06:00 committed by GitHub
parent 897c4475af
commit 8c2a425e4b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 272 additions and 4 deletions

View file

@ -3336,6 +3336,18 @@ expr_ref seq_rewriter::mk_regex_union_normalize(expr* r1, expr* r2) {
result = r1;
else if (re().is_dot_plus(r2) && re().get_info(r1).min_length > 0)
result = r2;
else {
// Range Range: [a,b] [c,d] = [min(a,c), max(b,d)] when overlapping or adjacent
unsigned lo1_v = 0, hi1_v = 0, lo2_v = 0, hi2_v = 0;
if (re().is_range(r1, lo1_v, hi1_v) && re().is_range(r2, lo2_v, hi2_v) &&
lo2_v <= hi1_v + 1 && lo1_v <= hi2_v + 1) {
unsigned new_lo = std::min(lo1_v, lo2_v);
unsigned new_hi = std::max(hi1_v, hi2_v);
result = re().mk_range(r1->get_sort(), new_lo, new_hi);
}
else
result = merge_regex_sets(r1, r2, re().mk_full_seq(r1->get_sort()), test, compose);
}
// (R1 \ R2) U (R2 \ R1) = R1 xor R2
else if (false && re().is_intersection(r1, a1, a2) && re().is_intersection(r2, b1, b2) &&
is_complement(a1, b2) && is_complement(a2, b1)) {
@ -3375,8 +3387,17 @@ expr_ref seq_rewriter::mk_regex_inter_normalize(expr* r1, expr* r2) {
result = r2;
else if (re().is_dot_plus(r2) && re().get_info(r1).min_length > 0)
result = r1;
else
result = merge_regex_sets(r1, r2, re().mk_empty(r1->get_sort()), test, compose);
else {
// Range ∩ Range: [a,b] ∩ [c,d] = [max(a,c), min(b,d)] or empty
unsigned lo1_v = 0, hi1_v = 0, lo2_v = 0, hi2_v = 0;
if (re().is_range(r1, lo1_v, hi1_v) && re().is_range(r2, lo2_v, hi2_v)) {
unsigned new_lo = std::max(lo1_v, lo2_v);
unsigned new_hi = std::min(hi1_v, hi2_v);
result = re().mk_range(r1->get_sort(), new_lo, new_hi);
}
else
result = merge_regex_sets(r1, r2, re().mk_empty(r1->get_sort()), test, compose);
}
return result;
}
@ -4805,6 +4826,34 @@ br_status seq_rewriter::mk_re_complement(expr* a, expr_ref& result) {
result = re().mk_plus(re().mk_full_char(a->get_sort()));
return BR_DONE;
}
// Range complement: comp([a,b]) → [0,a-1] [b+1,max] (or one half when a=0 or b=max)
unsigned lo_v = 0, hi_v = 0;
if (re().is_range(a, lo_v, hi_v)) {
unsigned max_c = u().max_char();
sort* srt = a->get_sort();
bool has_left = (lo_v > 0);
bool has_right = (hi_v < max_c);
if (!has_left && !has_right) {
// [0, max_c]: complement is empty
result = re().mk_empty(srt);
return BR_DONE;
}
if (!has_left) {
// [0, b]: complement is [b+1, max]
result = re().mk_range(srt, hi_v + 1, max_c);
return BR_REWRITE1;
}
if (!has_right) {
// [a, max]: complement is [0, a-1]
result = re().mk_range(srt, 0u, lo_v - 1);
return BR_REWRITE1;
}
// General: [a, b] → [0, a-1] [b+1, max]
auto left = re().mk_range(srt, 0u, lo_v - 1);
auto right = re().mk_range(srt, hi_v + 1, max_c);
result = re().mk_union(left, right);
return BR_REWRITE1;
}
return BR_FAILED;
}
@ -5102,6 +5151,11 @@ br_status seq_rewriter::mk_re_range(expr* lo, expr* hi, expr_ref& result) {
result = re().mk_empty(srt);
return BR_DONE;
}
// Singleton: re.range "a" "a" → str.to_re "a"
if (slo.length() == 1 && shi.length() == 1 && slo[0] == shi[0]) {
result = re().mk_to_re(lo);
return BR_DONE;
}
return BR_FAILED;
}

View file

@ -1208,6 +1208,15 @@ app* seq_util::rex::mk_of_pred(expr* p) {
return m.mk_app(m_fid, OP_RE_OF_PRED, 0, nullptr, 1, &p);
}
app* seq_util::rex::mk_range(sort* re_sort, unsigned lo, unsigned hi) {
if (lo > hi)
return mk_empty(re_sort);
app* lo_str = u.str.mk_string(zstring(lo));
if (lo == hi)
return mk_to_re(lo_str);
return mk_range(lo_str, u.str.mk_string(zstring(hi)));
}
bool seq_util::rex::is_loop(expr const* n, expr*& body, unsigned& lo, unsigned& hi) const {
if (is_loop(n)) {
app const* a = to_app(n);
@ -1671,11 +1680,19 @@ seq_util::rex::info seq_util::rex::mk_info_rec(app* e) const {
case OP_RE_OPTION:
i1 = get_info_rec(e->get_arg(0));
return i1.opt();
case OP_RE_RANGE:
case OP_RE_RANGE: {
// A concrete range [lo, hi] with lo <= hi is non-empty and classical.
zstring slo, shi;
if (u.str.is_string(e->get_arg(0), slo) && slo.length() == 1 &&
u.str.is_string(e->get_arg(1), shi) && shi.length() == 1 &&
slo[0] <= shi[0])
return info(true, l_false, 1, true);
// Symbolic or unknown: not classical
return info(true, l_false, 1, false);
}
case OP_RE_FULL_CHAR_SET:
case OP_RE_OF_PRED:
//TBD: check if the character predicate contains uninterpreted symbols or is nonground or is unsat
//TBD: check if the range is unsat
return info(true, l_false, 1, false);
case OP_RE_CONCAT:
i1 = get_info_rec(e->get_arg(0));

View file

@ -521,6 +521,8 @@ public:
app* mk_to_re(expr* s) { return m.mk_app(m_fid, OP_SEQ_TO_RE, 1, &s); }
app* mk_in_re(expr* s, expr* r) { return m.mk_app(m_fid, OP_SEQ_IN_RE, s, r); }
app* mk_range(expr* s1, expr* s2) { return m.mk_app(m_fid, OP_RE_RANGE, s1, s2); }
// Smart constructor: returns re.empty / str.to_re / re.range based on lo vs hi.
app* mk_range(sort* re_sort, unsigned lo, unsigned hi);
app* mk_concat(expr* r1, expr* r2) { return m.mk_app(m_fid, OP_RE_CONCAT, r1, r2); }
app* mk_union(expr* r1, expr* r2) { return m.mk_app(m_fid, OP_RE_UNION, r1, r2); }
app* mk_inter(expr* r1, expr* r2) { return m.mk_app(m_fid, OP_RE_INTERSECT, r1, r2); }