diff --git a/src/ast/rewriter/CMakeLists.txt b/src/ast/rewriter/CMakeLists.txt index cfcc179bc..0956fef48 100644 --- a/src/ast/rewriter/CMakeLists.txt +++ b/src/ast/rewriter/CMakeLists.txt @@ -35,6 +35,7 @@ z3_add_component(rewriter pb2bv_rewriter.cpp push_app_ite.cpp quant_hoist.cpp + range_predicate.cpp recfun_rewriter.cpp rewriter.cpp seq_axioms.cpp diff --git a/src/ast/rewriter/range_predicate.cpp b/src/ast/rewriter/range_predicate.cpp new file mode 100644 index 000000000..53b31a84a --- /dev/null +++ b/src/ast/rewriter/range_predicate.cpp @@ -0,0 +1,292 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + range_predicate.cpp + +Abstract: + + Implementation of the specialized range-algebra used by symbolic + derivative computation and regex rewriting. See range_predicate.h + for the algebraic specification. + + All Boolean operations are implemented as single linear sweeps over + the canonical sorted range vectors and produce canonical output + (sorted, disjoint, non-adjacent). + +Authors: + + Margus Veanes (veanes) 2026 + +--*/ + +#include "ast/rewriter/range_predicate.h" +#include "util/debug.h" +#include +#include + +namespace seq { + + // ----------------------------------------------------------------------- + // Factories + // ----------------------------------------------------------------------- + + range_predicate range_predicate::empty(unsigned max_char) { + return range_predicate(max_char); + } + + range_predicate range_predicate::top(unsigned max_char) { + range_predicate r(max_char); + r.m_ranges.push_back({0u, max_char}); + SASSERT(r.well_formed()); + return r; + } + + range_predicate range_predicate::singleton(unsigned c, unsigned max_char) { + SASSERT(c <= max_char); + range_predicate r(max_char); + r.m_ranges.push_back({c, c}); + SASSERT(r.well_formed()); + return r; + } + + range_predicate range_predicate::range(unsigned lo, unsigned hi, unsigned max_char) { + range_predicate r(max_char); + if (lo <= hi && lo <= max_char) { + unsigned clipped_hi = hi <= max_char ? hi : max_char; + r.m_ranges.push_back({lo, clipped_hi}); + } + SASSERT(r.well_formed()); + return r; + } + + // ----------------------------------------------------------------------- + // Invariants and observers + // ----------------------------------------------------------------------- + + bool range_predicate::well_formed() const { + for (unsigned i = 0; i < m_ranges.size(); ++i) { + auto [lo, hi] = m_ranges[i]; + if (lo > hi) return false; + if (hi > m_max_char) return false; + if (i > 0) { + unsigned prev_hi = m_ranges[i - 1].second; + // Non-adjacent and sorted: prev_hi + 1 < lo, with care + // around prev_hi == UINT_MAX which we never expect because + // hi <= m_max_char. + if (prev_hi + 1 >= lo) return false; + } + } + return true; + } + + bool range_predicate::contains(unsigned c) const { + // Binary search on first element of pairs. + unsigned lo = 0, hi = m_ranges.size(); + while (lo < hi) { + unsigned mid = lo + (hi - lo) / 2; + auto [a, b] = m_ranges[mid]; + if (c < a) hi = mid; + else if (c > b) lo = mid + 1; + else return true; + } + return false; + } + + uint64_t range_predicate::cardinality() const { + uint64_t n = 0; + for (auto [lo, hi] : m_ranges) + n += static_cast(hi) - static_cast(lo) + 1u; + return n; + } + + // ----------------------------------------------------------------------- + // Equality, ordering, hashing + // ----------------------------------------------------------------------- + + bool range_predicate::equals(range_predicate const& o) const { + if (m_max_char != o.m_max_char) return false; + if (m_ranges.size() != o.m_ranges.size()) return false; + for (unsigned i = 0; i < m_ranges.size(); ++i) + if (m_ranges[i] != o.m_ranges[i]) return false; + return true; + } + + bool range_predicate::operator<(range_predicate const& o) const { + if (m_max_char != o.m_max_char) + return m_max_char < o.m_max_char; + unsigned n = std::min(m_ranges.size(), o.m_ranges.size()); + for (unsigned i = 0; i < n; ++i) { + auto a = m_ranges[i]; + auto b = o.m_ranges[i]; + if (a.first != b.first) return a.first < b.first; + if (a.second != b.second) return a.second < b.second; + } + return m_ranges.size() < o.m_ranges.size(); + } + + unsigned range_predicate::hash() const { + // FNV-1a 32-bit over (max_char, then each (lo, hi)). + uint32_t h = 2166136261u; + auto step = [&](uint32_t x) { + h ^= x; + h *= 16777619u; + }; + step(m_max_char); + for (auto [lo, hi] : m_ranges) { + step(lo); + step(hi); + } + return h; + } + + // ----------------------------------------------------------------------- + // Boolean operations + // ----------------------------------------------------------------------- + + namespace { + // Append (lo, hi) to result, merging with the previous range if + // adjacent or overlapping. Maintains canonical form. + inline void append_merged(svector>& result, + unsigned lo, unsigned hi) { + SASSERT(lo <= hi); + if (!result.empty() && result.back().second + 1 >= lo) { + if (result.back().second < hi) + result.back().second = hi; + } else { + result.push_back({lo, hi}); + } + } + } + + range_predicate range_predicate::operator|(range_predicate const& o) const { + SASSERT(m_max_char == o.m_max_char); + range_predicate r(m_max_char); + unsigned i = 0, j = 0; + const unsigned n = m_ranges.size(); + const unsigned m = o.m_ranges.size(); + while (i < n && j < m) { + auto a = m_ranges[i]; + auto b = o.m_ranges[j]; + if (a.first <= b.first) { + append_merged(r.m_ranges, a.first, a.second); + ++i; + } else { + append_merged(r.m_ranges, b.first, b.second); + ++j; + } + } + while (i < n) { + auto a = m_ranges[i++]; + append_merged(r.m_ranges, a.first, a.second); + } + while (j < m) { + auto b = o.m_ranges[j++]; + append_merged(r.m_ranges, b.first, b.second); + } + SASSERT(r.well_formed()); + return r; + } + + range_predicate range_predicate::operator&(range_predicate const& o) const { + SASSERT(m_max_char == o.m_max_char); + range_predicate r(m_max_char); + unsigned i = 0, j = 0; + const unsigned n = m_ranges.size(); + const unsigned m = o.m_ranges.size(); + while (i < n && j < m) { + auto [a_lo, a_hi] = m_ranges[i]; + auto [b_lo, b_hi] = o.m_ranges[j]; + unsigned lo = std::max(a_lo, b_lo); + unsigned hi = std::min(a_hi, b_hi); + if (lo <= hi) + r.m_ranges.push_back({lo, hi}); + // Advance the range that ends first. + if (a_hi < b_hi) ++i; + else if (b_hi < a_hi) ++j; + else { ++i; ++j; } + } + SASSERT(r.well_formed()); + return r; + } + + range_predicate range_predicate::operator~() const { + range_predicate r(m_max_char); + unsigned cursor = 0; + for (auto [lo, hi] : m_ranges) { + if (cursor < lo) + r.m_ranges.push_back({cursor, lo - 1}); + // Step past hi without overflow: hi <= m_max_char and we + // only step when more characters remain. + if (hi >= m_max_char) { + cursor = m_max_char + 1; // sentinel: no more characters + break; + } + cursor = hi + 1; + } + if (cursor <= m_max_char) + r.m_ranges.push_back({cursor, m_max_char}); + SASSERT(r.well_formed()); + return r; + } + + range_predicate range_predicate::operator-(range_predicate const& o) const { + SASSERT(m_max_char == o.m_max_char); + // A - B by linear sweep: for each range of A, subtract overlapping + // ranges of B. Both inputs are sorted so we advance j monotonically. + range_predicate r(m_max_char); + unsigned j = 0; + const unsigned m = o.m_ranges.size(); + for (auto [a_lo, a_hi] : m_ranges) { + unsigned cursor = a_lo; + while (j < m && o.m_ranges[j].second < cursor) + ++j; + unsigned k = j; + while (k < m && o.m_ranges[k].first <= a_hi) { + auto [b_lo, b_hi] = o.m_ranges[k]; + if (cursor < b_lo) + r.m_ranges.push_back({cursor, std::min(a_hi, b_lo - 1)}); + if (b_hi >= a_hi) { + cursor = a_hi + 1; + break; + } + cursor = b_hi + 1; + ++k; + } + if (cursor <= a_hi) + r.m_ranges.push_back({cursor, a_hi}); + } + SASSERT(r.well_formed()); + return r; + } + + range_predicate range_predicate::operator^(range_predicate const& o) const { + SASSERT(m_max_char == o.m_max_char); + // (A | B) - (A & B), but implemented directly with one linear sweep + // over the union of breakpoints. + return (*this | o) - (*this & o); + } + + // ----------------------------------------------------------------------- + // Display + // ----------------------------------------------------------------------- + + std::ostream& range_predicate::display(std::ostream& out) const { + if (m_ranges.empty()) { + return out << "[]"; + } + out << "["; + bool first = true; + for (auto [lo, hi] : m_ranges) { + if (!first) out << ","; + first = false; + if (lo == hi) + out << lo; + else + out << lo << "-" << hi; + } + return out << "]"; + } + +} diff --git a/src/ast/rewriter/range_predicate.h b/src/ast/rewriter/range_predicate.h new file mode 100644 index 000000000..31226e4a9 --- /dev/null +++ b/src/ast/rewriter/range_predicate.h @@ -0,0 +1,127 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + range_predicate.h + +Abstract: + + Specialized range-algebra over an unsigned character domain [0, max_char]. + + A range_predicate represents a subset of the character domain as a + sorted sequence of non-overlapping, non-adjacent, non-empty ranges: + + [(lo_0, hi_0), (lo_1, hi_1), ...] with hi_i + 1 < lo_{i+1}. + + The representation is canonical, so two range_predicates over the same + domain are extensionally equivalent iff their internal vectors are + elementwise equal. + + All Boolean operations (union, intersection, complement, difference) + are linear in the total number of ranges and produce the canonical + representation. + + Intended use: + * path conditions for symbolic derivative computation, + * OneStep predicates capturing length-1 acceptance, + * smart-constructor side conditions for regex rewrites such as + R & psi --> toregex(OneStep(R) & psi). + + The type is a pure value: no ast_manager allocation occurs in its + construction or its Boolean operations. Conversion to and from + expr* is the responsibility of a separate translator (see callers + in seq_derive / seq_rewriter). + +Authors: + + Margus Veanes (veanes) 2026 + +--*/ +#pragma once + +#include "util/vector.h" +#include +#include + +namespace seq { + + class range_predicate { + using range_t = std::pair; + using ranges_t = svector; + + // Sorted by first; ranges are disjoint and non-adjacent; + // every range satisfies lo <= hi <= m_max_char. + ranges_t m_ranges; + unsigned m_max_char { 0 }; + + // Invariant check used in debug builds. + bool well_formed() const; + + public: + range_predicate() = default; + explicit range_predicate(unsigned max_char) : m_max_char(max_char) {} + + // ---------------- Factory functions ---------------- + + static range_predicate empty(unsigned max_char); + static range_predicate top(unsigned max_char); + static range_predicate singleton(unsigned c, unsigned max_char); + static range_predicate range(unsigned lo, unsigned hi, unsigned max_char); + + // ---------------- Observers ---------------- + + unsigned max_char() const { return m_max_char; } + unsigned num_ranges() const { return m_ranges.size(); } + range_t operator[](unsigned i) const { return m_ranges[i]; } + ranges_t const& ranges() const { return m_ranges; } + + bool is_empty() const { return m_ranges.empty(); } + bool is_top() const { + return m_ranges.size() == 1 + && m_ranges[0].first == 0 + && m_ranges[0].second == m_max_char; + } + bool is_singleton(unsigned& c) const { + if (m_ranges.size() != 1) return false; + if (m_ranges[0].first != m_ranges[0].second) return false; + c = m_ranges[0].first; + return true; + } + bool contains(unsigned c) const; + + // Number of characters in the predicate (well-defined for any domain). + uint64_t cardinality() const; + + // ---------------- Equality, ordering, hashing ---------------- + + bool equals(range_predicate const& o) const; + bool operator==(range_predicate const& o) const { return equals(o); } + bool operator!=(range_predicate const& o) const { return !equals(o); } + + // Total order: lexicographic on the canonical range sequence, + // with shorter sequences ordered before longer prefixes. + // Predicates over different domains compare by max_char first. + bool operator<(range_predicate const& o) const; + bool less_than(range_predicate const& o) const { return *this < o; } + + unsigned hash() const; + + // ---------------- Boolean operations ---------------- + + range_predicate operator|(range_predicate const& o) const; // union + range_predicate operator&(range_predicate const& o) const; // intersection + range_predicate operator-(range_predicate const& o) const; // difference + range_predicate operator^(range_predicate const& o) const; // symmetric diff + range_predicate operator~() const; // complement + + // ---------------- Display ---------------- + + std::ostream& display(std::ostream& out) const; + }; + + inline std::ostream& operator<<(std::ostream& out, range_predicate const& p) { + return p.display(out); + } + +} diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index ef5902c0c..d29b47d9b 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -120,6 +120,7 @@ add_executable(test-z3 quant_elim.cpp quant_solve.cpp random.cpp + range_predicate.cpp rational.cpp rcf.cpp region.cpp diff --git a/src/test/main.cpp b/src/test/main.cpp index 4a5239f19..104e3596b 100644 --- a/src/test/main.cpp +++ b/src/test/main.cpp @@ -113,6 +113,7 @@ X(api_bug) \ X(api_special_relations) \ X(arith_rewriter) \ + X(range_predicate) \ X(check_assumptions) \ X(smt_context) \ X(theory_dl) \ diff --git a/src/test/range_predicate.cpp b/src/test/range_predicate.cpp new file mode 100644 index 000000000..7d332f413 --- /dev/null +++ b/src/test/range_predicate.cpp @@ -0,0 +1,260 @@ +/*++ +Copyright (c) 2026 Microsoft Corporation + +Module Name: + + test/range_predicate.cpp + +Abstract: + + Unit tests for the range-algebra value type seq::range_predicate. + + The tests exercise: + * factory constructors and canonical-form invariants, + * extensional equality and total ordering, + * Boolean operations (|, &, ~, -, ^) on hand-picked instances, + * exhaustive verification of de-Morgan and lattice laws on a + small character domain, by enumerating every subset. + +Author: + + Margus Veanes (veanes) 2026 + +--*/ + +#include "ast/rewriter/range_predicate.h" +#include "util/debug.h" +#include +#include +#include + +using seq::range_predicate; + +namespace { + + // Build a range_predicate from a bitmask over [0, max_char] for testing. + range_predicate from_mask(uint64_t mask, unsigned max_char) { + range_predicate r = range_predicate::empty(max_char); + for (unsigned c = 0; c <= max_char; ++c) + if ((mask >> c) & 1u) + r = r | range_predicate::singleton(c, max_char); + return r; + } + + // Convert a range_predicate back to a bitmask for cross-checking. + uint64_t to_mask(range_predicate const& r) { + uint64_t mask = 0; + for (unsigned c = 0; c <= r.max_char(); ++c) + if (r.contains(c)) + mask |= (uint64_t(1) << c); + return mask; + } + + void test_factories() { + auto e = range_predicate::empty(255); + ENSURE(e.is_empty()); + ENSURE(!e.is_top()); + ENSURE(e.num_ranges() == 0); + ENSURE(e.cardinality() == 0); + + auto t = range_predicate::top(255); + ENSURE(!t.is_empty()); + ENSURE(t.is_top()); + ENSURE(t.num_ranges() == 1); + ENSURE(t.cardinality() == 256); + ENSURE(t.contains(0)); + ENSURE(t.contains(255)); + + auto s = range_predicate::singleton(42, 255); + ENSURE(s.num_ranges() == 1); + ENSURE(s.cardinality() == 1); + ENSURE(s.contains(42)); + ENSURE(!s.contains(41)); + unsigned c = 0; + ENSURE(s.is_singleton(c)); + ENSURE(c == 42); + + auto r = range_predicate::range(10, 20, 255); + ENSURE(r.num_ranges() == 1); + ENSURE(r.cardinality() == 11); + ENSURE(r.contains(10)); + ENSURE(r.contains(20)); + ENSURE(!r.contains(9)); + ENSURE(!r.contains(21)); + + // Reversed bounds produce empty. + auto bad = range_predicate::range(20, 10, 255); + ENSURE(bad.is_empty()); + + // Clipping at max_char. + auto clipped = range_predicate::range(200, 1000, 255); + ENSURE(clipped.num_ranges() == 1); + ENSURE(clipped[0] == std::make_pair(200u, 255u)); + } + + void test_equality_and_order() { + auto a = range_predicate::range(1, 5, 31); + auto b = range_predicate::range(1, 5, 31); + auto c = range_predicate::range(1, 6, 31); + ENSURE(a == b); + ENSURE(a != c); + ENSURE(a.hash() == b.hash()); + ENSURE(a < c || c < a); + ENSURE(!(a < a)); + + auto empty = range_predicate::empty(31); + ENSURE(empty < a); + + // Canonical merging of adjacent ranges. + auto d = range_predicate::range(0, 4, 31) | range_predicate::range(5, 10, 31); + auto e = range_predicate::range(0, 10, 31); + ENSURE(d == e); + } + + void test_union_intersection_hand() { + unsigned const M = 31; + auto a = range_predicate::range(0, 4, M) | range_predicate::range(10, 14, M); + auto b = range_predicate::range(3, 11, M); + + auto u = a | b; // [0,14] + ENSURE(u.num_ranges() == 1); + ENSURE(u[0] == std::make_pair(0u, 14u)); + + auto i = a & b; // [3,4] U [10,11] + ENSURE(i.num_ranges() == 2); + ENSURE(i[0] == std::make_pair(3u, 4u)); + ENSURE(i[1] == std::make_pair(10u, 11u)); + + auto d = a - b; // [0,2] U [12,14] + ENSURE(d.num_ranges() == 2); + ENSURE(d[0] == std::make_pair(0u, 2u)); + ENSURE(d[1] == std::make_pair(12u, 14u)); + + auto x = a ^ b; // [0,2] U [5,9] U [12,14] + ENSURE(x.num_ranges() == 3); + ENSURE(x[0] == std::make_pair(0u, 2u)); + ENSURE(x[1] == std::make_pair(5u, 9u)); + ENSURE(x[2] == std::make_pair(12u, 14u)); + } + + void test_complement_hand() { + unsigned const M = 10; + auto e = range_predicate::empty(M); + ENSURE((~e).is_top()); + auto t = range_predicate::top(M); + ENSURE((~t).is_empty()); + + // ~([2,3] U [7,8]) = [0,1] U [4,6] U [9,10] + auto a = range_predicate::range(2, 3, M) | range_predicate::range(7, 8, M); + auto na = ~a; + ENSURE(na.num_ranges() == 3); + ENSURE(na[0] == std::make_pair(0u, 1u)); + ENSURE(na[1] == std::make_pair(4u, 6u)); + ENSURE(na[2] == std::make_pair(9u, 10u)); + + // ~([0,4]) = [5,10] + auto b = range_predicate::range(0, 4, M); + auto nb = ~b; + ENSURE(nb.num_ranges() == 1); + ENSURE(nb[0] == std::make_pair(5u, 10u)); + + // ~([5,10]) = [0,4] + auto cnb = ~nb; + ENSURE(cnb == b); + } + + // Exhaustively verify the lattice / de-Morgan laws on a small domain + // by enumerating every possible subset (bitmask). + void test_exhaustive_laws() { + unsigned const M = 5; // 6 characters -> 64 subsets + unsigned const N = 1u << (M + 1); + for (unsigned i = 0; i < N; ++i) { + range_predicate A = from_mask(i, M); + ENSURE(to_mask(A) == i); + // ~ ~ A == A + ENSURE(~~A == A); + // A | ~A == top + ENSURE((A | ~A).is_top()); + // A & ~A == empty + ENSURE((A & ~A).is_empty()); + // cardinality matches popcount + unsigned pop = 0; + for (unsigned k = 0; k <= M; ++k) if ((i >> k) & 1u) ++pop; + ENSURE(A.cardinality() == pop); + } + for (unsigned i = 0; i < N; ++i) { + range_predicate A = from_mask(i, M); + for (unsigned j = 0; j < N; ++j) { + range_predicate B = from_mask(j, M); + // Bitmask reference semantics. + ENSURE(to_mask(A | B) == (i | j)); + ENSURE(to_mask(A & B) == (i & j)); + ENSURE(to_mask(A - B) == (i & ~j & ((1u << (M + 1)) - 1u))); + ENSURE(to_mask(A ^ B) == (i ^ j)); + // de-Morgan + ENSURE(~(A | B) == (~A & ~B)); + ENSURE(~(A & B) == (~A | ~B)); + // Commutativity + ENSURE((A | B) == (B | A)); + ENSURE((A & B) == (B & A)); + // (A - B) == A & ~B + ENSURE((A - B) == (A & ~B)); + // (A ^ B) == (A | B) - (A & B) + ENSURE((A ^ B) == ((A | B) - (A & B))); + // Extensional equality is reflexive on equal masks. + if (i == j) { + ENSURE(A == B); + ENSURE(A.hash() == B.hash()); + } + } + } + } + + void test_total_order_strict() { + unsigned const M = 5; + unsigned const N = 1u << (M + 1); + // Strict total order: for any distinct A, B exactly one of A