diff --git a/src/ast/euf/euf_seq_plugin.cpp b/src/ast/euf/euf_seq_plugin.cpp index 0a025c5c7..ab20120df 100644 --- a/src/ast/euf/euf_seq_plugin.cpp +++ b/src/ast/euf/euf_seq_plugin.cpp @@ -15,8 +15,8 @@ Abstract: Author: - Nikolaj Bjorner (nbjorner) 2026-03-01 Clemens Eisenhofer 2026-03-01 + Nikolaj Bjorner (nbjorner) 2026-03-01 --*/ diff --git a/src/ast/euf/euf_seq_plugin.h b/src/ast/euf/euf_seq_plugin.h index 8f72a6837..1bbb58630 100644 --- a/src/ast/euf/euf_seq_plugin.h +++ b/src/ast/euf/euf_seq_plugin.h @@ -32,8 +32,8 @@ Abstract: Author: - Nikolaj Bjorner (nbjorner) 2026-03-01 Clemens Eisenhofer 2026-03-01 + Nikolaj Bjorner (nbjorner) 2026-03-01 --*/ diff --git a/src/ast/euf/euf_sgraph.cpp b/src/ast/euf/euf_sgraph.cpp index d12e16d7c..6e8827a79 100644 --- a/src/ast/euf/euf_sgraph.cpp +++ b/src/ast/euf/euf_sgraph.cpp @@ -11,8 +11,8 @@ Abstract: Author: - Nikolaj Bjorner (nbjorner) 2026-03-01 Clemens Eisenhofer 2026-03-01 + Nikolaj Bjorner (nbjorner) 2026-03-01 --*/ diff --git a/src/ast/euf/euf_sgraph.h b/src/ast/euf/euf_sgraph.h index 89a1c0eee..545f21c38 100644 --- a/src/ast/euf/euf_sgraph.h +++ b/src/ast/euf/euf_sgraph.h @@ -49,8 +49,8 @@ Abstract: Author: - Nikolaj Bjorner (nbjorner) 2026-03-01 Clemens Eisenhofer 2026-03-01 + Nikolaj Bjorner (nbjorner) 2026-03-01 --*/ diff --git a/src/ast/euf/euf_snode.h b/src/ast/euf/euf_snode.h index e54d276f6..95bf1f64e 100644 --- a/src/ast/euf/euf_snode.h +++ b/src/ast/euf/euf_snode.h @@ -16,8 +16,8 @@ Abstract: Author: - Nikolaj Bjorner (nbjorner) 2026-03-01 Clemens Eisenhofer 2026-03-01 + Nikolaj Bjorner (nbjorner) 2026-03-01 --*/ diff --git a/src/smt/seq/seq_nielsen.cpp b/src/smt/seq/seq_nielsen.cpp index ea75ecc17..0c3a6342c 100644 --- a/src/smt/seq/seq_nielsen.cpp +++ b/src/smt/seq/seq_nielsen.cpp @@ -14,8 +14,8 @@ Abstract: Author: - Nikolaj Bjorner (nbjorner) 2026-03-02 Clemens Eisenhofer 2026-03-02 + Nikolaj Bjorner (nbjorner) 2026-03-02 --*/ @@ -23,7 +23,6 @@ Author: #include "ast/arith_decl_plugin.h" #include "ast/ast_pp.h" #include "math/lp/lar_solver.h" -#include "util/bit_util.h" #include "util/hashtable.h" #include #include @@ -321,149 +320,6 @@ namespace seq { return true; } - // ----------------------------------------------- - // char_set - // ----------------------------------------------- - - unsigned char_set::char_count() const { - unsigned count = 0; - for (auto const& r : m_ranges) - count += r.length(); - return count; - } - - bool char_set::contains(unsigned c) const { - // binary search over sorted non-overlapping ranges - int lo = 0, hi = static_cast(m_ranges.size()) - 1; - while (lo <= hi) { - int mid = lo + (hi - lo) / 2; - if (c < m_ranges[mid].m_lo) - hi = mid - 1; - else if (c >= m_ranges[mid].m_hi) - lo = mid + 1; - else - return true; - } - return false; - } - - void char_set::add(unsigned c) { - if (m_ranges.empty()) { - m_ranges.push_back(char_range(c)); - return; - } - // binary search for insertion point - int lo = 0, hi = static_cast(m_ranges.size()) - 1; - while (lo <= hi) { - int mid = lo + (hi - lo) / 2; - if (c < m_ranges[mid].m_lo) - hi = mid - 1; - else if (c >= m_ranges[mid].m_hi) - lo = mid + 1; - else - return; // already contained - } - // lo is the insertion point - unsigned idx = static_cast(lo); - bool merge_left = idx > 0 && m_ranges[idx - 1].m_hi == c; - bool merge_right = idx < m_ranges.size() && m_ranges[idx].m_lo == c + 1; - if (merge_left && merge_right) { - m_ranges[idx - 1].m_hi = m_ranges[idx].m_hi; - m_ranges.erase(m_ranges.begin() + idx); - } else if (merge_left) { - m_ranges[idx - 1].m_hi = c + 1; - } else if (merge_right) { - m_ranges[idx].m_lo = c; - } else { - // positional insert: shift elements right and place new element - m_ranges.push_back(char_range()); - for (unsigned k = m_ranges.size() - 1; k > idx; --k) - m_ranges[k] = m_ranges[k - 1]; - m_ranges[idx] = char_range(c); - } - } - - void char_set::add(char_set const& other) { - for (auto const& r : other.m_ranges) { - for (unsigned c = r.m_lo; c < r.m_hi; ++c) - add(c); - } - } - - char_set char_set::intersect_with(char_set const& other) const { - char_set result; - unsigned i = 0, j = 0; - while (i < m_ranges.size() && j < other.m_ranges.size()) { - unsigned lo = std::max(m_ranges[i].m_lo, other.m_ranges[j].m_lo); - unsigned hi = std::min(m_ranges[i].m_hi, other.m_ranges[j].m_hi); - if (lo < hi) - result.m_ranges.push_back(char_range(lo, hi)); - if (m_ranges[i].m_hi < other.m_ranges[j].m_hi) - ++i; - else - ++j; - } - return result; - } - - char_set char_set::complement(unsigned max_char) const { - char_set result; - if (m_ranges.empty()) { - result.m_ranges.push_back(char_range(0, max_char + 1)); - return result; - } - unsigned from = 0; - for (auto const& r : m_ranges) { - if (from < r.m_lo) - result.m_ranges.push_back(char_range(from, r.m_lo)); - from = r.m_hi; - } - if (from <= max_char) - result.m_ranges.push_back(char_range(from, max_char + 1)); - return result; - } - - bool char_set::is_disjoint(char_set const& other) const { - unsigned i = 0, j = 0; - while (i < m_ranges.size() && j < other.m_ranges.size()) { - if (m_ranges[i].m_hi <= other.m_ranges[j].m_lo) - ++i; - else if (other.m_ranges[j].m_hi <= m_ranges[i].m_lo) - ++j; - else - return false; - } - return true; - } - - std::ostream& char_set::display(std::ostream& out) const { - if (m_ranges.empty()) { - out << "{}"; - return out; - } - out << "{ "; - bool first = true; - for (auto const& r : m_ranges) { - if (!first) out << ", "; - first = false; - if (r.is_unit()) { - unsigned c = r.m_lo; - if (c >= 'a' && c <= 'z') - out << (char)c; - else if (c >= 'A' && c <= 'Z') - out << (char)c; - else if (c >= '0' && c <= '9') - out << (char)c; - else - out << "#[" << c << "]"; - } else { - out << "[" << r.m_lo << "-" << (r.m_hi - 1) << "]"; - } - } - out << " }"; - return out; - } - // ----------------------------------------------- // nielsen_edge // ----------------------------------------------- @@ -511,7 +367,7 @@ namespace seq { str_eq& eq = m_str_eq[i]; eq.m_lhs = sg.subst(eq.m_lhs, s.m_var, s.m_replacement); eq.m_rhs = sg.subst(eq.m_rhs, s.m_var, s.m_replacement); - eq.m_dep.merge(s.m_dep); + eq.m_dep |= s.m_dep; eq.sort(); } for (unsigned i = 0; i < m_str_mem.size(); ++i) { @@ -519,7 +375,7 @@ namespace seq { mem.m_str = sg.subst(mem.m_str, s.m_var, s.m_replacement); // regex is typically ground, but apply subst for generality mem.m_regex = sg.subst(mem.m_regex, s.m_var, s.m_replacement); - mem.m_dep.merge(s.m_dep); + mem.m_dep |= s.m_dep; } } @@ -654,8 +510,8 @@ namespace seq { void nielsen_graph::add_str_eq(euf::snode* lhs, euf::snode* rhs) { if (!m_root) m_root = mk_node(); - dep_tracker dep(m_root->str_eqs().size() + m_root->str_mems().size() + 1, - m_root->str_eqs().size()); + dep_tracker dep; + dep.insert(m_root->str_eqs().size()); str_eq eq(lhs, rhs, dep); eq.sort(); m_root->add_str_eq(eq); @@ -665,8 +521,8 @@ namespace seq { void nielsen_graph::add_str_mem(euf::snode* str, euf::snode* regex) { if (!m_root) m_root = mk_node(); - dep_tracker dep(m_root->str_eqs().size() + m_root->str_mems().size() + 1, - m_root->str_eqs().size() + m_root->str_mems().size()); + dep_tracker dep; + dep.insert(m_root->str_eqs().size() + m_root->str_mems().size()); euf::snode* history = m_sg.mk_empty(); unsigned id = next_mem_id(); m_root->add_str_mem(str_mem(str, regex, history, id, dep)); @@ -2574,9 +2430,9 @@ namespace seq { if (!n->is_currently_conflict()) continue; for (str_eq const& eq : n->str_eqs()) - deps.merge(eq.m_dep); + deps |= eq.m_dep; for (str_mem const& mem : n->str_mems()) - deps.merge(mem.m_dep); + deps |= mem.m_dep; } } @@ -2584,9 +2440,7 @@ namespace seq { SASSERT(m_root); dep_tracker deps; collect_conflict_deps(deps); - unsigned_vector bits; - deps.get_set_bits(bits); - for (unsigned b : bits) { + for (unsigned b : deps) { if (b < m_num_input_eqs) eq_indices.push_back(b); else diff --git a/src/smt/seq/seq_nielsen.h b/src/smt/seq/seq_nielsen.h index 9711beeea..7b571b108 100644 --- a/src/smt/seq/seq_nielsen.h +++ b/src/smt/seq/seq_nielsen.h @@ -98,10 +98,8 @@ Abstract: ------------------------------------ 1. dep_tracker (DependencyTracker): ZIPT's DependencyTracker is a .NET class using a BitArray-like structure for tracking constraint origins. - Z3's dep_tracker uses a dense bitvector stored as svector - (32-bit words). The merge/is_superset/empty semantics are equivalent, - but the representation is more cache-friendly and avoids managed-heap - allocation. + Z3 uses uint_set (a dense bitvector from util/uint_set.h) for the same + purpose. The |=/subset_of/empty semantics are equivalent. 2. Substitution application (nielsen_node::apply_subst): ZIPT uses an immutable, functional style -- Apply() returns a new constraint if @@ -223,8 +221,8 @@ Abstract: Author: - Nikolaj Bjorner (nbjorner) 2026-03-02 Clemens Eisenhofer 2026-03-02 + Nikolaj Bjorner (nbjorner) 2026-03-02 --*/ @@ -293,96 +291,7 @@ namespace seq { // dependency tracker: bitvector tracking which input constraints // contributed to deriving a given constraint // mirrors ZIPT's DependencyTracker - class dep_tracker { - svector m_bits; - public: - dep_tracker() = default; - explicit dep_tracker(unsigned num_bits); - dep_tracker(unsigned num_bits, unsigned set_bit); - - void merge(dep_tracker const& other); - bool is_superset(dep_tracker const& other) const; - bool empty() const; - - // collect indices of all set bits into 'indices' - void get_set_bits(unsigned_vector& indices) const; - - bool operator==(dep_tracker const& other) const { return m_bits == other.m_bits; } - bool operator!=(dep_tracker const& other) const { return !(*this == other); } - }; - - // ----------------------------------------------- - // character range and set types - // mirrors ZIPT's CharacterRange and CharacterSet - // ----------------------------------------------- - - // half-open character interval [lo, hi) - // mirrors ZIPT's CharacterRange - struct char_range { - unsigned m_lo; - unsigned m_hi; // exclusive - - char_range(): m_lo(0), m_hi(0) {} - char_range(unsigned c): m_lo(c), m_hi(c + 1) {} - char_range(unsigned lo, unsigned hi): m_lo(lo), m_hi(hi) { SASSERT(lo <= hi); } - - bool is_empty() const { return m_lo == m_hi; } - bool is_unit() const { return m_hi == m_lo + 1; } - unsigned length() const { return m_hi - m_lo; } - bool contains(unsigned c) const { return c >= m_lo && c < m_hi; } - - bool operator==(char_range const& o) const { return m_lo == o.m_lo && m_hi == o.m_hi; } - bool operator<(char_range const& o) const { - return m_lo < o.m_lo || (m_lo == o.m_lo && m_hi < o.m_hi); - } - }; - - // sorted list of non-overlapping character intervals - // mirrors ZIPT's CharacterSet - class char_set { - svector m_ranges; - public: - char_set() = default; - explicit char_set(char_range const& r) { if (!r.is_empty()) m_ranges.push_back(r); } - - static char_set full(unsigned max_char) { return char_set(char_range(0, max_char + 1)); } - - bool is_empty() const { return m_ranges.empty(); } - bool is_full(unsigned max_char) const { - return m_ranges.size() == 1 && m_ranges[0].m_lo == 0 && m_ranges[0].m_hi == max_char + 1; - } - bool is_unit() const { return m_ranges.size() == 1 && m_ranges[0].is_unit(); } - unsigned first_char() const { SASSERT(!is_empty()); return m_ranges[0].m_lo; } - - svector const& ranges() const { return m_ranges; } - - // total number of characters in the set - unsigned char_count() const; - - // membership test via binary search - bool contains(unsigned c) const; - - // add a single character - void add(unsigned c); - - // union with another char_set - void add(char_set const& other); - - // intersect with another char_set, returns the result - char_set intersect_with(char_set const& other) const; - - // complement relative to [0, max_char] - char_set complement(unsigned max_char) const; - - // check if two sets are disjoint - bool is_disjoint(char_set const& other) const; - - bool operator==(char_set const& other) const { return m_ranges == other.m_ranges; } - - char_set clone() const { char_set r; r.m_ranges = m_ranges; return r; } - - std::ostream& display(std::ostream& out) const; - }; + using dep_tracker = uint_set; // ----------------------------------------------- // character-level substitution diff --git a/src/smt/theory_nseq.cpp b/src/smt/theory_nseq.cpp index e231ea82a..5e11fe7cf 100644 --- a/src/smt/theory_nseq.cpp +++ b/src/smt/theory_nseq.cpp @@ -450,10 +450,8 @@ namespace smt { void theory_nseq::deps_to_lits(seq::dep_tracker const& deps, enode_pair_vector& eqs, literal_vector& lits) { context& ctx = get_context(); - unsigned_vector bits; - deps.get_set_bits(bits); unsigned num_input_eqs = m_nielsen.num_input_eqs(); - for (unsigned b : bits) { + for (unsigned b : deps) { if (b < num_input_eqs) { eq_source const& src = m_state.get_eq_source(b); if (src.m_n1->get_root() == src.m_n2->get_root()) diff --git a/src/test/seq_nielsen.cpp b/src/test/seq_nielsen.cpp index 1a6321ab2..92d5bb72e 100644 --- a/src/test/seq_nielsen.cpp +++ b/src/test/seq_nielsen.cpp @@ -22,7 +22,7 @@ Abstract: #include "ast/ast_pp.h" #include -// test dep_tracker basic operations +// test dep_tracker (uint_set) basic operations static void test_dep_tracker() { std::cout << "test_dep_tracker\n"; @@ -31,23 +31,26 @@ static void test_dep_tracker() { SASSERT(d0.empty()); // tracker with one bit set - seq::dep_tracker d1(8, 3); + seq::dep_tracker d1; + d1.insert(3); SASSERT(!d1.empty()); // tracker with another bit - seq::dep_tracker d2(8, 5); + seq::dep_tracker d2; + d2.insert(5); SASSERT(!d2.empty()); // merge seq::dep_tracker d3 = d1; - d3.merge(d2); + d3 |= d2; SASSERT(!d3.empty()); - SASSERT(d3.is_superset(d1)); - SASSERT(d3.is_superset(d2)); - SASSERT(!d1.is_superset(d2)); + SASSERT(d1.subset_of(d3)); + SASSERT(d2.subset_of(d3)); + SASSERT(!d2.subset_of(d1)); // equality - seq::dep_tracker d4(8, 3); + seq::dep_tracker d4; + d4.insert(3); SASSERT(d1 == d4); SASSERT(d1 != d2); } @@ -65,7 +68,7 @@ static void test_str_eq() { euf::snode* a = sg.mk_char('A'); euf::snode* e = sg.mk_empty(); - seq::dep_tracker dep(4, 0); + seq::dep_tracker dep; dep.insert(0); // basic equality seq::str_eq eq1(x, y, dep); @@ -112,7 +115,7 @@ static void test_str_mem() { expr_ref star_fc(seq.re.mk_full_seq(str_sort), m); euf::snode* regex = sg.mk(star_fc); - seq::dep_tracker dep(4, 1); + seq::dep_tracker dep; dep.insert(1); seq::str_mem mem(x, regex, e, 0, dep); // x in regex is primitive (x is a single variable) @@ -1322,37 +1325,40 @@ static void test_solve_conflict_deps() { SASSERT(!deps.empty()); } -// test dep_tracker::get_set_bits +// test dep_tracker (uint_set) iteration static void test_dep_tracker_get_set_bits() { std::cout << "test_dep_tracker_get_set_bits\n"; // empty tracker has no bits seq::dep_tracker d0; unsigned_vector bits0; - d0.get_set_bits(bits0); + for (unsigned b : d0) bits0.push_back(b); SASSERT(bits0.empty()); // single bit set at position 5 - seq::dep_tracker d1(16, 5); + seq::dep_tracker d1; + d1.insert(5); unsigned_vector bits1; - d1.get_set_bits(bits1); + for (unsigned b : d1) bits1.push_back(b); SASSERT(bits1.size() == 1); SASSERT(bits1[0] == 5); // two bits merged - seq::dep_tracker d2(16, 3); - d2.merge(seq::dep_tracker(16, 11)); + seq::dep_tracker d2; + d2.insert(3); + d2.insert(11); unsigned_vector bits2; - d2.get_set_bits(bits2); + for (unsigned b : d2) bits2.push_back(b); SASSERT(bits2.size() == 2); SASSERT(bits2[0] == 3); SASSERT(bits2[1] == 11); // test across word boundary (bit 31 and 32) - seq::dep_tracker d3(64, 31); - d3.merge(seq::dep_tracker(64, 32)); + seq::dep_tracker d3; + d3.insert(31); + d3.insert(32); unsigned_vector bits3; - d3.get_set_bits(bits3); + for (unsigned b : d3) bits3.push_back(b); SASSERT(bits3.size() == 2); SASSERT(bits3[0] == 31); SASSERT(bits3[1] == 32); @@ -1543,7 +1549,7 @@ static void test_simplify_prefix_cancel() { euf::snode* aby = sg.mk_concat(a, sg.mk_concat(b, y)); seq::nielsen_node* node = ng.mk_node(); - seq::dep_tracker dep(4, 0); + seq::dep_tracker dep; dep.insert(0); node->add_str_eq(seq::str_eq(abx, aby, dep)); auto sr = node->simplify_and_init(ng); @@ -1573,7 +1579,7 @@ static void test_simplify_symbol_clash() { euf::snode* by = sg.mk_concat(b, y); seq::nielsen_node* node = ng.mk_node(); - seq::dep_tracker dep(4, 0); + seq::dep_tracker dep; dep.insert(0); node->add_str_eq(seq::str_eq(ax, by, dep)); auto sr = node->simplify_and_init(ng); @@ -1598,7 +1604,7 @@ static void test_simplify_empty_propagation() { // ε = x·y → forces x=ε, y=ε → all trivial → satisfied seq::nielsen_node* node = ng.mk_node(); - seq::dep_tracker dep(4, 0); + seq::dep_tracker dep; dep.insert(0); node->add_str_eq(seq::str_eq(e, xy, dep)); auto sr = node->simplify_and_init(ng); @@ -1619,7 +1625,7 @@ static void test_simplify_empty_vs_char() { // ε = A → rhs has non-variable token → conflict seq::nielsen_node* node = ng.mk_node(); - seq::dep_tracker dep(4, 0); + seq::dep_tracker dep; dep.insert(0); node->add_str_eq(seq::str_eq(e, a, dep)); auto sr = node->simplify_and_init(ng); @@ -1645,7 +1651,7 @@ static void test_simplify_multi_pass_clash() { euf::snode* ac = sg.mk_concat(a, c); seq::nielsen_node* node = ng.mk_node(); - seq::dep_tracker dep(4, 0); + seq::dep_tracker dep; dep.insert(0); node->add_str_eq(seq::str_eq(ab, ac, dep)); auto sr = node->simplify_and_init(ng); @@ -1667,7 +1673,7 @@ static void test_simplify_trivial_removal() { euf::snode* e = sg.mk_empty(); seq::nielsen_node* node = ng.mk_node(); - seq::dep_tracker dep(4, 0); + seq::dep_tracker dep; dep.insert(0); node->add_str_eq(seq::str_eq(e, e, dep)); // trivial node->add_str_eq(seq::str_eq(x, y, dep)); // non-trivial @@ -1689,7 +1695,7 @@ static void test_simplify_all_trivial_satisfied() { euf::snode* e = sg.mk_empty(); seq::nielsen_node* node = ng.mk_node(); - seq::dep_tracker dep(4, 0); + seq::dep_tracker dep; dep.insert(0); node->add_str_eq(seq::str_eq(x, x, dep)); // trivial: same pointer node->add_str_eq(seq::str_eq(e, e, dep)); // trivial: both empty @@ -1716,7 +1722,7 @@ static void test_simplify_regex_infeasible() { // ε ∈ to_re("A") → non-nullable → conflict seq::nielsen_node* node = ng.mk_node(); - seq::dep_tracker dep(4, 0); + seq::dep_tracker dep; dep.insert(0); node->add_str_mem(seq::str_mem(e, regex, e, 0, dep)); auto sr = node->simplify_and_init(ng); @@ -1744,7 +1750,7 @@ static void test_simplify_nullable_removal() { // ε ∈ star(to_re("A")) → nullable → satisfied, mem removed seq::nielsen_node* node = ng.mk_node(); - seq::dep_tracker dep(4, 0); + seq::dep_tracker dep; dep.insert(0); node->add_str_mem(seq::str_mem(e, regex, e, 0, dep)); auto sr = node->simplify_and_init(ng); @@ -1772,7 +1778,7 @@ static void test_simplify_brzozowski_sat() { // "A" ∈ to_re("A") → derivative consumes 'A' → ε ∈ ε-regex → satisfied seq::nielsen_node* node = ng.mk_node(); - seq::dep_tracker dep(4, 0); + seq::dep_tracker dep; dep.insert(0); node->add_str_mem(seq::str_mem(a, regex, e, 0, dep)); auto sr = node->simplify_and_init(ng); @@ -1795,7 +1801,7 @@ static void test_simplify_multiple_eqs() { euf::snode* e = sg.mk_empty(); seq::nielsen_node* node = ng.mk_node(); - seq::dep_tracker dep(8, 0); + seq::dep_tracker dep; dep.insert(0); // eq1: ε = ε (trivial → removed) node->add_str_eq(seq::str_eq(e, e, dep)); @@ -2234,7 +2240,7 @@ static void test_length_constraints_deps() { for (auto const& c : constraints) { SASSERT(!c.m_dep.empty()); unsigned_vector bits; - c.m_dep.get_set_bits(bits); + for (unsigned b : c.m_dep) bits.push_back(b); SASSERT(bits.size() == 1); SASSERT(bits[0] == 0); } diff --git a/src/util/zstring.cpp b/src/util/zstring.cpp index f60b8d946..fa6dde45a 100644 --- a/src/util/zstring.cpp +++ b/src/util/zstring.cpp @@ -286,3 +286,146 @@ bool operator<(const zstring& lhs, const zstring& rhs) { // so decide based on the relative lengths return lhs.length() < rhs.length(); } + +// ----------------------------------------------- +// char_set +// ----------------------------------------------- + +unsigned char_set::char_count() const { + unsigned count = 0; + for (auto const& r : m_ranges) + count += r.length(); + return count; +} + +bool char_set::contains(unsigned c) const { + // binary search over sorted non-overlapping ranges + int lo = 0, hi = static_cast(m_ranges.size()) - 1; + while (lo <= hi) { + int mid = lo + (hi - lo) / 2; + if (c < m_ranges[mid].m_lo) + hi = mid - 1; + else if (c >= m_ranges[mid].m_hi) + lo = mid + 1; + else + return true; + } + return false; +} + +void char_set::add(unsigned c) { + if (m_ranges.empty()) { + m_ranges.push_back(char_range(c)); + return; + } + // binary search for insertion point + int lo = 0, hi = static_cast(m_ranges.size()) - 1; + while (lo <= hi) { + int mid = lo + (hi - lo) / 2; + if (c < m_ranges[mid].m_lo) + hi = mid - 1; + else if (c >= m_ranges[mid].m_hi) + lo = mid + 1; + else + return; // already contained + } + // lo is the insertion point + unsigned idx = static_cast(lo); + bool merge_left = idx > 0 && m_ranges[idx - 1].m_hi == c; + bool merge_right = idx < m_ranges.size() && m_ranges[idx].m_lo == c + 1; + if (merge_left && merge_right) { + m_ranges[idx - 1].m_hi = m_ranges[idx].m_hi; + m_ranges.erase(m_ranges.begin() + idx); + } else if (merge_left) { + m_ranges[idx - 1].m_hi = c + 1; + } else if (merge_right) { + m_ranges[idx].m_lo = c; + } else { + // positional insert: shift elements right and place new element + m_ranges.push_back(char_range()); + for (unsigned k = m_ranges.size() - 1; k > idx; --k) + m_ranges[k] = m_ranges[k - 1]; + m_ranges[idx] = char_range(c); + } +} + +void char_set::add(char_set const& other) { + for (auto const& r : other.m_ranges) { + for (unsigned c = r.m_lo; c < r.m_hi; ++c) + add(c); + } +} + +char_set char_set::intersect_with(char_set const& other) const { + char_set result; + unsigned i = 0, j = 0; + while (i < m_ranges.size() && j < other.m_ranges.size()) { + unsigned lo = std::max(m_ranges[i].m_lo, other.m_ranges[j].m_lo); + unsigned hi = std::min(m_ranges[i].m_hi, other.m_ranges[j].m_hi); + if (lo < hi) + result.m_ranges.push_back(char_range(lo, hi)); + if (m_ranges[i].m_hi < other.m_ranges[j].m_hi) + ++i; + else + ++j; + } + return result; +} + +char_set char_set::complement(unsigned max_char) const { + char_set result; + if (m_ranges.empty()) { + result.m_ranges.push_back(char_range(0, max_char + 1)); + return result; + } + unsigned from = 0; + for (auto const& r : m_ranges) { + if (from < r.m_lo) + result.m_ranges.push_back(char_range(from, r.m_lo)); + from = r.m_hi; + } + if (from <= max_char) + result.m_ranges.push_back(char_range(from, max_char + 1)); + return result; +} + +bool char_set::is_disjoint(char_set const& other) const { + unsigned i = 0, j = 0; + while (i < m_ranges.size() && j < other.m_ranges.size()) { + if (m_ranges[i].m_hi <= other.m_ranges[j].m_lo) + ++i; + else if (other.m_ranges[j].m_hi <= m_ranges[i].m_lo) + ++j; + else + return false; + } + return true; +} + +std::ostream& char_set::display(std::ostream& out) const { + if (m_ranges.empty()) { + out << "{}"; + return out; + } + out << "{ "; + bool first = true; + for (auto const& r : m_ranges) { + if (!first) out << ", "; + first = false; + if (r.is_unit()) { + unsigned c = r.m_lo; + if (c >= 'a' && c <= 'z') + out << (char)c; + else if (c >= 'A' && c <= 'Z') + out << (char)c; + else if (c >= '0' && c <= '9') + out << (char)c; + else + out << "#[" << c << "]"; + } else { + out << "[" << r.m_lo << "-" << (r.m_hi - 1) << "]"; + } + } + out << " }"; + return out; +} diff --git a/src/util/zstring.h b/src/util/zstring.h index 44dfcd36c..f7df66455 100644 --- a/src/util/zstring.h +++ b/src/util/zstring.h @@ -20,6 +20,7 @@ Author: #include #include "util/buffer.h" #include "util/rational.h" +#include "util/vector.h" enum class string_encoding { ascii, // exactly 8 bits @@ -93,3 +94,71 @@ public: friend std::ostream& operator<<(std::ostream &os, const zstring &str); friend bool operator<(const zstring& lhs, const zstring& rhs); }; + +// half-open character interval [lo, hi) +// mirrors ZIPT's CharacterRange +struct char_range { + unsigned m_lo; + unsigned m_hi; // exclusive + + char_range(): m_lo(0), m_hi(0) {} + char_range(unsigned c): m_lo(c), m_hi(c + 1) {} + char_range(unsigned lo, unsigned hi): m_lo(lo), m_hi(hi) { SASSERT(lo <= hi); } + + bool is_empty() const { return m_lo == m_hi; } + bool is_unit() const { return m_hi == m_lo + 1; } + unsigned length() const { return m_hi - m_lo; } + bool contains(unsigned c) const { return c >= m_lo && c < m_hi; } + + bool operator==(char_range const& o) const { return m_lo == o.m_lo && m_hi == o.m_hi; } + bool operator<(char_range const& o) const { + return m_lo < o.m_lo || (m_lo == o.m_lo && m_hi < o.m_hi); + } +}; + +// sorted list of non-overlapping character intervals +// mirrors ZIPT's CharacterSet +class char_set { + svector m_ranges; +public: + char_set() = default; + explicit char_set(char_range const& r) { if (!r.is_empty()) m_ranges.push_back(r); } + + static char_set full(unsigned max_char) { return char_set(char_range(0, max_char + 1)); } + + bool is_empty() const { return m_ranges.empty(); } + bool is_full(unsigned max_char) const { + return m_ranges.size() == 1 && m_ranges[0].m_lo == 0 && m_ranges[0].m_hi == max_char + 1; + } + bool is_unit() const { return m_ranges.size() == 1 && m_ranges[0].is_unit(); } + unsigned first_char() const { SASSERT(!is_empty()); return m_ranges[0].m_lo; } + + svector const& ranges() const { return m_ranges; } + + // total number of characters in the set + unsigned char_count() const; + + // membership test via binary search + bool contains(unsigned c) const; + + // add a single character + void add(unsigned c); + + // union with another char_set + void add(char_set const& other); + + // intersect with another char_set, returns the result + char_set intersect_with(char_set const& other) const; + + // complement relative to [0, max_char] + char_set complement(unsigned max_char) const; + + // check if two sets are disjoint + bool is_disjoint(char_set const& other) const; + + bool operator==(char_set const& other) const { return m_ranges == other.m_ranges; } + + char_set clone() const { char_set r; r.m_ranges = m_ranges; return r; } + + std::ostream& display(std::ostream& out) const; +};