diff --git a/src/ast/CMakeLists.txt b/src/ast/CMakeLists.txt index 6a50c3b05..14a522313 100644 --- a/src/ast/CMakeLists.txt +++ b/src/ast/CMakeLists.txt @@ -43,6 +43,7 @@ z3_add_component(ast polymorphism_util.cpp pp.cpp quantifier_stat.cpp + range_predicate.cpp recfun_decl_plugin.cpp reg_decl_plugins.cpp seq_decl_plugin.cpp diff --git a/src/ast/rewriter/range_predicate.cpp b/src/ast/range_predicate.cpp similarity index 99% rename from src/ast/rewriter/range_predicate.cpp rename to src/ast/range_predicate.cpp index 53b31a84a..e68ddff5f 100644 --- a/src/ast/rewriter/range_predicate.cpp +++ b/src/ast/range_predicate.cpp @@ -21,7 +21,7 @@ Authors: --*/ -#include "ast/rewriter/range_predicate.h" +#include "ast/range_predicate.h" #include "util/debug.h" #include #include diff --git a/src/ast/rewriter/range_predicate.h b/src/ast/range_predicate.h similarity index 100% rename from src/ast/rewriter/range_predicate.h rename to src/ast/range_predicate.h diff --git a/src/ast/rewriter/CMakeLists.txt b/src/ast/rewriter/CMakeLists.txt index 0956fef48..cfcc179bc 100644 --- a/src/ast/rewriter/CMakeLists.txt +++ b/src/ast/rewriter/CMakeLists.txt @@ -35,7 +35,6 @@ z3_add_component(rewriter pb2bv_rewriter.cpp push_app_ite.cpp quant_hoist.cpp - range_predicate.cpp recfun_rewriter.cpp rewriter.cpp seq_axioms.cpp diff --git a/src/ast/rewriter/seq_rewriter.cpp b/src/ast/rewriter/seq_rewriter.cpp index ac0261f61..c8b68bad2 100644 --- a/src/ast/rewriter/seq_rewriter.cpp +++ b/src/ast/rewriter/seq_rewriter.cpp @@ -2659,7 +2659,8 @@ expr_ref seq_rewriter::is_nullable_rec(expr* r) { else if (re().is_full_char(r) || re().is_empty(r) || re().is_of_pred(r) || - re().is_range(r)) { + re().is_range(r) || + re().is_charclass(r)) { result = m().mk_false(); } else if (re().is_plus(r, r1) || @@ -3084,6 +3085,47 @@ void seq_rewriter::mk_antimirov_deriv_rec(expr* e, expr* r, expr* path, expr_ref else result = re().mk_ite_simplify(c, mk_antimirov_deriv(e, r1, c1), mk_antimirov_deriv(e, r2, c2)); } + else if (re().is_charclass(r)) { + // CHARCLASS = disjoint union of ranges [lo_0,hi_0] | ... | [lo_{N-1},hi_{N-1}]. + // D(e, CHARCLASS) = ite(P(e), epsilon, []) where P(e) is the disjunction + // of (lo_i <= e <= hi_i). We build a chain of nested ITEs, one per range, + // so the result is in antimirov BDD normal form. + app const* a = to_app(r); + func_decl* d = a->get_decl(); + unsigned np = d->get_num_parameters(); + if (np == 0) { + result = nothing(); + } + else { + unsigned mx = u().max_char(); + result = nothing(); + // Iterate ranges in reverse so the first range becomes the outermost ITE. + for (unsigned k = np; k >= 2; k -= 2) { + unsigned i = k - 2; + unsigned lo_p = static_cast(d->get_parameter(i).get_int()); + unsigned hi_p = static_cast(d->get_parameter(i + 1).get_int()); + if (lo_p == 0 && hi_p == mx) { + // Range covers the entire char domain: derivative is unconditionally epsilon. + result = epsilon(); + continue; + } + expr_ref lo_c(m_util.mk_char(lo_p), m()); + expr_ref hi_c(m_util.mk_char(hi_p), m()); + expr_ref cond(m()); + if (lo_p == 0) + cond = u().mk_le(e, hi_c); + else if (hi_p == mx) + cond = u().mk_le(lo_c, e); + else + cond = m().mk_and(u().mk_le(lo_c, e), u().mk_le(e, hi_c)); + expr_ref range(simplify_path(e, cond), m()); + expr_ref psi(simplify_path(e, m().mk_and(path, range)), m()); + if (m().is_false(psi)) + continue; + result = re().mk_ite_simplify(range, epsilon(), result); + } + } + } else if (re().is_range(r, r1, r2)) { expr_ref range(m()); expr_ref psi(m().mk_false(), m()); @@ -3318,6 +3360,15 @@ expr_ref seq_rewriter::mk_regex_union_normalize(expr* r1, expr* r2) { SASSERT(m_util.is_re(r1)); SASSERT(m_util.is_re(r2)); expr_ref result(m()); + // Char-class fast path: combine in O(1) via range_predicate when both + // operands are equivalent to a single character class. + seq::range_predicate pa, pb; + if (re().as_charclass(r1, pa) && re().as_charclass(r2, pb)) { + sort* seq_sort = nullptr; + VERIFY(u().is_re(r1, seq_sort)); + result = re().mk_charclass(pa | pb, seq_sort); + return result; + } std::function test = [&](expr* t, expr*& a, expr*& b) { return re().is_union(t, a, b); }; std::function compose = [&](expr* r1, expr* r2) { return (is_subset(r1, r2) ? r2 : (is_subset(r2, r1) ? r1 : re().mk_union(r1, r2))); }; std::function is_complement = [&](expr *a, expr *b) { @@ -3355,6 +3406,13 @@ expr_ref seq_rewriter::mk_regex_inter_normalize(expr* r1, expr* r2) { SASSERT(m_util.is_re(r1)); SASSERT(m_util.is_re(r2)); expr_ref result(m()); + seq::range_predicate pa, pb; + if (re().as_charclass(r1, pa) && re().as_charclass(r2, pb)) { + sort* seq_sort = nullptr; + VERIFY(u().is_re(r1, seq_sort)); + result = re().mk_charclass(pa & pb, seq_sort); + return result; + } if (re().is_epsilon(r2)) std::swap(r1, r2); std::function test = [&](expr* t, expr*& a, expr*& b) { return re().is_intersection(t, a, b); }; @@ -4165,8 +4223,50 @@ expr_ref seq_rewriter::mk_derivative_rec(expr* ele, expr* r) { } } } + else if (re().is_charclass(r)) { + // Derivative of a char-class cc(P) = ite(P(ele), epsilon, []). + // We build it in transition-regex form: a disjunction over each + // range (lo, hi) of mk_der_inter(le(lo, ele), le(ele, hi)), with + // mk_der_cond normalization on each leaf so the result is a clean + // t-regex of ITE / and / or over single-character constraints. + app const* a = to_app(r); + func_decl* d = a->get_decl(); + unsigned np = d->get_num_parameters(); + if (np == 0) + return mk_empty(); + expr_ref acc(m()); + unsigned mx = u().max_char(); + for (unsigned i = 0; i + 1 < np; i += 2) { + unsigned lo_p = static_cast(d->get_parameter(i).get_int()); + unsigned hi_p = static_cast(d->get_parameter(i + 1).get_int()); + expr_ref leaf(m()); + if (lo_p == 0 && hi_p == mx) { + leaf = expr_ref(re().mk_to_re(str().mk_empty(seq_sort)), m()); + } + else { + expr_ref p1(m()), p2(m()); + if (lo_p > 0) { + p1 = u().mk_le(m_util.mk_char(lo_p), ele); + p1 = mk_der_cond(p1, ele, seq_sort); + } + if (hi_p < mx) { + p2 = u().mk_le(ele, m_util.mk_char(hi_p)); + p2 = mk_der_cond(p2, ele, seq_sort); + } + if (!p1 && !p2) + leaf = expr_ref(re().mk_to_re(str().mk_empty(seq_sort)), m()); + else if (!p1) + leaf = p2; + else if (!p2) + leaf = p1; + else + leaf = mk_der_inter(p1, p2); + } + acc = i == 0 ? leaf : mk_der_union(acc, leaf); + } + return acc; + } else if (re().is_range(r, r1, r2)) { - // r1, r2 are sequences. zstring s1, s2; if (str().is_string(r1, s1) && str().is_string(r2, s2)) { if (s1.length() == 1 && s2.length() == 1) { @@ -4763,6 +4863,16 @@ br_status seq_rewriter::mk_re_union0(expr* a, expr* b, expr_ref& result) { result = b; return BR_DONE; } + // O(1) char-class collapse: if both operands are equivalent to a + // char-class (CHARCLASS, full_char, empty-char, range with concrete + // bounds), return the union as a single CHARCLASS node. + seq::range_predicate pa, pb; + if (re().as_charclass(a, pa) && re().as_charclass(b, pb)) { + sort* seq_sort = nullptr; + VERIFY(u().is_re(a, seq_sort)); + result = re().mk_charclass(pa | pb, seq_sort); + return BR_DONE; + } return BR_FAILED; } @@ -4830,6 +4940,13 @@ br_status seq_rewriter::mk_re_inter0(expr* a, expr* b, expr_ref& result) { result = a; return BR_DONE; } + seq::range_predicate pa, pb; + if (re().as_charclass(a, pa) && re().as_charclass(b, pb)) { + sort* seq_sort = nullptr; + VERIFY(u().is_re(a, seq_sort)); + result = re().mk_charclass(pa & pb, seq_sort); + return BR_DONE; + } return BR_FAILED; } @@ -4840,6 +4957,13 @@ br_status seq_rewriter::mk_re_inter(expr* a, expr* b, expr_ref& result) { } br_status seq_rewriter::mk_re_diff(expr* a, expr* b, expr_ref& result) { + seq::range_predicate pa, pb; + if (re().as_charclass(a, pa) && re().as_charclass(b, pb)) { + sort* seq_sort = nullptr; + VERIFY(u().is_re(a, seq_sort)); + result = re().mk_charclass(pa - pb, seq_sort); + return BR_DONE; + } result = mk_regex_inter_normalize(a, re().mk_complement(b)); return BR_REWRITE2; } @@ -5103,6 +5227,19 @@ br_status seq_rewriter::mk_re_range(expr* lo, expr* hi, expr_ref& result) { return BR_DONE; } + // Lift concrete-bound ranges into a single canonical CHARCLASS node so + // that boolean combinations with other char-class regexes collapse in + // O(1) via the smart constructors below. Only fires when the element + // sort is Char (i.e. the standard string regex domain). + sort* ele_sort = nullptr; + if (slo.length() == 1 && shi.length() == 1 && slo[0] <= shi[0] + && u().is_seq(lo->get_sort(), ele_sort) && u().is_char(ele_sort)) { + unsigned mx = u().max_char(); + seq::range_predicate p = seq::range_predicate::range(slo[0], shi[0], mx); + result = re().mk_charclass(p, lo->get_sort()); + return BR_DONE; + } + return BR_FAILED; } diff --git a/src/ast/seq_decl_plugin.cpp b/src/ast/seq_decl_plugin.cpp index 59f5d046c..b50616439 100644 --- a/src/ast/seq_decl_plugin.cpp +++ b/src/ast/seq_decl_plugin.cpp @@ -240,6 +240,7 @@ void seq_decl_plugin::init() { m_sigs[OP_RE_OF_PRED] = alloc(psig, m, "re.of.pred", 1, 1, &predA, reA); m_sigs[OP_RE_REVERSE] = alloc(psig, m, "re.reverse", 1, 1, &reA, reA); m_sigs[OP_RE_DERIVATIVE] = alloc(psig, m, "re.derivative", 1, 2, AreA, reA); + m_sigs[OP_RE_CHARCLASS] = alloc(psig, m, "re.charclass", 1, 0, nullptr, reA); m_sigs[_OP_RE_ANTIMIROV_UNION] = alloc(psig, m, "re.union", 1, 2, reAreA, reA); m_sigs[OP_SEQ_TO_RE] = alloc(psig, m, "seq.to.re", 1, 1, &seqA, reA); m_sigs[OP_SEQ_IN_RE] = alloc(psig, m, "seq.in.re", 1, 2, seqAreA, boolT); @@ -497,6 +498,29 @@ func_decl* seq_decl_plugin::mk_func_decl(decl_kind k, unsigned num_parameters, p } m.raise_exception("Incorrect arguments used for re.^. Expected one non-negative integer parameter"); + case OP_RE_CHARCLASS: { + m_has_re = true; + if (arity != 0) + m.raise_exception("re.charclass takes no arguments"); + if ((num_parameters % 2) != 0) + m.raise_exception("re.charclass expects an even number of int parameters (lo, hi pairs)"); + unsigned mx = get_char_plugin().max_char(); + int prev_hi = -1; + for (unsigned i = 0; i + 1 < num_parameters; i += 2) { + if (!parameters[i].is_int() || !parameters[i + 1].is_int()) + m.raise_exception("re.charclass parameters must be integers"); + int lo = parameters[i].get_int(); + int hi = parameters[i + 1].get_int(); + if (lo < 0 || hi < 0 || lo > hi || static_cast(hi) > mx) + m.raise_exception("re.charclass range out of character domain"); + if (lo <= prev_hi + 1) + m.raise_exception("re.charclass ranges must be sorted, disjoint, and non-adjacent"); + prev_hi = hi; + } + if (!range) range = mk_reglan(); + return m.mk_func_decl(m_sigs[k]->m_name, arity, domain, range, + func_decl_info(m_family_id, k, num_parameters, parameters)); + } case OP_STRING_CONST: if (!(num_parameters == 1 && arity == 0 && parameters[0].is_zstring())) { m.raise_exception("invalid string declaration"); @@ -1140,6 +1164,74 @@ bool seq_util::rex::is_range(expr const* n, unsigned& lo, unsigned& hi) const { return true; } +app* seq_util::rex::mk_charclass(seq::range_predicate const& p, sort* seq_sort) { + sort* re_sort = mk_re(seq_sort); + if (p.is_empty()) + return mk_empty(re_sort); + unsigned n = p.num_ranges(); + vector params; + for (unsigned i = 0; i < n; ++i) { + auto r = p[i]; + params.push_back(parameter(static_cast(r.first))); + params.push_back(parameter(static_cast(r.second))); + } + return m.mk_app(m_fid, OP_RE_CHARCLASS, params.size(), params.data(), 0, nullptr, re_sort); +} + +bool seq_util::rex::is_charclass(expr const* n, seq::range_predicate& p) const { + if (!is_charclass(n)) + return false; + app const* a = to_app(n); + func_decl* d = a->get_decl(); + unsigned np = d->get_num_parameters(); + p = seq::range_predicate::empty(u.max_char()); + unsigned mx = u.max_char(); + for (unsigned i = 0; i + 1 < np; i += 2) { + unsigned lo = static_cast(d->get_parameter(i).get_int()); + unsigned hi = static_cast(d->get_parameter(i + 1).get_int()); + p = p | seq::range_predicate::range(lo, hi, mx); + } + return true; +} + +bool seq_util::rex::as_charclass(expr* r, seq::range_predicate& p) const { + unsigned mx = u.max_char(); + if (is_charclass(r, p)) + return true; + if (is_full_char(r)) { + p = seq::range_predicate::top(mx); + return true; + } + if (is_empty(r)) { + sort* seq_sort = nullptr; + sort* ele_sort = nullptr; + if (u.is_re(r, seq_sort) && u.is_seq(seq_sort, ele_sort) && u.is_char(ele_sort)) { + p = seq::range_predicate::empty(mx); + return true; + } + return false; + } + expr* e1 = nullptr, *e2 = nullptr; + if (is_range(r, e1, e2)) { + zstring s1, s2; + if (u.str.is_string(e1, s1) && u.str.is_string(e2, s2) + && s1.length() == 1 && s2.length() == 1 && s1[0] <= s2[0]) { + p = seq::range_predicate::range(s1[0], s2[0], mx); + return true; + } + unsigned ch1 = 0, ch2 = 0; + expr* c1 = nullptr, *c2 = nullptr; + if (u.str.is_unit(e1, c1) && u.str.is_unit(e2, c2) + && u.is_const_char(c1, ch1) && u.is_const_char(c2, ch2) + && ch1 <= ch2) { + p = seq::range_predicate::range(ch1, ch2, mx); + return true; + } + return false; + } + return false; +} + sort* seq_util::rex::to_seq(sort* re) { (void)u; @@ -1347,7 +1439,7 @@ std::ostream& seq_util::rex::pp::print_range(std::ostream& out, expr* s1, expr* */ bool seq_util::rex::pp::can_skip_parenth(expr* r) const { expr* s; - return ((re.is_to_re(r, s) && re.u.str.is_unit(s)) || re.is_range(r) || re.is_empty(r) || re.is_epsilon(r) || re.is_full_char(r)); + return ((re.is_to_re(r, s) && re.u.str.is_unit(s)) || re.is_range(r) || re.is_empty(r) || re.is_epsilon(r) || re.is_full_char(r) || re.is_charclass(r)); } /* @@ -1443,6 +1535,26 @@ std::ostream& seq_util::rex::pp::print(std::ostream& out, expr* e) const { print(out, r2); out << ")"; } + else if (re.is_charclass(e)) { + app const* a = to_app(e); + func_decl* d = a->get_decl(); + unsigned np = d->get_num_parameters(); + if (np == 0) { + out << (html_encode ? "∅" : "[]"); + } + else { + out << "["; + for (unsigned i = 0; i + 1 < np; i += 2) { + int lo_p = d->get_parameter(i).get_int(); + int hi_p = d->get_parameter(i + 1).get_int(); + if (lo_p == hi_p) + out << "\\x" << std::hex << lo_p << std::dec; + else + out << "\\x" << std::hex << lo_p << "-\\x" << hi_p << std::dec; + } + out << "]"; + } + } else if (re.is_intersection(e, r1, r2)) { out << "("; print(out, r1); @@ -1674,6 +1786,7 @@ seq_util::rex::info seq_util::rex::mk_info_rec(app* e) const { case OP_RE_RANGE: case OP_RE_FULL_CHAR_SET: case OP_RE_OF_PRED: + case OP_RE_CHARCLASS: //TBD: check if the character predicate contains uninterpreted symbols or is nonground or is unsat //TBD: check if the range is unsat return info(true, l_false, 1, false); diff --git a/src/ast/seq_decl_plugin.h b/src/ast/seq_decl_plugin.h index 75995ef7b..9b2bc4c2b 100644 --- a/src/ast/seq_decl_plugin.h +++ b/src/ast/seq_decl_plugin.h @@ -24,6 +24,7 @@ Revision History: #include "ast/ast.h" #include "ast/char_decl_plugin.h" +#include "ast/range_predicate.h" #include "util/lbool.h" #include "util/zstring.h" @@ -78,6 +79,13 @@ enum seq_op_kind { OP_RE_OF_PRED, OP_RE_REVERSE, OP_RE_DERIVATIVE, // Char -> RegEx -> RegEx + OP_RE_CHARCLASS, // 2N int parameters (lo_0, hi_0, ..., lo_{N-1}, hi_{N-1}) -> RegEx + // Canonical char-class regex: union of disjoint, sorted, non-adjacent + // ranges [lo_i, hi_i] over the character domain. N = 0 denotes the + // empty regex; the predicate covering the full domain is also + // representable. Equivalent in language to the union of re.range + // nodes, but a single hash-consed AST node enabling O(1) boolean + // combinations on regex char-classes. // string specific operators. @@ -544,6 +552,13 @@ public: app* mk_derivative(expr* ele, expr* r) { return m.mk_app(m_fid, OP_RE_DERIVATIVE, ele, r); } app* mk_antimirov_union(expr* r1, expr* r2) { return m.mk_app(m_fid, _OP_RE_ANTIMIROV_UNION, r1, r2); } + // Construct a canonical char-class regex over the character domain. + // The seq_sort argument identifies the element sort of the regex + // (must be (Seq Char)). The predicate is encoded as 2*N int + // parameters [lo_0, hi_0, ..., lo_{N-1}, hi_{N-1}]. An empty + // predicate produces re.none of the same sort. + app* mk_charclass(seq::range_predicate const& p, sort* seq_sort); + bool is_to_re(expr const* n) const { return is_app_of(n, m_fid, OP_SEQ_TO_RE); } bool is_concat(expr const* n) const { return is_app_of(n, m_fid, OP_RE_CONCAT); } bool is_union(expr const* n) const { return is_app_of(n, m_fid, OP_RE_UNION); } @@ -577,6 +592,16 @@ public: bool is_reverse(expr const* n) const { return is_app_of(n, m_fid, OP_RE_REVERSE); } bool is_derivative(expr const* n) const { return is_app_of(n, m_fid, OP_RE_DERIVATIVE); } bool is_antimirov_union(expr const* n) const { return is_app_of(n, m_fid, _OP_RE_ANTIMIROV_UNION); } + bool is_charclass(expr const* n) const { return is_app_of(n, m_fid, OP_RE_CHARCLASS); } + bool is_charclass(expr const* n, seq::range_predicate& p) const; + + // True if `r` is semantically equivalent to a single char-class regex + // over Char. Handles OP_RE_CHARCLASS, re.full_char, re.empty (over a + // sequence sort whose element sort is Char), and re.range with two + // single-char concrete bounds. Returns the canonical predicate in `p`. + // This is the O(1) fast-path check used by smart constructors to + // collapse boolean combinations to a single CHARCLASS node. + bool as_charclass(expr* r, seq::range_predicate& p) const; MATCH_UNARY(is_to_re); MATCH_BINARY(is_concat); MATCH_BINARY(is_union); diff --git a/src/test/range_predicate.cpp b/src/test/range_predicate.cpp index 7d332f413..7109d2fd8 100644 --- a/src/test/range_predicate.cpp +++ b/src/test/range_predicate.cpp @@ -22,7 +22,7 @@ Author: --*/ -#include "ast/rewriter/range_predicate.h" +#include "ast/range_predicate.h" #include "util/debug.h" #include #include