From 1def58bc9f2f3f8ee3695dc936cbe2469c13a707 Mon Sep 17 00:00:00 2001 From: Nikolaj Bjorner Date: Sun, 17 May 2020 19:06:34 -0700 Subject: [PATCH] optional unicode mode Signed-off-by: Nikolaj Bjorner --- src/ast/rewriter/seq_rewriter.h | 1 + src/ast/seq_decl_plugin.cpp | 73 ++++++++++++++++++++++++++++----- src/ast/seq_decl_plugin.h | 24 ++++++++++- src/smt/seq_unicode.cpp | 28 +++++++------ src/smt/seq_unicode.h | 2 +- src/smt/theory_seq.cpp | 50 +++++++++++++++++++++- src/smt/theory_seq.h | 3 ++ 7 files changed, 155 insertions(+), 26 deletions(-) diff --git a/src/ast/rewriter/seq_rewriter.h b/src/ast/rewriter/seq_rewriter.h index 098b2f82d..b52167f9d 100644 --- a/src/ast/rewriter/seq_rewriter.h +++ b/src/ast/rewriter/seq_rewriter.h @@ -212,6 +212,7 @@ public: void set_solver(expr_solver* solver) { m_re2aut.set_solver(solver); } bool has_solver() { return m_re2aut.has_solver(); } + bool coalesce_chars() const { return m_coalesce_chars; } br_status mk_app_core(func_decl * f, unsigned num_args, expr * const * args, expr_ref & result); br_status mk_eq_core(expr * lhs, expr * rhs, expr_ref & result); diff --git a/src/ast/seq_decl_plugin.cpp b/src/ast/seq_decl_plugin.cpp index 8b1453af7..26c05faf4 100644 --- a/src/ast/seq_decl_plugin.cpp +++ b/src/ast/seq_decl_plugin.cpp @@ -82,8 +82,7 @@ static bool is_escape_char(char const *& s, unsigned& result) { return true; } -#if 0 - /* unicode */ +#if _USE_UNICODE if (*(s+1) == 'u' && *(s+2) == '{') { result = 0; for (unsigned i = 0; i < 5; ++i) { @@ -570,6 +569,7 @@ void seq_decl_plugin::init() { sort* str2TintT[3] = { strT, strT, intT }; sort* seqAintT[2] = { seqA, intT }; sort* seq3A[3] = { seqA, seqA, seqA }; + sort* charTcharT[2] = { m_char, m_char }; m_sigs.resize(LAST_SEQ_OP); // TBD: have (par ..) construct and load parameterized signature from premable. m_sigs[OP_SEQ_UNIT] = alloc(psig, m, "seq.unit", 1, 1, &A, seqA); @@ -607,6 +607,10 @@ void seq_decl_plugin::init() { m_sigs[OP_SEQ_REPLACE_RE] = alloc(psig, m, "str.replace_re", 1, 3, seqAreAseqA, seqA); m_sigs[OP_SEQ_REPLACE_ALL] = alloc(psig, m, "str.replace_all", 1, 3, seqAseqAseqA, seqA); m_sigs[OP_STRING_CONST] = nullptr; +#if _USE_UNICODE + m_sigs[OP_CHAR_CONST] = nullptr; + m_sigs[OP_CHAR_LE] = alloc(psig, m, "char.<=", 0, 2, charTcharT, boolT); +#endif m_sigs[_OP_STRING_STRIDOF] = alloc(psig, m, "str.indexof", 0, 3, str2TintT, intT); m_sigs[_OP_STRING_STRREPL] = alloc(psig, m, "str.replace", 0, 3, str3T, strT); m_sigs[OP_STRING_ITOS] = alloc(psig, m, "str.from_int", 0, 1, &intT, strT); @@ -632,8 +636,11 @@ void seq_decl_plugin::init() { void seq_decl_plugin::set_manager(ast_manager* m, family_id id) { decl_plugin::set_manager(m, id); bv_util bv(*m); - // to be changed to Unicode (abstract sort) +#if _USE_UNICODE + m_char = m->mk_sort(symbol("Unicode"), sort_info(m_family_id, _CHAR_SORT, 0, nullptr)); +#else m_char = bv.mk_sort(8); +#endif m->inc_ref(m_char); parameter param(m_char); m_string = m->mk_sort(symbol("String"), sort_info(m_family_id, SEQ_SORT, 1, ¶m)); @@ -667,8 +674,10 @@ sort * seq_decl_plugin::mk_sort(decl_kind k, unsigned num_parameters, parameter } return m.mk_sort(symbol("RegEx"), sort_info(m_family_id, RE_SORT, num_parameters, parameters)); } +#if _USE_UNICODE case _CHAR_SORT: return m_char; +#endif case _STRING_SORT: return m_string; case _REGLAN_SORT: @@ -744,7 +753,6 @@ func_decl * seq_decl_plugin::mk_func_decl(decl_kind k, unsigned num_parameters, case OP_STRING_IS_DIGIT: case OP_STRING_TO_CODE: case OP_STRING_FROM_CODE: - m.raise_exception("character function is not yet supported"); match(*m_sigs[k], arity, domain, range, rng); return m.mk_func_decl(m_sigs[k]->m_name, arity, domain, rng, func_decl_info(m_family_id, k)); @@ -831,7 +839,6 @@ func_decl * seq_decl_plugin::mk_func_decl(decl_kind k, unsigned num_parameters, case OP_SEQ_REPLACE_RE: m_has_re = true; case OP_SEQ_REPLACE_ALL: - m.raise_exception("replace-all operation is not yet supported"); return mk_str_fun(k, arity, domain, range, k); case OP_SEQ_CONCAT: @@ -895,6 +902,22 @@ func_decl * seq_decl_plugin::mk_func_decl(decl_kind k, unsigned num_parameters, m_has_re = true; return mk_str_fun(k, arity, domain, range, OP_SEQ_TO_RE); +#if _USE_UNICODE + case OP_CHAR_LE: + if (arity == 2 && domain[0] == m_char && domain[1] == m_char) { + return m.mk_func_decl(m_sigs[k]->m_name, arity, domain, m.mk_bool_sort(), func_decl_info(m_family_id, k, 0, nullptr)); + } + m.raise_exception("Incorrect parameters passed to character comparison"); + case OP_CHAR_CONST: + if (!(num_parameters == 1 && arity == 0 && + parameters[0].is_int() && + 0 <= parameters[0].get_int() && + parameters[0].get_int() < static_cast(zstring::max_char()))) { + m.raise_exception("invalid character declaration"); + } + return m.mk_const_decl(m_charc_sym, m_char, func_decl_info(m_family_id, OP_CHAR_CONST, num_parameters, parameters)); +#endif + case OP_SEQ_IN_RE: m_has_re = true; return mk_seq_fun(k, arity, domain, range, _OP_STRING_IN_REGEXP); @@ -983,6 +1006,17 @@ app* seq_decl_plugin::mk_string(zstring const& s) { return m_manager->mk_const(f); } +app* seq_decl_plugin::mk_char(unsigned u) { +#if _USE_UNICODE + parameter param(u); + func_decl* f = m_manager->mk_const_decl(m_charc_sym, m_char, func_decl_info(m_family_id, OP_CHAR_CONST, 1, ¶m)); + return m_manager->mk_const(f); +#else + UNREACHABLE(); + return nullptr; +#endif +} + bool seq_decl_plugin::is_considered_uninterpreted(func_decl * f) { seq_util util(*m_manager); @@ -1045,6 +1079,12 @@ bool seq_decl_plugin::are_distinct(app* a, app* b) const { is_app_of(a, m_family_id, OP_SEQ_UNIT)) { return true; } +#if _USE_UNICODE + if (is_app_of(a, m_family_id, OP_CHAR_CONST) && + is_app_of(b, m_family_id, OP_CHAR_CONST)) { + return true; + } +#endif return false; } @@ -1074,13 +1114,11 @@ app* seq_util::str::mk_string(zstring const& s) const { } app* seq_util::str::mk_char(zstring const& s, unsigned idx) const { - // TBD for unicode - return u.bv().mk_numeral(s[idx], 8); + return u.mk_char(s[idx]); } -app* seq_util::str::mk_char(char ch) const { - zstring s(ch); - return mk_char(s, 0); +app* seq_util::str::mk_char(unsigned ch) const { + return u.mk_char(ch); } bv_util& seq_util::bv() const { @@ -1089,21 +1127,34 @@ bv_util& seq_util::bv() const { } bool seq_util::is_const_char(expr* e, unsigned& c) const { +#if _USE_UNICODE + return is_app_of(e, m_fid, OP_CHAR_CONST) && (c = to_app(e)->get_parameter(0).get_int(), true); +#else rational r; unsigned sz; return bv().is_numeral(e, r, sz) && sz == 8 && r.is_unsigned() && (c = r.get_unsigned(), true); +#endif } app* seq_util::mk_char(unsigned ch) const { +#if _USE_UNICODE + return seq.mk_char(ch); +#else return bv().mk_numeral(rational(ch), 8); +#endif } app* seq_util::mk_le(expr* ch1, expr* ch2) const { +#if _USE_UNICODE + expr* es[2] = { ch1, ch2 }; + return m.mk_app(m_fid, OP_CHAR_LE, 2, es); +#else return bv().mk_ule(ch1, ch2); +#endif } app* seq_util::mk_lt(expr* ch1, expr* ch2) const { - return m.mk_not(bv().mk_ule(ch2, ch1)); + return m.mk_not(mk_le(ch2, ch1)); } bool seq_util::str::is_string(func_decl const* f, zstring& s) const { diff --git a/src/ast/seq_decl_plugin.h b/src/ast/seq_decl_plugin.h index 578e76a4d..35a0e86a9 100644 --- a/src/ast/seq_decl_plugin.h +++ b/src/ast/seq_decl_plugin.h @@ -26,11 +26,14 @@ Revision History: #include "ast/ast.h" #include "ast/bv_decl_plugin.h" +#define _USE_UNICODE 0 enum seq_sort_kind { SEQ_SORT, RE_SORT, +#if _USE_UNICODE _CHAR_SORT, // internal only +#endif _STRING_SORT, _REGLAN_SORT }; @@ -83,6 +86,11 @@ enum seq_op_kind { OP_STRING_IS_DIGIT, OP_STRING_TO_CODE, OP_STRING_FROM_CODE, + +#if _USE_UNICODE + OP_CHAR_CONST, // constant character + OP_CHAR_LE, // Unicode comparison +#endif // internal only operators. Converted to SEQ variants. _OP_STRING_STRREPL, _OP_STRING_CONCAT, @@ -210,6 +218,7 @@ public: app* mk_string(symbol const& s); app* mk_string(zstring const& s); + app* mk_char(unsigned ch); bool has_re() const { return m_has_re; } bool has_seq() const { return m_has_seq; } @@ -238,6 +247,11 @@ public: bool is_re(expr* e, sort*& seq) const { return is_re(m.get_sort(e), seq); } bool is_char(expr* e) const { return is_char(m.get_sort(e)); } bool is_const_char(expr* e, unsigned& c) const; +#if _USE_UNICODE + bool is_char_le(expr const* e) const { return is_app_of(e, m_fid, OP_CHAR_LE); } +#else + bool is_char_le(expr const* e) const { return false; } +#endif app* mk_char(unsigned ch) const; app* mk_le(expr* ch1, expr* ch2) const; app* mk_lt(expr* ch1, expr* ch2) const; @@ -245,6 +259,8 @@ public: app* mk_skolem(symbol const& name, unsigned n, expr* const* args, sort* range); bool is_skolem(expr const* e) const { return is_app_of(e, m_fid, _OP_SEQ_SKOLEM); } + MATCH_BINARY(is_char_le); + bool has_re() const { return seq.has_re(); } bool has_seq() const { return seq.has_seq(); } @@ -265,7 +281,7 @@ public: app* mk_empty(sort* s) const { return m.mk_const(m.mk_func_decl(m_fid, OP_SEQ_EMPTY, 0, nullptr, 0, (expr*const*)nullptr, s)); } app* mk_string(zstring const& s) const; app* mk_string(symbol const& s) const { return u.seq.mk_string(s); } - app* mk_char(char ch) const; + app* mk_char(unsigned ch) const; app* mk_concat(expr* a, expr* b) const { expr* es[2] = { a, b }; return m.mk_app(m_fid, OP_SEQ_CONCAT, 2, es); } app* mk_concat(expr* a, expr* b, expr* c) const { return mk_concat(a, mk_concat(b, c)); } expr* mk_concat(unsigned n, expr* const* es, sort* s) const { @@ -323,6 +339,9 @@ public: bool is_index(expr const* n) const { return is_app_of(n, m_fid, OP_SEQ_INDEX); } bool is_last_index(expr const* n) const { return is_app_of(n, m_fid, OP_SEQ_LAST_INDEX); } bool is_replace(expr const* n) const { return is_app_of(n, m_fid, OP_SEQ_REPLACE); } + bool is_replace_re(expr const* n) const { return is_app_of(n, m_fid, OP_SEQ_REPLACE_RE); } + bool is_replace_re_all(expr const* n) const { return is_app_of(n, m_fid, OP_SEQ_REPLACE_RE_ALL); } + bool is_replace_all(expr const* n) const { return is_app_of(n, m_fid, OP_SEQ_REPLACE_ALL); } bool is_prefix(expr const* n) const { return is_app_of(n, m_fid, OP_SEQ_PREFIX); } bool is_suffix(expr const* n) const { return is_app_of(n, m_fid, OP_SEQ_SUFFIX); } bool is_itos(expr const* n) const { return is_app_of(n, m_fid, OP_STRING_ITOS); } @@ -356,6 +375,9 @@ public: MATCH_TERNARY(is_index); MATCH_BINARY(is_last_index); MATCH_TERNARY(is_replace); + MATCH_TERNARY(is_replace_re); + MATCH_TERNARY(is_replace_re_all); + MATCH_TERNARY(is_replace_all); MATCH_BINARY(is_prefix); MATCH_BINARY(is_suffix); MATCH_BINARY(is_lt); diff --git a/src/smt/seq_unicode.cpp b/src/smt/seq_unicode.cpp index 82dbd9712..0a947b1c6 100644 --- a/src/smt/seq_unicode.cpp +++ b/src/smt/seq_unicode.cpp @@ -38,8 +38,6 @@ namespace smt { void seq_unicode::assign_le(theory_var v1, theory_var v2, literal lit) { dl.init_var(v1); dl.init_var(v2); - // dl.set_assignment(v1, 'a' + ctx().random(24)); - // dl.set_assignment(v2, 'a' + ctx().random(24)); ctx().push_trail(push_back_vector>(m_asserted_edges)); m_asserted_edges.push_back(dl.add_edge(v1, v2, s_integer(0), lit)); } @@ -79,11 +77,12 @@ namespace smt { adapt_eq(v1, v2); } - final_check_status seq_unicode::final_check() { + bool seq_unicode::final_check() { // ensure all variables are above 0 and less than zstring::max_char() bool added_constraint = false; - // TBD: shift assignments on variables that are not lower-bounded - // TBD: set zero to a zero value. + // TBD: shift assignments on variables that are not lower-bounded, so that they are "nice" (have values 'a', 'b', ...) + // TBD: set "zero" to a zero value. + // TBD: ensure that unicode constants have the right values arith_util a(m); arith_value avalue(m); avalue.init(&ctx()); @@ -111,10 +110,15 @@ namespace smt { continue; } // ensure str.to_code(unit(v)) = val - expr_ref ch1(seq.str.mk_unit(seq.str.mk_char(val)), m); - // or - // expr_ref ch2(seq.str.mk_string(zstring(val)), m); - expr_ref code(seq.str.mk_to_code(ch1), m); + expr_ref ch(m); + if (false) { + /// m_rewrite.coalesce_chars(); + ch = seq.str.mk_string(zstring(val)); + } + else { + ch = seq.str.mk_unit(seq.str.mk_char(val)); + } + expr_ref code(seq.str.mk_to_code(ch), m); rational val2; if (avalue.get_value(code, val2) && val2 == rational(val)) continue; @@ -122,13 +126,13 @@ namespace smt { added_constraint = true; } if (added_constraint) - return FC_CONTINUE; + return false; // ensure equalities over shared symbols if (th.assume_eqs(m_var_value_table)) - return FC_CONTINUE; + return false; - return FC_DONE; + return true; } void seq_unicode::propagate() { diff --git a/src/smt/seq_unicode.h b/src/smt/seq_unicode.h index a89f431ee..91578d3a6 100644 --- a/src/smt/seq_unicode.h +++ b/src/smt/seq_unicode.h @@ -102,7 +102,7 @@ namespace smt { void new_diseq_eh(theory_var v1, theory_var v2); // ensure coherence for character codes and equalities of shared symbols. - final_check_status final_check(); + bool final_check(); unsigned get_value(theory_var v); diff --git a/src/smt/theory_seq.cpp b/src/smt/theory_seq.cpp index 6b03a0c4d..2bf867cc0 100644 --- a/src/smt/theory_seq.cpp +++ b/src/smt/theory_seq.cpp @@ -298,12 +298,14 @@ theory_seq::theory_seq(context& ctx): m_autil(m), m_sk(m, m_rewrite), m_ax(*this, m_rewrite), + m_unicode(*this), m_arith_value(m), m_trail_stack(*this), m_ls(m), m_rs(m), m_lhs(m), m_rhs(m), m_new_eqs(m), m_has_seq(m_util.has_seq()), + m_unhandled_expr(nullptr), m_res(m), m_max_unfolding_depth(1), m_max_unfolding_lit(null_literal), @@ -346,6 +348,7 @@ final_check_status theory_seq::final_check_eh() { if (!m_has_seq) { return FC_DONE; } + m_new_propagation = false; TRACE("seq", display(tout << "level: " << ctx.get_scope_level() << "\n");); TRACE("seq_verbose", ctx.display(tout);); @@ -374,6 +377,9 @@ final_check_status theory_seq::final_check_eh() { TRACEFIN("zero_length"); return FC_CONTINUE; } + if (!m_unicode.final_check()) { + return FC_CONTINUE; + } if (get_fparams().m_split_w_len && len_based_split()) { ++m_stats.m_branch_variable; TRACEFIN("split_based_on_length"); @@ -424,6 +430,9 @@ final_check_status theory_seq::final_check_eh() { TRACEFIN("branch_ne"); return FC_CONTINUE; } + if (m_unhandled_expr) { + return FC_GIVEUP; + } if (is_solved()) { //scoped_enable_trace _se; TRACEFIN("is_solved"); @@ -2309,7 +2318,7 @@ theory_var theory_seq::mk_var(enode* n) { } bool theory_seq::can_propagate() { - return m_axioms_head < m_axioms.size() || !m_replay.empty() || m_new_solution; + return m_axioms_head < m_axioms.size() || !m_replay.empty() || m_new_solution || m_unicode.can_propagate(); } bool theory_seq::canonize(expr* e, dependency*& eqs, expr_ref& result) { @@ -2501,6 +2510,7 @@ void theory_seq::add_dependency(dependency*& dep, enode* a, enode* b) { void theory_seq::propagate() { + m_unicode.propagate(); while (m_axioms_head < m_axioms.size() && !ctx.inconsistent()) { expr_ref e(m); e = m_axioms[m_axioms_head].get(); @@ -2578,6 +2588,15 @@ void theory_seq::deque_axiom(expr* n) { else if (m_util.str.is_unit(n)) { m_ax.add_unit_axiom(n); } + else if (m_util.str.is_is_digit(n)) { + m_ax.add_is_digit_axiom(n); + } + else if (m_util.str.is_from_code(n)) { + m_ax.add_str_from_code_axiom(n); + } + else if (m_util.str.is_to_code(n)) { + m_ax.add_str_to_code_axiom(n); + } } expr_ref theory_seq::add_elim_string_axiom(expr* n) { @@ -3053,7 +3072,16 @@ void theory_seq::assign_eh(bool_var v, bool is_true) { else if (m_util.str.is_nth_i(e) || m_util.str.is_nth_u(e)) { // no-op } + else if (m_util.is_char_le(e, e1, e2)) { + theory_var v1 = get_th_var(ctx.get_enode(e1)); + theory_var v2 = get_th_var(ctx.get_enode(e2)); + if (is_true) + m_unicode.assign_le(v1, v2, lit); + else + m_unicode.assign_lt(v2, v1, lit); + } else if (m_util.is_skolem(e)) { + // no-op } else { @@ -3065,6 +3093,10 @@ void theory_seq::assign_eh(bool_var v, bool is_true) { void theory_seq::new_eq_eh(theory_var v1, theory_var v2) { enode* n1 = get_enode(v1); enode* n2 = get_enode(v2); + if (m_util.is_char(n1->get_owner())) { + m_unicode.new_eq_eh(v1, v2); + return; + } dependency* deps = m_dm.mk_leaf(assumption(n1, n2)); new_eq_eh(deps, n1, n2); } @@ -3156,6 +3188,10 @@ void theory_seq::new_diseq_eh(theory_var v1, theory_var v2) { throw default_exception("convert regular expressions into automata"); } } + if (m_util.is_char(n1->get_owner())) { + m_unicode.new_diseq_eh(v1, v2); + return; + } m_exclude.update(e1, e2); expr_ref eq(m.mk_eq(e1, e2), m); TRACE("seq", tout << "new disequality " << ctx.get_scope_level() << ": " << mk_bounded_pp(eq, m, 2) << "\n";); @@ -3219,6 +3255,9 @@ void theory_seq::relevant_eh(app* n) { m_util.str.is_itos(n) || m_util.str.is_stoi(n) || m_util.str.is_lt(n) || + m_util.str.is_is_digit(n) || + m_util.str.is_from_code(n) || + m_util.str.is_to_code(n) || m_util.str.is_unit(n) || m_util.str.is_le(n)) { enque_axiom(n); @@ -3237,6 +3276,15 @@ void theory_seq::relevant_eh(app* n) { if (m_util.str.is_length(n, arg) && !has_length(arg) && ctx.e_internalized(arg)) { add_length_to_eqc(arg); } + + if (m_util.str.is_replace_all(n) || + m_util.str.is_replace_re(n) || + m_util.str.is_replace_re_all(n)) { + if (!m_unhandled_expr) { + ctx.push_trail(value_trail(m_unhandled_expr)); + m_unhandled_expr = n; + } + } } diff --git a/src/smt/theory_seq.h b/src/smt/theory_seq.h index 3bf28ecd9..a344d8831 100644 --- a/src/smt/theory_seq.h +++ b/src/smt/theory_seq.h @@ -33,6 +33,7 @@ Revision History: #include "smt/theory_seq_empty.h" #include "smt/seq_skolem.h" #include "smt/seq_axioms.h" +#include "smt/seq_unicode.h" #include "smt/seq_offset_eq.h" namespace smt { @@ -402,6 +403,7 @@ namespace smt { arith_util m_autil; seq_skolem m_sk; seq_axioms m_ax; + seq_unicode m_unicode; arith_value m_arith_value; th_trail_stack m_trail_stack; stats m_stats; @@ -409,6 +411,7 @@ namespace smt { expr_ref_vector m_ls, m_rs, m_lhs, m_rhs; expr_ref_pair_vector m_new_eqs; bool m_has_seq; + expr* m_unhandled_expr; // maintain automata with regular expressions. scoped_ptr_vector m_automata;