mirror of
https://github.com/Z3Prover/z3
synced 2026-04-03 02:18:58 +00:00
Added classical regex factorization
This commit is contained in:
parent
3ca960d679
commit
a81ce477f5
8 changed files with 258 additions and 6 deletions
|
|
@ -154,6 +154,7 @@ namespace euf {
|
|||
n->m_ground = l->is_ground() && r->is_ground();
|
||||
n->m_regex_free = l->is_regex_free() && r->is_regex_free();
|
||||
n->m_nullable = l->is_nullable() && r->is_nullable();
|
||||
n->m_is_classical = l->is_classical() && r->is_classical();
|
||||
n->m_level = std::max(l->level(), r->level()) + 1;
|
||||
n->m_length = l->length() + r->length();
|
||||
++m_stats.m_num_concat;
|
||||
|
|
@ -163,13 +164,14 @@ namespace euf {
|
|||
case snode_kind::s_power: {
|
||||
// s^n: nullable follows base, consistent with ZIPT's PowerToken
|
||||
// the exponent n is assumed to be a symbolic integer, may or may not be zero
|
||||
// NSB review: SASSERT(n->num_args() == 2); and simplify code
|
||||
// NSB review: SASSERT(n->num_args() == 2); and simplify code
|
||||
// NSB review: is this the correct definition of ground what about the exponent?
|
||||
SASSERT(n->num_args() >= 1);
|
||||
snode* base = n->arg(0);
|
||||
n->m_ground = base->is_ground();
|
||||
n->m_regex_free = base->is_regex_free();
|
||||
n->m_nullable = base->is_nullable();
|
||||
n->m_is_classical = base->is_classical();
|
||||
n->m_level = 1;
|
||||
n->m_length = 1;
|
||||
++m_stats.m_num_power;
|
||||
|
|
@ -181,6 +183,7 @@ namespace euf {
|
|||
n->m_ground = n->arg(0)->is_ground();
|
||||
n->m_regex_free = false;
|
||||
n->m_nullable = true;
|
||||
n->m_is_classical = n->arg(0)->is_classical();
|
||||
n->m_level = 1;
|
||||
n->m_length = 1;
|
||||
break;
|
||||
|
|
@ -189,6 +192,7 @@ namespace euf {
|
|||
n->m_ground = n->num_args() > 0 ? n->arg(0)->is_ground() : true;
|
||||
n->m_regex_free = false;
|
||||
// nullable iff lower bound is 0: r{0,n} accepts the empty string
|
||||
n->m_is_classical = n->arg(0)->is_classical();
|
||||
// default lo=1 (non-nullable) in case extraction fails
|
||||
unsigned lo = 1, hi = 1;
|
||||
expr* loop_body = nullptr;
|
||||
|
|
@ -207,15 +211,17 @@ namespace euf {
|
|||
n->m_ground = n->arg(0)->is_ground() && n->arg(1)->is_ground();
|
||||
n->m_regex_free = false;
|
||||
n->m_nullable = n->arg(0)->is_nullable() || n->arg(1)->is_nullable();
|
||||
n->m_is_classical = n->arg(0)->is_classical() && n->arg(1)->is_classical();
|
||||
n->m_level = 1;
|
||||
n->m_length = 1;
|
||||
break;
|
||||
|
||||
case snode_kind::s_intersect:
|
||||
SASSERT(n->num_args() == 2);
|
||||
n->m_ground = n->arg(0)->is_ground() && n->arg(1)->is_ground();
|
||||
n->m_ground = n->arg(0)->is_ground() && n->arg(1)->is_ground();
|
||||
n->m_regex_free = false;
|
||||
n->m_nullable = n->arg(0)->is_nullable() && n->arg(1)->is_nullable();
|
||||
n->m_is_classical = false;
|
||||
n->m_level = 1;
|
||||
n->m_length = 1;
|
||||
break;
|
||||
|
|
@ -225,6 +231,7 @@ namespace euf {
|
|||
n->m_ground = n->arg(0)->is_ground();
|
||||
n->m_regex_free = false;
|
||||
n->m_nullable = !n->arg(0)->is_nullable();
|
||||
n->m_is_classical = false;
|
||||
n->m_level = 1;
|
||||
n->m_length = 1;
|
||||
break;
|
||||
|
|
@ -233,6 +240,7 @@ namespace euf {
|
|||
n->m_ground = true;
|
||||
n->m_regex_free = false;
|
||||
n->m_nullable = false;
|
||||
n->m_is_classical = false;
|
||||
n->m_level = 1;
|
||||
n->m_length = 1;
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -63,10 +63,11 @@ namespace euf {
|
|||
unsigned m_num_args = 0;
|
||||
|
||||
// metadata flags, analogous to ZIPT's Str/StrToken properties
|
||||
bool m_ground = true; // no uninterpreted string variables
|
||||
bool m_regex_free = true; // no regex constructs
|
||||
bool m_nullable = false; // accepts the empty string
|
||||
unsigned m_level = 0; // tree depth/level (0 for empty, 1 for singletons)
|
||||
bool m_ground = true; // no uninterpreted string variables
|
||||
bool m_regex_free = true; // no regex constructs
|
||||
bool m_nullable = false; // accepts the empty string
|
||||
bool m_is_classical = true; // classical regular expression
|
||||
unsigned m_level = 0; // tree depth/level (0 for empty, 1 for singletons)
|
||||
unsigned m_length = 0; // token count, number of leaf tokens in the tree
|
||||
|
||||
// hash matrix for associativity-respecting hashing (2x2 polynomial hash matrix)
|
||||
|
|
@ -104,6 +105,7 @@ namespace euf {
|
|||
bool is_ground() const { return m_ground; }
|
||||
bool is_regex_free() const { return m_regex_free; }
|
||||
bool is_nullable() const { return m_nullable; }
|
||||
bool is_classical() const { return m_is_classical; }
|
||||
unsigned level() const { return m_level; }
|
||||
unsigned length() const { return m_length; }
|
||||
|
||||
|
|
|
|||
|
|
@ -57,6 +57,7 @@ void smt_params::updt_local_params(params_ref const & _p) {
|
|||
m_nseq_max_nodes = p.nseq_max_nodes();
|
||||
m_nseq_parikh = p.nseq_parikh();
|
||||
m_nseq_regex_precheck = p.nseq_regex_precheck();
|
||||
m_nseq_regex_factorization = p.nseq_regex_factorization();
|
||||
m_nseq_signature = p.nseq_signature();
|
||||
m_up_persist_clauses = p.up_persist_clauses();
|
||||
validate_string_solver(m_string_solver);
|
||||
|
|
@ -169,6 +170,9 @@ void smt_params::display(std::ostream & out) const {
|
|||
DISPLAY_PARAM(m_lemmas2console);
|
||||
DISPLAY_PARAM(m_logic);
|
||||
DISPLAY_PARAM(m_string_solver);
|
||||
DISPLAY_PARAM(m_nseq_parikh);
|
||||
DISPLAY_PARAM(m_nseq_regex_precheck);
|
||||
DISPLAY_PARAM(m_nseq_regex_factorization);
|
||||
|
||||
DISPLAY_PARAM(m_profile_res_sub);
|
||||
DISPLAY_PARAM(m_display_bool_var2expr);
|
||||
|
|
|
|||
|
|
@ -252,6 +252,7 @@ struct smt_params : public preprocessor_params,
|
|||
unsigned m_nseq_max_nodes = 0;
|
||||
bool m_nseq_parikh = false;
|
||||
bool m_nseq_regex_precheck = true;
|
||||
bool m_nseq_regex_factorization = true;
|
||||
bool m_nseq_signature = false;
|
||||
|
||||
smt_params(params_ref const & p = params_ref()):
|
||||
|
|
|
|||
|
|
@ -129,6 +129,7 @@ def_module_params(module_name='smt',
|
|||
('nseq.max_nodes', UINT, 0, 'maximum number of DFS nodes explored by theory_nseq per solve() call (0 = unlimited)'),
|
||||
('nseq.parikh', BOOL, False, 'enable Parikh image checks in nseq solver'),
|
||||
('nseq.regex_precheck', BOOL, True, 'enable regex membership pre-check before DFS in theory_nseq: checks intersection emptiness per-variable and short-circuits SAT/UNSAT for regex-only problems'),
|
||||
('nseq.regex_factorization', BOOL, True, 'enable syntactic regex factorization in theory_nseq: decomposes Boolean closure of regular expressions into primitive membership constraints'),
|
||||
('nseq.signature', BOOL, False, 'enable heuristic signature-based string equation splitting in Nielsen solver'),
|
||||
('core.validate', BOOL, False, '[internal] validate unsat core produced by SMT context. This option is intended for debugging'),
|
||||
('seq.split_w_len', BOOL, True, 'enable splitting guided by length constraints'),
|
||||
|
|
|
|||
|
|
@ -2218,6 +2218,10 @@ namespace seq {
|
|||
if (apply_gpower_intr(node))
|
||||
return ++m_stats.m_mod_gpower_intr, true;
|
||||
|
||||
// Priority 7b: Regex Factorization (Boolean Closure)
|
||||
if (apply_regex_factorization(node))
|
||||
return ++m_stats.m_mod_regex_factorization, true;
|
||||
|
||||
// Priority 8: ConstNielsen - char vs var (2 children)
|
||||
if (apply_const_nielsen(node))
|
||||
return ++m_stats.m_mod_const_nielsen, true;
|
||||
|
|
@ -2810,6 +2814,196 @@ namespace seq {
|
|||
return false;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Modifier: apply_regex_factorization (Boolean Closure)
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
struct tau_pair {
|
||||
expr_ref m_p;
|
||||
expr_ref m_q;
|
||||
tau_pair(expr* p, expr* q, ast_manager& m) : m_p(p, m), m_q(q, m) {
|
||||
SASSERT(p);
|
||||
SASSERT(q);
|
||||
}
|
||||
};
|
||||
typedef vector<tau_pair> tau_pairs;
|
||||
|
||||
static void compute_tau(ast_manager& m, seq_util& seq, euf::sgraph& sg, expr* r, tau_pairs& result) {
|
||||
SASSERT(r);
|
||||
sort* str_sort = nullptr;
|
||||
if (!seq.is_re(r, str_sort)) return;
|
||||
expr *body = nullptr;
|
||||
|
||||
if (seq.re.is_epsilon(r)) {
|
||||
expr_ref eps(seq.re.mk_epsilon(str_sort), m);
|
||||
result.push_back(tau_pair(eps, eps, m));
|
||||
}
|
||||
else if (seq.str.is_unit(r) || seq.str.is_string(r) || seq.re.is_range(r) || seq.re.is_full_char(r) ||
|
||||
(seq.re.is_to_re(r) && seq.str.is_string(to_app(r)->get_arg(0)))) {
|
||||
if (seq.re.is_to_re(r)) {
|
||||
expr* arg = to_app(r)->get_arg(0);
|
||||
zstring s;
|
||||
if (seq.str.is_string(arg, s) && s.length() > 1) {
|
||||
for (unsigned i = 0; i <= s.length(); ++i) {
|
||||
expr_ref p(seq.re.mk_to_re(seq.str.mk_string(s.extract(0, i))), m);
|
||||
expr_ref q(seq.re.mk_to_re(seq.str.mk_string(s.extract(i, s.length() - i))), m);
|
||||
result.push_back(tau_pair(p, q, m));
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
expr_ref eps(seq.re.mk_epsilon(str_sort), m);
|
||||
result.push_back(tau_pair(eps, r, m));
|
||||
result.push_back(tau_pair(r, eps, m));
|
||||
}
|
||||
else if (seq.re.is_empty(r)) {
|
||||
// empty set has no splits
|
||||
}
|
||||
else if (seq.re.is_union(r)) {
|
||||
for (expr* arg : *to_app(r)) {
|
||||
compute_tau(m, seq, sg, arg, result);
|
||||
}
|
||||
}
|
||||
else if (seq.re.is_concat(r)) {
|
||||
unsigned num_args = to_app(r)->get_num_args();
|
||||
if (num_args == 0) {
|
||||
expr_ref eps(seq.re.mk_epsilon(str_sort), m);
|
||||
result.push_back(tau_pair(eps, eps, m));
|
||||
return;
|
||||
}
|
||||
for (unsigned i = 0; i < num_args; ++i) {
|
||||
tau_pairs tau_arg;
|
||||
compute_tau(m, seq, sg, to_app(r)->get_arg(i), tau_arg);
|
||||
|
||||
expr_ref left(m);
|
||||
expr_ref right(m);
|
||||
|
||||
if (i == 0) left = seq.re.mk_epsilon(str_sort);
|
||||
else {
|
||||
expr_ref_vector left_args(m);
|
||||
for (unsigned j = 0; j < i; ++j) left_args.push_back(to_app(r)->get_arg(j));
|
||||
if (left_args.size() == 1) left = left_args.get(0);
|
||||
else left = m.mk_app(seq.get_family_id(), OP_RE_CONCAT, left_args.size(), left_args.data());
|
||||
}
|
||||
|
||||
if (i == num_args - 1) right = seq.re.mk_epsilon(str_sort);
|
||||
else {
|
||||
expr_ref_vector right_args(m);
|
||||
for (unsigned j = i + 1; j < num_args; ++j) right_args.push_back(to_app(r)->get_arg(j));
|
||||
if (right_args.size() == 1) right = right_args.get(0);
|
||||
else right = m.mk_app(seq.get_family_id(), OP_RE_CONCAT, right_args.size(), right_args.data());
|
||||
}
|
||||
|
||||
for (auto const& pair : tau_arg) {
|
||||
expr_ref p(m), q(m);
|
||||
if (seq.re.is_epsilon(left)) p = pair.m_p;
|
||||
else if (seq.re.is_epsilon(pair.m_p)) p = left;
|
||||
|
||||
if (seq.re.is_epsilon(right)) q = pair.m_q;
|
||||
else if (seq.re.is_epsilon(pair.m_q)) q = right;
|
||||
else q = seq.re.mk_concat(pair.m_q, right);
|
||||
|
||||
result.push_back(tau_pair(p, q, m));
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (seq.re.is_star(r, body) || seq.re.is_plus(r, body)) {
|
||||
if (seq.re.is_plus(r)) {
|
||||
expr_ref star(seq.re.mk_star(body), m);
|
||||
expr_ref concat(seq.re.mk_concat(body, star), m);
|
||||
compute_tau(m, seq, sg, concat, result);
|
||||
return;
|
||||
}
|
||||
|
||||
expr_ref eps(seq.re.mk_epsilon(str_sort), m);
|
||||
result.push_back(tau_pair(eps, eps, m));
|
||||
|
||||
tau_pairs tau_body;
|
||||
compute_tau(m, seq, sg, body, tau_body);
|
||||
for (auto const& pair : tau_body) {
|
||||
expr_ref p(m), q(m);
|
||||
if (seq.re.is_epsilon(pair.m_p)) p = r;
|
||||
else p = seq.re.mk_concat(r, pair.m_p);
|
||||
|
||||
if (seq.re.is_epsilon(pair.m_q)) q = r;
|
||||
else q = seq.re.mk_concat(pair.m_q, r);
|
||||
|
||||
result.push_back(tau_pair(p, q, m));
|
||||
}
|
||||
}
|
||||
else if (seq.re.is_opt(r, body)) {
|
||||
expr_ref eps(seq.re.mk_epsilon(str_sort), m);
|
||||
result.push_back(tau_pair(eps, eps, m));
|
||||
compute_tau(m, seq, sg, body, result);
|
||||
}
|
||||
else {
|
||||
expr_ref eps(seq.re.mk_epsilon(str_sort), m);
|
||||
result.push_back(tau_pair(eps, r, m));
|
||||
result.push_back(tau_pair(r, eps, m));
|
||||
}
|
||||
}
|
||||
|
||||
bool nielsen_graph::apply_regex_factorization(nielsen_node* node) {
|
||||
if (!m_regex_factorization)
|
||||
return false;
|
||||
|
||||
for (str_mem const& mem : node->str_mems()) {
|
||||
SASSERT(mem.m_str && mem.m_regex);
|
||||
|
||||
if (mem.is_primitive() || !mem.m_regex->is_classical())
|
||||
continue;
|
||||
|
||||
euf::snode* first = mem.m_str->first();
|
||||
SASSERT(first);
|
||||
euf::snode* tail = m_sg.drop_first(mem.m_str);
|
||||
SASSERT(tail);
|
||||
|
||||
tau_pairs pairs;
|
||||
compute_tau(m, m_seq, m_sg, mem.m_regex->get_expr(), pairs);
|
||||
|
||||
for (auto const& pair : pairs) {
|
||||
euf::snode* sn_p = m_sg.mk(pair.m_p);
|
||||
euf::snode* sn_q = m_sg.mk(pair.m_q);
|
||||
|
||||
// Eagerly eliminate contradictory cases
|
||||
// e.g. check intersection emptiness with max_states = 100
|
||||
if (m_seq_regex->is_empty_bfs(sn_p, 100) == l_true)
|
||||
continue;
|
||||
if (m_seq_regex->is_empty_bfs(sn_q, 100) == l_true)
|
||||
continue;
|
||||
|
||||
// Also check intersection with other primitive constraints on `first`
|
||||
ptr_vector<euf::snode> regexes_p;
|
||||
regexes_p.push_back(sn_p);
|
||||
for (auto const& prev_mem : node->str_mems()) {
|
||||
if (prev_mem.m_str == first)
|
||||
regexes_p.push_back(prev_mem.m_regex);
|
||||
}
|
||||
if (regexes_p.size() > 1 && m_seq_regex->check_intersection_emptiness(regexes_p, 100) == l_true)
|
||||
continue;
|
||||
|
||||
nielsen_node* child = mk_child(node);
|
||||
mk_edge(node, child, true);
|
||||
|
||||
// remove the original mem from child
|
||||
auto& child_mems = child->str_mems();
|
||||
for (unsigned k = 0; k < child_mems.size(); ++k) {
|
||||
if (child_mems[k].m_id == mem.m_id) {
|
||||
child_mems[k] = child_mems.back();
|
||||
child_mems.pop_back();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
child->add_str_mem(str_mem(first, sn_p, mem.m_history, next_mem_id(), mem.m_dep));
|
||||
child->add_str_mem(str_mem(tail, sn_q, mem.m_history, next_mem_id(), mem.m_dep));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool nielsen_graph::fire_gpower_intro(
|
||||
nielsen_node* node, str_eq const& eq,
|
||||
euf::snode* var, euf::snode_vector const& ground_prefix_orig, bool fwd) {
|
||||
|
|
@ -4141,6 +4335,7 @@ namespace seq {
|
|||
st.update("nseq mod eq split", m_stats.m_mod_eq_split);
|
||||
st.update("nseq mod star intr", m_stats.m_mod_star_intr);
|
||||
st.update("nseq mod gpower intr", m_stats.m_mod_gpower_intr);
|
||||
st.update("nseq mod regex fact", m_stats.m_mod_regex_factorization);
|
||||
st.update("nseq mod const nielsen", m_stats.m_mod_const_nielsen);
|
||||
st.update("nseq mod signature split", m_stats.m_mod_signature_split);
|
||||
st.update("nseq mod regex var", m_stats.m_mod_regex_var_split);
|
||||
|
|
|
|||
|
|
@ -734,6 +734,7 @@ namespace seq {
|
|||
unsigned m_mod_eq_split = 0;
|
||||
unsigned m_mod_star_intr = 0;
|
||||
unsigned m_mod_gpower_intr = 0;
|
||||
unsigned m_mod_regex_factorization = 0;
|
||||
unsigned m_mod_const_nielsen = 0;
|
||||
unsigned m_mod_regex_var_split = 0;
|
||||
unsigned m_mod_signature_split = 0;
|
||||
|
|
@ -763,6 +764,7 @@ namespace seq {
|
|||
unsigned m_max_nodes = 0; // 0 = unlimited
|
||||
bool m_parikh_enabled = true;
|
||||
bool m_signature_split = false;
|
||||
bool m_regex_factorization = true;
|
||||
unsigned m_next_mem_id = 0;
|
||||
unsigned m_fresh_cnt = 0;
|
||||
nielsen_stats m_stats;
|
||||
|
|
@ -886,6 +888,8 @@ namespace seq {
|
|||
void set_parikh_enabled(bool e) { m_parikh_enabled = e; }
|
||||
|
||||
void set_signature_split(bool e) { m_signature_split = e; }
|
||||
|
||||
void set_regex_factorization(bool e) { m_regex_factorization = e; }
|
||||
|
||||
// generate next unique regex membership id
|
||||
unsigned next_mem_id() { return m_next_mem_id++; }
|
||||
|
|
@ -1072,6 +1076,9 @@ namespace seq {
|
|||
// mirrors ZIPT's GPowerIntrModifier
|
||||
bool apply_gpower_intr(nielsen_node* node);
|
||||
|
||||
// generalized regex factorization (Boolean closure derivation rule)
|
||||
bool apply_regex_factorization(nielsen_node* node);
|
||||
|
||||
// helper for apply_gpower_intr: fires the substitution.
|
||||
// `fwd=true` uses left-to-right decomposition; `fwd=false` mirrors ZIPT's
|
||||
// backward (right-to-left) direction.
|
||||
|
|
|
|||
|
|
@ -390,6 +390,38 @@ namespace smt {
|
|||
expr* s_expr = mem.m_str->get_expr();
|
||||
if (s_expr)
|
||||
ensure_length_var(s_expr);
|
||||
|
||||
if (!get_fparams().m_nseq_regex_factorization)
|
||||
return;
|
||||
|
||||
// Boolean Closure Propagations
|
||||
expr* re_expr = mem.m_regex->get_expr();
|
||||
if (m_seq.re.is_intersection(re_expr)) {
|
||||
for (expr* arg : *to_app(re_expr)) {
|
||||
expr_ref in_r(m_seq.re.mk_in_re(s_expr, arg), m);
|
||||
literal_vector lits;
|
||||
lits.push_back(~mem.lit);
|
||||
lits.push_back(mk_literal(in_r));
|
||||
ctx.mk_th_axiom(get_id(), lits.size(), lits.data());
|
||||
}
|
||||
}
|
||||
else if (m_seq.re.is_union(re_expr)) {
|
||||
literal_vector lits;
|
||||
lits.push_back(~mem.lit);
|
||||
for (expr* arg : *to_app(re_expr)) {
|
||||
expr_ref in_r(m_seq.re.mk_in_re(s_expr, arg), m);
|
||||
lits.push_back(mk_literal(in_r));
|
||||
}
|
||||
ctx.mk_th_axiom(get_id(), lits.size(), lits.data());
|
||||
}
|
||||
else if (m_seq.re.is_complement(re_expr)) {
|
||||
expr* arg = to_app(re_expr)->get_arg(0);
|
||||
expr_ref in_r(m_seq.re.mk_in_re(s_expr, arg), m);
|
||||
literal_vector lits;
|
||||
lits.push_back(~mem.lit);
|
||||
lits.push_back(~mk_literal(in_r));
|
||||
ctx.mk_th_axiom(get_id(), lits.size(), lits.data());
|
||||
}
|
||||
}
|
||||
|
||||
void theory_nseq::ensure_length_var(expr* e) {
|
||||
|
|
@ -578,6 +610,7 @@ namespace smt {
|
|||
m_nielsen.set_max_nodes(get_fparams().m_nseq_max_nodes);
|
||||
m_nielsen.set_parikh_enabled(get_fparams().m_nseq_parikh);
|
||||
m_nielsen.set_signature_split(get_fparams().m_nseq_signature);
|
||||
m_nielsen.set_regex_factorization(get_fparams().m_nseq_regex_factorization);
|
||||
|
||||
// Regex membership pre-check: before running DFS, check intersection
|
||||
// emptiness for each variable's regex constraints. This handles
|
||||
|
|
@ -1221,6 +1254,7 @@ namespace smt {
|
|||
lbool result = m_regex.check_intersection_emptiness(regexes);
|
||||
|
||||
if (result == l_true) {
|
||||
// TODO: Incorporate that we might know the maximum length generated by a regex [in those cases, the gradients will never work]
|
||||
// It is empty. Try gradient.
|
||||
regexes.pop_back(); // Remove loop_l
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue