3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2026-05-21 09:29:35 +00:00

Added classical regex factorization

This commit is contained in:
CEisenhofer 2026-04-02 20:03:22 +02:00
parent 3ca960d679
commit a81ce477f5
8 changed files with 258 additions and 6 deletions

View file

@ -2218,6 +2218,10 @@ namespace seq {
if (apply_gpower_intr(node))
return ++m_stats.m_mod_gpower_intr, true;
// Priority 7b: Regex Factorization (Boolean Closure)
if (apply_regex_factorization(node))
return ++m_stats.m_mod_regex_factorization, true;
// Priority 8: ConstNielsen - char vs var (2 children)
if (apply_const_nielsen(node))
return ++m_stats.m_mod_const_nielsen, true;
@ -2810,6 +2814,196 @@ namespace seq {
return false;
}
// -----------------------------------------------------------------------
// Modifier: apply_regex_factorization (Boolean Closure)
// -----------------------------------------------------------------------
struct tau_pair {
expr_ref m_p;
expr_ref m_q;
tau_pair(expr* p, expr* q, ast_manager& m) : m_p(p, m), m_q(q, m) {
SASSERT(p);
SASSERT(q);
}
};
typedef vector<tau_pair> tau_pairs;
static void compute_tau(ast_manager& m, seq_util& seq, euf::sgraph& sg, expr* r, tau_pairs& result) {
SASSERT(r);
sort* str_sort = nullptr;
if (!seq.is_re(r, str_sort)) return;
expr *body = nullptr;
if (seq.re.is_epsilon(r)) {
expr_ref eps(seq.re.mk_epsilon(str_sort), m);
result.push_back(tau_pair(eps, eps, m));
}
else if (seq.str.is_unit(r) || seq.str.is_string(r) || seq.re.is_range(r) || seq.re.is_full_char(r) ||
(seq.re.is_to_re(r) && seq.str.is_string(to_app(r)->get_arg(0)))) {
if (seq.re.is_to_re(r)) {
expr* arg = to_app(r)->get_arg(0);
zstring s;
if (seq.str.is_string(arg, s) && s.length() > 1) {
for (unsigned i = 0; i <= s.length(); ++i) {
expr_ref p(seq.re.mk_to_re(seq.str.mk_string(s.extract(0, i))), m);
expr_ref q(seq.re.mk_to_re(seq.str.mk_string(s.extract(i, s.length() - i))), m);
result.push_back(tau_pair(p, q, m));
}
return;
}
}
expr_ref eps(seq.re.mk_epsilon(str_sort), m);
result.push_back(tau_pair(eps, r, m));
result.push_back(tau_pair(r, eps, m));
}
else if (seq.re.is_empty(r)) {
// empty set has no splits
}
else if (seq.re.is_union(r)) {
for (expr* arg : *to_app(r)) {
compute_tau(m, seq, sg, arg, result);
}
}
else if (seq.re.is_concat(r)) {
unsigned num_args = to_app(r)->get_num_args();
if (num_args == 0) {
expr_ref eps(seq.re.mk_epsilon(str_sort), m);
result.push_back(tau_pair(eps, eps, m));
return;
}
for (unsigned i = 0; i < num_args; ++i) {
tau_pairs tau_arg;
compute_tau(m, seq, sg, to_app(r)->get_arg(i), tau_arg);
expr_ref left(m);
expr_ref right(m);
if (i == 0) left = seq.re.mk_epsilon(str_sort);
else {
expr_ref_vector left_args(m);
for (unsigned j = 0; j < i; ++j) left_args.push_back(to_app(r)->get_arg(j));
if (left_args.size() == 1) left = left_args.get(0);
else left = m.mk_app(seq.get_family_id(), OP_RE_CONCAT, left_args.size(), left_args.data());
}
if (i == num_args - 1) right = seq.re.mk_epsilon(str_sort);
else {
expr_ref_vector right_args(m);
for (unsigned j = i + 1; j < num_args; ++j) right_args.push_back(to_app(r)->get_arg(j));
if (right_args.size() == 1) right = right_args.get(0);
else right = m.mk_app(seq.get_family_id(), OP_RE_CONCAT, right_args.size(), right_args.data());
}
for (auto const& pair : tau_arg) {
expr_ref p(m), q(m);
if (seq.re.is_epsilon(left)) p = pair.m_p;
else if (seq.re.is_epsilon(pair.m_p)) p = left;
if (seq.re.is_epsilon(right)) q = pair.m_q;
else if (seq.re.is_epsilon(pair.m_q)) q = right;
else q = seq.re.mk_concat(pair.m_q, right);
result.push_back(tau_pair(p, q, m));
}
}
}
else if (seq.re.is_star(r, body) || seq.re.is_plus(r, body)) {
if (seq.re.is_plus(r)) {
expr_ref star(seq.re.mk_star(body), m);
expr_ref concat(seq.re.mk_concat(body, star), m);
compute_tau(m, seq, sg, concat, result);
return;
}
expr_ref eps(seq.re.mk_epsilon(str_sort), m);
result.push_back(tau_pair(eps, eps, m));
tau_pairs tau_body;
compute_tau(m, seq, sg, body, tau_body);
for (auto const& pair : tau_body) {
expr_ref p(m), q(m);
if (seq.re.is_epsilon(pair.m_p)) p = r;
else p = seq.re.mk_concat(r, pair.m_p);
if (seq.re.is_epsilon(pair.m_q)) q = r;
else q = seq.re.mk_concat(pair.m_q, r);
result.push_back(tau_pair(p, q, m));
}
}
else if (seq.re.is_opt(r, body)) {
expr_ref eps(seq.re.mk_epsilon(str_sort), m);
result.push_back(tau_pair(eps, eps, m));
compute_tau(m, seq, sg, body, result);
}
else {
expr_ref eps(seq.re.mk_epsilon(str_sort), m);
result.push_back(tau_pair(eps, r, m));
result.push_back(tau_pair(r, eps, m));
}
}
bool nielsen_graph::apply_regex_factorization(nielsen_node* node) {
if (!m_regex_factorization)
return false;
for (str_mem const& mem : node->str_mems()) {
SASSERT(mem.m_str && mem.m_regex);
if (mem.is_primitive() || !mem.m_regex->is_classical())
continue;
euf::snode* first = mem.m_str->first();
SASSERT(first);
euf::snode* tail = m_sg.drop_first(mem.m_str);
SASSERT(tail);
tau_pairs pairs;
compute_tau(m, m_seq, m_sg, mem.m_regex->get_expr(), pairs);
for (auto const& pair : pairs) {
euf::snode* sn_p = m_sg.mk(pair.m_p);
euf::snode* sn_q = m_sg.mk(pair.m_q);
// Eagerly eliminate contradictory cases
// e.g. check intersection emptiness with max_states = 100
if (m_seq_regex->is_empty_bfs(sn_p, 100) == l_true)
continue;
if (m_seq_regex->is_empty_bfs(sn_q, 100) == l_true)
continue;
// Also check intersection with other primitive constraints on `first`
ptr_vector<euf::snode> regexes_p;
regexes_p.push_back(sn_p);
for (auto const& prev_mem : node->str_mems()) {
if (prev_mem.m_str == first)
regexes_p.push_back(prev_mem.m_regex);
}
if (regexes_p.size() > 1 && m_seq_regex->check_intersection_emptiness(regexes_p, 100) == l_true)
continue;
nielsen_node* child = mk_child(node);
mk_edge(node, child, true);
// remove the original mem from child
auto& child_mems = child->str_mems();
for (unsigned k = 0; k < child_mems.size(); ++k) {
if (child_mems[k].m_id == mem.m_id) {
child_mems[k] = child_mems.back();
child_mems.pop_back();
break;
}
}
child->add_str_mem(str_mem(first, sn_p, mem.m_history, next_mem_id(), mem.m_dep));
child->add_str_mem(str_mem(tail, sn_q, mem.m_history, next_mem_id(), mem.m_dep));
}
return true;
}
return false;
}
bool nielsen_graph::fire_gpower_intro(
nielsen_node* node, str_eq const& eq,
euf::snode* var, euf::snode_vector const& ground_prefix_orig, bool fwd) {
@ -4141,6 +4335,7 @@ namespace seq {
st.update("nseq mod eq split", m_stats.m_mod_eq_split);
st.update("nseq mod star intr", m_stats.m_mod_star_intr);
st.update("nseq mod gpower intr", m_stats.m_mod_gpower_intr);
st.update("nseq mod regex fact", m_stats.m_mod_regex_factorization);
st.update("nseq mod const nielsen", m_stats.m_mod_const_nielsen);
st.update("nseq mod signature split", m_stats.m_mod_signature_split);
st.update("nseq mod regex var", m_stats.m_mod_regex_var_split);

View file

@ -734,6 +734,7 @@ namespace seq {
unsigned m_mod_eq_split = 0;
unsigned m_mod_star_intr = 0;
unsigned m_mod_gpower_intr = 0;
unsigned m_mod_regex_factorization = 0;
unsigned m_mod_const_nielsen = 0;
unsigned m_mod_regex_var_split = 0;
unsigned m_mod_signature_split = 0;
@ -763,6 +764,7 @@ namespace seq {
unsigned m_max_nodes = 0; // 0 = unlimited
bool m_parikh_enabled = true;
bool m_signature_split = false;
bool m_regex_factorization = true;
unsigned m_next_mem_id = 0;
unsigned m_fresh_cnt = 0;
nielsen_stats m_stats;
@ -886,6 +888,8 @@ namespace seq {
void set_parikh_enabled(bool e) { m_parikh_enabled = e; }
void set_signature_split(bool e) { m_signature_split = e; }
void set_regex_factorization(bool e) { m_regex_factorization = e; }
// generate next unique regex membership id
unsigned next_mem_id() { return m_next_mem_id++; }
@ -1072,6 +1076,9 @@ namespace seq {
// mirrors ZIPT's GPowerIntrModifier
bool apply_gpower_intr(nielsen_node* node);
// generalized regex factorization (Boolean closure derivation rule)
bool apply_regex_factorization(nielsen_node* node);
// helper for apply_gpower_intr: fires the substitution.
// `fwd=true` uses left-to-right decomposition; `fwd=false` mirrors ZIPT's
// backward (right-to-left) direction.