From b288c2e7dce1bd5d23321eac85237106d098a5e2 Mon Sep 17 00:00:00 2001 From: CEisenhofer Date: Wed, 18 Mar 2026 14:54:12 +0100 Subject: [PATCH] Some more bug fixes --- src/smt/seq/seq_nielsen.cpp | 25 +++++++------ src/smt/seq/seq_parikh.cpp | 71 ------------------------------------- src/smt/seq/seq_parikh.h | 11 ------ src/smt/seq/seq_regex.cpp | 64 +++++++++++++++++++++++++++++++++ src/smt/seq/seq_regex.h | 15 ++++++++ src/test/seq_parikh.cpp | 2 ++ 6 files changed, 96 insertions(+), 92 deletions(-) diff --git a/src/smt/seq/seq_nielsen.cpp b/src/smt/seq/seq_nielsen.cpp index 3d7ccfc2d..b52201da6 100644 --- a/src/smt/seq/seq_nielsen.cpp +++ b/src/smt/seq/seq_nielsen.cpp @@ -1872,7 +1872,7 @@ namespace seq { for (euf::snode* mt : minterms) { if (!mt || mt->is_fail()) continue; if (!mt->get_expr()) continue; - char_set mt_cs = m_graph.m_parikh->minterm_to_char_set(mt->get_expr()); + char_set mt_cs = m_graph.m_seq_regex->minterm_to_char_set(mt->get_expr()); if (cs.is_subset(mt_cs)) { euf::snode* deriv = sg.brzozowski_deriv(mem.m_regex, mt); if (!deriv) { found = false; break; } @@ -3628,6 +3628,7 @@ namespace seq { VERIFY(!minterms.empty()); bool created = false; + std::cout << "Considering regex: " << mk_pp(mem.m_regex->get_expr(), m_sg.get_manager()) << std::endl; // Branch 1: x → ε (progress) { @@ -3642,7 +3643,9 @@ namespace seq { // Branch 2+: for each minterm m_i, x → ?c · x // where ?c is a symbolic char constrained by the minterm for (euf::snode* mt : minterms) { - // std::cout << "Processing minterm: " << mk_pp(mt->get_expr(), m_sg.get_manager()) << std::endl; + std::cout << "Processing minterm: " << mk_pp(mt->get_expr(), m_sg.get_manager()) << std::endl; + SASSERT(mt); + SASSERT(mt->get_expr()); SASSERT(!mt->is_fail()); // Try Brzozowski derivative with respect to this minterm @@ -3651,19 +3654,22 @@ namespace seq { SASSERT(deriv); if (deriv->is_fail()) continue; - // std::cout << "Result: " << mk_pp(deriv->get_expr(), m_sg.get_manager()) << std::endl; + std::cout << "Result: " << mk_pp(deriv->get_expr(), m_sg.get_manager()) << std::endl; - char_set cs; - if (m_parikh && mt->get_expr()) - cs = m_parikh->minterm_to_char_set(mt->get_expr()); + SASSERT(m_seq_regex); + char_set cs = m_seq_regex->minterm_to_char_set(mt->get_expr()); + std::cout << "char_set:\n"; + for (auto& r : cs.ranges()) { + std::cout << "\t[" << r.m_lo << "; " << r.m_hi - 1 << "]" << std::endl; + } euf::snode* fresh_char = nullptr; if (cs.is_unit()) { expr_ref char_expr(m_sg.get_seq_util().str.mk_string(zstring(cs.first_char())), m_sg.get_manager()); fresh_char = m_sg.mk(char_expr); - } else { - fresh_char = mk_fresh_char_var(); } + else + fresh_char = mk_fresh_char_var(); euf::snode* replacement = m_sg.mk_concat(fresh_char, first); nielsen_node* child = mk_child(node); @@ -3675,9 +3681,8 @@ namespace seq { // This is the key Parikh pruning step: when x → ?c · x' is // generated from minterm m_i, ?c must belong to the character // class described by m_i so that str ∈ derivative(R, m_i). - if (!cs.is_unit() && !cs.is_empty()) { + if (!cs.is_unit() && !cs.is_empty()) child->add_char_range(fresh_char, cs); - } created = true; } diff --git a/src/smt/seq/seq_parikh.cpp b/src/smt/seq/seq_parikh.cpp index 26587322b..80450da55 100644 --- a/src/smt/seq/seq_parikh.cpp +++ b/src/smt/seq/seq_parikh.cpp @@ -318,77 +318,6 @@ namespace seq { // sgraph::compute_minterms(). This function converts them to a // concrete char_set so that fresh character variables introduced by // apply_regex_var_split can be constrained with add_char_range(). - // - // The implementation is recursive and handles: - // re.full_char → [0, max_char] (full alphabet) - // re.range(lo, hi) → [lo, hi+1) (hi inclusive in Z3) - // re.complement(r) → alphabet \ minterm_to_char_set(r) - // re.inter(r1, r2) → intersection of both sides - // re.diff(r1, r2) → r1 ∩ complement(r2) - // re.to_re(str.unit(c)) → singleton {c} - // re.empty → empty set - // anything else → [0, max_char] (conservative) - char_set seq_parikh::minterm_to_char_set(expr* re_expr) { - unsigned max_c = seq.max_char(); - if (!re_expr) - return char_set::full(max_c); - - // full_char: the whole alphabet [0, max_char] - if (seq.re.is_full_char(re_expr)) - return char_set::full(max_c); - - // range [lo, hi] (hi inclusive in Z3's regex representation) - unsigned lo = 0, hi = 0; - if (seq.re.is_range(re_expr, lo, hi)) { - // lo > hi is a degenerate range; should not arise from well-formed minterms - SASSERT(lo <= hi); - if (lo > hi) return char_set(); - // char_range uses exclusive upper bound; Z3 hi is inclusive - return char_set(char_range(lo, hi + 1)); - } - - // complement: alphabet minus the inner set - expr* inner = nullptr; - if (seq.re.is_complement(re_expr, inner)) - return minterm_to_char_set(inner).complement(max_c); - - // union: characters present in either set - expr* r1 = nullptr, *r2 = nullptr; - if (seq.re.is_union(re_expr, r1, r2)) { - char_set cs = minterm_to_char_set(r1); - cs.add(minterm_to_char_set(r2)); - return cs; - } - - // intersection: characters present in both sets - if (seq.re.is_intersection(re_expr, r1, r2)) - return minterm_to_char_set(r1).intersect_with(minterm_to_char_set(r2)); - - // difference: r1 minus r2 = r1 ∩ complement(r2) - if (seq.re.is_diff(re_expr, r1, r2)) - return minterm_to_char_set(r1).intersect_with( - minterm_to_char_set(r2).complement(max_c)); - - // to_re(str.unit(c)): singleton character set - expr* str_arg = nullptr; - expr* ch_expr = nullptr; - unsigned char_val = 0; - if (seq.re.is_to_re(re_expr, str_arg) && - seq.str.is_unit(str_arg, ch_expr) && - seq.is_const_char(ch_expr, char_val)) { - char_set cs; - cs.add(char_val); - return cs; - } - - // empty regex: no characters can appear - if (seq.re.is_empty(re_expr)) - return char_set(); - - // Conservative fallback: return the full alphabet so that - // no unsound constraints are added for unrecognised expressions. - return char_set::full(max_c); - } } // namespace seq diff --git a/src/smt/seq/seq_parikh.h b/src/smt/seq/seq_parikh.h index 8e8a4e178..47ec26424 100644 --- a/src/smt/seq/seq_parikh.h +++ b/src/smt/seq/seq_parikh.h @@ -134,17 +134,6 @@ namespace seq { // single, indivisible character equivalence class. Minterms are // produced by sgraph::compute_minterms and used in // apply_regex_var_split to constrain fresh character variables. - // - // Supported expressions: - // re.full_char → full set [0, max_char] - // re.range(lo, hi) → char_set [lo, hi] (inclusive on both ends) - // re.complement(r) → complement of minterm_to_char_set(r) - // re.inter(r1, r2) → intersection of both sets - // re.diff(r1, r2) → r1 minus r2 (= r1 ∩ complement(r2)) - // re.to_re(unit(c)) → singleton {c} - // re.empty → empty set - // anything else → full set (conservative, sound over-approximation) - char_set minterm_to_char_set(expr* minterm_re); }; } // namespace seq diff --git a/src/smt/seq/seq_regex.cpp b/src/smt/seq/seq_regex.cpp index ab23cad42..1787b7416 100644 --- a/src/smt/seq/seq_regex.cpp +++ b/src/smt/seq/seq_regex.cpp @@ -1098,4 +1098,68 @@ namespace seq { return result == l_true; } + char_set seq_regex::minterm_to_char_set(expr* re_expr) { + seq_util& seq = m_sg.get_seq_util(); + unsigned max_c = seq.max_char(); + + if (!re_expr) + return char_set::full(max_c); + + // full_char: the whole alphabet [0, max_char] + if (seq.re.is_full_char(re_expr)) + return char_set::full(max_c); + + // range [lo, hi] (hi inclusive in Z3's regex representation) + unsigned lo = 0, hi = 0; + if (seq.re.is_range(re_expr, lo, hi)) { + // lo > hi is a degenerate range; should not arise from well-formed minterms + SASSERT(lo <= hi); + if (lo > hi) return char_set(); + // char_range uses exclusive upper bound; Z3 hi is inclusive + return char_set(char_range(lo, hi + 1)); + } + + // complement: alphabet minus the inner set + expr* inner = nullptr; + if (seq.re.is_complement(re_expr, inner)) + return minterm_to_char_set(inner).complement(max_c); + + // union: characters present in either set + expr* r1 = nullptr, *r2 = nullptr; + if (seq.re.is_union(re_expr, r1, r2)) { + char_set cs = minterm_to_char_set(r1); + cs.add(minterm_to_char_set(r2)); + return cs; + } + + // intersection: characters present in both sets + if (seq.re.is_intersection(re_expr, r1, r2)) + return minterm_to_char_set(r1).intersect_with(minterm_to_char_set(r2)); + + // difference: r1 minus r2 = r1 ∩ complement(r2) + if (seq.re.is_diff(re_expr, r1, r2)) + return minterm_to_char_set(r1).intersect_with( + minterm_to_char_set(r2).complement(max_c)); + + // to_re(str.unit(c)): singleton character set + expr* str_arg = nullptr; + expr* ch_expr = nullptr; + unsigned char_val = 0; + if (seq.re.is_to_re(re_expr, str_arg) && + seq.str.is_unit(str_arg, ch_expr) && + seq.is_const_char(ch_expr, char_val)) { + char_set cs; + cs.add(char_val); + return cs; + } + + // empty regex: no characters can appear + if (seq.re.is_empty(re_expr)) + return char_set(); + + // Conservative fallback: return the full alphabet so that + // no unsound constraints are added for unrecognised expressions. + return char_set::full(max_c); + } + } diff --git a/src/smt/seq/seq_regex.h b/src/smt/seq/seq_regex.h index 9ef24e665..58a77d64d 100644 --- a/src/smt/seq/seq_regex.h +++ b/src/smt/seq/seq_regex.h @@ -72,6 +72,21 @@ namespace seq { void get_alphabet_representatives(euf::snode* re, euf::snode_vector& reps); public: + // Convert a regex minterm expression to a char_set. + // Used to transform symbolic boundaries (re.range, re.complement, etc.) + // into exact character sets (char_set). + // + // Supported expressions: + // re.full_char → full set [0, max_char] + // re.range(lo, hi) → char_set [lo, hi] (inclusive on both ends) + // re.complement(r) → complement of minterm_to_char_set(r) + // re.inter(r1, r2) → intersection of both sets + // re.diff(r1, r2) → r1 minus r2 (= r1 ∩ complement(r2)) + // re.to_re(unit(c)) → singleton {c} + // re.empty → empty set + // anything else → full set (conservative, sound over-approximation) + char_set minterm_to_char_set(expr* minterm_re); + seq_regex(euf::sgraph& sg) : m_sg(sg) {} euf::sgraph& sg() { return m_sg; } diff --git a/src/test/seq_parikh.cpp b/src/test/seq_parikh.cpp index d8a7d0b6a..cb237a28f 100644 --- a/src/test/seq_parikh.cpp +++ b/src/test/seq_parikh.cpp @@ -29,6 +29,7 @@ Author: #include "ast/euf/euf_sgraph.h" #include "smt/seq/seq_nielsen.h" #include "smt/seq/seq_parikh.h" +#include "smt/seq/seq_regex.h" #include "ast/arith_decl_plugin.h" #include "ast/reg_decl_plugins.h" #include "ast/ast_pp.h" @@ -890,3 +891,4 @@ void tst_seq_parikh() { test_minterm_singleton(); test_minterm_nullptr_is_full(); } +