diff --git a/src/ast/euf/euf_sgraph.cpp b/src/ast/euf/euf_sgraph.cpp index 23c35336c..2fc385688 100644 --- a/src/ast/euf/euf_sgraph.cpp +++ b/src/ast/euf/euf_sgraph.cpp @@ -617,6 +617,14 @@ namespace euf { return; if (m_seq.re.is_empty(e)) return; + + // Expected compound regex operators are handled by recursion below. + // If a leaf survives to this point, it is an unhandled regex form. + if (re->num_args() == 0) { + UNREACHABLE(); + return; + } + // recurse into compound regex operators for (unsigned i = 0; i < re->num_args(); ++i) { collect_re_predicates(re->arg(i), preds); @@ -669,8 +677,10 @@ namespace euf { } else if (m_seq.re.is_full_char(p)) p_set = char_set::full(max_c); - else + else { + UNREACHABLE(); continue; + } if (p_set.is_empty() || p_set.is_full(max_c)) continue; diff --git a/src/smt/seq/seq_regex.cpp b/src/smt/seq/seq_regex.cpp index c7c22ae4f..537b38c6a 100644 --- a/src/smt/seq/seq_regex.cpp +++ b/src/smt/seq/seq_regex.cpp @@ -20,6 +20,55 @@ Author: namespace seq { + // Collect possible first characters of a syntactically known *string* + // expression (the body of to_re). Regex operators (union, complement, + // intersection, ...) are not expected here. + void collect_possible_first_chars(seq_util& seq, euf::sgraph const& sg, expr* str, + unsigned_vector& bounds, bool& may_be_empty) { + may_be_empty = false; + VERIFY(str); + sort* re_sort = nullptr; + VERIFY(!seq.is_re(str, re_sort)); + + unsigned ch = 0; + if (sg.decode_re_char(str, ch)) { + bounds.push_back(ch); + if (ch < zstring::max_char()) + bounds.push_back(ch + 1); + return; + } + + zstring s; + if (seq.str.is_string(str, s)) { + if (s.length() == 0) { + may_be_empty = true; + return; + } + unsigned first_ch = s[0]; + bounds.push_back(first_ch); + if (first_ch < zstring::max_char()) + bounds.push_back(first_ch + 1); + return; + } + + expr* a = nullptr; + expr* b = nullptr; + if (seq.str.is_concat(str, a, b)) { + bool a_may_be_empty = false; + collect_possible_first_chars(seq, sg, a, bounds, a_may_be_empty); + + if (a_may_be_empty) { + bool b_may_be_empty = false; + collect_possible_first_chars(seq, sg, b, bounds, b_may_be_empty); + may_be_empty = b_may_be_empty; + } + return; + } + + UNREACHABLE(); + } + + // ----------------------------------------------------------------------- // Stabilizer store // ----------------------------------------------------------------------- @@ -375,22 +424,11 @@ namespace seq { return; } - // to_re(s): boundary at first character and first+1 + // to_re(s): boundary at possible first characters of s expr* body = nullptr; if (seq.re.is_to_re(e, body)) { - zstring s; - unsigned ch = 0; - if (m_sg.decode_re_char(body, ch)) { - bounds.push_back(ch); - if (ch < zstring::max_char()) - bounds.push_back(ch + 1); - } - else if (seq.str.is_string(body, s) && s.length() > 0) { - unsigned first_ch = s[0]; - bounds.push_back(first_ch); - if (first_ch < zstring::max_char()) - bounds.push_back(first_ch + 1); - } + bool may_be_empty = false; + collect_possible_first_chars(seq, m_sg, body, bounds, may_be_empty); return; } @@ -398,6 +436,14 @@ namespace seq { if (re->is_fail() || re->is_full_char() || re->is_full_seq()) return; + // If we reached a leaf and none of the expected leaf forms matched, + // this is a regex constructor we did not account for in boundary + // extraction and should fail loudly in debug builds. + if (re->num_args() == 0) { + UNREACHABLE(); + return; + } + // Recurse into children (handles union, concat, star, loop, etc.) for (unsigned i = 0; i < re->num_args(); ++i) { collect_char_boundaries(re->arg(i), bounds); @@ -408,9 +454,9 @@ namespace seq { // BFS regex emptiness check — helper: alphabet representatives // Faster alternative of computing all min-terms and taking representatives of them // ----------------------------------------------------------------------- - void seq_regex::get_alphabet_representatives(euf::snode* re, euf::snode_vector& reps) { + bool seq_regex::get_alphabet_representatives(euf::snode* re, euf::snode_vector& reps) { if (!re || !re->get_expr()) - return; + return false; seq_util& seq = m_sg.get_seq_util(); unsigned max_c = seq.max_char(); @@ -441,6 +487,7 @@ namespace seq { // Defensive fallback for degenerate inputs. if (reps.empty()) reps.push_back(m_sg.mk_char(0)); + return true; } // ----------------------------------------------------------------------- @@ -495,7 +542,8 @@ namespace seq { // character snode whose equivalence class has identical // derivative behavior. euf::snode_vector reps; - get_alphabet_representatives(current, reps); + if (!get_alphabet_representatives(current, reps)) + return l_undef; if (reps.empty()) // Nothing found = dead-end @@ -1117,8 +1165,7 @@ namespace seq { seq_util& seq = m_sg.get_seq_util(); unsigned max_c = seq.max_char(); - if (!re_expr) - return char_set::full(max_c); + VERIFY(re_expr); // full_char: the whole alphabet [0, max_char] if (seq.re.is_full_char(re_expr)) @@ -1186,8 +1233,9 @@ namespace seq { if (seq.re.is_empty(re_expr)) return char_set(); - // Conservative fallback: return the full alphabet so that - // no unsound constraints are added for unrecognised expressions. + // Unexpected minterm shape: we should fail loudly instead of silently + // returning a conservative approximation. + UNREACHABLE(); return char_set::full(max_c); } diff --git a/src/smt/seq/seq_regex.h b/src/smt/seq/seq_regex.h index 5c3bf2843..72f4b71f5 100644 --- a/src/smt/seq/seq_regex.h +++ b/src/smt/seq/seq_regex.h @@ -69,7 +69,7 @@ namespace seq { // Build a set of representative character snodes, one per // alphabet equivalence class, derived from the boundary points // of the given regex. - void get_alphabet_representatives(euf::snode* re, euf::snode_vector& reps); + bool get_alphabet_representatives(euf::snode* re, euf::snode_vector& reps); public: