mirror of
https://github.com/Z3Prover/z3
synced 2026-03-07 22:04:53 +00:00
Continue nseq: regex synthesis, model generation, length propagation
Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com>
This commit is contained in:
parent
a65b64ae63
commit
faeb936a83
2 changed files with 305 additions and 31 deletions
|
|
@ -94,8 +94,11 @@ namespace smt {
|
|||
if (check_disequalities())
|
||||
return FC_CONTINUE;
|
||||
|
||||
if (all_eqs_solved() && m_state.mems().empty())
|
||||
return FC_DONE;
|
||||
if (all_eqs_solved()) {
|
||||
// All word equations solved; check if remaining memberships are handled
|
||||
if (all_mems_checked())
|
||||
return FC_DONE;
|
||||
}
|
||||
|
||||
return FC_GIVEUP;
|
||||
}
|
||||
|
|
@ -195,6 +198,12 @@ namespace smt {
|
|||
if (m_find.find(v1) != m_find.find(v2))
|
||||
m_find.merge(v1, v2);
|
||||
m_state.add_eq(o1, o2, dep);
|
||||
// Propagate length equality: len(o1) = len(o2)
|
||||
ensure_length_axiom(o1);
|
||||
ensure_length_axiom(o2);
|
||||
expr_ref len1 = mk_len(o1);
|
||||
expr_ref len2 = mk_len(o2);
|
||||
add_axiom(mk_eq(len1, len2, false));
|
||||
IF_VERBOSE(3, verbose_stream() << "new_eq_eh: " << mk_bounded_pp(o1, m) << " = " << mk_bounded_pp(o2, m)
|
||||
<< " (total eqs: " << m_state.eqs().size() << ")\n");
|
||||
TRACE(seq, tout << "new eq: " << mk_bounded_pp(o1, m) << " = " << mk_bounded_pp(o2, m) << "\n";);
|
||||
|
|
@ -1397,6 +1406,84 @@ namespace smt {
|
|||
return r;
|
||||
}
|
||||
|
||||
// DFS: find a string of exactly target_len in regex r.
|
||||
// On success, appends characters to 'result' and returns true.
|
||||
// cur_len is how many characters have already been appended.
|
||||
bool theory_nseq::dfs_regex(expr* r, unsigned target_len, unsigned cur_len, zstring& result) {
|
||||
if (cur_len == target_len) {
|
||||
expr_ref nullable = m_seq_rewrite.is_nullable(expr_ref(r, m));
|
||||
m_rewrite(nullable);
|
||||
return m.is_true(nullable);
|
||||
}
|
||||
// Try printable characters: lowercase, uppercase, digits, then common punctuation
|
||||
static const unsigned char priority[] = {
|
||||
'a','b','c','d','e','f','g','h','i','j','k','l','m',
|
||||
'n','o','p','q','r','s','t','u','v','w','x','y','z',
|
||||
'0','1','2','3','4','5','6','7','8','9',
|
||||
'A','B','C','D','E','F','G','H','I','J','K','L','M',
|
||||
'N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
|
||||
' ','.', ',', '!', '?', '-', '_', '@', '#', '$', '%', '&', '*'
|
||||
};
|
||||
for (unsigned c : priority) {
|
||||
expr_ref ch(m_util.mk_char(c), m);
|
||||
expr_ref dr = m_seq_rewrite.mk_derivative(ch, expr_ref(r, m));
|
||||
m_rewrite(dr);
|
||||
if (m_util.re.is_empty(dr))
|
||||
continue;
|
||||
result = result + zstring(c);
|
||||
if (dfs_regex(dr, target_len, cur_len + 1, result))
|
||||
return true;
|
||||
// Backtrack: remove last character
|
||||
result = result.extract(0, result.length() - 1);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Synthesize a concrete string of exactly `len` accepted by `regex`.
|
||||
// Returns true and sets `result` on success.
|
||||
bool theory_nseq::synthesize_regex_string(expr* regex, unsigned len, zstring& result) {
|
||||
result = zstring();
|
||||
return dfs_regex(regex, len, 0, result);
|
||||
}
|
||||
|
||||
// Add length-bound axioms from regex structure: len(s) >= min_length(r).
|
||||
// Also adds: len(s) = 0 → nullable(r) (otherwise conflict).
|
||||
bool theory_nseq::add_regex_length_axioms(nseq_mem const& mem) {
|
||||
expr* s = mem.str();
|
||||
expr* r = mem.regex();
|
||||
bool sign = mem.sign();
|
||||
nseq_dependency* dep = mem.dep();
|
||||
bool progress = false;
|
||||
|
||||
if (sign) {
|
||||
// s ∈ r → len(s) >= min_length(r)
|
||||
unsigned minlen = m_util.re.min_length(r);
|
||||
if (minlen > 0) {
|
||||
expr_ref len_s = mk_len(s);
|
||||
ensure_length_axiom(s);
|
||||
literal lb = mk_literal(m_autil.mk_ge(len_s, m_autil.mk_int(minlen)));
|
||||
if (ctx.get_assignment(lb) != l_true) {
|
||||
add_axiom(lb);
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
// If r is not nullable and len(s) = 0, conflict
|
||||
expr_ref nullable = m_seq_rewrite.is_nullable(expr_ref(r, m));
|
||||
m_rewrite(nullable);
|
||||
if (m.is_false(nullable)) {
|
||||
// s ∈ r → s ≠ ""
|
||||
expr_ref emp(m_util.str.mk_empty(s->get_sort()), m);
|
||||
ensure_enode(emp);
|
||||
literal s_empty = mk_eq(s, emp, false);
|
||||
if (ctx.get_assignment(~s_empty) != l_true) {
|
||||
add_axiom(~s_empty);
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool theory_nseq::check_regex_memberships() {
|
||||
bool progress = false;
|
||||
for (auto const& mem : m_state.mems()) {
|
||||
|
|
@ -1414,43 +1501,99 @@ namespace smt {
|
|||
bool sign = mem.sign();
|
||||
nseq_dependency* dep = mem.dep();
|
||||
|
||||
// Add structural length axioms from the regex
|
||||
if (add_regex_length_axioms(mem))
|
||||
return true;
|
||||
|
||||
// Try to determine the string value from the e-graph
|
||||
zstring sval;
|
||||
if (!is_ground_string(s, sval))
|
||||
if (is_ground_string(s, sval)) {
|
||||
// Compute derivatives for the known string
|
||||
expr_ref derived = derive_regex(r, sval);
|
||||
|
||||
// Check nullable: does the derived regex accept the empty word?
|
||||
expr_ref nullable = m_seq_rewrite.is_nullable(derived);
|
||||
m_rewrite(nullable);
|
||||
|
||||
if (sign) {
|
||||
// Positive membership: s must be in r
|
||||
if (m.is_false(nullable) || m_util.re.is_empty(derived)) {
|
||||
set_conflict(dep);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Negative membership: s must NOT be in r
|
||||
if (m.is_true(nullable)) {
|
||||
set_conflict(dep);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Compute derivatives for the known string
|
||||
expr_ref derived = derive_regex(r, sval);
|
||||
|
||||
// Check nullable: does the derived regex accept the empty word?
|
||||
expr_ref nullable = m_seq_rewrite.is_nullable(derived);
|
||||
m_rewrite(nullable);
|
||||
|
||||
// String is not yet ground. For positive membership:
|
||||
// if we know the exact length, synthesize a concrete string.
|
||||
if (sign) {
|
||||
// Positive membership: s must be in r
|
||||
if (m.is_false(nullable)) {
|
||||
// String is fully determined but regex rejects → conflict
|
||||
set_conflict(dep);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Negative membership: s must NOT be in r
|
||||
if (m.is_true(nullable)) {
|
||||
// String is fully determined and regex accepts → conflict
|
||||
set_conflict(dep);
|
||||
return true;
|
||||
rational len_val;
|
||||
if (get_length(s, len_val) && len_val.is_nonneg() && len_val.is_unsigned()) {
|
||||
unsigned len = len_val.get_unsigned();
|
||||
// Feasibility check only: detect conflicts but don't force a specific value.
|
||||
// Model generation (mk_value) will synthesize the actual string.
|
||||
zstring synth;
|
||||
if (!synthesize_regex_string(r, len, synth)) {
|
||||
// No string of that length accepted by regex - conflict
|
||||
IF_VERBOSE(3, verbose_stream() << " regex conflict: no string of len=" << len
|
||||
<< " in " << mk_bounded_pp(r, m, 2) << "\n";);
|
||||
set_conflict(dep);
|
||||
return true;
|
||||
}
|
||||
IF_VERBOSE(3, verbose_stream() << " regex feasible: len=" << len
|
||||
<< " has witness \"" << synth << "\"\n";);
|
||||
}
|
||||
}
|
||||
|
||||
// Check if the derived regex is the empty set
|
||||
if (sign && m_util.re.is_empty(derived)) {
|
||||
set_conflict(dep);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns true if every membership constraint is already verified
|
||||
// (ground strings checked) or provably feasible (non-ground with known length).
|
||||
bool theory_nseq::all_mems_checked() {
|
||||
for (auto const& mem : m_state.mems()) {
|
||||
expr* s = mem.str();
|
||||
expr* r = mem.regex();
|
||||
bool sign = mem.sign();
|
||||
nseq_dependency* dep = mem.dep();
|
||||
zstring sval;
|
||||
if (is_ground_string(s, sval)) {
|
||||
// Verify the ground string
|
||||
expr_ref derived = derive_regex(r, sval);
|
||||
expr_ref nullable = m_seq_rewrite.is_nullable(derived);
|
||||
m_rewrite(nullable);
|
||||
if (sign && !m.is_true(nullable))
|
||||
return false;
|
||||
if (!sign && !m.is_false(nullable))
|
||||
return false;
|
||||
}
|
||||
else if (sign) {
|
||||
// Non-ground positive membership: check feasibility by length
|
||||
rational len_val;
|
||||
if (get_length(s, len_val) && len_val.is_nonneg() && len_val.is_unsigned()) {
|
||||
zstring synth;
|
||||
if (!synthesize_regex_string(r, len_val.get_unsigned(), synth))
|
||||
return false; // infeasible; conflict should have been set by check_regex_mem
|
||||
// Feasible - model generator will assign the right value
|
||||
}
|
||||
else {
|
||||
// Length unknown and regex is empty: unconditionally infeasible
|
||||
if (m_util.re.is_empty(r))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------
|
||||
// Length reasoning
|
||||
// -------------------------------------------------------
|
||||
|
|
@ -1495,7 +1638,34 @@ namespace smt {
|
|||
|
||||
bool theory_nseq::get_length(expr* e, rational& val) {
|
||||
expr_ref len = mk_len(e);
|
||||
return m_arith_value.get_value(len, val);
|
||||
if (m_arith_value.get_value(len, val))
|
||||
return true;
|
||||
// Structural fallback: for str.substr(s, off, n) with concrete off and n,
|
||||
// use the explicit length argument if it's a numeral and s has sufficient length.
|
||||
expr* s = nullptr, *offset = nullptr, *len_arg = nullptr;
|
||||
if (m_util.str.is_extract(e, s, offset, len_arg)) {
|
||||
rational len_n;
|
||||
if (m_autil.is_numeral(len_arg, len_n) && len_n.is_nonneg()) {
|
||||
rational off_n;
|
||||
if (m_autil.is_numeral(offset, off_n) && off_n.is_nonneg()) {
|
||||
rational parent_len;
|
||||
if (get_length(s, parent_len) &&
|
||||
off_n + len_n <= parent_len) {
|
||||
val = len_n;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Fallback via lower/upper bound: if lo == hi, that's the value
|
||||
rational lo, hi;
|
||||
bool s_lo, s_hi;
|
||||
if (m_arith_value.get_lo(len, lo, s_lo) && !s_lo &&
|
||||
m_arith_value.get_up(len, hi, s_hi) && !s_hi && lo == hi) {
|
||||
val = lo;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool theory_nseq::lower_bound(expr* e, rational& lo) {
|
||||
|
|
@ -1603,8 +1773,60 @@ namespace smt {
|
|||
st.update("nseq axioms", s.m_num_axioms);
|
||||
}
|
||||
|
||||
// Find the most constrained regex for a given string expression
|
||||
// by scanning memberships that apply to e or its equivalence class root.
|
||||
expr* theory_nseq::find_regex_for(expr* e) {
|
||||
if (!ctx.e_internalized(e))
|
||||
return nullptr;
|
||||
enode* root = ctx.get_enode(e)->get_root();
|
||||
for (auto const& mem : m_state.mems()) {
|
||||
if (!mem.sign()) continue;
|
||||
expr* ms = mem.str();
|
||||
if (!ctx.e_internalized(ms)) continue;
|
||||
if (ctx.get_enode(ms)->get_root() == root)
|
||||
return mem.regex();
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Find any positive membership for a string expression whose base
|
||||
// is `base` and which starts at position `pos`. Returns the regex
|
||||
// and sets `cover_len` to how many characters it covers.
|
||||
// Used to synthesize sub-range values during model generation.
|
||||
expr* theory_nseq::find_substr_regex(expr* base, unsigned pos,
|
||||
unsigned total_len, unsigned& cover_len) {
|
||||
if (!ctx.e_internalized(base))
|
||||
return nullptr;
|
||||
enode* base_root = ctx.get_enode(base)->get_root();
|
||||
for (auto const& mem : m_state.mems()) {
|
||||
if (!mem.sign()) continue;
|
||||
expr* ms = mem.str();
|
||||
if (!ctx.e_internalized(ms)) continue;
|
||||
// Check if ms is str.substr(base, pos, k) for some k
|
||||
expr* s = nullptr, *off = nullptr, *len_arg = nullptr;
|
||||
if (!m_util.str.is_extract(ms, s, off, len_arg)) continue;
|
||||
if (!ctx.e_internalized(s)) continue;
|
||||
if (ctx.get_enode(s)->get_root() != base_root) continue;
|
||||
// Check offset matches
|
||||
rational off_v;
|
||||
if (!m_autil.is_numeral(off, off_v) || !off_v.is_nonneg() ||
|
||||
!off_v.is_unsigned() || off_v.get_unsigned() != pos) continue;
|
||||
// Get the length (may be symbolic, use get_length)
|
||||
rational len_v;
|
||||
if (m_autil.is_numeral(len_arg, len_v) && len_v.is_nonneg() && len_v.is_unsigned()) {
|
||||
cover_len = len_v.get_unsigned();
|
||||
return mem.regex();
|
||||
}
|
||||
if (get_length(ms, len_v) && len_v.is_nonneg() && len_v.is_unsigned()) {
|
||||
cover_len = len_v.get_unsigned();
|
||||
return mem.regex();
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------
|
||||
// Model generation (stub)
|
||||
// Model generation
|
||||
// -------------------------------------------------------
|
||||
|
||||
model_value_proc* theory_nseq::mk_value(enode* n, model_generator& mg) {
|
||||
|
|
@ -1636,11 +1858,57 @@ namespace smt {
|
|||
curr = curr->get_next();
|
||||
} while (curr != root);
|
||||
|
||||
// Try to construct a string of the correct length
|
||||
// Try to construct a string satisfying any regex membership constraint
|
||||
rational len_val;
|
||||
if (get_length(e, len_val) && len_val.is_nonneg() && len_val.is_unsigned()) {
|
||||
unsigned len = len_val.get_unsigned();
|
||||
// Check if there's a direct regex constraint for this string
|
||||
expr* regex = find_regex_for(e);
|
||||
if (regex) {
|
||||
zstring synth;
|
||||
if (synthesize_regex_string(regex, len, synth)) {
|
||||
expr_ref val(m_util.str.mk_string(synth), m);
|
||||
m_fresh_values.push_back(val);
|
||||
return alloc(expr_wrapper_proc, to_app(val));
|
||||
}
|
||||
}
|
||||
// Try to synthesize character by character from str.at(e, i) memberships.
|
||||
// Preprocessors often split x ∈ r into (str.at x 0) ∈ r0, (str.substr x 1 k) ∈ r1, etc.
|
||||
zstring s;
|
||||
bool from_chars = true;
|
||||
unsigned pos = 0;
|
||||
while (pos < len && from_chars) {
|
||||
// Option A: there's a membership for str.at(e, pos) (length-1 char)
|
||||
expr_ref at_i(m_util.str.mk_at(e, m_autil.mk_int(pos)), m);
|
||||
expr* r = find_regex_for(at_i);
|
||||
if (r) {
|
||||
zstring ch;
|
||||
if (synthesize_regex_string(r, 1, ch)) {
|
||||
s = s + ch;
|
||||
pos++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// Option B: find a membership for str.substr(e, pos, k) and synthesize k chars
|
||||
unsigned cover_len = 0;
|
||||
expr* rs = find_substr_regex(e, pos, len, cover_len);
|
||||
if (rs && cover_len > 0) {
|
||||
zstring piece;
|
||||
if (synthesize_regex_string(rs, cover_len, piece)) {
|
||||
s = s + piece;
|
||||
pos += cover_len;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
from_chars = false;
|
||||
}
|
||||
if (from_chars && s.length() == len) {
|
||||
expr_ref val(m_util.str.mk_string(s), m);
|
||||
m_fresh_values.push_back(val);
|
||||
return alloc(expr_wrapper_proc, to_app(val));
|
||||
}
|
||||
// No regex constraint (or synthesis failed): use default fill
|
||||
s = zstring();
|
||||
for (unsigned i = 0; i < len; i++)
|
||||
s = s + zstring("A");
|
||||
expr_ref val(m_util.str.mk_string(s), m);
|
||||
|
|
|
|||
|
|
@ -115,8 +115,14 @@ namespace smt {
|
|||
// Regex membership
|
||||
bool check_regex_memberships();
|
||||
bool check_regex_mem(nseq_mem const& mem);
|
||||
bool add_regex_length_axioms(nseq_mem const& mem);
|
||||
bool synthesize_regex_string(expr* regex, unsigned len, zstring& result);
|
||||
bool dfs_regex(expr* r, unsigned target_len, unsigned cur_len, zstring& result);
|
||||
bool is_ground_string(expr* e, zstring& s);
|
||||
expr_ref derive_regex(expr* regex, zstring const& prefix);
|
||||
bool all_mems_checked();
|
||||
expr* find_regex_for(expr* e);
|
||||
expr* find_substr_regex(expr* base, unsigned pos, unsigned total_len, unsigned& cover_len);
|
||||
|
||||
// String operation reductions
|
||||
void reduce_op(expr* e);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue