3
0
Fork 0
mirror of https://github.com/Z3Prover/z3 synced 2026-06-22 16:40:29 +00:00

attempt to fix lease cancel/reslimit race condition in worker

This commit is contained in:
Ilana Shapiro 2026-06-20 16:58:11 -07:00
parent 41471bbc3e
commit f86feab060

View file

@ -228,6 +228,12 @@ class parallel_solver {
m_bb_cv.notify_all();
}
void set_canceled_unlocked() {
if (m_state != state::is_running)
return;
cancel_background_threads();
}
void release_lease_unlocked(unsigned worker_id, search_tree::node<solver_cube_config>* n) {
if (worker_id >= m_worker_leases.size()) return;
auto& lease = m_worker_leases[worker_id];
@ -236,6 +242,34 @@ class parallel_solver {
lease = {};
}
void release_worker_lease_unlocked(unsigned worker_id, node_lease& lease) {
if (worker_id >= m_worker_leases.size()) {
lease = {};
return;
}
auto& stored_lease = m_worker_leases[worker_id];
bool cancel_signaled = stored_lease.leased_node == lease.leased_node && stored_lease.cancel_signaled;
release_lease_unlocked(worker_id, lease.leased_node);
lease = {};
if (cancel_signaled)
p.m_workers[worker_id]->limit().dec_cancel();
}
bool attempt_release_canceled_lease_unlocked(unsigned worker_id, node_lease& lease) {
if (m_state != state::is_running || !lease.leased_node || worker_id >= m_worker_leases.size())
return false;
auto& stored_lease = m_worker_leases[worker_id];
if (stored_lease.leased_node != lease.leased_node)
return false;
if (!m_search_tree.is_lease_canceled(stored_lease.leased_node))
return false;
release_worker_lease_unlocked(worker_id, lease);
return true;
}
void cancel_closed_leases_unlocked(unsigned source_worker_id) {
unsigned n = std::min(m_worker_leases.size(), p.m_workers.size());
for (unsigned id = 0; id < n; ++id) {
@ -387,7 +421,7 @@ class parallel_solver {
void backtrack_unlocked(ast_translation& l2g, unsigned worker_id,
expr_ref_vector const& core,
node_lease const* lease = nullptr,
node_lease* lease = nullptr,
vector<node_lease> const* targets = nullptr) {
if (m_state != state::is_running)
return;
@ -402,9 +436,13 @@ class parallel_solver {
if (lease && !m_search_tree.is_lease_canceled(lease->leased_node)) {
// we close/backtrack regardless of whether this lease is stale or not, as long as the lease isn't canceled
// i.e. worker 1 splits this node, but then worker 2 determines UNSAT --> worker 2 is stale but we still close this node and backtrack
auto* leased_node = lease->leased_node;
did_backtrack = true;
release_lease_unlocked(worker_id, lease->leased_node);
m_search_tree.backtrack(lease->leased_node, g_core);
release_worker_lease_unlocked(worker_id, *lease);
m_search_tree.backtrack(leased_node, g_core);
}
else if (lease) {
attempt_release_canceled_lease_unlocked(worker_id, *lease);
}
if (targets) {
for (auto const& target : *targets) {
@ -476,9 +514,7 @@ class parallel_solver {
void set_canceled() {
std::scoped_lock lock(mux);
if (m_state != state::is_running)
return;
cancel_background_threads();
set_canceled_unlocked();
}
void set_unsat(ast_translation& l2g,
@ -539,7 +575,7 @@ class parallel_solver {
void backtrack(ast_translation& l2g, unsigned worker_id,
expr_ref_vector const& core,
node_lease const& lease) {
node_lease& lease) {
std::scoped_lock lock(mux);
if (m_state != state::is_running) return;
vector<solver_cube_config::literal> g_core;
@ -683,28 +719,30 @@ class parallel_solver {
}
void try_split(ast_translation& l2g, unsigned worker_id,
node_lease const& lease,
expr* atom, unsigned effort) {
node_lease& lease, expr* atom, unsigned effort) {
std::scoped_lock lock(mux);
if (m_state != state::is_running) return;
if (m_search_tree.is_lease_canceled(lease.leased_node)) return;
if (m_search_tree.is_lease_canceled(lease.leased_node)) {
attempt_release_canceled_lease_unlocked(worker_id, lease);
return;
}
expr_ref lit(m), nlit(m);
auto* leased_node = lease.leased_node;
lit = l2g(atom);
nlit = mk_not(m, lit);
VERIFY(!lease.leased_node->path_contains_atom(lit));
VERIFY(!lease.leased_node->path_contains_atom(nlit));
VERIFY(!leased_node->path_contains_atom(lit));
VERIFY(!leased_node->path_contains_atom(nlit));
bool did_split = m_search_tree.try_split(
lease.leased_node, lit, nlit, effort);
bool did_split = m_search_tree.try_split(leased_node, lit, nlit, effort);
release_lease_unlocked(worker_id, lease.leased_node);
release_worker_lease_unlocked(worker_id, lease);
if (did_split) {
++m_stats.m_num_cubes;
m_stats.m_max_cube_depth = std::max(
m_stats.m_max_cube_depth,
lease.leased_node->depth() + 1);
leased_node->depth() + 1);
IF_VERBOSE(1, verbose_stream() << "Batch manager splitting on literal: " << mk_bounded_pp(lit, m, 3) << "\n");
}
}
@ -715,22 +753,31 @@ class parallel_solver {
release_lease_unlocked(worker_id, lease.leased_node);
}
bool release_canceled_lease(unsigned worker_id, node_lease const& lease, bool& cancel_signaled) {
bool attempt_release_canceled_lease(unsigned worker_id, node_lease& lease) {
std::scoped_lock lock(mux);
cancel_signaled = false;
if (m_state != state::is_running || !lease.leased_node || worker_id >= m_worker_leases.size())
return false;
return attempt_release_canceled_lease_unlocked(worker_id, lease);
}
auto& stored_lease = m_worker_leases[worker_id];
if (stored_lease.leased_node != lease.leased_node)
return false;
bool checkpoint_worker(unsigned worker_id, node_lease& lease, bool& lease_canceled) {
std::scoped_lock lock(mux);
lease_canceled = false;
SASSERT(worker_id < p.m_workers.size());
if (!m_search_tree.is_lease_canceled(stored_lease.leased_node))
return false;
if (attempt_release_canceled_lease_unlocked(worker_id, lease)) {
lease_canceled = true;
return true;
}
cancel_signaled = stored_lease.cancel_signaled;
release_lease_unlocked(worker_id, stored_lease.leased_node);
return true;
if (p.m_workers[worker_id]->limit().inc())
return true;
if (attempt_release_canceled_lease_unlocked(worker_id, lease)) {
lease_canceled = true;
return true;
}
set_canceled_unlocked();
return false;
}
void collect_clause(ast_translation& l2g,
@ -1088,7 +1135,7 @@ class parallel_solver {
}
}
catch (...) {
if (!m.inc())
if (m.limit().is_canceled())
return expr_ref(nullptr, m);
throw;
}
@ -1116,7 +1163,7 @@ class parallel_solver {
try {
s->get_backbone_candidates(cands, s->get_num_bool_vars());
} catch (z3_exception &ex) {
if (!m.inc())
if (m.limit().is_canceled())
return result;
throw;
}
@ -1174,26 +1221,24 @@ class parallel_solver {
if (m_config.m_global_backbones) {
bb_candidates local_candidates = find_backbone_candidates(cube);
b.collect_backbone_candidates(m_l2g, local_candidates);
bool lease_canceled = false;
if (!b.checkpoint_worker(id, lease, lease_canceled))
return;
if (lease_canceled) {
LOG_WORKER(1, " abandoning canceled lease\n");
continue;
}
}
lbool r = check_cube(cube);
bool cancel_signaled = false;
if (b.release_canceled_lease(id, lease, cancel_signaled)) {
bool lease_canceled = false;
if (!b.checkpoint_worker(id, lease, lease_canceled))
return;
if (lease_canceled) {
LOG_WORKER(1, " abandoning canceled lease\n");
lease = {};
if (cancel_signaled)
m.limit().dec_cancel();
continue;
}
// NSB - at this point it shouldn't be possible to call inc_cancel.
// Is this ensured? I am not sure.
if (!m.inc()) {
b.set_canceled();
return;
}
switch (r) {
case l_undef: {
@ -1205,7 +1250,6 @@ class parallel_solver {
if (atom) {
b.try_split(m_l2g, id, lease, atom.get(),
m_config.m_threads_max_conflicts);
lease = {};
}
else {
goto check_cube_start;
@ -1241,7 +1285,6 @@ class parallel_solver {
if (m_config.m_core_minimize)
b.enqueue_core_minimization(m_l2g, source, unsat_core);
lease = {};
if (m_config.m_share_conflicts)
b.collect_clause(m_l2g, id, mk_not(mk_and(unsat_core)));