mirror of
https://github.com/Z3Prover/z3
synced 2026-06-22 16:40:29 +00:00
attempt to fix lease cancel/reslimit race condition in worker
This commit is contained in:
parent
41471bbc3e
commit
f86feab060
1 changed files with 88 additions and 45 deletions
|
|
@ -228,6 +228,12 @@ class parallel_solver {
|
|||
m_bb_cv.notify_all();
|
||||
}
|
||||
|
||||
void set_canceled_unlocked() {
|
||||
if (m_state != state::is_running)
|
||||
return;
|
||||
cancel_background_threads();
|
||||
}
|
||||
|
||||
void release_lease_unlocked(unsigned worker_id, search_tree::node<solver_cube_config>* n) {
|
||||
if (worker_id >= m_worker_leases.size()) return;
|
||||
auto& lease = m_worker_leases[worker_id];
|
||||
|
|
@ -236,6 +242,34 @@ class parallel_solver {
|
|||
lease = {};
|
||||
}
|
||||
|
||||
void release_worker_lease_unlocked(unsigned worker_id, node_lease& lease) {
|
||||
if (worker_id >= m_worker_leases.size()) {
|
||||
lease = {};
|
||||
return;
|
||||
}
|
||||
auto& stored_lease = m_worker_leases[worker_id];
|
||||
bool cancel_signaled = stored_lease.leased_node == lease.leased_node && stored_lease.cancel_signaled;
|
||||
release_lease_unlocked(worker_id, lease.leased_node);
|
||||
lease = {};
|
||||
if (cancel_signaled)
|
||||
p.m_workers[worker_id]->limit().dec_cancel();
|
||||
}
|
||||
|
||||
bool attempt_release_canceled_lease_unlocked(unsigned worker_id, node_lease& lease) {
|
||||
if (m_state != state::is_running || !lease.leased_node || worker_id >= m_worker_leases.size())
|
||||
return false;
|
||||
|
||||
auto& stored_lease = m_worker_leases[worker_id];
|
||||
if (stored_lease.leased_node != lease.leased_node)
|
||||
return false;
|
||||
|
||||
if (!m_search_tree.is_lease_canceled(stored_lease.leased_node))
|
||||
return false;
|
||||
|
||||
release_worker_lease_unlocked(worker_id, lease);
|
||||
return true;
|
||||
}
|
||||
|
||||
void cancel_closed_leases_unlocked(unsigned source_worker_id) {
|
||||
unsigned n = std::min(m_worker_leases.size(), p.m_workers.size());
|
||||
for (unsigned id = 0; id < n; ++id) {
|
||||
|
|
@ -387,7 +421,7 @@ class parallel_solver {
|
|||
|
||||
void backtrack_unlocked(ast_translation& l2g, unsigned worker_id,
|
||||
expr_ref_vector const& core,
|
||||
node_lease const* lease = nullptr,
|
||||
node_lease* lease = nullptr,
|
||||
vector<node_lease> const* targets = nullptr) {
|
||||
if (m_state != state::is_running)
|
||||
return;
|
||||
|
|
@ -402,9 +436,13 @@ class parallel_solver {
|
|||
if (lease && !m_search_tree.is_lease_canceled(lease->leased_node)) {
|
||||
// we close/backtrack regardless of whether this lease is stale or not, as long as the lease isn't canceled
|
||||
// i.e. worker 1 splits this node, but then worker 2 determines UNSAT --> worker 2 is stale but we still close this node and backtrack
|
||||
auto* leased_node = lease->leased_node;
|
||||
did_backtrack = true;
|
||||
release_lease_unlocked(worker_id, lease->leased_node);
|
||||
m_search_tree.backtrack(lease->leased_node, g_core);
|
||||
release_worker_lease_unlocked(worker_id, *lease);
|
||||
m_search_tree.backtrack(leased_node, g_core);
|
||||
}
|
||||
else if (lease) {
|
||||
attempt_release_canceled_lease_unlocked(worker_id, *lease);
|
||||
}
|
||||
if (targets) {
|
||||
for (auto const& target : *targets) {
|
||||
|
|
@ -476,9 +514,7 @@ class parallel_solver {
|
|||
|
||||
void set_canceled() {
|
||||
std::scoped_lock lock(mux);
|
||||
if (m_state != state::is_running)
|
||||
return;
|
||||
cancel_background_threads();
|
||||
set_canceled_unlocked();
|
||||
}
|
||||
|
||||
void set_unsat(ast_translation& l2g,
|
||||
|
|
@ -539,7 +575,7 @@ class parallel_solver {
|
|||
|
||||
void backtrack(ast_translation& l2g, unsigned worker_id,
|
||||
expr_ref_vector const& core,
|
||||
node_lease const& lease) {
|
||||
node_lease& lease) {
|
||||
std::scoped_lock lock(mux);
|
||||
if (m_state != state::is_running) return;
|
||||
vector<solver_cube_config::literal> g_core;
|
||||
|
|
@ -683,28 +719,30 @@ class parallel_solver {
|
|||
}
|
||||
|
||||
void try_split(ast_translation& l2g, unsigned worker_id,
|
||||
node_lease const& lease,
|
||||
expr* atom, unsigned effort) {
|
||||
node_lease& lease, expr* atom, unsigned effort) {
|
||||
std::scoped_lock lock(mux);
|
||||
if (m_state != state::is_running) return;
|
||||
if (m_search_tree.is_lease_canceled(lease.leased_node)) return;
|
||||
if (m_search_tree.is_lease_canceled(lease.leased_node)) {
|
||||
attempt_release_canceled_lease_unlocked(worker_id, lease);
|
||||
return;
|
||||
}
|
||||
|
||||
expr_ref lit(m), nlit(m);
|
||||
auto* leased_node = lease.leased_node;
|
||||
lit = l2g(atom);
|
||||
nlit = mk_not(m, lit);
|
||||
VERIFY(!lease.leased_node->path_contains_atom(lit));
|
||||
VERIFY(!lease.leased_node->path_contains_atom(nlit));
|
||||
VERIFY(!leased_node->path_contains_atom(lit));
|
||||
VERIFY(!leased_node->path_contains_atom(nlit));
|
||||
|
||||
bool did_split = m_search_tree.try_split(
|
||||
lease.leased_node, lit, nlit, effort);
|
||||
bool did_split = m_search_tree.try_split(leased_node, lit, nlit, effort);
|
||||
|
||||
release_lease_unlocked(worker_id, lease.leased_node);
|
||||
release_worker_lease_unlocked(worker_id, lease);
|
||||
|
||||
if (did_split) {
|
||||
++m_stats.m_num_cubes;
|
||||
m_stats.m_max_cube_depth = std::max(
|
||||
m_stats.m_max_cube_depth,
|
||||
lease.leased_node->depth() + 1);
|
||||
leased_node->depth() + 1);
|
||||
IF_VERBOSE(1, verbose_stream() << "Batch manager splitting on literal: " << mk_bounded_pp(lit, m, 3) << "\n");
|
||||
}
|
||||
}
|
||||
|
|
@ -715,22 +753,31 @@ class parallel_solver {
|
|||
release_lease_unlocked(worker_id, lease.leased_node);
|
||||
}
|
||||
|
||||
bool release_canceled_lease(unsigned worker_id, node_lease const& lease, bool& cancel_signaled) {
|
||||
bool attempt_release_canceled_lease(unsigned worker_id, node_lease& lease) {
|
||||
std::scoped_lock lock(mux);
|
||||
cancel_signaled = false;
|
||||
if (m_state != state::is_running || !lease.leased_node || worker_id >= m_worker_leases.size())
|
||||
return false;
|
||||
return attempt_release_canceled_lease_unlocked(worker_id, lease);
|
||||
}
|
||||
|
||||
auto& stored_lease = m_worker_leases[worker_id];
|
||||
if (stored_lease.leased_node != lease.leased_node)
|
||||
return false;
|
||||
bool checkpoint_worker(unsigned worker_id, node_lease& lease, bool& lease_canceled) {
|
||||
std::scoped_lock lock(mux);
|
||||
lease_canceled = false;
|
||||
SASSERT(worker_id < p.m_workers.size());
|
||||
|
||||
if (!m_search_tree.is_lease_canceled(stored_lease.leased_node))
|
||||
return false;
|
||||
if (attempt_release_canceled_lease_unlocked(worker_id, lease)) {
|
||||
lease_canceled = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
cancel_signaled = stored_lease.cancel_signaled;
|
||||
release_lease_unlocked(worker_id, stored_lease.leased_node);
|
||||
return true;
|
||||
if (p.m_workers[worker_id]->limit().inc())
|
||||
return true;
|
||||
|
||||
if (attempt_release_canceled_lease_unlocked(worker_id, lease)) {
|
||||
lease_canceled = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
set_canceled_unlocked();
|
||||
return false;
|
||||
}
|
||||
|
||||
void collect_clause(ast_translation& l2g,
|
||||
|
|
@ -1088,7 +1135,7 @@ class parallel_solver {
|
|||
}
|
||||
}
|
||||
catch (...) {
|
||||
if (!m.inc())
|
||||
if (m.limit().is_canceled())
|
||||
return expr_ref(nullptr, m);
|
||||
throw;
|
||||
}
|
||||
|
|
@ -1116,7 +1163,7 @@ class parallel_solver {
|
|||
try {
|
||||
s->get_backbone_candidates(cands, s->get_num_bool_vars());
|
||||
} catch (z3_exception &ex) {
|
||||
if (!m.inc())
|
||||
if (m.limit().is_canceled())
|
||||
return result;
|
||||
throw;
|
||||
}
|
||||
|
|
@ -1174,26 +1221,24 @@ class parallel_solver {
|
|||
if (m_config.m_global_backbones) {
|
||||
bb_candidates local_candidates = find_backbone_candidates(cube);
|
||||
b.collect_backbone_candidates(m_l2g, local_candidates);
|
||||
bool lease_canceled = false;
|
||||
if (!b.checkpoint_worker(id, lease, lease_canceled))
|
||||
return;
|
||||
if (lease_canceled) {
|
||||
LOG_WORKER(1, " abandoning canceled lease\n");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
lbool r = check_cube(cube);
|
||||
|
||||
bool cancel_signaled = false;
|
||||
if (b.release_canceled_lease(id, lease, cancel_signaled)) {
|
||||
bool lease_canceled = false;
|
||||
if (!b.checkpoint_worker(id, lease, lease_canceled))
|
||||
return;
|
||||
if (lease_canceled) {
|
||||
LOG_WORKER(1, " abandoning canceled lease\n");
|
||||
lease = {};
|
||||
if (cancel_signaled)
|
||||
m.limit().dec_cancel();
|
||||
continue;
|
||||
}
|
||||
|
||||
// NSB - at this point it shouldn't be possible to call inc_cancel.
|
||||
// Is this ensured? I am not sure.
|
||||
if (!m.inc()) {
|
||||
b.set_canceled();
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
switch (r) {
|
||||
|
||||
case l_undef: {
|
||||
|
|
@ -1205,7 +1250,6 @@ class parallel_solver {
|
|||
if (atom) {
|
||||
b.try_split(m_l2g, id, lease, atom.get(),
|
||||
m_config.m_threads_max_conflicts);
|
||||
lease = {};
|
||||
}
|
||||
else {
|
||||
goto check_cube_start;
|
||||
|
|
@ -1241,7 +1285,6 @@ class parallel_solver {
|
|||
|
||||
if (m_config.m_core_minimize)
|
||||
b.enqueue_core_minimization(m_l2g, source, unsat_core);
|
||||
lease = {};
|
||||
|
||||
if (m_config.m_share_conflicts)
|
||||
b.collect_clause(m_l2g, id, mk_not(mk_and(unsat_core)));
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue