From f86feab060733e1b646aea969a551951de4c4c9e Mon Sep 17 00:00:00 2001 From: Ilana Shapiro Date: Sat, 20 Jun 2026 16:58:11 -0700 Subject: [PATCH] attempt to fix lease cancel/reslimit race condition in worker --- src/solver/parallel_tactical2.cpp | 133 ++++++++++++++++++++---------- 1 file changed, 88 insertions(+), 45 deletions(-) diff --git a/src/solver/parallel_tactical2.cpp b/src/solver/parallel_tactical2.cpp index 262f2d96a..9ea707ec6 100644 --- a/src/solver/parallel_tactical2.cpp +++ b/src/solver/parallel_tactical2.cpp @@ -228,6 +228,12 @@ class parallel_solver { m_bb_cv.notify_all(); } + void set_canceled_unlocked() { + if (m_state != state::is_running) + return; + cancel_background_threads(); + } + void release_lease_unlocked(unsigned worker_id, search_tree::node* n) { if (worker_id >= m_worker_leases.size()) return; auto& lease = m_worker_leases[worker_id]; @@ -236,6 +242,34 @@ class parallel_solver { lease = {}; } + void release_worker_lease_unlocked(unsigned worker_id, node_lease& lease) { + if (worker_id >= m_worker_leases.size()) { + lease = {}; + return; + } + auto& stored_lease = m_worker_leases[worker_id]; + bool cancel_signaled = stored_lease.leased_node == lease.leased_node && stored_lease.cancel_signaled; + release_lease_unlocked(worker_id, lease.leased_node); + lease = {}; + if (cancel_signaled) + p.m_workers[worker_id]->limit().dec_cancel(); + } + + bool attempt_release_canceled_lease_unlocked(unsigned worker_id, node_lease& lease) { + if (m_state != state::is_running || !lease.leased_node || worker_id >= m_worker_leases.size()) + return false; + + auto& stored_lease = m_worker_leases[worker_id]; + if (stored_lease.leased_node != lease.leased_node) + return false; + + if (!m_search_tree.is_lease_canceled(stored_lease.leased_node)) + return false; + + release_worker_lease_unlocked(worker_id, lease); + return true; + } + void cancel_closed_leases_unlocked(unsigned source_worker_id) { unsigned n = std::min(m_worker_leases.size(), p.m_workers.size()); for (unsigned id = 0; id < n; ++id) { @@ -387,7 +421,7 @@ class parallel_solver { void backtrack_unlocked(ast_translation& l2g, unsigned worker_id, expr_ref_vector const& core, - node_lease const* lease = nullptr, + node_lease* lease = nullptr, vector const* targets = nullptr) { if (m_state != state::is_running) return; @@ -402,9 +436,13 @@ class parallel_solver { if (lease && !m_search_tree.is_lease_canceled(lease->leased_node)) { // we close/backtrack regardless of whether this lease is stale or not, as long as the lease isn't canceled // i.e. worker 1 splits this node, but then worker 2 determines UNSAT --> worker 2 is stale but we still close this node and backtrack + auto* leased_node = lease->leased_node; did_backtrack = true; - release_lease_unlocked(worker_id, lease->leased_node); - m_search_tree.backtrack(lease->leased_node, g_core); + release_worker_lease_unlocked(worker_id, *lease); + m_search_tree.backtrack(leased_node, g_core); + } + else if (lease) { + attempt_release_canceled_lease_unlocked(worker_id, *lease); } if (targets) { for (auto const& target : *targets) { @@ -476,9 +514,7 @@ class parallel_solver { void set_canceled() { std::scoped_lock lock(mux); - if (m_state != state::is_running) - return; - cancel_background_threads(); + set_canceled_unlocked(); } void set_unsat(ast_translation& l2g, @@ -539,7 +575,7 @@ class parallel_solver { void backtrack(ast_translation& l2g, unsigned worker_id, expr_ref_vector const& core, - node_lease const& lease) { + node_lease& lease) { std::scoped_lock lock(mux); if (m_state != state::is_running) return; vector g_core; @@ -683,28 +719,30 @@ class parallel_solver { } void try_split(ast_translation& l2g, unsigned worker_id, - node_lease const& lease, - expr* atom, unsigned effort) { + node_lease& lease, expr* atom, unsigned effort) { std::scoped_lock lock(mux); if (m_state != state::is_running) return; - if (m_search_tree.is_lease_canceled(lease.leased_node)) return; + if (m_search_tree.is_lease_canceled(lease.leased_node)) { + attempt_release_canceled_lease_unlocked(worker_id, lease); + return; + } expr_ref lit(m), nlit(m); + auto* leased_node = lease.leased_node; lit = l2g(atom); nlit = mk_not(m, lit); - VERIFY(!lease.leased_node->path_contains_atom(lit)); - VERIFY(!lease.leased_node->path_contains_atom(nlit)); + VERIFY(!leased_node->path_contains_atom(lit)); + VERIFY(!leased_node->path_contains_atom(nlit)); - bool did_split = m_search_tree.try_split( - lease.leased_node, lit, nlit, effort); + bool did_split = m_search_tree.try_split(leased_node, lit, nlit, effort); - release_lease_unlocked(worker_id, lease.leased_node); + release_worker_lease_unlocked(worker_id, lease); if (did_split) { ++m_stats.m_num_cubes; m_stats.m_max_cube_depth = std::max( m_stats.m_max_cube_depth, - lease.leased_node->depth() + 1); + leased_node->depth() + 1); IF_VERBOSE(1, verbose_stream() << "Batch manager splitting on literal: " << mk_bounded_pp(lit, m, 3) << "\n"); } } @@ -715,22 +753,31 @@ class parallel_solver { release_lease_unlocked(worker_id, lease.leased_node); } - bool release_canceled_lease(unsigned worker_id, node_lease const& lease, bool& cancel_signaled) { + bool attempt_release_canceled_lease(unsigned worker_id, node_lease& lease) { std::scoped_lock lock(mux); - cancel_signaled = false; - if (m_state != state::is_running || !lease.leased_node || worker_id >= m_worker_leases.size()) - return false; + return attempt_release_canceled_lease_unlocked(worker_id, lease); + } - auto& stored_lease = m_worker_leases[worker_id]; - if (stored_lease.leased_node != lease.leased_node) - return false; + bool checkpoint_worker(unsigned worker_id, node_lease& lease, bool& lease_canceled) { + std::scoped_lock lock(mux); + lease_canceled = false; + SASSERT(worker_id < p.m_workers.size()); - if (!m_search_tree.is_lease_canceled(stored_lease.leased_node)) - return false; + if (attempt_release_canceled_lease_unlocked(worker_id, lease)) { + lease_canceled = true; + return true; + } - cancel_signaled = stored_lease.cancel_signaled; - release_lease_unlocked(worker_id, stored_lease.leased_node); - return true; + if (p.m_workers[worker_id]->limit().inc()) + return true; + + if (attempt_release_canceled_lease_unlocked(worker_id, lease)) { + lease_canceled = true; + return true; + } + + set_canceled_unlocked(); + return false; } void collect_clause(ast_translation& l2g, @@ -1088,7 +1135,7 @@ class parallel_solver { } } catch (...) { - if (!m.inc()) + if (m.limit().is_canceled()) return expr_ref(nullptr, m); throw; } @@ -1116,7 +1163,7 @@ class parallel_solver { try { s->get_backbone_candidates(cands, s->get_num_bool_vars()); } catch (z3_exception &ex) { - if (!m.inc()) + if (m.limit().is_canceled()) return result; throw; } @@ -1174,26 +1221,24 @@ class parallel_solver { if (m_config.m_global_backbones) { bb_candidates local_candidates = find_backbone_candidates(cube); b.collect_backbone_candidates(m_l2g, local_candidates); + bool lease_canceled = false; + if (!b.checkpoint_worker(id, lease, lease_canceled)) + return; + if (lease_canceled) { + LOG_WORKER(1, " abandoning canceled lease\n"); + continue; + } } lbool r = check_cube(cube); - bool cancel_signaled = false; - if (b.release_canceled_lease(id, lease, cancel_signaled)) { + bool lease_canceled = false; + if (!b.checkpoint_worker(id, lease, lease_canceled)) + return; + if (lease_canceled) { LOG_WORKER(1, " abandoning canceled lease\n"); - lease = {}; - if (cancel_signaled) - m.limit().dec_cancel(); continue; } - // NSB - at this point it shouldn't be possible to call inc_cancel. - // Is this ensured? I am not sure. - if (!m.inc()) { - b.set_canceled(); - return; - } - - switch (r) { case l_undef: { @@ -1205,7 +1250,6 @@ class parallel_solver { if (atom) { b.try_split(m_l2g, id, lease, atom.get(), m_config.m_threads_max_conflicts); - lease = {}; } else { goto check_cube_start; @@ -1241,7 +1285,6 @@ class parallel_solver { if (m_config.m_core_minimize) b.enqueue_core_minimization(m_l2g, source, unsat_core); - lease = {}; if (m_config.m_share_conflicts) b.collect_clause(m_l2g, id, mk_not(mk_and(unsat_core)));