diff --git a/src/smt/smt_parallel.cpp b/src/smt/smt_parallel.cpp index d74b71df2..07a29d777 100644 --- a/src/smt/smt_parallel.cpp +++ b/src/smt/smt_parallel.cpp @@ -111,11 +111,11 @@ namespace smt { } void parallel::worker::run() { - search_tree::node *node = nullptr; + node_lease lease; expr_ref_vector cube(m); while (true) { - if (!b.get_cube(m_g2l, id, cube, node)) { + if (!b.get_cube(m_g2l, id, cube, lease)) { LOG_WORKER(1, " no more cubes\n"); return; } @@ -125,8 +125,18 @@ namespace smt { LOG_WORKER(1, " CUBE SIZE IN MAIN LOOP: " << cube.size() << "\n"); lbool r = check_cube(cube); + if (b.lease_canceled(lease)) { + LOG_WORKER(1, " abandoning canceled lease\n"); + b.abandon_lease(id, lease); + m.limit().reset_cancel(); + lease = {}; + continue; + } + if (!m.inc()) { - b.set_exception("context cancelled"); + if (b.is_batch_running()) { + b.set_exception("context cancelled"); + } return; } @@ -140,7 +150,8 @@ namespace smt { auto atom = get_split_atom(); if (!atom) goto check_cube_start; - b.try_split(m_l2g, id, node, atom, m_config.m_threads_max_conflicts); + b.try_split(m_l2g, id, lease, atom, m_config.m_threads_max_conflicts); + lease = {}; simplify(); break; } @@ -164,7 +175,8 @@ namespace smt { } LOG_WORKER(1, " found unsat cube\n"); - b.backtrack(m_l2g, unsat_core, node); + b.backtrack(m_l2g, id, unsat_core, lease); + lease = {}; if (m_config.m_share_conflicts) b.collect_clause(m_l2g, id, mk_not(mk_and(unsat_core))); @@ -314,18 +326,48 @@ namespace smt { m.limit().cancel(); } - void parallel::batch_manager::backtrack(ast_translation &l2g, expr_ref_vector const &core, - search_tree::node *node) { + void parallel::batch_manager::release_lease_unlocked(unsigned worker_id, node* n, unsigned epoch) { + if (worker_id >= m_worker_leases.size()) + return; + auto &lease = m_worker_leases[worker_id]; + if (!lease.node || lease.node != n || lease.epoch != epoch) + return; + m_search_tree.dec_active_workers(lease.node); + lease = {}; + } + + void parallel::batch_manager::cancel_closed_leases_unlocked(unsigned source_worker_id) { + for (unsigned worker_id = 0; worker_id < m_worker_leases.size(); ++worker_id) { + if (worker_id == source_worker_id) + continue; + auto const& lease = m_worker_leases[worker_id]; + if (!lease.node || !m_search_tree.is_lease_canceled(lease.node, lease.cancel_epoch)) + continue; + p.m_workers[worker_id]->cancel(); + } + } + + void parallel::batch_manager::backtrack(ast_translation &l2g, unsigned worker_id, expr_ref_vector const &core, + node_lease const &lease) { std::scoped_lock lock(mux); IF_VERBOSE(1, verbose_stream() << "Batch manager backtracking.\n"); if (m_state != state::is_running) return; vector g_core; for (auto c : core) { - expr_ref g_c(l2g(c), m); g_core.push_back(expr_ref(l2g(c), m)); } - m_search_tree.backtrack(node, g_core); + release_lease_unlocked(worker_id, lease.node, lease.epoch); + + // only backtrack if the lease is still valid (i.e., the worker has not been given a new node to work on or the lease has not been canceled by another worker) + // this means we can potentially delay backtracking for some unsat cubes until another worker (whose lease is valid) determines it UNSAT. + // Empirically, this is better than aggressively backtracking on every unsat cube, for now. + if (lease.node && + m_search_tree.is_lease_valid(lease.node, lease.epoch) && + !m_search_tree.is_lease_canceled(lease.node, lease.cancel_epoch)) + m_search_tree.backtrack(lease.node, g_core); + + cancel_closed_leases_unlocked(worker_id); IF_VERBOSE(1, m_search_tree.display(verbose_stream() << bounded_pp_exprs(core) << "\n");); if (m_search_tree.is_closed()) { @@ -338,8 +380,8 @@ namespace smt { } } - void parallel::batch_manager::try_split(ast_translation &l2g, unsigned source_worker_id, - search_tree::node *node, expr *atom, unsigned effort) { + void parallel::batch_manager::try_split(ast_translation &l2g, unsigned worker_id, + node_lease const &lease, expr *atom, unsigned effort) { std::scoped_lock lock(mux); expr_ref lit(m), nlit(m); lit = l2g(atom); @@ -348,15 +390,34 @@ namespace smt { if (m_state != state::is_running) return; - bool did_split = m_search_tree.try_split(node, lit, nlit, effort); + release_lease_unlocked(worker_id, lease.node, lease.epoch); + if (!lease.node || m_search_tree.is_lease_canceled(lease.node, lease.cancel_epoch)) + return; + + bool did_split = m_search_tree.try_split(lease.node, lit, nlit, effort, lease.epoch); if (did_split) { ++m_stats.m_num_cubes; - m_stats.m_max_cube_depth = std::max(m_stats.m_max_cube_depth, node->depth() + 1); + m_stats.m_max_cube_depth = std::max(m_stats.m_max_cube_depth, lease.node->depth() + 1); IF_VERBOSE(1, verbose_stream() << "Batch manager splitting on literal: " << mk_bounded_pp(lit, m, 3) << "\n"); } } + void parallel::batch_manager::abandon_lease(unsigned worker_id, node_lease const &lease) { + std::scoped_lock lock(mux); + release_lease_unlocked(worker_id, lease.node, lease.epoch); + } + + bool parallel::batch_manager::lease_canceled(node_lease const &lease) { + std::scoped_lock lock(mux); + return m_search_tree.is_lease_canceled(lease.node, lease.cancel_epoch); + } + + bool parallel::batch_manager::is_batch_running() { + std::scoped_lock lock(mux); + return m_state == state::is_running; + } + void parallel::batch_manager::collect_clause(ast_translation &l2g, unsigned source_worker_id, expr *clause) { std::scoped_lock lock(mux); expr *g_clause = l2g(clause); @@ -400,11 +461,14 @@ namespace smt { try { r = ctx->check(asms.size(), asms.data()); } catch (z3_error &err) { - b.set_exception(err.error_code()); + if (!m.limit().is_canceled()) + b.set_exception(err.error_code()); } catch (z3_exception &ex) { - b.set_exception(ex.what()); + if (!m.limit().is_canceled()) + b.set_exception(ex.what()); } catch (...) { - b.set_exception("unknown exception"); + if (!m.limit().is_canceled()) + b.set_exception("unknown exception"); } asms.shrink(asms.size() - cube.size()); LOG_WORKER(1, " DONE checking cube " << r << "\n";); @@ -501,9 +565,10 @@ namespace smt { } } - bool parallel::batch_manager::get_cube(ast_translation &g2l, unsigned id, expr_ref_vector &cube, node *&n) { + bool parallel::batch_manager::get_cube(ast_translation &g2l, unsigned id, expr_ref_vector &cube, node_lease &lease) { cube.reset(); - std::unique_lock lock(mux); + std::scoped_lock lock(mux); + if (m_search_tree.is_closed()) { IF_VERBOSE(1, verbose_stream() << "all done\n";); return false; @@ -512,11 +577,21 @@ namespace smt { IF_VERBOSE(1, verbose_stream() << "aborting get_cube\n";); return false; } - node *t = m_search_tree.activate_node(n); + + node *t = m_search_tree.activate_node(lease.node); + if (!t) return false; + IF_VERBOSE(1, m_search_tree.display(verbose_stream()); verbose_stream() << "\n";); - n = t; + + lease.node = t; + lease.epoch = t->epoch(); + lease.cancel_epoch = t->get_cancel_epoch(); + if (id >= m_worker_leases.size()) + m_worker_leases.resize(id + 1); + m_worker_leases[id] = lease; + while (t) { if (cube_config::literal_is_null(t->get_literal())) break; @@ -525,6 +600,7 @@ namespace smt { cube.push_back(std::move(lit)); t = t->parent(); } + return true; } @@ -532,6 +608,8 @@ namespace smt { m_state = state::is_running; m_search_tree.reset(); m_search_tree.set_effort_unit(initial_max_thread_conflicts); + m_worker_leases.reset(); + m_worker_leases.resize(p.num_threads); } void parallel::batch_manager::collect_statistics(::statistics &st) const { @@ -556,7 +634,6 @@ namespace smt { }; scoped_clear clear(*this); - m_batch_manager.initialize(); m_workers.reset(); smt_parallel_params pp(ctx.m_params); @@ -574,6 +651,8 @@ namespace smt { sl.push_child(&(m_sls_worker->limit())); } + m_batch_manager.initialize(); + // Launch threads vector threads; threads.resize(m_should_run_sls ? num_threads + 1 : num_threads); // +1 for sls worker diff --git a/src/smt/smt_parallel.h b/src/smt/smt_parallel.h index bddb10b63..93f3214e2 100644 --- a/src/smt/smt_parallel.h +++ b/src/smt/smt_parallel.h @@ -43,6 +43,20 @@ namespace smt { expr_ref clause; }; + struct node_lease { + search_tree::node* node = nullptr; + // Version counter for structural mutations of this node (e.g., split/close). + // Used to detect stale leases: if a worker's lease.epoch != node.epoch, + // the node has changed since it was acquired and must not be mutated. + unsigned epoch = 0; + + // Cancellation generation counter for this node/subtree. + // Incremented when the node is closed; used to signal that all + // workers holding leases on this node (or its descendants) + // must abandon work immediately. + unsigned cancel_epoch = 0; + }; + class batch_manager { enum state { @@ -66,6 +80,7 @@ namespace smt { stats m_stats; using node = search_tree::node; search_tree::tree m_search_tree; + vector m_worker_leases; unsigned m_exception_code = 0; std::string m_exception_msg; @@ -92,6 +107,8 @@ namespace smt { } void init_parameters_state(); + void release_lease_unlocked(unsigned worker_id, node* n, unsigned epoch); + void cancel_closed_leases_unlocked(unsigned source_worker_id); public: batch_manager(ast_manager& m, parallel& p) : m(m), p(p), m_search_tree(expr_ref(m)) { } @@ -104,9 +121,12 @@ namespace smt { void set_exception(unsigned error_code); void collect_statistics(::statistics& st) const; - bool get_cube(ast_translation& g2l, unsigned id, expr_ref_vector& cube, node*& n); - void backtrack(ast_translation& l2g, expr_ref_vector const& core, node* n); - void try_split(ast_translation& l2g, unsigned id, node* n, expr* atom, unsigned effort); + bool get_cube(ast_translation& g2l, unsigned id, expr_ref_vector& cube, node_lease& lease); + void backtrack(ast_translation& l2g, unsigned worker_id, expr_ref_vector const& core, node_lease const& lease); + void try_split(ast_translation& l2g, unsigned worker_id, node_lease const& lease, expr* atom, unsigned effort); + void abandon_lease(unsigned worker_id, node_lease const& lease); + bool lease_canceled(node_lease const& lease); + bool is_batch_running(); void collect_clause(ast_translation& l2g, unsigned source_worker_id, expr* clause); expr_ref_vector return_shared_clauses(ast_translation& g2l, unsigned& worker_limit, unsigned worker_id); diff --git a/src/util/search_tree.h b/src/util/search_tree.h index bac6ce0d0..15f86eafa 100644 --- a/src/util/search_tree.h +++ b/src/util/search_tree.h @@ -48,6 +48,9 @@ namespace search_tree { vector m_core; unsigned m_num_activations = 0; unsigned m_effort_spent = 0; + unsigned m_active_workers = 0; + unsigned m_epoch = 0; + unsigned m_cancel_epoch = 0; public: node(literal const &l, node *parent) : m_literal(l), m_parent(parent), m_status(status::open) {} @@ -77,6 +80,7 @@ namespace search_tree { SASSERT(!m_right); m_left = alloc(node, a, this); m_right = alloc(node, b, this); + bump_epoch(); } node* left() const { return m_left; } @@ -137,6 +141,14 @@ namespace search_tree { void mark_new_activation() { set_status(status::active); ++m_num_activations; + ++m_active_workers; + } + void dec_active_workers() { + if (m_active_workers > 0) + --m_active_workers; + } + bool has_active_workers() const { + return m_active_workers > 0; } unsigned effort_spent() const { return m_effort_spent; @@ -144,6 +156,18 @@ namespace search_tree { void add_effort(unsigned effort) { m_effort_spent += effort; } + unsigned epoch() const { + return m_epoch; + } + void bump_epoch() { + ++m_epoch; + } + unsigned get_cancel_epoch() const { + return m_cancel_epoch; + } + void bump_cancel_epoch() { + ++m_cancel_epoch; + } }; template class tree { @@ -202,6 +226,8 @@ namespace search_tree { select_next_node(cur->right(), target_status, best); } + // Try to select an open node using the select_next_node policy + // If there are no open nodes, try to select an active node for portfolio solving node* activate_best_node() { candidate best; select_next_node(m_root.get(), status::open, best); @@ -233,7 +259,7 @@ namespace search_tree { unsigned count_active_nodes(node* cur) const { if (!cur || cur->get_status() == status::closed) return 0; - return (cur->get_status() == status::active ? 1 : 0) + + return (cur->has_active_workers() ? 1 : 0) + count_active_nodes(cur->left()) + count_active_nodes(cur->right()); } @@ -251,8 +277,8 @@ namespace search_tree { find_shallowest_timed_out_leaf_depth(cur->right(), best_depth); } - bool should_split(node* n) { - if (!n || n->get_status() != status::active || !n->is_leaf()) + bool should_split(node* n, unsigned epoch) { + if (!n || n->epoch() != epoch || n->get_status() != status::active || !n->is_leaf()) return false; unsigned num_active_nodes = count_active_nodes(m_root.get()); @@ -344,6 +370,8 @@ namespace search_tree { void close(node *n, vector const &C) { if (!n || n->get_status() == status::closed) return; + n->bump_epoch(); + n->bump_cancel_epoch(); n->set_status(status::closed); n->set_core(C); close(n->left(), C); @@ -446,14 +474,16 @@ namespace search_tree { // On timeout, either expand the current leaf or reopen the node for a // later revisit, depending on the tree-expansion heuristic. - bool try_split(node *n, literal const &a, literal const &b, unsigned effort) { - if (!n || n->get_status() != status::active) + bool try_split(node *n, literal const &a, literal const &b, unsigned effort, unsigned epoch) { + // the node could have been marked open by another thread that finished first and split + // we still want to add the current thread's effort in this case, but not if the node was closed + if (!n || n->get_status() == status::closed) return false; n->add_effort(effort); bool did_split = false; - if (should_split(n)) { + if (should_split(n, epoch)) { n->split(a, b); did_split = true; } @@ -467,7 +497,8 @@ namespace search_tree { // // Waiting for all workers would introduce per-node synchronization, delay // diversification, and let a slow worker stall progress. - n->set_status(status::open); + if (n->epoch() == epoch) + n->set_status(status::open); return did_split; } @@ -475,6 +506,8 @@ namespace search_tree { // conflict is given by a set of literals. // they are subsets of the literals on the path from root to n AND the external assumption literals void backtrack(node *n, vector const &conflict) { + if (!n) + return; if (conflict.empty()) { close_with_core(m_root.get(), conflict); return; @@ -508,8 +541,6 @@ namespace search_tree { } // return an active node in the tree, or nullptr if there is none - // first check if there is a node to activate under n, - // if not, go up the tree and try to activate a sibling subtree node *activate_node(node *n) { if (!n) { if (m_root->get_status() == status::active) { @@ -520,6 +551,20 @@ namespace search_tree { return activate_best_node(); } + void dec_active_workers(node* n) { + if (!n) + return; + n->dec_active_workers(); + } + + bool is_lease_valid(node* n, unsigned epoch) const { + return n && n->get_status() != status::closed && n->epoch() == epoch; + } + + bool is_lease_canceled(node* n, unsigned cancel_epoch) const { + return !n || n->get_status() == status::closed || n->get_cancel_epoch() != cancel_epoch; + } + vector const &get_core_from_root() const { return m_root->get_core(); }