From 14d24e2304dd4e2a070f9f1ae3a986bd1f4d273d Mon Sep 17 00:00:00 2001 From: Nikolaj Bjorner Date: Mon, 29 Jun 2026 12:55:34 -0700 Subject: [PATCH] add verdicts Signed-off-by: Nikolaj Bjorner --- src/cmd_context/tptp_frontend.cpp | 94 +++++++++++++++++++++++++++++-- src/test/main.cpp | 1 + src/test/tptp.cpp | 47 ++++++++++++++++ 3 files changed, 138 insertions(+), 4 deletions(-) diff --git a/src/cmd_context/tptp_frontend.cpp b/src/cmd_context/tptp_frontend.cpp index 6c8258ad75..242c1558f9 100644 --- a/src/cmd_context/tptp_frontend.cpp +++ b/src/cmd_context/tptp_frontend.cpp @@ -291,6 +291,7 @@ class tptp_parser { sort* m_univ; bool m_has_conjecture = false; bool m_last_name_quoted = false; + std::string m_expected_status; // SZS status from the input annotation, if any std::unordered_map m_sorts; sort_ref_vector m_pinned_sorts; // prevents cached sorts from being freed std::unordered_map m_decls; @@ -2135,6 +2136,7 @@ public: std::ostringstream buf; buf << in.rdbuf(); m_input = buf.str(); + extract_expected_status(m_input); m_lex = std::make_unique(m_input); next(); parse_toplevel(current_file); @@ -2161,6 +2163,61 @@ public: } bool has_conjecture() const { return m_has_conjecture; } + + std::string const& expected_status() const { return m_expected_status; } + + // Scan TPTP comments for an SZS/Status annotation, e.g. + // % Status : Unsatisfiable + // % SZS status Theorem + // Only the first annotation found (the top-level file's) is recorded. + void extract_expected_status(std::string const& text) { + if (!m_expected_status.empty()) + return; + std::istringstream in(text); + std::string line; + while (std::getline(in, line)) { + // TPTP comment lines start with '%'. + size_t i = line.find_first_not_of(" \t"); + if (i == std::string::npos || line[i] != '%') + continue; + ++i; // skip '%' + // Skip leading '%' and spaces. + i = line.find_first_not_of("% \t", i); + if (i == std::string::npos) + continue; + std::string rest = line.substr(i); + std::string status; + // Form 1: "SZS status " + if (rest.compare(0, 4, "SZS ") == 0) { + size_t p = rest.find("status"); + if (p == std::string::npos) + continue; + p += 6; // length of "status" + status = next_status_word(rest, p); + } + // Form 2: "Status : " / "Status : " + else if (rest.compare(0, 6, "Status") == 0) { + size_t p = rest.find(':', 6); + if (p == std::string::npos) + continue; + status = next_status_word(rest, p + 1); + } + if (!status.empty()) { + m_expected_status = status; + return; + } + } + } + + static std::string next_status_word(std::string const& s, size_t p) { + size_t a = s.find_first_not_of(" \t", p); + if (a == std::string::npos) + return ""; + size_t b = a; + while (b < s.size() && (isalnum((unsigned char)s[b]) || s[b] == '_')) + ++b; + return s.substr(a, b - a); + } }; expr_ref tptp_parser::parse_term() { @@ -2190,6 +2247,35 @@ expr_ref tptp_parser::parse_formula() { } +// Classify an SZS status into the coarse verdict used for cross-checking. +// unsat: a refutation/proof exists (Theorem, Unsatisfiable, ContradictoryAxioms, ...) +// sat: a model exists (Satisfiable, CounterSatisfiable, ...) +// other: no comparable verdict (Open, Unknown, Timeout, GaveUp, empty, ...) +enum class szs_verdict { unsat, sat, other }; + +static szs_verdict classify_szs(std::string const& s) { + if (s == "Theorem" || s == "Unsatisfiable" || s == "ContradictoryAxioms" || s == "Unsat") + return szs_verdict::unsat; + if (s == "Satisfiable" || s == "CounterSatisfiable" || s == "CounterTheorem" || s == "Sat") + return szs_verdict::sat; + return szs_verdict::other; +} + +// Emit the SZS status produced by z3. If the input carries an annotated status +// that contradicts the produced verdict, prepend "BUG" to flag the mismatch. +static void report_szs_status(char const* produced, std::string const& expected) { + szs_verdict pv = classify_szs(produced); + szs_verdict ev = classify_szs(expected); + bool is_bug = !expected.empty() && + (pv == szs_verdict::unsat || pv == szs_verdict::sat) && + (ev == szs_verdict::unsat || ev == szs_verdict::sat) && + pv != ev; + if (is_bug) + std::cout << "% SZS status BUG " << produced << " (expected " << expected << ")\n"; + else + std::cout << "% SZS status " << produced << "\n"; +} + static unsigned read_tptp_stream(std::istream& in, char const* current_file) { register_on_timeout_proc(on_timeout); try { @@ -2206,12 +2292,12 @@ static unsigned read_tptp_stream(std::istream& in, char const* current_file) { ctx.check_sat(0, nullptr); switch (ctx.cs_state()) { case cmd_context::css_unsat: - if (p.has_conjecture()) std::cout << "% SZS status Theorem\n"; - else std::cout << "% SZS status Unsatisfiable\n"; + if (p.has_conjecture()) report_szs_status("Theorem", p.expected_status()); + else report_szs_status("Unsatisfiable", p.expected_status()); break; case cmd_context::css_sat: - if (p.has_conjecture()) std::cout << "% SZS status CounterSatisfiable\n"; - else std::cout << "% SZS status Satisfiable\n"; + if (p.has_conjecture()) report_szs_status("CounterSatisfiable", p.expected_status()); + else report_szs_status("Satisfiable", p.expected_status()); if (g_display_model) { model_ref mdl; if (ctx.is_model_available(mdl)) diff --git a/src/test/main.cpp b/src/test/main.cpp index 453bf5eb55..8b4d9e760b 100644 --- a/src/test/main.cpp +++ b/src/test/main.cpp @@ -140,6 +140,7 @@ X(zstring) #define FOR_EACH_EXTRA_TEST(X, X_ARGV) \ + X(tptp) \ X(ext_numeral) \ X(interval) \ X(value_generator) \ diff --git a/src/test/tptp.cpp b/src/test/tptp.cpp index a96f31568a..6242aee092 100644 --- a/src/test/tptp.cpp +++ b/src/test/tptp.cpp @@ -123,4 +123,51 @@ R"(tff(c1,conjecture, $let(a: $int, a := 5, $let(b: $int, b := 3, $less(b,a)))). unsigned code = run_tptp("tff(c1,conjecture, $less(1/0,1)).", out, err); ENSURE(code == ERR_PARSER); ENSURE(err.find("denominator of rational literal cannot be zero") != std::string::npos); + + // SZS status cross-checking against the annotated input status. + + // Matching annotation: no BUG flag. + { + std::string o = run_tptp( +R"(% Status : Unsatisfiable +cnf(c1,axiom, p(X)). +cnf(c2,axiom, ~ p(a)).)"); + ENSURE(o.find("% SZS status Unsatisfiable") != std::string::npos); + ENSURE(o.find("BUG") == std::string::npos); + } + + // Contradicting annotation (says Satisfiable, z3 finds Unsatisfiable): BUG. + { + std::string o = run_tptp( +R"(% SZS status Satisfiable +cnf(c1,axiom, p(X)). +cnf(c2,axiom, ~ p(a)).)"); + ENSURE(o.find("BUG") != std::string::npos); + ENSURE(o.find("expected Satisfiable") != std::string::npos); + } + + // Contradicting annotation (says Unsatisfiable, z3 finds Satisfiable): BUG. + { + std::string o = run_tptp( +R"(% Status : Unsatisfiable +fof(a1,axiom, p(a)).)"); + ENSURE(o.find("BUG") != std::string::npos); + } + + // Theorem annotation matches z3's Theorem verdict for conjectures: no BUG. + { + std::string o = run_tptp( +R"(% SZS status Theorem +fof(a1,axiom, ! [X] : (human(X) => mortal(X))). +fof(a2,axiom, human(socrates)). +fof(c1,conjecture, mortal(socrates)).)"); + ENSURE(o.find("% SZS status Theorem") != std::string::npos); + ENSURE(o.find("BUG") == std::string::npos); + } + + // Unannotated input: nothing to check, no BUG. + { + std::string o = run_tptp("fof(a1,axiom, p(a))."); + ENSURE(o.find("BUG") == std::string::npos); + } }