From df0abab7884ecfff887fc0b5ed0c53d1d3871c4e Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 13 May 2026 09:29:35 +0000
Subject: [PATCH] Add TPTP benchmark workflow (tptp-benchmark.md + lock file)

Agent-Logs-Url: https://github.com/Z3Prover/z3/sessions/834244bd-f780-46e8-b66a-d0e8c88f8dc8

Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com>
---
 .github/workflows/tptp-benchmark.lock.yml | 1317 +++++++++++++++++++++
 .github/workflows/tptp-benchmark.md       |  554 +++++++++
 2 files changed, 1871 insertions(+)
 create mode 100644 .github/workflows/tptp-benchmark.lock.yml
 create mode 100644 .github/workflows/tptp-benchmark.md

diff --git a/.github/workflows/tptp-benchmark.lock.yml b/.github/workflows/tptp-benchmark.lock.yml
new file mode 100644
index 000000000..7e682c2fb
--- /dev/null
+++ b/.github/workflows/tptp-benchmark.lock.yml
@@ -0,0 +1,1317 @@
+# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"05a6ebcfc669740744c3887e595339f183e37872d3ebbaebe34513c2a8016af8","compiler_version":"v0.72.1","strict":true,"agent_id":"copilot"}
+# gh-aw-manifest: {"version":1,"secrets":["COPILOT_GITHUB_TOKEN","GH_AW_GITHUB_MCP_SERVER_TOKEN","GH_AW_GITHUB_TOKEN","GITHUB_TOKEN"],"actions":[{"repo":"actions/checkout","sha":"de0fac2e4500dabe0009e67214ff5f5447ce83dd","version":"v6.0.2"},{"repo":"actions/download-artifact","sha":"3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c","version":"v8.0.1"},{"repo":"actions/github-script","sha":"3a2844b7e9c422d3c10d287c895573f7108da1b3","version":"v9"},{"repo":"actions/setup-node","sha":"48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e","version":"v6.4.0"},{"repo":"actions/upload-artifact","sha":"043fb46d1a93c77aae656e7c1c64a875d1fc6a0a","version":"v7.0.1"},{"repo":"github/gh-aw-actions/setup","sha":"v0.72.1","version":"v0.72.1"}],"containers":[{"image":"ghcr.io/github/gh-aw-firewall/agent:0.25.41"},{"image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.25.41"},{"image":"ghcr.io/github/gh-aw-firewall/squid:0.25.41"},{"image":"ghcr.io/github/gh-aw-mcpg:v0.3.6","digest":"sha256:2bb8eef86006a4c5963c55616a9c51c32f27bfdecb023b8aa6f91f6718d9171c","pinned_image":"ghcr.io/github/gh-aw-mcpg:v0.3.6@sha256:2bb8eef86006a4c5963c55616a9c51c32f27bfdecb023b8aa6f91f6718d9171c"},{"image":"ghcr.io/github/github-mcp-server:v1.0.3","digest":"sha256:2ac27ef03461ef2b877031b838a7d1fd7f12b12d4ace7796d8cad91446d55959","pinned_image":"ghcr.io/github/github-mcp-server:v1.0.3@sha256:2ac27ef03461ef2b877031b838a7d1fd7f12b12d4ace7796d8cad91446d55959"},{"image":"node:lts-alpine","digest":"sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f","pinned_image":"node:lts-alpine@sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f"}]}
+#    ___                   _   _      
+#   / _ \                 | | (_)     
+#  | |_| | __ _  ___ _ __ | |_ _  ___ 
+#  |  _  |/ _` |/ _ \ '_ \| __| |/ __|
+#  | | | | (_| |  __/ | | | |_| | (__ 
+#  \_| |_/\__, |\___|_| |_|\__|_|\___|
+#          __/ |
+#  _    _ |___/ 
+# | |  | |                / _| |
+# | |  | | ___ _ __ _  __| |_| | _____      ____
+# | |/\| |/ _ \ '__| |/ /|  _| |/ _ \ \ /\ / / ___|
+# \  /\  / (_) | | | | ( | | | | (_) \ V  V /\__ \
+#  \/  \/ \___/|_| |_|\_\|_| |_|\___/ \_/\_/ |___/
+#
+# This file was automatically generated by gh-aw (v0.72.1). DO NOT EDIT.
+#
+# To update this file, edit the corresponding .md file and run:
+#   gh aw compile
+# Not all edits will cause changes to this file.
+#
+# For more information: https://github.github.com/gh-aw/introduction/overview/
+#
+# Weekly benchmark of Z3's TPTP front-end against 500 random TPTP problems. Downloads TPTP benchmarks from tptp.org, resolves axiom dependencies, skips large problems, runs each with a 5-second timeout, and posts a discrepancy/crash report as a GitHub discussion.
+#
+# Secrets used:
+#   - COPILOT_GITHUB_TOKEN
+#   - GH_AW_GITHUB_MCP_SERVER_TOKEN
+#   - GH_AW_GITHUB_TOKEN
+#   - GITHUB_TOKEN
+#
+# Custom actions used:
+#   - actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+#   - actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+#   - actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9
+#   - actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+#   - actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
+#   - actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+#   - github/gh-aw-actions/setup@v0.72.1
+#
+# Container images used:
+#   - ghcr.io/github/gh-aw-firewall/agent:0.25.41
+#   - ghcr.io/github/gh-aw-firewall/api-proxy:0.25.41
+#   - ghcr.io/github/gh-aw-firewall/squid:0.25.41
+#   - ghcr.io/github/gh-aw-mcpg:v0.3.6@sha256:2bb8eef86006a4c5963c55616a9c51c32f27bfdecb023b8aa6f91f6718d9171c
+#   - ghcr.io/github/github-mcp-server:v1.0.3@sha256:2ac27ef03461ef2b877031b838a7d1fd7f12b12d4ace7796d8cad91446d55959
+#   - node:lts-alpine@sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f
+
+name: "TPTP Front-End Benchmark"
+"on":
+  schedule:
+  - cron: "0 6 * * 1"
+  workflow_dispatch:
+    inputs:
+      aw_context:
+        default: ""
+        description: Agent caller context (used internally by Agentic Workflows).
+        required: false
+        type: string
+
+permissions: {}
+
+concurrency:
+  group: "gh-aw-${{ github.workflow }}"
+
+run-name: "TPTP Front-End Benchmark"
+
+jobs:
+  activation:
+    runs-on: ubuntu-slim
+    permissions:
+      actions: read
+      contents: read
+    outputs:
+      comment_id: ""
+      comment_repo: ""
+      engine_id: ${{ steps.generate_aw_info.outputs.engine_id }}
+      lockdown_check_failed: ${{ steps.generate_aw_info.outputs.lockdown_check_failed == 'true' }}
+      model: ${{ steps.generate_aw_info.outputs.model }}
+      secret_verification_result: ${{ steps.validate-secret.outputs.verification_result }}
+      setup-trace-id: ${{ steps.setup.outputs.trace-id }}
+      stale_lock_file_failed: ${{ steps.check-lock-file.outputs.stale_lock_file_failed == 'true' }}
+    steps:
+      - name: Setup Scripts
+        id: setup
+        uses: github/gh-aw-actions/setup@v0.72.1
+        with:
+          destination: ${{ runner.temp }}/gh-aw/actions
+          job-name: ${{ github.job }}
+        env:
+          GH_AW_SETUP_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+          GH_AW_CURRENT_WORKFLOW_REF: ${{ github.repository }}/.github/workflows/tptp-benchmark.lock.yml@${{ github.ref }}
+          GH_AW_INFO_VERSION: "1.0.40"
+      - name: Generate agentic run info
+        id: generate_aw_info
+        env:
+          GH_AW_INFO_ENGINE_ID: "copilot"
+          GH_AW_INFO_ENGINE_NAME: "GitHub Copilot CLI"
+          GH_AW_INFO_MODEL: ${{ vars.GH_AW_MODEL_AGENT_COPILOT || 'claude-sonnet-4.6' }}
+          GH_AW_INFO_VERSION: "1.0.40"
+          GH_AW_INFO_AGENT_VERSION: "1.0.40"
+          GH_AW_INFO_CLI_VERSION: "v0.72.1"
+          GH_AW_INFO_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+          GH_AW_INFO_EXPERIMENTAL: "false"
+          GH_AW_INFO_SUPPORTS_TOOLS_ALLOWLIST: "true"
+          GH_AW_INFO_STAGED: "false"
+          GH_AW_INFO_ALLOWED_DOMAINS: '["defaults","tptp.org"]'
+          GH_AW_INFO_FIREWALL_ENABLED: "true"
+          GH_AW_INFO_AWF_VERSION: "v0.25.41"
+          GH_AW_INFO_AWMG_VERSION: ""
+          GH_AW_INFO_FIREWALL_TYPE: "squid"
+          GH_AW_COMPILED_STRICT: "true"
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        with:
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io, getOctokit);
+            const { main } = require('${{ runner.temp }}/gh-aw/actions/generate_aw_info.cjs');
+            await main(core, context);
+      - name: Validate COPILOT_GITHUB_TOKEN secret
+        id: validate-secret
+        run: bash "${RUNNER_TEMP}/gh-aw/actions/validate_multi_secret.sh" COPILOT_GITHUB_TOKEN 'GitHub Copilot CLI' https://github.github.com/gh-aw/reference/engines/#github-copilot-default
+        env:
+          COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }}
+      - name: Checkout .github and .agents folders
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          persist-credentials: false
+          sparse-checkout: |
+            .github
+            .agents
+            .claude
+            .codex
+            .crush
+            .gemini
+            .opencode
+            .pi
+          sparse-checkout-cone-mode: true
+          fetch-depth: 1
+      - name: Save agent config folders for base branch restoration
+        env:
+          GH_AW_AGENT_FOLDERS: ".agents .claude .codex .crush .gemini .github .opencode .pi"
+          GH_AW_AGENT_FILES: ".crush.json AGENTS.md CLAUDE.md GEMINI.md PI.md opencode.jsonc"
+        # poutine:ignore untrusted_checkout_exec
+        run: bash "${RUNNER_TEMP}/gh-aw/actions/save_base_github_folders.sh"
+      - name: Check workflow lock file
+        id: check-lock-file
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        env:
+          GH_AW_WORKFLOW_FILE: "tptp-benchmark.lock.yml"
+          GH_AW_CONTEXT_WORKFLOW_REF: "${{ github.workflow_ref }}"
+        with:
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io, getOctokit);
+            const { main } = require('${{ runner.temp }}/gh-aw/actions/check_workflow_timestamp_api.cjs');
+            await main();
+      - name: Check compile-agentic version
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        env:
+          GH_AW_COMPILED_VERSION: "v0.72.1"
+        with:
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io, getOctokit);
+            const { main } = require('${{ runner.temp }}/gh-aw/actions/check_version_updates.cjs');
+            await main();
+      - name: Create prompt with built-in context
+        env:
+          GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+          GH_AW_SAFE_OUTPUTS: ${{ runner.temp }}/gh-aw/safeoutputs/outputs.jsonl
+          GH_AW_GITHUB_ACTOR: ${{ github.actor }}
+          GH_AW_GITHUB_EVENT_COMMENT_ID: ${{ github.event.comment.id }}
+          GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: ${{ github.event.discussion.number }}
+          GH_AW_GITHUB_EVENT_ISSUE_NUMBER: ${{ github.event.issue.number }}
+          GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: ${{ github.event.pull_request.number }}
+          GH_AW_GITHUB_REPOSITORY: ${{ github.repository }}
+          GH_AW_GITHUB_RUN_ID: ${{ github.run_id }}
+          GH_AW_GITHUB_WORKFLOW: ${{ github.workflow }}
+          GH_AW_GITHUB_WORKSPACE: ${{ github.workspace }}
+        # poutine:ignore untrusted_checkout_exec
+        run: |
+          bash "${RUNNER_TEMP}/gh-aw/actions/create_prompt_first.sh"
+          {
+          cat << 'GH_AW_PROMPT_dbe2d26d7191f793_EOF'
+          <system>
+          GH_AW_PROMPT_dbe2d26d7191f793_EOF
+          cat "${RUNNER_TEMP}/gh-aw/prompts/xpia.md"
+          cat "${RUNNER_TEMP}/gh-aw/prompts/temp_folder_prompt.md"
+          cat "${RUNNER_TEMP}/gh-aw/prompts/markdown.md"
+          cat "${RUNNER_TEMP}/gh-aw/prompts/safe_outputs_prompt.md"
+          cat << 'GH_AW_PROMPT_dbe2d26d7191f793_EOF'
+          <safe-output-tools>
+          Tools: create_discussion, missing_tool, missing_data, noop
+          </safe-output-tools>
+          GH_AW_PROMPT_dbe2d26d7191f793_EOF
+          cat "${RUNNER_TEMP}/gh-aw/prompts/mcp_cli_tools_prompt.md"
+          cat << 'GH_AW_PROMPT_dbe2d26d7191f793_EOF'
+          <github-context>
+          The following GitHub context information is available for this workflow:
+          {{#if __GH_AW_GITHUB_ACTOR__ }}
+          - **actor**: __GH_AW_GITHUB_ACTOR__
+          {{/if}}
+          {{#if __GH_AW_GITHUB_REPOSITORY__ }}
+          - **repository**: __GH_AW_GITHUB_REPOSITORY__
+          {{/if}}
+          {{#if __GH_AW_GITHUB_WORKSPACE__ }}
+          - **workspace**: __GH_AW_GITHUB_WORKSPACE__
+          {{/if}}
+          {{#if __GH_AW_GITHUB_EVENT_ISSUE_NUMBER__ }}
+          - **issue-number**: #__GH_AW_GITHUB_EVENT_ISSUE_NUMBER__
+          {{/if}}
+          {{#if __GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER__ }}
+          - **discussion-number**: #__GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER__
+          {{/if}}
+          {{#if __GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER__ }}
+          - **pull-request-number**: #__GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER__
+          {{/if}}
+          {{#if __GH_AW_GITHUB_EVENT_COMMENT_ID__ }}
+          - **comment-id**: __GH_AW_GITHUB_EVENT_COMMENT_ID__
+          {{/if}}
+          {{#if __GH_AW_GITHUB_RUN_ID__ }}
+          - **workflow-run-id**: __GH_AW_GITHUB_RUN_ID__
+          {{/if}}
+          </github-context>
+          
+          GH_AW_PROMPT_dbe2d26d7191f793_EOF
+          cat "${RUNNER_TEMP}/gh-aw/prompts/github_mcp_tools_with_safeoutputs_prompt.md"
+          cat << 'GH_AW_PROMPT_dbe2d26d7191f793_EOF'
+          </system>
+          {{#runtime-import .github/workflows/tptp-benchmark.md}}
+          GH_AW_PROMPT_dbe2d26d7191f793_EOF
+          } > "$GH_AW_PROMPT"
+      - name: Interpolate variables and render templates
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        env:
+          GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+          GH_AW_ENGINE_ID: "copilot"
+          GH_AW_GITHUB_REPOSITORY: ${{ github.repository }}
+          GH_AW_GITHUB_RUN_ID: ${{ github.run_id }}
+          GH_AW_GITHUB_WORKFLOW: ${{ github.workflow }}
+          GH_AW_GITHUB_WORKSPACE: ${{ github.workspace }}
+        with:
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io, getOctokit);
+            const { main } = require('${{ runner.temp }}/gh-aw/actions/interpolate_prompt.cjs');
+            await main();
+      - name: Substitute placeholders
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        env:
+          GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+          GH_AW_GITHUB_ACTOR: ${{ github.actor }}
+          GH_AW_GITHUB_EVENT_COMMENT_ID: ${{ github.event.comment.id }}
+          GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: ${{ github.event.discussion.number }}
+          GH_AW_GITHUB_EVENT_ISSUE_NUMBER: ${{ github.event.issue.number }}
+          GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: ${{ github.event.pull_request.number }}
+          GH_AW_GITHUB_REPOSITORY: ${{ github.repository }}
+          GH_AW_GITHUB_RUN_ID: ${{ github.run_id }}
+          GH_AW_GITHUB_WORKFLOW: ${{ github.workflow }}
+          GH_AW_GITHUB_WORKSPACE: ${{ github.workspace }}
+          GH_AW_MCP_CLI_SERVERS_LIST: '- `safeoutputs` — run `safeoutputs --help` to see available tools'
+        with:
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io, getOctokit);
+            
+            const substitutePlaceholders = require('${{ runner.temp }}/gh-aw/actions/substitute_placeholders.cjs');
+            
+            // Call the substitution function
+            return await substitutePlaceholders({
+              file: process.env.GH_AW_PROMPT,
+              substitutions: {
+                GH_AW_GITHUB_ACTOR: process.env.GH_AW_GITHUB_ACTOR,
+                GH_AW_GITHUB_EVENT_COMMENT_ID: process.env.GH_AW_GITHUB_EVENT_COMMENT_ID,
+                GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: process.env.GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER,
+                GH_AW_GITHUB_EVENT_ISSUE_NUMBER: process.env.GH_AW_GITHUB_EVENT_ISSUE_NUMBER,
+                GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: process.env.GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER,
+                GH_AW_GITHUB_REPOSITORY: process.env.GH_AW_GITHUB_REPOSITORY,
+                GH_AW_GITHUB_RUN_ID: process.env.GH_AW_GITHUB_RUN_ID,
+                GH_AW_GITHUB_WORKFLOW: process.env.GH_AW_GITHUB_WORKFLOW,
+                GH_AW_GITHUB_WORKSPACE: process.env.GH_AW_GITHUB_WORKSPACE,
+                GH_AW_MCP_CLI_SERVERS_LIST: process.env.GH_AW_MCP_CLI_SERVERS_LIST
+              }
+            });
+      - name: Validate prompt placeholders
+        env:
+          GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+        # poutine:ignore untrusted_checkout_exec
+        run: bash "${RUNNER_TEMP}/gh-aw/actions/validate_prompt_placeholders.sh"
+      - name: Print prompt
+        env:
+          GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+        # poutine:ignore untrusted_checkout_exec
+        run: bash "${RUNNER_TEMP}/gh-aw/actions/print_prompt_summary.sh"
+      - name: Upload activation artifact
+        if: success()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: activation
+          include-hidden-files: true
+          path: |
+            /tmp/gh-aw/aw_info.json
+            /tmp/gh-aw/aw-prompts/prompt.txt
+            /tmp/gh-aw/aw-prompts/prompt-template.txt
+            /tmp/gh-aw/aw-prompts/prompt-import-tree.json
+            /tmp/gh-aw/github_rate_limits.jsonl
+            /tmp/gh-aw/base
+            /tmp/gh-aw/.github/agents
+          if-no-files-found: ignore
+          retention-days: 1
+
+  agent:
+    needs: activation
+    runs-on: ubuntu-latest
+    permissions: read-all
+    concurrency:
+      group: "gh-aw-copilot-${{ github.workflow }}"
+    env:
+      DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+      GH_AW_ASSETS_ALLOWED_EXTS: ""
+      GH_AW_ASSETS_BRANCH: ""
+      GH_AW_ASSETS_MAX_SIZE_KB: 0
+      GH_AW_MCP_LOG_DIR: /tmp/gh-aw/mcp-logs/safeoutputs
+      GH_AW_WORKFLOW_ID_SANITIZED: tptpbenchmark
+    outputs:
+      agentic_engine_timeout: ${{ steps.detect-copilot-errors.outputs.agentic_engine_timeout || 'false' }}
+      checkout_pr_success: ${{ steps.checkout-pr.outputs.checkout_pr_success || 'true' }}
+      effective_tokens: ${{ steps.parse-mcp-gateway.outputs.effective_tokens }}
+      has_patch: ${{ steps.collect_output.outputs.has_patch }}
+      inference_access_error: ${{ steps.detect-copilot-errors.outputs.inference_access_error || 'false' }}
+      mcp_policy_error: ${{ steps.detect-copilot-errors.outputs.mcp_policy_error || 'false' }}
+      model: ${{ needs.activation.outputs.model }}
+      model_not_supported_error: ${{ steps.detect-copilot-errors.outputs.model_not_supported_error || 'false' }}
+      output: ${{ steps.collect_output.outputs.output }}
+      output_types: ${{ steps.collect_output.outputs.output_types }}
+      setup-trace-id: ${{ steps.setup.outputs.trace-id }}
+    steps:
+      - name: Setup Scripts
+        id: setup
+        uses: github/gh-aw-actions/setup@v0.72.1
+        with:
+          destination: ${{ runner.temp }}/gh-aw/actions
+          job-name: ${{ github.job }}
+          trace-id: ${{ needs.activation.outputs.setup-trace-id }}
+        env:
+          GH_AW_SETUP_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+          GH_AW_CURRENT_WORKFLOW_REF: ${{ github.repository }}/.github/workflows/tptp-benchmark.lock.yml@${{ github.ref }}
+          GH_AW_INFO_VERSION: "1.0.40"
+      - name: Set runtime paths
+        id: set-runtime-paths
+        run: |
+          {
+            echo "GH_AW_SAFE_OUTPUTS=${RUNNER_TEMP}/gh-aw/safeoutputs/outputs.jsonl"
+            echo "GH_AW_SAFE_OUTPUTS_CONFIG_PATH=${RUNNER_TEMP}/gh-aw/safeoutputs/config.json"
+            echo "GH_AW_SAFE_OUTPUTS_TOOLS_PATH=${RUNNER_TEMP}/gh-aw/safeoutputs/tools.json"
+          } >> "$GITHUB_OUTPUT"
+      - name: Create gh-aw temp directory
+        run: bash "${RUNNER_TEMP}/gh-aw/actions/create_gh_aw_tmp_dir.sh"
+      - name: Configure gh CLI for GitHub Enterprise
+        run: bash "${RUNNER_TEMP}/gh-aw/actions/configure_gh_for_ghe.sh"
+        env:
+          GH_TOKEN: ${{ github.token }}
+      - name: Checkout repository
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          persist-credentials: false
+
+      - name: Configure Git credentials
+        env:
+          REPO_NAME: ${{ github.repository }}
+          SERVER_URL: ${{ github.server_url }}
+          GITHUB_TOKEN: ${{ github.token }}
+        run: |
+          git config --global user.email "github-actions[bot]@users.noreply.github.com"
+          git config --global user.name "github-actions[bot]"
+          git config --global am.keepcr true
+          # Re-authenticate git with GitHub token
+          SERVER_URL_STRIPPED="${SERVER_URL#https://}"
+          git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@${SERVER_URL_STRIPPED}/${REPO_NAME}.git"
+          echo "Git configured with standard GitHub Actions identity"
+      - name: Checkout PR branch
+        id: checkout-pr
+        if: |
+          github.event.pull_request || github.event.issue.pull_request
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        env:
+          GH_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+        with:
+          github-token: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io, getOctokit);
+            const { main } = require('${{ runner.temp }}/gh-aw/actions/checkout_pr_branch.cjs');
+            await main();
+      - name: Install GitHub Copilot CLI
+        run: bash "${RUNNER_TEMP}/gh-aw/actions/install_copilot_cli.sh" 1.0.40
+        env:
+          GH_HOST: github.com
+      - name: Install AWF binary
+        run: bash "${RUNNER_TEMP}/gh-aw/actions/install_awf_binary.sh" v0.25.41
+      - name: Determine automatic lockdown mode for GitHub MCP Server
+        id: determine-automatic-lockdown
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9
+        env:
+          GH_AW_GITHUB_TOKEN: ${{ secrets.GH_AW_GITHUB_TOKEN }}
+          GH_AW_GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN }}
+        with:
+          script: |
+            const determineAutomaticLockdown = require('${{ runner.temp }}/gh-aw/actions/determine_automatic_lockdown.cjs');
+            await determineAutomaticLockdown(github, context, core);
+      - name: Download activation artifact
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+        with:
+          name: activation
+          path: /tmp/gh-aw
+      - name: Restore agent config folders from base branch
+        if: steps.checkout-pr.outcome == 'success'
+        env:
+          GH_AW_AGENT_FOLDERS: ".agents .claude .codex .crush .gemini .github .opencode .pi"
+          GH_AW_AGENT_FILES: ".crush.json AGENTS.md CLAUDE.md GEMINI.md PI.md opencode.jsonc"
+        run: bash "${RUNNER_TEMP}/gh-aw/actions/restore_base_github_folders.sh"
+      - name: Restore inline sub-agents from activation artifact
+        env:
+          GH_AW_SUB_AGENT_DIR: ".github/agents"
+          GH_AW_SUB_AGENT_EXT: ".agent.md"
+        run: bash "${RUNNER_TEMP}/gh-aw/actions/restore_inline_sub_agents.sh"
+      - name: Download container images
+        run: bash "${RUNNER_TEMP}/gh-aw/actions/download_docker_images.sh" ghcr.io/github/gh-aw-firewall/agent:0.25.41 ghcr.io/github/gh-aw-firewall/api-proxy:0.25.41 ghcr.io/github/gh-aw-firewall/squid:0.25.41 ghcr.io/github/gh-aw-mcpg:v0.3.6@sha256:2bb8eef86006a4c5963c55616a9c51c32f27bfdecb023b8aa6f91f6718d9171c ghcr.io/github/github-mcp-server:v1.0.3@sha256:2ac27ef03461ef2b877031b838a7d1fd7f12b12d4ace7796d8cad91446d55959 node:lts-alpine@sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f
+      - name: Generate Safe Outputs Config
+        run: |
+          mkdir -p "${RUNNER_TEMP}/gh-aw/safeoutputs"
+          mkdir -p /tmp/gh-aw/safeoutputs
+          mkdir -p /tmp/gh-aw/mcp-logs/safeoutputs
+          cat > "${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" << 'GH_AW_SAFE_OUTPUTS_CONFIG_832ae11399c26134_EOF'
+          {"create_discussion":{"category":"agentic workflows","close_older_discussions":true,"expires":336,"fallback_to_issue":true,"max":1,"title_prefix":"[TPTP Benchmark] "},"create_report_incomplete_issue":{},"missing_data":{},"missing_tool":{},"noop":{"max":1,"report-as-issue":"false"},"report_incomplete":{}}
+          GH_AW_SAFE_OUTPUTS_CONFIG_832ae11399c26134_EOF
+      - name: Generate Safe Outputs Tools
+        env:
+          GH_AW_TOOLS_META_JSON: |
+            {
+              "description_suffixes": {
+                "create_discussion": " CONSTRAINTS: Maximum 1 discussion(s) can be created. Title will be prefixed with \"[TPTP Benchmark] \". Discussions will be created in category \"agentic workflows\"."
+              },
+              "repo_params": {},
+              "dynamic_tools": []
+            }
+          GH_AW_VALIDATION_JSON: |
+            {
+              "create_discussion": {
+                "defaultMax": 1,
+                "fields": {
+                  "body": {
+                    "required": true,
+                    "type": "string",
+                    "sanitize": true,
+                    "maxLength": 65000
+                  },
+                  "category": {
+                    "type": "string",
+                    "sanitize": true,
+                    "maxLength": 128
+                  },
+                  "repo": {
+                    "type": "string",
+                    "maxLength": 256
+                  },
+                  "title": {
+                    "required": true,
+                    "type": "string",
+                    "sanitize": true,
+                    "maxLength": 128
+                  }
+                }
+              },
+              "missing_data": {
+                "defaultMax": 20,
+                "fields": {
+                  "alternatives": {
+                    "type": "string",
+                    "sanitize": true,
+                    "maxLength": 256
+                  },
+                  "context": {
+                    "type": "string",
+                    "sanitize": true,
+                    "maxLength": 256
+                  },
+                  "data_type": {
+                    "type": "string",
+                    "sanitize": true,
+                    "maxLength": 128
+                  },
+                  "reason": {
+                    "type": "string",
+                    "sanitize": true,
+                    "maxLength": 256
+                  }
+                }
+              },
+              "missing_tool": {
+                "defaultMax": 20,
+                "fields": {
+                  "alternatives": {
+                    "type": "string",
+                    "sanitize": true,
+                    "maxLength": 512
+                  },
+                  "reason": {
+                    "required": true,
+                    "type": "string",
+                    "sanitize": true,
+                    "maxLength": 256
+                  },
+                  "tool": {
+                    "type": "string",
+                    "sanitize": true,
+                    "maxLength": 128
+                  }
+                }
+              },
+              "noop": {
+                "defaultMax": 1,
+                "fields": {
+                  "message": {
+                    "required": true,
+                    "type": "string",
+                    "sanitize": true,
+                    "maxLength": 65000
+                  }
+                }
+              },
+              "report_incomplete": {
+                "defaultMax": 5,
+                "fields": {
+                  "details": {
+                    "type": "string",
+                    "sanitize": true,
+                    "maxLength": 65000
+                  },
+                  "reason": {
+                    "required": true,
+                    "type": "string",
+                    "sanitize": true,
+                    "maxLength": 1024
+                  }
+                }
+              }
+            }
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        with:
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io, getOctokit);
+            const { main } = require('${{ runner.temp }}/gh-aw/actions/generate_safe_outputs_tools.cjs');
+            await main();
+      - name: Generate Safe Outputs MCP Server Config
+        id: safe-outputs-config
+        run: |
+          # Generate a secure random API key (360 bits of entropy, 40+ chars)
+          # Mask immediately to prevent timing vulnerabilities
+          API_KEY=$(openssl rand -base64 45 | tr -d '/+=')
+          echo "::add-mask::${API_KEY}"
+          
+          PORT=3001
+          
+          # Set outputs for next steps
+          {
+            echo "safe_outputs_api_key=${API_KEY}"
+            echo "safe_outputs_port=${PORT}"
+          } >> "$GITHUB_OUTPUT"
+          
+          echo "Safe Outputs MCP server will run on port ${PORT}"
+          
+      - name: Start Safe Outputs MCP HTTP Server
+        id: safe-outputs-start
+        env:
+          DEBUG: '*'
+          GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }}
+          GH_AW_SAFE_OUTPUTS_PORT: ${{ steps.safe-outputs-config.outputs.safe_outputs_port }}
+          GH_AW_SAFE_OUTPUTS_API_KEY: ${{ steps.safe-outputs-config.outputs.safe_outputs_api_key }}
+          GH_AW_SAFE_OUTPUTS_TOOLS_PATH: ${{ runner.temp }}/gh-aw/safeoutputs/tools.json
+          GH_AW_SAFE_OUTPUTS_CONFIG_PATH: ${{ runner.temp }}/gh-aw/safeoutputs/config.json
+          GH_AW_MCP_LOG_DIR: /tmp/gh-aw/mcp-logs/safeoutputs
+        run: |
+          # Environment variables are set above to prevent template injection
+          export DEBUG
+          export GH_AW_SAFE_OUTPUTS
+          export GH_AW_SAFE_OUTPUTS_PORT
+          export GH_AW_SAFE_OUTPUTS_API_KEY
+          export GH_AW_SAFE_OUTPUTS_TOOLS_PATH
+          export GH_AW_SAFE_OUTPUTS_CONFIG_PATH
+          export GH_AW_MCP_LOG_DIR
+          
+          bash "${RUNNER_TEMP}/gh-aw/actions/start_safe_outputs_server.sh"
+          
+      - name: Start MCP Gateway
+        id: start-mcp-gateway
+        env:
+          GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }}
+          GH_AW_SAFE_OUTPUTS_API_KEY: ${{ steps.safe-outputs-start.outputs.api_key }}
+          GH_AW_SAFE_OUTPUTS_PORT: ${{ steps.safe-outputs-start.outputs.port }}
+          GITHUB_MCP_GUARD_MIN_INTEGRITY: ${{ steps.determine-automatic-lockdown.outputs.min_integrity }}
+          GITHUB_MCP_GUARD_REPOS: ${{ steps.determine-automatic-lockdown.outputs.repos }}
+          GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+        run: |
+          set -eo pipefail
+          mkdir -p "${RUNNER_TEMP}/gh-aw/mcp-config"
+          
+          # Export gateway environment variables for MCP config and gateway script
+          export MCP_GATEWAY_PORT="8080"
+          export MCP_GATEWAY_DOMAIN="host.docker.internal"
+          export MCP_GATEWAY_HOST_DOMAIN="localhost"
+          MCP_GATEWAY_API_KEY=$(openssl rand -base64 45 | tr -d '/+=')
+          echo "::add-mask::${MCP_GATEWAY_API_KEY}"
+          export MCP_GATEWAY_API_KEY
+          export MCP_GATEWAY_PAYLOAD_DIR="/tmp/gh-aw/mcp-payloads"
+          mkdir -p "${MCP_GATEWAY_PAYLOAD_DIR}"
+          export MCP_GATEWAY_PAYLOAD_SIZE_THRESHOLD="524288"
+          export DEBUG="*"
+          
+          export GH_AW_ENGINE="copilot"
+          MCP_GATEWAY_UID=$(id -u 2>/dev/null || echo '0')
+          MCP_GATEWAY_GID=$(id -g 2>/dev/null || echo '0')
+          DOCKER_SOCK_GID=$(stat -c '%g' /var/run/docker.sock 2>/dev/null || echo '0')
+          export MCP_GATEWAY_DOCKER_COMMAND='docker run -i --rm --network host --add-host host.docker.internal:127.0.0.1 --user '"${MCP_GATEWAY_UID}"':'"${MCP_GATEWAY_GID}"' --group-add '"${DOCKER_SOCK_GID}"' -v /var/run/docker.sock:/var/run/docker.sock -e MCP_GATEWAY_PORT -e MCP_GATEWAY_DOMAIN -e MCP_GATEWAY_API_KEY -e MCP_GATEWAY_PAYLOAD_DIR -e MCP_GATEWAY_PAYLOAD_SIZE_THRESHOLD -e DEBUG -e MCP_GATEWAY_LOG_DIR -e GH_AW_MCP_LOG_DIR -e GH_AW_SAFE_OUTPUTS -e GH_AW_SAFE_OUTPUTS_CONFIG_PATH -e GH_AW_SAFE_OUTPUTS_TOOLS_PATH -e GH_AW_ASSETS_BRANCH -e GH_AW_ASSETS_MAX_SIZE_KB -e GH_AW_ASSETS_ALLOWED_EXTS -e DEFAULT_BRANCH -e GITHUB_MCP_SERVER_TOKEN -e GITHUB_MCP_GUARD_MIN_INTEGRITY -e GITHUB_MCP_GUARD_REPOS -e GITHUB_REPOSITORY -e GITHUB_SERVER_URL -e GITHUB_SHA -e GITHUB_WORKSPACE -e GITHUB_TOKEN -e GITHUB_RUN_ID -e GITHUB_RUN_NUMBER -e GITHUB_RUN_ATTEMPT -e GITHUB_JOB -e GITHUB_ACTION -e GITHUB_EVENT_NAME -e GITHUB_EVENT_PATH -e GITHUB_ACTOR -e GITHUB_ACTOR_ID -e GITHUB_TRIGGERING_ACTOR -e GITHUB_WORKFLOW -e GITHUB_WORKFLOW_REF -e GITHUB_WORKFLOW_SHA -e GITHUB_REF -e GITHUB_REF_NAME -e GITHUB_REF_TYPE -e GITHUB_HEAD_REF -e GITHUB_BASE_REF -e GH_AW_SAFE_OUTPUTS_PORT -e GH_AW_SAFE_OUTPUTS_API_KEY -v /tmp/gh-aw/mcp-payloads:/tmp/gh-aw/mcp-payloads:rw -v /opt:/opt:ro -v /tmp:/tmp:rw -v '"${GITHUB_WORKSPACE}"':'"${GITHUB_WORKSPACE}"':rw ghcr.io/github/gh-aw-mcpg:v0.3.6'
+          
+          mkdir -p /home/runner/.copilot
+          GH_AW_NODE=$(which node 2>/dev/null || command -v node 2>/dev/null || echo node)
+          cat << GH_AW_MCP_CONFIG_0b5ba1ac1a376250_EOF | "$GH_AW_NODE" "${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.cjs"
+          {
+            "mcpServers": {
+              "github": {
+                "type": "stdio",
+                "container": "ghcr.io/github/github-mcp-server:v1.0.3",
+                "env": {
+                  "GITHUB_HOST": "\${GITHUB_SERVER_URL}",
+                  "GITHUB_PERSONAL_ACCESS_TOKEN": "\${GITHUB_MCP_SERVER_TOKEN}",
+                  "GITHUB_READ_ONLY": "1",
+                  "GITHUB_TOOLSETS": "context,repos,issues,pull_requests"
+                },
+                "guard-policies": {
+                  "allow-only": {
+                    "min-integrity": "$GITHUB_MCP_GUARD_MIN_INTEGRITY",
+                    "repos": "$GITHUB_MCP_GUARD_REPOS"
+                  }
+                }
+              },
+              "safeoutputs": {
+                "type": "http",
+                "url": "http://host.docker.internal:$GH_AW_SAFE_OUTPUTS_PORT",
+                "headers": {
+                  "Authorization": "\${GH_AW_SAFE_OUTPUTS_API_KEY}"
+                },
+                "guard-policies": {
+                  "write-sink": {
+                    "accept": [
+                      "*"
+                    ]
+                  }
+                }
+              }
+            },
+            "gateway": {
+              "port": $MCP_GATEWAY_PORT,
+              "domain": "${MCP_GATEWAY_DOMAIN}",
+              "apiKey": "${MCP_GATEWAY_API_KEY}",
+              "payloadDir": "${MCP_GATEWAY_PAYLOAD_DIR}"
+            }
+          }
+          GH_AW_MCP_CONFIG_0b5ba1ac1a376250_EOF
+      - name: Mount MCP servers as CLIs
+        id: mount-mcp-clis
+        continue-on-error: true
+        env:
+          MCP_GATEWAY_API_KEY: ${{ steps.start-mcp-gateway.outputs.gateway-api-key }}
+          MCP_GATEWAY_DOMAIN: ${{ steps.start-mcp-gateway.outputs.gateway-domain }}
+          MCP_GATEWAY_PORT: ${{ steps.start-mcp-gateway.outputs.gateway-port }}
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        with:
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io);
+            const { main } = require('${{ runner.temp }}/gh-aw/actions/mount_mcp_as_cli.cjs');
+            await main();
+      - name: Clean credentials
+        continue-on-error: true
+        run: bash "${RUNNER_TEMP}/gh-aw/actions/clean_git_credentials.sh"
+      - name: Audit pre-agent workspace
+        id: pre_agent_audit
+        continue-on-error: true
+        run: bash "${RUNNER_TEMP}/gh-aw/actions/audit_pre_agent_workspace.sh"
+      - name: Execute GitHub Copilot CLI
+        id: agentic_execution
+        # Copilot CLI tool arguments (sorted):
+        timeout-minutes: 300
+        run: |
+          set -o pipefail
+          touch /tmp/gh-aw/agent-step-summary.md
+          GH_AW_NODE_BIN=$(command -v node 2>/dev/null || true)
+          export GH_AW_NODE_BIN
+          (umask 177 && touch /tmp/gh-aw/agent-stdio.log)
+          printf '%s\n' '{"$schema":"https://github.com/github/gh-aw-firewall/releases/download/v0.25.41/awf-config.schema.json","network":{"allowDomains":["api.business.githubcopilot.com","api.enterprise.githubcopilot.com","api.github.com","api.githubcopilot.com","api.individual.githubcopilot.com","api.snapcraft.io","archive.ubuntu.com","azure.archive.ubuntu.com","crl.geotrust.com","crl.globalsign.com","crl.identrust.com","crl.sectigo.com","crl.thawte.com","crl.usertrust.com","crl.verisign.com","crl3.digicert.com","crl4.digicert.com","crls.ssl.com","github.com","host.docker.internal","json-schema.org","json.schemastore.org","keyserver.ubuntu.com","ocsp.digicert.com","ocsp.geotrust.com","ocsp.globalsign.com","ocsp.identrust.com","ocsp.sectigo.com","ocsp.ssl.com","ocsp.thawte.com","ocsp.usertrust.com","ocsp.verisign.com","packagecloud.io","packages.cloud.google.com","packages.microsoft.com","ppa.launchpad.net","raw.githubusercontent.com","registry.npmjs.org","s.symcb.com","s.symcd.com","security.ubuntu.com","telemetry.enterprise.githubcopilot.com","tptp.org","ts-crl.ws.symantec.com","ts-ocsp.ws.symantec.com","www.googleapis.com"]},"apiProxy":{"enabled":true,"models":{"auto":["large"],"deep-research":["copilot/deep-research*","copilot/o3-deep-research*","copilot/o4-mini-deep-research*","google/deep-research*","openai/o3-deep-research*","openai/o4-mini-deep-research*"],"gemini-flash":["copilot/gemini-*flash*","google/gemini-*flash*"],"gemini-pro":["copilot/gemini-*pro*","google/gemini-*pro*"],"gpt-4.1":["copilot/gpt-4.1*","openai/gpt-4.1*"],"gpt-5":["copilot/gpt-5*","openai/gpt-5*"],"gpt-5-codex":["copilot/gpt-5*codex*","openai/gpt-5*codex*"],"gpt-5-mini":["copilot/gpt-5*mini*","openai/gpt-5*mini*"],"gpt-5-nano":["copilot/gpt-5*nano*","openai/gpt-5*nano*"],"gpt-5-pro":["copilot/gpt-5*pro*","openai/gpt-5*pro*"],"haiku":["copilot/*haiku*","anthropic/*haiku*"],"large":["sonnet","gpt-5-pro","gpt-5","gemini-pro"],"mini":["haiku","gpt-5-mini","gpt-5-nano","gemini-flash"],"opus":["copilot/*opus*","anthropic/*opus*"],"reasoning":["copilot/o1*","copilot/o3*","copilot/o4*","openai/o1*","openai/o3*","openai/o4*"],"small":["mini"],"sonnet":["copilot/*sonnet*","anthropic/*sonnet*"]}},"container":{"imageTag":"0.25.41"}}' > "${RUNNER_TEMP}/gh-aw/awf-config.json" && cp "${RUNNER_TEMP}/gh-aw/awf-config.json" /tmp/gh-aw/awf-config.json
+          # shellcheck disable=SC1003
+          sudo -E awf --config "${RUNNER_TEMP}/gh-aw/awf-config.json" --container-workdir "${GITHUB_WORKSPACE}" --mount "${RUNNER_TEMP}/gh-aw:${RUNNER_TEMP}/gh-aw:ro" --mount "${RUNNER_TEMP}/gh-aw:/host${RUNNER_TEMP}/gh-aw:ro" --env-all --exclude-env COPILOT_GITHUB_TOKEN --exclude-env GITHUB_MCP_SERVER_TOKEN --exclude-env MCP_GATEWAY_API_KEY --log-level info --proxy-logs-dir /tmp/gh-aw/sandbox/firewall/logs --audit-dir /tmp/gh-aw/sandbox/firewall/audit --enable-host-access --allow-host-ports 80,443,8080 --skip-pull \
+            -- /bin/bash -c 'export PATH="${RUNNER_TEMP}/gh-aw/mcp-cli/bin:$PATH" && export PATH="$(find /opt/hostedtoolcache /home/runner/work/_tool -maxdepth 4 -type d -name bin 2>/dev/null | tr '\''\n'\'' '\'':'\'')$PATH"; [ -n "$GOROOT" ] && export PATH="$GOROOT/bin:$PATH" || true && GH_AW_NODE_EXEC="${GH_AW_NODE_BIN:-}"; if [ -z "$GH_AW_NODE_EXEC" ] || [ ! -x "$GH_AW_NODE_EXEC" ]; then GH_AW_NODE_EXEC="$(command -v node 2>/dev/null || echo node)"; fi; "$GH_AW_NODE_EXEC" ${RUNNER_TEMP}/gh-aw/actions/copilot_harness.cjs /usr/local/bin/copilot --add-dir /tmp/gh-aw/ --log-level all --log-dir /tmp/gh-aw/sandbox/agent/logs/ --disable-builtin-mcps --no-ask-user --allow-all-tools --allow-all-paths --add-dir "${GITHUB_WORKSPACE}" --prompt-file /tmp/gh-aw/aw-prompts/prompt.txt' 2>&1 | tee -a /tmp/gh-aw/agent-stdio.log
+        env:
+          AWF_REFLECT_ENABLED: 1
+          COPILOT_AGENT_RUNNER_TYPE: STANDALONE
+          COPILOT_API_KEY: dummy-byok-key-for-offline-mode
+          COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }}
+          COPILOT_MODEL: ${{ vars.GH_AW_MODEL_AGENT_COPILOT || 'claude-sonnet-4.6' }}
+          GH_AW_MCP_CONFIG: /home/runner/.copilot/mcp-config.json
+          GH_AW_PHASE: agent
+          GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+          GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }}
+          GH_AW_VERSION: v0.72.1
+          GITHUB_API_URL: ${{ github.api_url }}
+          GITHUB_AW: true
+          GITHUB_COPILOT_INTEGRATION_ID: agentic-workflows
+          GITHUB_HEAD_REF: ${{ github.head_ref }}
+          GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+          GITHUB_REF_NAME: ${{ github.ref_name }}
+          GITHUB_SERVER_URL: ${{ github.server_url }}
+          GITHUB_STEP_SUMMARY: /tmp/gh-aw/agent-step-summary.md
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+          GIT_AUTHOR_EMAIL: github-actions[bot]@users.noreply.github.com
+          GIT_AUTHOR_NAME: github-actions[bot]
+          GIT_COMMITTER_EMAIL: github-actions[bot]@users.noreply.github.com
+          GIT_COMMITTER_NAME: github-actions[bot]
+          XDG_CONFIG_HOME: /home/runner
+      - name: Detect Copilot errors
+        id: detect-copilot-errors
+        if: always()
+        continue-on-error: true
+        run: node "${RUNNER_TEMP}/gh-aw/actions/detect_copilot_errors.cjs"
+      - name: Configure Git credentials
+        env:
+          REPO_NAME: ${{ github.repository }}
+          SERVER_URL: ${{ github.server_url }}
+          GITHUB_TOKEN: ${{ github.token }}
+        run: |
+          git config --global user.email "github-actions[bot]@users.noreply.github.com"
+          git config --global user.name "github-actions[bot]"
+          git config --global am.keepcr true
+          # Re-authenticate git with GitHub token
+          SERVER_URL_STRIPPED="${SERVER_URL#https://}"
+          git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@${SERVER_URL_STRIPPED}/${REPO_NAME}.git"
+          echo "Git configured with standard GitHub Actions identity"
+      - name: Copy Copilot session state files to logs
+        if: always()
+        continue-on-error: true
+        run: bash "${RUNNER_TEMP}/gh-aw/actions/copy_copilot_session_state.sh"
+      - name: Stop MCP Gateway
+        if: always()
+        continue-on-error: true
+        env:
+          MCP_GATEWAY_PORT: ${{ steps.start-mcp-gateway.outputs.gateway-port }}
+          MCP_GATEWAY_API_KEY: ${{ steps.start-mcp-gateway.outputs.gateway-api-key }}
+          GATEWAY_PID: ${{ steps.start-mcp-gateway.outputs.gateway-pid }}
+        run: |
+          bash "${RUNNER_TEMP}/gh-aw/actions/stop_mcp_gateway.sh" "$GATEWAY_PID"
+      - name: Redact secrets in logs
+        if: always()
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        with:
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io, getOctokit);
+            const { main } = require('${{ runner.temp }}/gh-aw/actions/redact_secrets.cjs');
+            await main();
+        env:
+          GH_AW_SECRET_NAMES: 'COPILOT_GITHUB_TOKEN,GH_AW_GITHUB_MCP_SERVER_TOKEN,GH_AW_GITHUB_TOKEN,GITHUB_TOKEN'
+          SECRET_COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }}
+          SECRET_GH_AW_GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN }}
+          SECRET_GH_AW_GITHUB_TOKEN: ${{ secrets.GH_AW_GITHUB_TOKEN }}
+          SECRET_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Append agent step summary
+        if: always()
+        run: bash "${RUNNER_TEMP}/gh-aw/actions/append_agent_step_summary.sh"
+      - name: Copy Safe Outputs
+        if: always()
+        env:
+          GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }}
+        run: |
+          mkdir -p /tmp/gh-aw
+          cp "$GH_AW_SAFE_OUTPUTS" /tmp/gh-aw/safeoutputs.jsonl 2>/dev/null || true
+      - name: Ingest agent output
+        id: collect_output
+        if: always()
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        env:
+          GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }}
+          GH_AW_ALLOWED_DOMAINS: "api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,api.snapcraft.io,archive.ubuntu.com,azure.archive.ubuntu.com,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,github.com,host.docker.internal,json-schema.org,json.schemastore.org,keyserver.ubuntu.com,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,ppa.launchpad.net,raw.githubusercontent.com,registry.npmjs.org,s.symcb.com,s.symcd.com,security.ubuntu.com,telemetry.enterprise.githubcopilot.com,tptp.org,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com,www.googleapis.com"
+          GITHUB_SERVER_URL: ${{ github.server_url }}
+          GITHUB_API_URL: ${{ github.api_url }}
+        with:
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io, getOctokit);
+            const { main } = require('${{ runner.temp }}/gh-aw/actions/collect_ndjson_output.cjs');
+            await main();
+      - name: Parse agent logs for step summary
+        if: always()
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        env:
+          GH_AW_AGENT_OUTPUT: /tmp/gh-aw/sandbox/agent/logs/
+        with:
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io, getOctokit);
+            const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_copilot_log.cjs');
+            await main();
+      - name: Parse MCP Gateway logs for step summary
+        if: always()
+        id: parse-mcp-gateway
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        with:
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io, getOctokit);
+            const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_mcp_gateway_log.cjs');
+            await main();
+      - name: Print firewall logs
+        if: always()
+        continue-on-error: true
+        env:
+          AWF_LOGS_DIR: /tmp/gh-aw/sandbox/firewall/logs
+        run: |
+          # Fix permissions on firewall logs/audit dirs so they can be uploaded as artifacts
+          # AWF runs with sudo, creating files owned by root
+          sudo chmod -R a+rX /tmp/gh-aw/sandbox/firewall 2>/dev/null || true
+          # Only run awf logs summary if awf command exists (it may not be installed if workflow failed before install step)
+          if command -v awf &> /dev/null; then
+            awf logs summary | tee -a "$GITHUB_STEP_SUMMARY"
+          else
+            echo 'AWF binary not installed, skipping firewall log summary'
+          fi
+      - name: Parse token usage for step summary
+        if: always()
+        continue-on-error: true
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        with:
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io, getOctokit);
+            const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_token_usage.cjs');
+            await main();
+      - name: Print AWF reflect summary
+        if: always()
+        continue-on-error: true
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        with:
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io, getOctokit);
+            const { main } = require('${{ runner.temp }}/gh-aw/actions/awf_reflect_summary.cjs');
+            await main();
+      - name: Write agent output placeholder if missing
+        if: always()
+        run: |
+          if [ ! -f /tmp/gh-aw/agent_output.json ]; then
+            echo '{"items":[]}' > /tmp/gh-aw/agent_output.json
+          fi
+      - name: Upload agent artifacts
+        if: always()
+        continue-on-error: true
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: agent
+          path: |
+            /tmp/gh-aw/aw-prompts/prompt.txt
+            /tmp/gh-aw/sandbox/agent/logs/
+            /tmp/gh-aw/redacted-urls.log
+            /tmp/gh-aw/mcp-logs/
+            /tmp/gh-aw/agent_usage.json
+            /tmp/gh-aw/agent-stdio.log
+            /tmp/gh-aw/pre-agent-audit.txt
+            /tmp/gh-aw/agent/
+            /tmp/gh-aw/github_rate_limits.jsonl
+            /tmp/gh-aw/safeoutputs.jsonl
+            /tmp/gh-aw/agent_output.json
+            /tmp/gh-aw/aw-*.patch
+            /tmp/gh-aw/aw-*.bundle
+            /tmp/gh-aw/awf-config.json
+            /tmp/gh-aw/sandbox/firewall/logs/
+            /tmp/gh-aw/sandbox/firewall/audit/
+            /tmp/gh-aw/sandbox/firewall/awf-reflect.json
+          if-no-files-found: ignore
+
+  conclusion:
+    needs:
+      - activation
+      - agent
+      - detection
+      - safe_outputs
+    if: >
+      always() && (needs.agent.result != 'skipped' || needs.activation.outputs.lockdown_check_failed == 'true' ||
+      needs.activation.outputs.stale_lock_file_failed == 'true')
+    runs-on: ubuntu-slim
+    permissions:
+      contents: read
+      discussions: write
+      issues: write
+    concurrency:
+      group: "gh-aw-conclusion-tptp-benchmark"
+      cancel-in-progress: false
+    outputs:
+      incomplete_count: ${{ steps.report_incomplete.outputs.incomplete_count }}
+      noop_message: ${{ steps.noop.outputs.noop_message }}
+      tools_reported: ${{ steps.missing_tool.outputs.tools_reported }}
+      total_count: ${{ steps.missing_tool.outputs.total_count }}
+    steps:
+      - name: Setup Scripts
+        id: setup
+        uses: github/gh-aw-actions/setup@v0.72.1
+        with:
+          destination: ${{ runner.temp }}/gh-aw/actions
+          job-name: ${{ github.job }}
+          trace-id: ${{ needs.activation.outputs.setup-trace-id }}
+        env:
+          GH_AW_SETUP_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+          GH_AW_CURRENT_WORKFLOW_REF: ${{ github.repository }}/.github/workflows/tptp-benchmark.lock.yml@${{ github.ref }}
+          GH_AW_INFO_VERSION: "1.0.40"
+      - name: Download agent output artifact
+        id: download-agent-output
+        continue-on-error: true
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+        with:
+          name: agent
+          path: /tmp/gh-aw/
+      - name: Setup agent output environment variable
+        id: setup-agent-output-env
+        if: steps.download-agent-output.outcome == 'success'
+        run: |
+          mkdir -p /tmp/gh-aw/
+          find "/tmp/gh-aw/" -type f -print
+          echo "GH_AW_AGENT_OUTPUT=/tmp/gh-aw/agent_output.json" >> "$GITHUB_OUTPUT"
+      - name: Process no-op messages
+        id: noop
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        env:
+          GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }}
+          GH_AW_NOOP_MAX: "1"
+          GH_AW_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+          GH_AW_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+          GH_AW_AGENT_CONCLUSION: ${{ needs.agent.result }}
+          GH_AW_NOOP_REPORT_AS_ISSUE: "false"
+        with:
+          github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io, getOctokit);
+            const { main } = require('${{ runner.temp }}/gh-aw/actions/handle_noop_message.cjs');
+            await main();
+      - name: Log detection run
+        id: detection_runs
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        env:
+          GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }}
+          GH_AW_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+          GH_AW_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+          GH_AW_DETECTION_CONCLUSION: ${{ needs.detection.outputs.detection_conclusion }}
+          GH_AW_DETECTION_REASON: ${{ needs.detection.outputs.detection_reason }}
+        with:
+          github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io, getOctokit);
+            const { main } = require('${{ runner.temp }}/gh-aw/actions/handle_detection_runs.cjs');
+            await main();
+      - name: Record missing tool
+        id: missing_tool
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        env:
+          GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }}
+          GH_AW_MISSING_TOOL_CREATE_ISSUE: "true"
+          GH_AW_MISSING_TOOL_TITLE_PREFIX: "[missing tool]"
+          GH_AW_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+        with:
+          github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io, getOctokit);
+            const { main } = require('${{ runner.temp }}/gh-aw/actions/missing_tool.cjs');
+            await main();
+      - name: Record incomplete
+        id: report_incomplete
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        env:
+          GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }}
+          GH_AW_REPORT_INCOMPLETE_CREATE_ISSUE: "true"
+          GH_AW_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+        with:
+          github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io, getOctokit);
+            const { main } = require('${{ runner.temp }}/gh-aw/actions/report_incomplete_handler.cjs');
+            await main();
+      - name: Handle agent failure
+        id: handle_agent_failure
+        if: always()
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        env:
+          GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }}
+          GH_AW_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+          GH_AW_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+          GH_AW_AGENT_CONCLUSION: ${{ needs.agent.result }}
+          GH_AW_WORKFLOW_ID: "tptp-benchmark"
+          GH_AW_ACTION_FAILURE_ISSUE_EXPIRES_HOURS: "168"
+          GH_AW_ENGINE_ID: "copilot"
+          GH_AW_SECRET_VERIFICATION_RESULT: ${{ needs.activation.outputs.secret_verification_result }}
+          GH_AW_CHECKOUT_PR_SUCCESS: ${{ needs.agent.outputs.checkout_pr_success }}
+          GH_AW_INFERENCE_ACCESS_ERROR: ${{ needs.agent.outputs.inference_access_error }}
+          GH_AW_MCP_POLICY_ERROR: ${{ needs.agent.outputs.mcp_policy_error }}
+          GH_AW_AGENTIC_ENGINE_TIMEOUT: ${{ needs.agent.outputs.agentic_engine_timeout }}
+          GH_AW_MODEL_NOT_SUPPORTED_ERROR: ${{ needs.agent.outputs.model_not_supported_error }}
+          GH_AW_ENGINE_API_HOSTS: "api.enterprise.githubcopilot.com,api.githubcopilot.com,api.business.githubcopilot.com,api.individual.githubcopilot.com"
+          GH_AW_CREATE_DISCUSSION_ERRORS: ${{ needs.safe_outputs.outputs.create_discussion_errors }}
+          GH_AW_CREATE_DISCUSSION_ERROR_COUNT: ${{ needs.safe_outputs.outputs.create_discussion_error_count }}
+          GH_AW_LOCKDOWN_CHECK_FAILED: ${{ needs.activation.outputs.lockdown_check_failed }}
+          GH_AW_STALE_LOCK_FILE_FAILED: ${{ needs.activation.outputs.stale_lock_file_failed }}
+          GH_AW_GROUP_REPORTS: "false"
+          GH_AW_FAILURE_REPORT_AS_ISSUE: "true"
+          GH_AW_MISSING_TOOL_REPORT_AS_FAILURE: "true"
+          GH_AW_MISSING_DATA_REPORT_AS_FAILURE: "true"
+          GH_AW_TIMEOUT_MINUTES: "300"
+        with:
+          github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io, getOctokit);
+            const { main } = require('${{ runner.temp }}/gh-aw/actions/handle_agent_failure.cjs');
+            await main();
+
+  detection:
+    needs:
+      - activation
+      - agent
+    if: >
+      always() && needs.agent.result != 'skipped' && (needs.agent.outputs.output_types != '' || needs.agent.outputs.has_patch == 'true')
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+    outputs:
+      detection_conclusion: ${{ steps.detection_conclusion.outputs.conclusion }}
+      detection_reason: ${{ steps.detection_conclusion.outputs.reason }}
+      detection_success: ${{ steps.detection_conclusion.outputs.success }}
+    steps:
+      - name: Setup Scripts
+        id: setup
+        uses: github/gh-aw-actions/setup@v0.72.1
+        with:
+          destination: ${{ runner.temp }}/gh-aw/actions
+          job-name: ${{ github.job }}
+          trace-id: ${{ needs.activation.outputs.setup-trace-id }}
+        env:
+          GH_AW_SETUP_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+          GH_AW_CURRENT_WORKFLOW_REF: ${{ github.repository }}/.github/workflows/tptp-benchmark.lock.yml@${{ github.ref }}
+          GH_AW_INFO_VERSION: "1.0.40"
+      - name: Download agent output artifact
+        id: download-agent-output
+        continue-on-error: true
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+        with:
+          name: agent
+          path: /tmp/gh-aw/
+      - name: Setup agent output environment variable
+        id: setup-agent-output-env
+        if: steps.download-agent-output.outcome == 'success'
+        run: |
+          mkdir -p /tmp/gh-aw/
+          find "/tmp/gh-aw/" -type f -print
+          echo "GH_AW_AGENT_OUTPUT=/tmp/gh-aw/agent_output.json" >> "$GITHUB_OUTPUT"
+      - name: Checkout repository for patch context
+        if: needs.agent.outputs.has_patch == 'true'
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          persist-credentials: false
+      # --- Threat Detection ---
+      - name: Clean stale firewall files from agent artifact
+        run: |
+          rm -rf /tmp/gh-aw/sandbox/firewall/logs
+          rm -rf /tmp/gh-aw/sandbox/firewall/audit
+      - name: Download container images
+        run: bash "${RUNNER_TEMP}/gh-aw/actions/download_docker_images.sh" ghcr.io/github/gh-aw-firewall/agent:0.25.41 ghcr.io/github/gh-aw-firewall/api-proxy:0.25.41 ghcr.io/github/gh-aw-firewall/squid:0.25.41
+      - name: Check if detection needed
+        id: detection_guard
+        if: always()
+        env:
+          OUTPUT_TYPES: ${{ needs.agent.outputs.output_types }}
+          HAS_PATCH: ${{ needs.agent.outputs.has_patch }}
+        run: |
+          if [[ -n "$OUTPUT_TYPES" || "$HAS_PATCH" == "true" ]]; then
+            echo "run_detection=true" >> "$GITHUB_OUTPUT"
+            echo "Detection will run: output_types=$OUTPUT_TYPES, has_patch=$HAS_PATCH"
+          else
+            echo "run_detection=false" >> "$GITHUB_OUTPUT"
+            echo "Detection skipped: no agent outputs or patches to analyze"
+          fi
+      - name: Clear MCP Config for detection
+        if: always() && steps.detection_guard.outputs.run_detection == 'true'
+        run: |
+          rm -f "${RUNNER_TEMP}/gh-aw/mcp-config/mcp-servers.json"
+          rm -f /home/runner/.copilot/mcp-config.json
+          rm -f "$GITHUB_WORKSPACE/.gemini/settings.json"
+      - name: Prepare threat detection files
+        if: always() && steps.detection_guard.outputs.run_detection == 'true'
+        run: |
+          mkdir -p /tmp/gh-aw/threat-detection/aw-prompts
+          cp /tmp/gh-aw/aw-prompts/prompt.txt /tmp/gh-aw/threat-detection/aw-prompts/prompt.txt 2>/dev/null || true
+          cp /tmp/gh-aw/agent_output.json /tmp/gh-aw/threat-detection/agent_output.json 2>/dev/null || true
+          for f in /tmp/gh-aw/aw-*.patch; do
+            [ -f "$f" ] && cp "$f" /tmp/gh-aw/threat-detection/ 2>/dev/null || true
+          done
+          for f in /tmp/gh-aw/aw-*.bundle; do
+            [ -f "$f" ] && cp "$f" /tmp/gh-aw/threat-detection/ 2>/dev/null || true
+          done
+          echo "Prepared threat detection files:"
+          ls -la /tmp/gh-aw/threat-detection/ 2>/dev/null || true
+      - name: Setup threat detection
+        if: always() && steps.detection_guard.outputs.run_detection == 'true'
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        env:
+          WORKFLOW_NAME: "TPTP Front-End Benchmark"
+          WORKFLOW_DESCRIPTION: "Weekly benchmark of Z3's TPTP front-end against 500 random TPTP problems. Downloads TPTP benchmarks from tptp.org, resolves axiom dependencies, skips large problems, runs each with a 5-second timeout, and posts a discrepancy/crash report as a GitHub discussion."
+          HAS_PATCH: ${{ needs.agent.outputs.has_patch }}
+        with:
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io, getOctokit);
+            const { main } = require('${{ runner.temp }}/gh-aw/actions/setup_threat_detection.cjs');
+            await main();
+      - name: Ensure threat-detection directory and log
+        if: always() && steps.detection_guard.outputs.run_detection == 'true'
+        run: |
+          mkdir -p /tmp/gh-aw/threat-detection
+          touch /tmp/gh-aw/threat-detection/detection.log
+      - name: Setup Node.js
+        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
+        with:
+          node-version: '24'
+          package-manager-cache: false
+      - name: Install GitHub Copilot CLI
+        run: bash "${RUNNER_TEMP}/gh-aw/actions/install_copilot_cli.sh" 1.0.40
+        env:
+          GH_HOST: github.com
+      - name: Install AWF binary
+        run: bash "${RUNNER_TEMP}/gh-aw/actions/install_awf_binary.sh" v0.25.41
+      - name: Execute GitHub Copilot CLI
+        if: always() && steps.detection_guard.outputs.run_detection == 'true'
+        continue-on-error: true
+        id: detection_agentic_execution
+        # Copilot CLI tool arguments (sorted):
+        timeout-minutes: 20
+        run: |
+          set -o pipefail
+          touch /tmp/gh-aw/agent-step-summary.md
+          GH_AW_NODE_BIN=$(command -v node 2>/dev/null || true)
+          export GH_AW_NODE_BIN
+          (umask 177 && touch /tmp/gh-aw/threat-detection/detection.log)
+          printf '%s\n' '{"$schema":"https://github.com/github/gh-aw-firewall/releases/download/v0.25.41/awf-config.schema.json","network":{"allowDomains":["api.business.githubcopilot.com","api.enterprise.githubcopilot.com","api.github.com","api.githubcopilot.com","api.individual.githubcopilot.com","github.com","host.docker.internal","telemetry.enterprise.githubcopilot.com"]},"apiProxy":{"enabled":true},"container":{"imageTag":"0.25.41"}}' > "${RUNNER_TEMP}/gh-aw/awf-config.json" && cp "${RUNNER_TEMP}/gh-aw/awf-config.json" /tmp/gh-aw/awf-config.json
+          # shellcheck disable=SC1003
+          sudo -E awf --config "${RUNNER_TEMP}/gh-aw/awf-config.json" --container-workdir "${GITHUB_WORKSPACE}" --mount "${RUNNER_TEMP}/gh-aw:${RUNNER_TEMP}/gh-aw:ro" --mount "${RUNNER_TEMP}/gh-aw:/host${RUNNER_TEMP}/gh-aw:ro" --env-all --exclude-env COPILOT_GITHUB_TOKEN --log-level info --proxy-logs-dir /tmp/gh-aw/sandbox/firewall/logs --audit-dir /tmp/gh-aw/sandbox/firewall/audit --enable-host-access --allow-host-ports 80,443,8080 --skip-pull \
+            -- /bin/bash -c 'export PATH="$(find /opt/hostedtoolcache /home/runner/work/_tool -maxdepth 4 -type d -name bin 2>/dev/null | tr '\''\n'\'' '\'':'\'')$PATH"; [ -n "$GOROOT" ] && export PATH="$GOROOT/bin:$PATH" || true && GH_AW_NODE_EXEC="${GH_AW_NODE_BIN:-}"; if [ -z "$GH_AW_NODE_EXEC" ] || [ ! -x "$GH_AW_NODE_EXEC" ]; then GH_AW_NODE_EXEC="$(command -v node 2>/dev/null || echo node)"; fi; "$GH_AW_NODE_EXEC" ${RUNNER_TEMP}/gh-aw/actions/copilot_harness.cjs /usr/local/bin/copilot --add-dir /tmp/gh-aw/ --log-level all --log-dir /tmp/gh-aw/sandbox/agent/logs/ --disable-builtin-mcps --no-ask-user --allow-all-tools --add-dir "${GITHUB_WORKSPACE}" --prompt-file /tmp/gh-aw/aw-prompts/prompt.txt' 2>&1 | tee -a /tmp/gh-aw/threat-detection/detection.log
+        env:
+          AWF_REFLECT_ENABLED: 1
+          COPILOT_AGENT_RUNNER_TYPE: STANDALONE
+          COPILOT_API_KEY: dummy-byok-key-for-offline-mode
+          COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }}
+          COPILOT_MODEL: ${{ vars.GH_AW_MODEL_DETECTION_COPILOT || 'claude-sonnet-4.6' }}
+          GH_AW_PHASE: detection
+          GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+          GH_AW_VERSION: v0.72.1
+          GITHUB_API_URL: ${{ github.api_url }}
+          GITHUB_AW: true
+          GITHUB_COPILOT_INTEGRATION_ID: agentic-workflows
+          GITHUB_HEAD_REF: ${{ github.head_ref }}
+          GITHUB_REF_NAME: ${{ github.ref_name }}
+          GITHUB_SERVER_URL: ${{ github.server_url }}
+          GITHUB_STEP_SUMMARY: /tmp/gh-aw/agent-step-summary.md
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+          GIT_AUTHOR_EMAIL: github-actions[bot]@users.noreply.github.com
+          GIT_AUTHOR_NAME: github-actions[bot]
+          GIT_COMMITTER_EMAIL: github-actions[bot]@users.noreply.github.com
+          GIT_COMMITTER_NAME: github-actions[bot]
+          XDG_CONFIG_HOME: /home/runner
+      - name: Upload threat detection log
+        if: always() && steps.detection_guard.outputs.run_detection == 'true'
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: detection
+          path: /tmp/gh-aw/threat-detection/detection.log
+          if-no-files-found: ignore
+      - name: Parse and conclude threat detection
+        id: detection_conclusion
+        if: always()
+        continue-on-error: true
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        env:
+          RUN_DETECTION: ${{ steps.detection_guard.outputs.run_detection }}
+          GH_AW_DETECTION_CONTINUE_ON_ERROR: "true"
+        with:
+          script: |
+            try {
+              const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+              setupGlobals(core, github, context, exec, io, getOctokit);
+              const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_threat_detection_results.cjs');
+              await main();
+            } catch (loadErr) {
+              const continueOnError = process.env.GH_AW_DETECTION_CONTINUE_ON_ERROR !== 'false';
+              const msg = 'ERR_SYSTEM: \u274C Unexpected error loading threat detection module: ' + (loadErr && loadErr.message ? loadErr.message : String(loadErr));
+              core.error(msg);
+              core.setOutput('reason', 'parse_error');
+              if (continueOnError) {
+                core.warning('\u26A0\uFE0F ' + msg);
+                core.setOutput('conclusion', 'warning');
+                core.setOutput('success', 'false');
+              } else {
+                core.setOutput('conclusion', 'failure');
+                core.setOutput('success', 'false');
+                core.setFailed(msg);
+              }
+            }
+
+  safe_outputs:
+    needs:
+      - activation
+      - agent
+      - detection
+    if: (!cancelled()) && needs.agent.result != 'skipped' && needs.detection.result == 'success'
+    runs-on: ubuntu-slim
+    permissions:
+      contents: read
+      discussions: write
+      issues: write
+    timeout-minutes: 15
+    env:
+      GH_AW_CALLER_WORKFLOW_ID: "${{ github.repository }}/tptp-benchmark"
+      GH_AW_DETECTION_CONCLUSION: ${{ needs.detection.outputs.detection_conclusion }}
+      GH_AW_DETECTION_REASON: ${{ needs.detection.outputs.detection_reason }}
+      GH_AW_EFFECTIVE_TOKENS: ${{ needs.agent.outputs.effective_tokens }}
+      GH_AW_ENGINE_ID: "copilot"
+      GH_AW_ENGINE_MODEL: ${{ needs.agent.outputs.model }}
+      GH_AW_ENGINE_VERSION: "1.0.40"
+      GH_AW_WORKFLOW_ID: "tptp-benchmark"
+      GH_AW_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+    outputs:
+      code_push_failure_count: ${{ steps.process_safe_outputs.outputs.code_push_failure_count }}
+      code_push_failure_errors: ${{ steps.process_safe_outputs.outputs.code_push_failure_errors }}
+      create_discussion_error_count: ${{ steps.process_safe_outputs.outputs.create_discussion_error_count }}
+      create_discussion_errors: ${{ steps.process_safe_outputs.outputs.create_discussion_errors }}
+      process_safe_outputs_processed_count: ${{ steps.process_safe_outputs.outputs.processed_count }}
+      process_safe_outputs_temporary_id_map: ${{ steps.process_safe_outputs.outputs.temporary_id_map }}
+    steps:
+      - name: Setup Scripts
+        id: setup
+        uses: github/gh-aw-actions/setup@v0.72.1
+        with:
+          destination: ${{ runner.temp }}/gh-aw/actions
+          job-name: ${{ github.job }}
+          trace-id: ${{ needs.activation.outputs.setup-trace-id }}
+        env:
+          GH_AW_SETUP_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+          GH_AW_CURRENT_WORKFLOW_REF: ${{ github.repository }}/.github/workflows/tptp-benchmark.lock.yml@${{ github.ref }}
+          GH_AW_INFO_VERSION: "1.0.40"
+      - name: Download agent output artifact
+        id: download-agent-output
+        continue-on-error: true
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+        with:
+          name: agent
+          path: /tmp/gh-aw/
+      - name: Setup agent output environment variable
+        id: setup-agent-output-env
+        if: steps.download-agent-output.outcome == 'success'
+        run: |
+          mkdir -p /tmp/gh-aw/
+          find "/tmp/gh-aw/" -type f -print
+          echo "GH_AW_AGENT_OUTPUT=/tmp/gh-aw/agent_output.json" >> "$GITHUB_OUTPUT"
+      - name: Configure GH_HOST for enterprise compatibility
+        id: ghes-host-config
+        shell: bash
+        run: |
+          # Derive GH_HOST from GITHUB_SERVER_URL so the gh CLI targets the correct
+          # GitHub instance (GHES/GHEC). On github.com this is a harmless no-op.
+          GH_HOST="${GITHUB_SERVER_URL#https://}"
+          GH_HOST="${GH_HOST#http://}"
+          echo "GH_HOST=${GH_HOST}" >> "$GITHUB_ENV"
+      - name: Process Safe Outputs
+        id: process_safe_outputs
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        env:
+          GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }}
+          GH_AW_ALLOWED_DOMAINS: "api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,api.snapcraft.io,archive.ubuntu.com,azure.archive.ubuntu.com,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,github.com,host.docker.internal,json-schema.org,json.schemastore.org,keyserver.ubuntu.com,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,ppa.launchpad.net,raw.githubusercontent.com,registry.npmjs.org,s.symcb.com,s.symcd.com,security.ubuntu.com,telemetry.enterprise.githubcopilot.com,tptp.org,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com,www.googleapis.com"
+          GITHUB_SERVER_URL: ${{ github.server_url }}
+          GITHUB_API_URL: ${{ github.api_url }}
+          GH_AW_SAFE_OUTPUTS_HANDLER_CONFIG: "{\"create_discussion\":{\"category\":\"agentic workflows\",\"close_older_discussions\":true,\"expires\":336,\"fallback_to_issue\":true,\"max\":1,\"title_prefix\":\"[TPTP Benchmark] \"},\"create_report_incomplete_issue\":{},\"missing_data\":{},\"missing_tool\":{},\"noop\":{\"max\":1,\"report-as-issue\":\"false\"},\"report_incomplete\":{}}"
+        with:
+          github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+          script: |
+            const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+            setupGlobals(core, github, context, exec, io, getOctokit);
+            const { main } = require('${{ runner.temp }}/gh-aw/actions/safe_output_handler_manager.cjs');
+            await main();
+      - name: Upload Safe Outputs Items
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: safe-outputs-items
+          path: |
+            /tmp/gh-aw/safe-output-items.jsonl
+            /tmp/gh-aw/temporary-id-map.json
+          if-no-files-found: ignore
+
diff --git a/.github/workflows/tptp-benchmark.md b/.github/workflows/tptp-benchmark.md
new file mode 100644
index 000000000..48752372a
--- /dev/null
+++ b/.github/workflows/tptp-benchmark.md
@@ -0,0 +1,554 @@
+---
+description: >
+  Weekly benchmark of Z3's TPTP front-end against 500 random TPTP problems.
+  Downloads TPTP benchmarks from tptp.org, resolves axiom dependencies,
+  skips large problems, runs each with a 5-second timeout, and posts a
+  discrepancy/crash report as a GitHub discussion.
+
+on:
+  schedule:
+    - cron: "0 6 * * 1"
+  workflow_dispatch:
+
+permissions: read-all
+
+network:
+  allowed:
+    - defaults
+    - tptp.org
+
+tools:
+  bash: true
+  github:
+    toolsets: [default]
+
+safe-outputs:
+  create-discussion:
+    title-prefix: "[TPTP Benchmark] "
+    category: "Agentic Workflows"
+    close-older-discussions: true
+    expires: 14d
+  missing-tool:
+    create-issue: true
+  noop:
+    report-as-issue: false
+
+timeout-minutes: 300
+
+steps:
+  - name: Checkout repository
+    uses: actions/checkout@v6.0.2
+    with:
+      persist-credentials: false
+
+---
+
+# TPTP Front-End Benchmark
+
+## Job Description
+
+Your name is ${{ github.workflow }}. You are an expert testing engineer for the Z3 theorem prover. Your task is to:
+
+1. Build Z3 from the current `master` branch
+2. Download the TPTP benchmark library from tptp.org
+3. Select 500 random small-to-medium problems (with their axiom dependencies)
+4. Run each problem through Z3's TPTP front-end with a 5-second timeout
+5. Compare Z3's output against the expected SZS status declared in each problem file
+6. Post a detailed report as a GitHub Discussion summarising discrepancies and crashes
+
+**Repository**: ${{ github.repository }}
+**Workspace**: ${{ github.workspace }}
+
+## Phase 1: Build Z3
+
+Install build tools and build Z3 in Release mode.
+
+```bash
+sudo apt-get update -y -q
+sudo apt-get install -y cmake ninja-build python3 wget curl bc 2>/dev/null || true
+```
+
+Configure and build:
+
+```bash
+mkdir -p /tmp/z3-build
+cd /tmp/z3-build
+cmake "$GITHUB_WORKSPACE" \
+  -G Ninja \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DZ3_BUILD_TEST_EXECUTABLES=OFF \
+  2>&1 | tail -20
+
+# Build with limited parallelism to avoid OOM alongside LLM inference.
+# Never run ninja in the background with &.
+ninja -j2 z3 2>&1 | tail -30
+```
+
+Verify the build:
+
+```bash
+/tmp/z3-build/z3 --version
+```
+
+If the build fails, call the `noop` safe-output with a message describing the error and stop.
+
+Once the binary is confirmed, call `noop` with `"Z3 built successfully. Downloading TPTP benchmark library — this may take a few minutes."` to keep the safe-output session alive.
+
+## Phase 2: Download the TPTP Problem Library
+
+Find the latest TPTP release and download the full archive.
+
+```bash
+# Find the latest TPTP distribution version by fetching the directory listing
+TPTP_DIST_URL="https://tptp.org/TPTP/Distribution/"
+LATEST_TGZ=$(curl -sL "$TPTP_DIST_URL" \
+  | grep -oP 'TPTP-v[0-9]+\.[0-9]+\.[0-9]+\.tgz' \
+  | sort -V | tail -1)
+
+if [ -z "$LATEST_TGZ" ]; then
+  echo "ERROR: Could not determine latest TPTP version from $TPTP_DIST_URL"
+  # Fall back to a known stable version
+  LATEST_TGZ="TPTP-v9.0.0.tgz"
+fi
+
+echo "Downloading $LATEST_TGZ ..."
+mkdir -p /tmp/tptp_download
+wget -q --show-progress \
+  "${TPTP_DIST_URL}${LATEST_TGZ}" \
+  -O /tmp/tptp_download/tptp.tgz
+
+echo "Extracting TPTP library..."
+mkdir -p /tmp/tptp
+tar -xzf /tmp/tptp_download/tptp.tgz -C /tmp/tptp --strip-components=1 2>&1 | tail -5
+
+# Verify extraction
+if [ ! -d /tmp/tptp/Problems ] || [ ! -d /tmp/tptp/Axioms ]; then
+  echo "ERROR: TPTP extraction failed — Problems/ or Axioms/ directory not found"
+  ls /tmp/tptp/
+  exit 1
+fi
+
+TPTP_ROOT=/tmp/tptp
+echo "TPTP library extracted to $TPTP_ROOT"
+echo "Problem domains available:"
+ls "$TPTP_ROOT/Problems/" | wc -l
+echo "Axiom files available:"
+ls "$TPTP_ROOT/Axioms/" | wc -l
+```
+
+If the download or extraction fails, call `noop` with the error details and stop.
+
+Call `noop` with `"TPTP library downloaded and extracted. Selecting 500 benchmark problems — filtering by size."` to keep the session alive.
+
+## Phase 3: Select 500 Benchmark Problems
+
+Filter out large problems and problems that depend on large axiom files, then take a random sample of 500.
+
+Save this script to `/tmp/select_benchmarks.py` and run it:
+
+```python
+#!/usr/bin/env python3
+"""
+Select 500 random TPTP problems that:
+  - Have a known, conclusive expected status (Theorem, Unsatisfiable,
+    CounterSatisfiable, Satisfiable) OR Unknown/Open status.
+  - Are not "large" (problem file <= 50 KB).
+  - Do not include any axiom file larger than 100 KB.
+"""
+import os
+import re
+import random
+import sys
+
+TPTP_ROOT = "/tmp/tptp"
+PROBLEMS_DIR = os.path.join(TPTP_ROOT, "Problems")
+AXIOMS_DIR = os.path.join(TPTP_ROOT, "Axioms")
+MAX_PROBLEM_SIZE = 50 * 1024      # 50 KB
+MAX_AXIOM_SIZE   = 100 * 1024     # 100 KB
+SAMPLE_SIZE = 500
+OUTPUT_FILE = "/tmp/selected_benchmarks.txt"
+
+include_re = re.compile(r"include\s*\(\s*['\"]([^'\"]+)['\"]", re.IGNORECASE)
+status_re  = re.compile(r"%\s*Status\s*:\s*(\S+)", re.IGNORECASE)
+
+def axiom_sizes_ok(problem_path):
+    """Return True if all included axiom files exist and are <= MAX_AXIOM_SIZE."""
+    try:
+        with open(problem_path, encoding="utf-8", errors="replace") as f:
+            content = f.read(4096)   # header is in first few KB
+    except OSError:
+        return False
+    for m in include_re.finditer(content):
+        axiom_rel = m.group(1)        # e.g. "Axioms/AGT001+0.ax"
+        axiom_path = os.path.join(TPTP_ROOT, axiom_rel)
+        if not os.path.exists(axiom_path):
+            return False              # axiom missing — skip
+        if os.path.getsize(axiom_path) > MAX_AXIOM_SIZE:
+            return False              # axiom too large — skip
+    return True
+
+candidates = []
+skipped_size = 0
+skipped_axiom = 0
+
+for domain in sorted(os.listdir(PROBLEMS_DIR)):
+    domain_dir = os.path.join(PROBLEMS_DIR, domain)
+    if not os.path.isdir(domain_dir):
+        continue
+    for fname in os.listdir(domain_dir):
+        if not fname.endswith(".p"):
+            continue
+        fpath = os.path.join(domain_dir, fname)
+        size = os.path.getsize(fpath)
+        if size > MAX_PROBLEM_SIZE:
+            skipped_size += 1
+            continue
+        if not axiom_sizes_ok(fpath):
+            skipped_axiom += 1
+            continue
+        candidates.append(fpath)
+
+print(f"Total candidates (after filtering): {len(candidates)}", flush=True)
+print(f"  Skipped — problem too large : {skipped_size}", flush=True)
+print(f"  Skipped — axiom too large   : {skipped_axiom}", flush=True)
+
+if len(candidates) == 0:
+    print("ERROR: No suitable benchmark problems found.", file=sys.stderr)
+    sys.exit(1)
+
+if len(candidates) > SAMPLE_SIZE:
+    random.seed(42)
+    selected = random.sample(candidates, SAMPLE_SIZE)
+else:
+    selected = candidates
+
+selected.sort()
+with open(OUTPUT_FILE, "w") as f:
+    f.write("\n".join(selected) + "\n")
+
+print(f"Selected {len(selected)} problems → {OUTPUT_FILE}", flush=True)
+```
+
+Run the script:
+
+```bash
+python3 /tmp/select_benchmarks.py
+SELECTED=$(wc -l < /tmp/selected_benchmarks.txt)
+echo "Benchmark set: $SELECTED problems"
+```
+
+If no problems are found, call `noop` with an error message and stop.
+
+Call `noop` with `"$SELECTED problems selected. Starting benchmark run with 5-second timeout per problem — this will take approximately $(( SELECTED * 7 / 60 )) minutes."` to keep the session alive.
+
+## Phase 4: Run Benchmarks
+
+Save the following script to `/tmp/run_tptp_benchmarks.sh`, make it executable, and run it.
+
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+
+Z3=/tmp/z3-build/z3
+TPTP_ROOT=/tmp/tptp
+TIMEOUT_HARD=8        # outer OS-level guard (seconds)
+Z3_TIMEOUT=5          # Z3 internal timeout flag -T:5
+
+RESULTS=/tmp/tptp_results.tsv
+PROBLEM_LIST=/tmp/selected_benchmarks.txt
+
+echo -e "file\texpected\tactual\ttime_s\tnotes" > "$RESULTS"
+
+# Helper: extract the expected SZS status from the TPTP problem header.
+get_expected_status() {
+    local file="$1"
+    # Look for lines like: "% Status   : Theorem"
+    grep -m1 -iP '%\s*Status\s*:\s*\K\S+' "$file" 2>/dev/null || echo "Unknown"
+}
+
+# Helper: run z3 on a single TPTP problem with timeout.
+run_benchmark() {
+    local file="$1"
+    local start end elapsed output exit_code verdict
+
+    start=$(date +%s%3N)
+    output=$(TPTP="$TPTP_ROOT" timeout "$TIMEOUT_HARD" \
+        "$Z3" -tptp -T:"$Z3_TIMEOUT" "$file" 2>&1) || exit_code=$?
+    exit_code=${exit_code:-0}
+    end=$(date +%s%3N)
+    elapsed=$(echo "scale=3; ($end - $start) / 1000" | bc)
+
+    # Extract SZS status line from output
+    szs_line=$(echo "$output" | grep -m1 "% SZS status" || true)
+
+    if [ -n "$szs_line" ]; then
+        # Parse the status keyword (e.g. "Theorem", "CounterSatisfiable", "GaveUp")
+        verdict=$(echo "$szs_line" | grep -oP '% SZS status \K\S+' || echo "Unknown")
+    elif [ "$exit_code" -eq 124 ]; then
+        verdict="Timeout"
+    elif [ "$exit_code" -ne 0 ]; then
+        verdict="Crash"
+    else
+        verdict="NoOutput"
+    fi
+
+    echo "$verdict $elapsed"
+}
+
+COUNTER=0
+TOTAL=$(wc -l < "$PROBLEM_LIST")
+
+while IFS= read -r problem_file; do
+    COUNTER=$((COUNTER + 1))
+
+    expected=$(get_expected_status "$problem_file")
+    result_line=$(run_benchmark "$problem_file")
+    actual=$(echo "$result_line" | cut -d' ' -f1)
+    elapsed=$(echo "$result_line" | cut -d' ' -f2)
+    fname=$(basename "$problem_file")
+
+    # Classify notes
+    notes=""
+    # Soundness discrepancy: both answers are conclusive but conflict
+    conclusive_expected=false
+    conclusive_actual=false
+    case "$expected" in
+        Theorem|Unsatisfiable)     conclusive_expected=true ;;
+        Satisfiable|CounterSatisfiable) conclusive_expected=true ;;
+    esac
+    case "$actual" in
+        Theorem|Unsatisfiable)     conclusive_actual=true ;;
+        Satisfiable|CounterSatisfiable) conclusive_actual=true ;;
+    esac
+
+    if $conclusive_expected && $conclusive_actual; then
+        # Map expected to the Z3 output equivalents for comparison
+        # Theorem (has-conjecture unsat) matches "Theorem"
+        # Unsatisfiable (no-conjecture unsat) matches "Unsatisfiable"
+        # Satisfiable (no-conjecture sat) matches "Satisfiable"
+        # CounterSatisfiable (has-conjecture sat) matches "CounterSatisfiable"
+        if [ "$expected" != "$actual" ]; then
+            # Check for sat/unsat polarity conflict
+            sat_expected=false; sat_actual=false
+            case "$expected" in Satisfiable|CounterSatisfiable) sat_expected=true ;; esac
+            case "$actual"   in Satisfiable|CounterSatisfiable) sat_actual=true   ;; esac
+            if [ "$sat_expected" != "$sat_actual" ]; then
+                notes="SOUNDNESS_ERROR"
+            else
+                notes="STATUS_MISMATCH"
+            fi
+        fi
+    fi
+
+    if [ "$actual" = "Crash" ]; then
+        notes="CRASH"
+    fi
+
+    echo -e "$fname\t$expected\t$actual\t$elapsed\t$notes" >> "$RESULTS"
+
+    if [ -n "$notes" ]; then
+        echo "[$COUNTER/$TOTAL] $fname  expected=$expected  actual=$actual  time=${elapsed}s  *** $notes ***"
+    elif [ $((COUNTER % 50)) -eq 0 ]; then
+        echo "[$COUNTER/$TOTAL] Progress checkpoint  last=$fname  actual=$actual  time=${elapsed}s"
+    fi
+
+done < "$PROBLEM_LIST"
+
+echo "Benchmark run complete: $COUNTER problems processed. Results → $RESULTS"
+```
+
+Run it:
+
+```bash
+chmod +x /tmp/run_tptp_benchmarks.sh
+/tmp/run_tptp_benchmarks.sh
+```
+
+Do not skip any file in the list.
+
+## Phase 5: Analyze Results
+
+Save the following script to `/tmp/analyze_tptp.py` and run it:
+
+```python
+#!/usr/bin/env python3
+"""Compute summary statistics from the TPTP benchmark TSV."""
+import csv
+
+RESULTS_FILE = "/tmp/tptp_results.tsv"
+
+rows = []
+with open(RESULTS_FILE, newline="") as f:
+    reader = csv.DictReader(f, delimiter="\t")
+    for row in reader:
+        rows.append(row)
+
+total = len(rows)
+
+# Verdict counts
+from collections import Counter, defaultdict
+actual_counts = Counter(r["actual"] for r in rows)
+expected_counts = Counter(r["expected"] for r in rows)
+
+# Flagged rows
+soundness_errors = [r for r in rows if r["notes"] == "SOUNDNESS_ERROR"]
+status_mismatches = [r for r in rows if r["notes"] == "STATUS_MISMATCH"]
+crashes           = [r for r in rows if r["notes"] == "CRASH"]
+timeouts          = [r for r in rows if r["actual"] == "Timeout"]
+gave_up           = [r for r in rows if r["actual"] == "GaveUp"]
+
+# Solved correctly (expected matches actual for conclusive verdicts)
+conclusive_expected = {"Theorem", "Unsatisfiable", "Satisfiable", "CounterSatisfiable"}
+correct = [r for r in rows
+           if r["expected"] in conclusive_expected
+           and r["actual"] == r["expected"]]
+
+print(f"TOTAL={total}")
+print(f"CORRECT={len(correct)}")
+print(f"TIMEOUTS={len(timeouts)}")
+print(f"GAVE_UP={len(gave_up)}")
+print(f"CRASHES={len(crashes)}")
+print(f"SOUNDNESS_ERRORS={len(soundness_errors)}")
+print(f"STATUS_MISMATCHES={len(status_mismatches)}")
+
+print("\n--- Actual verdict breakdown ---")
+for v, c in sorted(actual_counts.items()):
+    print(f"  {v}: {c}")
+
+print("\n--- Expected status breakdown ---")
+for v, c in sorted(expected_counts.items()):
+    print(f"  {v}: {c}")
+
+if soundness_errors:
+    print(f"\n--- SOUNDNESS ERRORS ({len(soundness_errors)}) ---")
+    for r in soundness_errors:
+        print(f"  {r['file']}  expected={r['expected']}  actual={r['actual']}")
+
+if crashes:
+    print(f"\n--- CRASHES ({len(crashes)}) ---")
+    for r in crashes:
+        print(f"  {r['file']}  expected={r['expected']}")
+
+if status_mismatches:
+    print(f"\n--- STATUS MISMATCHES ({len(status_mismatches)}) ---")
+    for r in status_mismatches[:20]:
+        print(f"  {r['file']}  expected={r['expected']}  actual={r['actual']}")
+```
+
+Run the analysis:
+
+```bash
+python3 /tmp/analyze_tptp.py
+```
+
+## Phase 6: Generate and Post the Discussion Report
+
+Read the TSV at `/tmp/tptp_results.tsv` and the analysis output, then compose a Markdown report and call `create_discussion`.
+
+The report should use `###` or lower for all headers (never `#` or `##`). Use collapsible `<details>` sections for large tables.
+
+Use this structure:
+
+```markdown
+**Date**: <today's date>
+**Branch**: master
+**Commit**: `<short SHA>` (run `git rev-parse --short HEAD` in ${{ github.workspace }} to get the SHA)
+**Workflow Run**: [${{ github.run_id }}](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})
+**TPTP version**: <downloaded version>
+**Problems benchmarked**: <N> (random sample, timeout 5 s per problem)
+
+---
+
+### Summary
+
+| Metric | Count |
+|--------|-------|
+| Total problems run | N |
+| Correct (expected = actual) | N |
+| Timeouts | N |
+| GaveUp (within time budget) | N |
+| Crashes / errors | N |
+| Soundness errors (sat↔unsat conflict) | N |
+| Status mismatches (Theorem vs Unsatisfiable etc.) | N |
+
+### Expected Status Distribution
+
+| Expected Status | Count |
+|----------------|-------|
+| Theorem | N |
+| Unsatisfiable | N |
+| Satisfiable | N |
+| CounterSatisfiable | N |
+| Unknown / Open | N |
+
+---
+
+### ⚠️ Critical: Soundness Errors
+
+[List ALL files where Z3 returned a conclusive answer that contradicts the expected answer
+(e.g., expected Theorem but got CounterSatisfiable). If none, write "None detected."]
+
+### 💥 Crashes
+
+[List ALL files where Z3 crashed (non-zero exit, no SZS output, not a timeout).
+Include filename and expected status. If none, write "None detected."]
+
+### Status Mismatches
+
+[Files where both answers are conclusive but differ in Theorem vs Unsatisfiable polarity
+(e.g., expected Theorem but actual Unsatisfiable). These may indicate conjecture-handling
+differences rather than soundness bugs. If none, write "None detected."]
+
+---
+
+<details>
+<summary>View all Timeouts (problems where Z3 exceeded the 5-second limit)</summary>
+
+| # | File | Expected Status |
+|---|------|----------------|
+[First 100 timeout rows]
+
+</details>
+
+<details>
+<summary>View full per-problem results table</summary>
+
+| # | File | Expected | Actual | Time (s) | Notes |
+|---|------|----------|--------|----------|-------|
+[All rows, or first 500 if over limit]
+
+</details>
+
+---
+
+### Recommendations
+
+[Based on the findings, list actionable items. E.g.: investigate soundness errors,
+file crash bugs, note domains where Z3 consistently times out.]
+```
+
+Post the discussion using the `create_discussion` safe output. The title should be
+`[TPTP Benchmark] master — <date>`.
+
+## Safe Output Guarantee
+
+You **MUST** call either `create_discussion` or `noop` before the workflow ends:
+
+- **Full success**: Call `create_discussion` with the complete report.
+- **Partial results** (some problems ran): Call `create_discussion` with whatever results are available and a note about incomplete execution.
+- **Build failure**: Call `noop` with a brief message describing the build error.
+- **Download failure**: Call `noop` with the download error details.
+- **No problems selected**: Call `noop` explaining why no problems were found.
+
+Failing to produce any safe output triggers an automatic workflow-failure issue that clutters the repository.
+
+## Important Notes
+
+- **Never run `ninja` in the background with `&`**: Concurrent C++ compilation and LLM inference exhausts available RAM and kills the agent process (exit 137). Always wait for build commands to finish before continuing.
+- **TPTP environment variable**: Set `TPTP=/tmp/tptp` when invoking `z3 -tptp` so that `include()` directives in problem files resolve correctly against the downloaded Axioms directory.
+- **Timeout detection**: Use `timeout 8` as the outer OS-level guard (2 seconds beyond Z3's `-T:5`) to allow Z3 to exit cleanly before the shell kills it. If the exit code from `timeout` is 124, record the verdict as `Timeout`.
+- **Crash detection**: A crash is a non-zero exit code with no `% SZS status` line in the output and no timeout. Record it separately from `GaveUp`.
+- **SZS status semantics**: Z3 outputs `Theorem` (not `Unsatisfiable`) when it proves a conjecture; `CounterSatisfiable` (not `Satisfiable`) when it finds a counterexample to a conjecture. A status mismatch between `Theorem` and `Unsatisfiable` for the same problem may be innocuous and depends on whether the problem file uses a conjecture formula.
+- **Report soundness bugs prominently**: Any case where the polarity of the answer conflicts (expected Theorem/Unsatisfiable but got CounterSatisfiable/Satisfiable, or vice versa) is a potential soundness bug and must be highlighted as critical.
+- **Keep progress log**: Print a line for every flagged result and every 50th problem so the workflow log shows progress.
+- **Close older discussions**: Configured via `close-older-discussions: true`. Only the latest weekly report remains open.