diff --git a/.github/workflows/tptp-benchmark.lock.yml b/.github/workflows/tptp-benchmark.lock.yml
new file mode 100644
index 000000000..7e682c2fb
--- /dev/null
+++ b/.github/workflows/tptp-benchmark.lock.yml
@@ -0,0 +1,1317 @@
+# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"05a6ebcfc669740744c3887e595339f183e37872d3ebbaebe34513c2a8016af8","compiler_version":"v0.72.1","strict":true,"agent_id":"copilot"}
+# gh-aw-manifest: {"version":1,"secrets":["COPILOT_GITHUB_TOKEN","GH_AW_GITHUB_MCP_SERVER_TOKEN","GH_AW_GITHUB_TOKEN","GITHUB_TOKEN"],"actions":[{"repo":"actions/checkout","sha":"de0fac2e4500dabe0009e67214ff5f5447ce83dd","version":"v6.0.2"},{"repo":"actions/download-artifact","sha":"3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c","version":"v8.0.1"},{"repo":"actions/github-script","sha":"3a2844b7e9c422d3c10d287c895573f7108da1b3","version":"v9"},{"repo":"actions/setup-node","sha":"48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e","version":"v6.4.0"},{"repo":"actions/upload-artifact","sha":"043fb46d1a93c77aae656e7c1c64a875d1fc6a0a","version":"v7.0.1"},{"repo":"github/gh-aw-actions/setup","sha":"v0.72.1","version":"v0.72.1"}],"containers":[{"image":"ghcr.io/github/gh-aw-firewall/agent:0.25.41"},{"image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.25.41"},{"image":"ghcr.io/github/gh-aw-firewall/squid:0.25.41"},{"image":"ghcr.io/github/gh-aw-mcpg:v0.3.6","digest":"sha256:2bb8eef86006a4c5963c55616a9c51c32f27bfdecb023b8aa6f91f6718d9171c","pinned_image":"ghcr.io/github/gh-aw-mcpg:v0.3.6@sha256:2bb8eef86006a4c5963c55616a9c51c32f27bfdecb023b8aa6f91f6718d9171c"},{"image":"ghcr.io/github/github-mcp-server:v1.0.3","digest":"sha256:2ac27ef03461ef2b877031b838a7d1fd7f12b12d4ace7796d8cad91446d55959","pinned_image":"ghcr.io/github/github-mcp-server:v1.0.3@sha256:2ac27ef03461ef2b877031b838a7d1fd7f12b12d4ace7796d8cad91446d55959"},{"image":"node:lts-alpine","digest":"sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f","pinned_image":"node:lts-alpine@sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f"}]}
+# ___ _ _
+# / _ \ | | (_)
+# | |_| | __ _ ___ _ __ | |_ _ ___
+# | _ |/ _` |/ _ \ '_ \| __| |/ __|
+# | | | | (_| | __/ | | | |_| | (__
+# \_| |_/\__, |\___|_| |_|\__|_|\___|
+# __/ |
+# _ _ |___/
+# | | | | / _| |
+# | | | | ___ _ __ _ __| |_| | _____ ____
+# | |/\| |/ _ \ '__| |/ /| _| |/ _ \ \ /\ / / ___|
+# \ /\ / (_) | | | | ( | | | | (_) \ V V /\__ \
+# \/ \/ \___/|_| |_|\_\|_| |_|\___/ \_/\_/ |___/
+#
+# This file was automatically generated by gh-aw (v0.72.1). DO NOT EDIT.
+#
+# To update this file, edit the corresponding .md file and run:
+# gh aw compile
+# Not all edits will cause changes to this file.
+#
+# For more information: https://github.github.com/gh-aw/introduction/overview/
+#
+# Weekly benchmark of Z3's TPTP front-end against 500 random TPTP problems. Downloads TPTP benchmarks from tptp.org, resolves axiom dependencies, skips large problems, runs each with a 5-second timeout, and posts a discrepancy/crash report as a GitHub discussion.
+#
+# Secrets used:
+# - COPILOT_GITHUB_TOKEN
+# - GH_AW_GITHUB_MCP_SERVER_TOKEN
+# - GH_AW_GITHUB_TOKEN
+# - GITHUB_TOKEN
+#
+# Custom actions used:
+# - actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+# - actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+# - actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9
+# - actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+# - actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
+# - actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+# - github/gh-aw-actions/setup@v0.72.1
+#
+# Container images used:
+# - ghcr.io/github/gh-aw-firewall/agent:0.25.41
+# - ghcr.io/github/gh-aw-firewall/api-proxy:0.25.41
+# - ghcr.io/github/gh-aw-firewall/squid:0.25.41
+# - ghcr.io/github/gh-aw-mcpg:v0.3.6@sha256:2bb8eef86006a4c5963c55616a9c51c32f27bfdecb023b8aa6f91f6718d9171c
+# - ghcr.io/github/github-mcp-server:v1.0.3@sha256:2ac27ef03461ef2b877031b838a7d1fd7f12b12d4ace7796d8cad91446d55959
+# - node:lts-alpine@sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f
+
+name: "TPTP Front-End Benchmark"
+"on":
+ schedule:
+ - cron: "0 6 * * 1"
+ workflow_dispatch:
+ inputs:
+ aw_context:
+ default: ""
+ description: Agent caller context (used internally by Agentic Workflows).
+ required: false
+ type: string
+
+permissions: {}
+
+concurrency:
+ group: "gh-aw-${{ github.workflow }}"
+
+run-name: "TPTP Front-End Benchmark"
+
+jobs:
+ activation:
+ runs-on: ubuntu-slim
+ permissions:
+ actions: read
+ contents: read
+ outputs:
+ comment_id: ""
+ comment_repo: ""
+ engine_id: ${{ steps.generate_aw_info.outputs.engine_id }}
+ lockdown_check_failed: ${{ steps.generate_aw_info.outputs.lockdown_check_failed == 'true' }}
+ model: ${{ steps.generate_aw_info.outputs.model }}
+ secret_verification_result: ${{ steps.validate-secret.outputs.verification_result }}
+ setup-trace-id: ${{ steps.setup.outputs.trace-id }}
+ stale_lock_file_failed: ${{ steps.check-lock-file.outputs.stale_lock_file_failed == 'true' }}
+ steps:
+ - name: Setup Scripts
+ id: setup
+ uses: github/gh-aw-actions/setup@v0.72.1
+ with:
+ destination: ${{ runner.temp }}/gh-aw/actions
+ job-name: ${{ github.job }}
+ env:
+ GH_AW_SETUP_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+ GH_AW_CURRENT_WORKFLOW_REF: ${{ github.repository }}/.github/workflows/tptp-benchmark.lock.yml@${{ github.ref }}
+ GH_AW_INFO_VERSION: "1.0.40"
+ - name: Generate agentic run info
+ id: generate_aw_info
+ env:
+ GH_AW_INFO_ENGINE_ID: "copilot"
+ GH_AW_INFO_ENGINE_NAME: "GitHub Copilot CLI"
+ GH_AW_INFO_MODEL: ${{ vars.GH_AW_MODEL_AGENT_COPILOT || 'claude-sonnet-4.6' }}
+ GH_AW_INFO_VERSION: "1.0.40"
+ GH_AW_INFO_AGENT_VERSION: "1.0.40"
+ GH_AW_INFO_CLI_VERSION: "v0.72.1"
+ GH_AW_INFO_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+ GH_AW_INFO_EXPERIMENTAL: "false"
+ GH_AW_INFO_SUPPORTS_TOOLS_ALLOWLIST: "true"
+ GH_AW_INFO_STAGED: "false"
+ GH_AW_INFO_ALLOWED_DOMAINS: '["defaults","tptp.org"]'
+ GH_AW_INFO_FIREWALL_ENABLED: "true"
+ GH_AW_INFO_AWF_VERSION: "v0.25.41"
+ GH_AW_INFO_AWMG_VERSION: ""
+ GH_AW_INFO_FIREWALL_TYPE: "squid"
+ GH_AW_COMPILED_STRICT: "true"
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/generate_aw_info.cjs');
+ await main(core, context);
+ - name: Validate COPILOT_GITHUB_TOKEN secret
+ id: validate-secret
+ run: bash "${RUNNER_TEMP}/gh-aw/actions/validate_multi_secret.sh" COPILOT_GITHUB_TOKEN 'GitHub Copilot CLI' https://github.github.com/gh-aw/reference/engines/#github-copilot-default
+ env:
+ COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }}
+ - name: Checkout .github and .agents folders
+ uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ with:
+ persist-credentials: false
+ sparse-checkout: |
+ .github
+ .agents
+ .claude
+ .codex
+ .crush
+ .gemini
+ .opencode
+ .pi
+ sparse-checkout-cone-mode: true
+ fetch-depth: 1
+ - name: Save agent config folders for base branch restoration
+ env:
+ GH_AW_AGENT_FOLDERS: ".agents .claude .codex .crush .gemini .github .opencode .pi"
+ GH_AW_AGENT_FILES: ".crush.json AGENTS.md CLAUDE.md GEMINI.md PI.md opencode.jsonc"
+ # poutine:ignore untrusted_checkout_exec
+ run: bash "${RUNNER_TEMP}/gh-aw/actions/save_base_github_folders.sh"
+ - name: Check workflow lock file
+ id: check-lock-file
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ env:
+ GH_AW_WORKFLOW_FILE: "tptp-benchmark.lock.yml"
+ GH_AW_CONTEXT_WORKFLOW_REF: "${{ github.workflow_ref }}"
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/check_workflow_timestamp_api.cjs');
+ await main();
+ - name: Check compile-agentic version
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ env:
+ GH_AW_COMPILED_VERSION: "v0.72.1"
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/check_version_updates.cjs');
+ await main();
+ - name: Create prompt with built-in context
+ env:
+ GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+ GH_AW_SAFE_OUTPUTS: ${{ runner.temp }}/gh-aw/safeoutputs/outputs.jsonl
+ GH_AW_GITHUB_ACTOR: ${{ github.actor }}
+ GH_AW_GITHUB_EVENT_COMMENT_ID: ${{ github.event.comment.id }}
+ GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: ${{ github.event.discussion.number }}
+ GH_AW_GITHUB_EVENT_ISSUE_NUMBER: ${{ github.event.issue.number }}
+ GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: ${{ github.event.pull_request.number }}
+ GH_AW_GITHUB_REPOSITORY: ${{ github.repository }}
+ GH_AW_GITHUB_RUN_ID: ${{ github.run_id }}
+ GH_AW_GITHUB_WORKFLOW: ${{ github.workflow }}
+ GH_AW_GITHUB_WORKSPACE: ${{ github.workspace }}
+ # poutine:ignore untrusted_checkout_exec
+ run: |
+ bash "${RUNNER_TEMP}/gh-aw/actions/create_prompt_first.sh"
+ {
+ cat << 'GH_AW_PROMPT_dbe2d26d7191f793_EOF'
+
+ GH_AW_PROMPT_dbe2d26d7191f793_EOF
+ cat "${RUNNER_TEMP}/gh-aw/prompts/xpia.md"
+ cat "${RUNNER_TEMP}/gh-aw/prompts/temp_folder_prompt.md"
+ cat "${RUNNER_TEMP}/gh-aw/prompts/markdown.md"
+ cat "${RUNNER_TEMP}/gh-aw/prompts/safe_outputs_prompt.md"
+ cat << 'GH_AW_PROMPT_dbe2d26d7191f793_EOF'
+
+ Tools: create_discussion, missing_tool, missing_data, noop
+
+ GH_AW_PROMPT_dbe2d26d7191f793_EOF
+ cat "${RUNNER_TEMP}/gh-aw/prompts/mcp_cli_tools_prompt.md"
+ cat << 'GH_AW_PROMPT_dbe2d26d7191f793_EOF'
+
+ The following GitHub context information is available for this workflow:
+ {{#if __GH_AW_GITHUB_ACTOR__ }}
+ - **actor**: __GH_AW_GITHUB_ACTOR__
+ {{/if}}
+ {{#if __GH_AW_GITHUB_REPOSITORY__ }}
+ - **repository**: __GH_AW_GITHUB_REPOSITORY__
+ {{/if}}
+ {{#if __GH_AW_GITHUB_WORKSPACE__ }}
+ - **workspace**: __GH_AW_GITHUB_WORKSPACE__
+ {{/if}}
+ {{#if __GH_AW_GITHUB_EVENT_ISSUE_NUMBER__ }}
+ - **issue-number**: #__GH_AW_GITHUB_EVENT_ISSUE_NUMBER__
+ {{/if}}
+ {{#if __GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER__ }}
+ - **discussion-number**: #__GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER__
+ {{/if}}
+ {{#if __GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER__ }}
+ - **pull-request-number**: #__GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER__
+ {{/if}}
+ {{#if __GH_AW_GITHUB_EVENT_COMMENT_ID__ }}
+ - **comment-id**: __GH_AW_GITHUB_EVENT_COMMENT_ID__
+ {{/if}}
+ {{#if __GH_AW_GITHUB_RUN_ID__ }}
+ - **workflow-run-id**: __GH_AW_GITHUB_RUN_ID__
+ {{/if}}
+
+
+ GH_AW_PROMPT_dbe2d26d7191f793_EOF
+ cat "${RUNNER_TEMP}/gh-aw/prompts/github_mcp_tools_with_safeoutputs_prompt.md"
+ cat << 'GH_AW_PROMPT_dbe2d26d7191f793_EOF'
+
+ {{#runtime-import .github/workflows/tptp-benchmark.md}}
+ GH_AW_PROMPT_dbe2d26d7191f793_EOF
+ } > "$GH_AW_PROMPT"
+ - name: Interpolate variables and render templates
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ env:
+ GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+ GH_AW_ENGINE_ID: "copilot"
+ GH_AW_GITHUB_REPOSITORY: ${{ github.repository }}
+ GH_AW_GITHUB_RUN_ID: ${{ github.run_id }}
+ GH_AW_GITHUB_WORKFLOW: ${{ github.workflow }}
+ GH_AW_GITHUB_WORKSPACE: ${{ github.workspace }}
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/interpolate_prompt.cjs');
+ await main();
+ - name: Substitute placeholders
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ env:
+ GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+ GH_AW_GITHUB_ACTOR: ${{ github.actor }}
+ GH_AW_GITHUB_EVENT_COMMENT_ID: ${{ github.event.comment.id }}
+ GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: ${{ github.event.discussion.number }}
+ GH_AW_GITHUB_EVENT_ISSUE_NUMBER: ${{ github.event.issue.number }}
+ GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: ${{ github.event.pull_request.number }}
+ GH_AW_GITHUB_REPOSITORY: ${{ github.repository }}
+ GH_AW_GITHUB_RUN_ID: ${{ github.run_id }}
+ GH_AW_GITHUB_WORKFLOW: ${{ github.workflow }}
+ GH_AW_GITHUB_WORKSPACE: ${{ github.workspace }}
+ GH_AW_MCP_CLI_SERVERS_LIST: '- `safeoutputs` — run `safeoutputs --help` to see available tools'
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+
+ const substitutePlaceholders = require('${{ runner.temp }}/gh-aw/actions/substitute_placeholders.cjs');
+
+ // Call the substitution function
+ return await substitutePlaceholders({
+ file: process.env.GH_AW_PROMPT,
+ substitutions: {
+ GH_AW_GITHUB_ACTOR: process.env.GH_AW_GITHUB_ACTOR,
+ GH_AW_GITHUB_EVENT_COMMENT_ID: process.env.GH_AW_GITHUB_EVENT_COMMENT_ID,
+ GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: process.env.GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER,
+ GH_AW_GITHUB_EVENT_ISSUE_NUMBER: process.env.GH_AW_GITHUB_EVENT_ISSUE_NUMBER,
+ GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: process.env.GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER,
+ GH_AW_GITHUB_REPOSITORY: process.env.GH_AW_GITHUB_REPOSITORY,
+ GH_AW_GITHUB_RUN_ID: process.env.GH_AW_GITHUB_RUN_ID,
+ GH_AW_GITHUB_WORKFLOW: process.env.GH_AW_GITHUB_WORKFLOW,
+ GH_AW_GITHUB_WORKSPACE: process.env.GH_AW_GITHUB_WORKSPACE,
+ GH_AW_MCP_CLI_SERVERS_LIST: process.env.GH_AW_MCP_CLI_SERVERS_LIST
+ }
+ });
+ - name: Validate prompt placeholders
+ env:
+ GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+ # poutine:ignore untrusted_checkout_exec
+ run: bash "${RUNNER_TEMP}/gh-aw/actions/validate_prompt_placeholders.sh"
+ - name: Print prompt
+ env:
+ GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+ # poutine:ignore untrusted_checkout_exec
+ run: bash "${RUNNER_TEMP}/gh-aw/actions/print_prompt_summary.sh"
+ - name: Upload activation artifact
+ if: success()
+ uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+ with:
+ name: activation
+ include-hidden-files: true
+ path: |
+ /tmp/gh-aw/aw_info.json
+ /tmp/gh-aw/aw-prompts/prompt.txt
+ /tmp/gh-aw/aw-prompts/prompt-template.txt
+ /tmp/gh-aw/aw-prompts/prompt-import-tree.json
+ /tmp/gh-aw/github_rate_limits.jsonl
+ /tmp/gh-aw/base
+ /tmp/gh-aw/.github/agents
+ if-no-files-found: ignore
+ retention-days: 1
+
+ agent:
+ needs: activation
+ runs-on: ubuntu-latest
+ permissions: read-all
+ concurrency:
+ group: "gh-aw-copilot-${{ github.workflow }}"
+ env:
+ DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+ GH_AW_ASSETS_ALLOWED_EXTS: ""
+ GH_AW_ASSETS_BRANCH: ""
+ GH_AW_ASSETS_MAX_SIZE_KB: 0
+ GH_AW_MCP_LOG_DIR: /tmp/gh-aw/mcp-logs/safeoutputs
+ GH_AW_WORKFLOW_ID_SANITIZED: tptpbenchmark
+ outputs:
+ agentic_engine_timeout: ${{ steps.detect-copilot-errors.outputs.agentic_engine_timeout || 'false' }}
+ checkout_pr_success: ${{ steps.checkout-pr.outputs.checkout_pr_success || 'true' }}
+ effective_tokens: ${{ steps.parse-mcp-gateway.outputs.effective_tokens }}
+ has_patch: ${{ steps.collect_output.outputs.has_patch }}
+ inference_access_error: ${{ steps.detect-copilot-errors.outputs.inference_access_error || 'false' }}
+ mcp_policy_error: ${{ steps.detect-copilot-errors.outputs.mcp_policy_error || 'false' }}
+ model: ${{ needs.activation.outputs.model }}
+ model_not_supported_error: ${{ steps.detect-copilot-errors.outputs.model_not_supported_error || 'false' }}
+ output: ${{ steps.collect_output.outputs.output }}
+ output_types: ${{ steps.collect_output.outputs.output_types }}
+ setup-trace-id: ${{ steps.setup.outputs.trace-id }}
+ steps:
+ - name: Setup Scripts
+ id: setup
+ uses: github/gh-aw-actions/setup@v0.72.1
+ with:
+ destination: ${{ runner.temp }}/gh-aw/actions
+ job-name: ${{ github.job }}
+ trace-id: ${{ needs.activation.outputs.setup-trace-id }}
+ env:
+ GH_AW_SETUP_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+ GH_AW_CURRENT_WORKFLOW_REF: ${{ github.repository }}/.github/workflows/tptp-benchmark.lock.yml@${{ github.ref }}
+ GH_AW_INFO_VERSION: "1.0.40"
+ - name: Set runtime paths
+ id: set-runtime-paths
+ run: |
+ {
+ echo "GH_AW_SAFE_OUTPUTS=${RUNNER_TEMP}/gh-aw/safeoutputs/outputs.jsonl"
+ echo "GH_AW_SAFE_OUTPUTS_CONFIG_PATH=${RUNNER_TEMP}/gh-aw/safeoutputs/config.json"
+ echo "GH_AW_SAFE_OUTPUTS_TOOLS_PATH=${RUNNER_TEMP}/gh-aw/safeoutputs/tools.json"
+ } >> "$GITHUB_OUTPUT"
+ - name: Create gh-aw temp directory
+ run: bash "${RUNNER_TEMP}/gh-aw/actions/create_gh_aw_tmp_dir.sh"
+ - name: Configure gh CLI for GitHub Enterprise
+ run: bash "${RUNNER_TEMP}/gh-aw/actions/configure_gh_for_ghe.sh"
+ env:
+ GH_TOKEN: ${{ github.token }}
+ - name: Checkout repository
+ uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ with:
+ persist-credentials: false
+
+ - name: Configure Git credentials
+ env:
+ REPO_NAME: ${{ github.repository }}
+ SERVER_URL: ${{ github.server_url }}
+ GITHUB_TOKEN: ${{ github.token }}
+ run: |
+ git config --global user.email "github-actions[bot]@users.noreply.github.com"
+ git config --global user.name "github-actions[bot]"
+ git config --global am.keepcr true
+ # Re-authenticate git with GitHub token
+ SERVER_URL_STRIPPED="${SERVER_URL#https://}"
+ git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@${SERVER_URL_STRIPPED}/${REPO_NAME}.git"
+ echo "Git configured with standard GitHub Actions identity"
+ - name: Checkout PR branch
+ id: checkout-pr
+ if: |
+ github.event.pull_request || github.event.issue.pull_request
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ env:
+ GH_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+ with:
+ github-token: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/checkout_pr_branch.cjs');
+ await main();
+ - name: Install GitHub Copilot CLI
+ run: bash "${RUNNER_TEMP}/gh-aw/actions/install_copilot_cli.sh" 1.0.40
+ env:
+ GH_HOST: github.com
+ - name: Install AWF binary
+ run: bash "${RUNNER_TEMP}/gh-aw/actions/install_awf_binary.sh" v0.25.41
+ - name: Determine automatic lockdown mode for GitHub MCP Server
+ id: determine-automatic-lockdown
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9
+ env:
+ GH_AW_GITHUB_TOKEN: ${{ secrets.GH_AW_GITHUB_TOKEN }}
+ GH_AW_GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN }}
+ with:
+ script: |
+ const determineAutomaticLockdown = require('${{ runner.temp }}/gh-aw/actions/determine_automatic_lockdown.cjs');
+ await determineAutomaticLockdown(github, context, core);
+ - name: Download activation artifact
+ uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+ with:
+ name: activation
+ path: /tmp/gh-aw
+ - name: Restore agent config folders from base branch
+ if: steps.checkout-pr.outcome == 'success'
+ env:
+ GH_AW_AGENT_FOLDERS: ".agents .claude .codex .crush .gemini .github .opencode .pi"
+ GH_AW_AGENT_FILES: ".crush.json AGENTS.md CLAUDE.md GEMINI.md PI.md opencode.jsonc"
+ run: bash "${RUNNER_TEMP}/gh-aw/actions/restore_base_github_folders.sh"
+ - name: Restore inline sub-agents from activation artifact
+ env:
+ GH_AW_SUB_AGENT_DIR: ".github/agents"
+ GH_AW_SUB_AGENT_EXT: ".agent.md"
+ run: bash "${RUNNER_TEMP}/gh-aw/actions/restore_inline_sub_agents.sh"
+ - name: Download container images
+ run: bash "${RUNNER_TEMP}/gh-aw/actions/download_docker_images.sh" ghcr.io/github/gh-aw-firewall/agent:0.25.41 ghcr.io/github/gh-aw-firewall/api-proxy:0.25.41 ghcr.io/github/gh-aw-firewall/squid:0.25.41 ghcr.io/github/gh-aw-mcpg:v0.3.6@sha256:2bb8eef86006a4c5963c55616a9c51c32f27bfdecb023b8aa6f91f6718d9171c ghcr.io/github/github-mcp-server:v1.0.3@sha256:2ac27ef03461ef2b877031b838a7d1fd7f12b12d4ace7796d8cad91446d55959 node:lts-alpine@sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f
+ - name: Generate Safe Outputs Config
+ run: |
+ mkdir -p "${RUNNER_TEMP}/gh-aw/safeoutputs"
+ mkdir -p /tmp/gh-aw/safeoutputs
+ mkdir -p /tmp/gh-aw/mcp-logs/safeoutputs
+ cat > "${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" << 'GH_AW_SAFE_OUTPUTS_CONFIG_832ae11399c26134_EOF'
+ {"create_discussion":{"category":"agentic workflows","close_older_discussions":true,"expires":336,"fallback_to_issue":true,"max":1,"title_prefix":"[TPTP Benchmark] "},"create_report_incomplete_issue":{},"missing_data":{},"missing_tool":{},"noop":{"max":1,"report-as-issue":"false"},"report_incomplete":{}}
+ GH_AW_SAFE_OUTPUTS_CONFIG_832ae11399c26134_EOF
+ - name: Generate Safe Outputs Tools
+ env:
+ GH_AW_TOOLS_META_JSON: |
+ {
+ "description_suffixes": {
+ "create_discussion": " CONSTRAINTS: Maximum 1 discussion(s) can be created. Title will be prefixed with \"[TPTP Benchmark] \". Discussions will be created in category \"agentic workflows\"."
+ },
+ "repo_params": {},
+ "dynamic_tools": []
+ }
+ GH_AW_VALIDATION_JSON: |
+ {
+ "create_discussion": {
+ "defaultMax": 1,
+ "fields": {
+ "body": {
+ "required": true,
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 65000
+ },
+ "category": {
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 128
+ },
+ "repo": {
+ "type": "string",
+ "maxLength": 256
+ },
+ "title": {
+ "required": true,
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 128
+ }
+ }
+ },
+ "missing_data": {
+ "defaultMax": 20,
+ "fields": {
+ "alternatives": {
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 256
+ },
+ "context": {
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 256
+ },
+ "data_type": {
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 128
+ },
+ "reason": {
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 256
+ }
+ }
+ },
+ "missing_tool": {
+ "defaultMax": 20,
+ "fields": {
+ "alternatives": {
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 512
+ },
+ "reason": {
+ "required": true,
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 256
+ },
+ "tool": {
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 128
+ }
+ }
+ },
+ "noop": {
+ "defaultMax": 1,
+ "fields": {
+ "message": {
+ "required": true,
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 65000
+ }
+ }
+ },
+ "report_incomplete": {
+ "defaultMax": 5,
+ "fields": {
+ "details": {
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 65000
+ },
+ "reason": {
+ "required": true,
+ "type": "string",
+ "sanitize": true,
+ "maxLength": 1024
+ }
+ }
+ }
+ }
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/generate_safe_outputs_tools.cjs');
+ await main();
+ - name: Generate Safe Outputs MCP Server Config
+ id: safe-outputs-config
+ run: |
+ # Generate a secure random API key (360 bits of entropy, 40+ chars)
+ # Mask immediately to prevent timing vulnerabilities
+ API_KEY=$(openssl rand -base64 45 | tr -d '/+=')
+ echo "::add-mask::${API_KEY}"
+
+ PORT=3001
+
+ # Set outputs for next steps
+ {
+ echo "safe_outputs_api_key=${API_KEY}"
+ echo "safe_outputs_port=${PORT}"
+ } >> "$GITHUB_OUTPUT"
+
+ echo "Safe Outputs MCP server will run on port ${PORT}"
+
+ - name: Start Safe Outputs MCP HTTP Server
+ id: safe-outputs-start
+ env:
+ DEBUG: '*'
+ GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }}
+ GH_AW_SAFE_OUTPUTS_PORT: ${{ steps.safe-outputs-config.outputs.safe_outputs_port }}
+ GH_AW_SAFE_OUTPUTS_API_KEY: ${{ steps.safe-outputs-config.outputs.safe_outputs_api_key }}
+ GH_AW_SAFE_OUTPUTS_TOOLS_PATH: ${{ runner.temp }}/gh-aw/safeoutputs/tools.json
+ GH_AW_SAFE_OUTPUTS_CONFIG_PATH: ${{ runner.temp }}/gh-aw/safeoutputs/config.json
+ GH_AW_MCP_LOG_DIR: /tmp/gh-aw/mcp-logs/safeoutputs
+ run: |
+ # Environment variables are set above to prevent template injection
+ export DEBUG
+ export GH_AW_SAFE_OUTPUTS
+ export GH_AW_SAFE_OUTPUTS_PORT
+ export GH_AW_SAFE_OUTPUTS_API_KEY
+ export GH_AW_SAFE_OUTPUTS_TOOLS_PATH
+ export GH_AW_SAFE_OUTPUTS_CONFIG_PATH
+ export GH_AW_MCP_LOG_DIR
+
+ bash "${RUNNER_TEMP}/gh-aw/actions/start_safe_outputs_server.sh"
+
+ - name: Start MCP Gateway
+ id: start-mcp-gateway
+ env:
+ GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }}
+ GH_AW_SAFE_OUTPUTS_API_KEY: ${{ steps.safe-outputs-start.outputs.api_key }}
+ GH_AW_SAFE_OUTPUTS_PORT: ${{ steps.safe-outputs-start.outputs.port }}
+ GITHUB_MCP_GUARD_MIN_INTEGRITY: ${{ steps.determine-automatic-lockdown.outputs.min_integrity }}
+ GITHUB_MCP_GUARD_REPOS: ${{ steps.determine-automatic-lockdown.outputs.repos }}
+ GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+ run: |
+ set -eo pipefail
+ mkdir -p "${RUNNER_TEMP}/gh-aw/mcp-config"
+
+ # Export gateway environment variables for MCP config and gateway script
+ export MCP_GATEWAY_PORT="8080"
+ export MCP_GATEWAY_DOMAIN="host.docker.internal"
+ export MCP_GATEWAY_HOST_DOMAIN="localhost"
+ MCP_GATEWAY_API_KEY=$(openssl rand -base64 45 | tr -d '/+=')
+ echo "::add-mask::${MCP_GATEWAY_API_KEY}"
+ export MCP_GATEWAY_API_KEY
+ export MCP_GATEWAY_PAYLOAD_DIR="/tmp/gh-aw/mcp-payloads"
+ mkdir -p "${MCP_GATEWAY_PAYLOAD_DIR}"
+ export MCP_GATEWAY_PAYLOAD_SIZE_THRESHOLD="524288"
+ export DEBUG="*"
+
+ export GH_AW_ENGINE="copilot"
+ MCP_GATEWAY_UID=$(id -u 2>/dev/null || echo '0')
+ MCP_GATEWAY_GID=$(id -g 2>/dev/null || echo '0')
+ DOCKER_SOCK_GID=$(stat -c '%g' /var/run/docker.sock 2>/dev/null || echo '0')
+ export MCP_GATEWAY_DOCKER_COMMAND='docker run -i --rm --network host --add-host host.docker.internal:127.0.0.1 --user '"${MCP_GATEWAY_UID}"':'"${MCP_GATEWAY_GID}"' --group-add '"${DOCKER_SOCK_GID}"' -v /var/run/docker.sock:/var/run/docker.sock -e MCP_GATEWAY_PORT -e MCP_GATEWAY_DOMAIN -e MCP_GATEWAY_API_KEY -e MCP_GATEWAY_PAYLOAD_DIR -e MCP_GATEWAY_PAYLOAD_SIZE_THRESHOLD -e DEBUG -e MCP_GATEWAY_LOG_DIR -e GH_AW_MCP_LOG_DIR -e GH_AW_SAFE_OUTPUTS -e GH_AW_SAFE_OUTPUTS_CONFIG_PATH -e GH_AW_SAFE_OUTPUTS_TOOLS_PATH -e GH_AW_ASSETS_BRANCH -e GH_AW_ASSETS_MAX_SIZE_KB -e GH_AW_ASSETS_ALLOWED_EXTS -e DEFAULT_BRANCH -e GITHUB_MCP_SERVER_TOKEN -e GITHUB_MCP_GUARD_MIN_INTEGRITY -e GITHUB_MCP_GUARD_REPOS -e GITHUB_REPOSITORY -e GITHUB_SERVER_URL -e GITHUB_SHA -e GITHUB_WORKSPACE -e GITHUB_TOKEN -e GITHUB_RUN_ID -e GITHUB_RUN_NUMBER -e GITHUB_RUN_ATTEMPT -e GITHUB_JOB -e GITHUB_ACTION -e GITHUB_EVENT_NAME -e GITHUB_EVENT_PATH -e GITHUB_ACTOR -e GITHUB_ACTOR_ID -e GITHUB_TRIGGERING_ACTOR -e GITHUB_WORKFLOW -e GITHUB_WORKFLOW_REF -e GITHUB_WORKFLOW_SHA -e GITHUB_REF -e GITHUB_REF_NAME -e GITHUB_REF_TYPE -e GITHUB_HEAD_REF -e GITHUB_BASE_REF -e GH_AW_SAFE_OUTPUTS_PORT -e GH_AW_SAFE_OUTPUTS_API_KEY -v /tmp/gh-aw/mcp-payloads:/tmp/gh-aw/mcp-payloads:rw -v /opt:/opt:ro -v /tmp:/tmp:rw -v '"${GITHUB_WORKSPACE}"':'"${GITHUB_WORKSPACE}"':rw ghcr.io/github/gh-aw-mcpg:v0.3.6'
+
+ mkdir -p /home/runner/.copilot
+ GH_AW_NODE=$(which node 2>/dev/null || command -v node 2>/dev/null || echo node)
+ cat << GH_AW_MCP_CONFIG_0b5ba1ac1a376250_EOF | "$GH_AW_NODE" "${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.cjs"
+ {
+ "mcpServers": {
+ "github": {
+ "type": "stdio",
+ "container": "ghcr.io/github/github-mcp-server:v1.0.3",
+ "env": {
+ "GITHUB_HOST": "\${GITHUB_SERVER_URL}",
+ "GITHUB_PERSONAL_ACCESS_TOKEN": "\${GITHUB_MCP_SERVER_TOKEN}",
+ "GITHUB_READ_ONLY": "1",
+ "GITHUB_TOOLSETS": "context,repos,issues,pull_requests"
+ },
+ "guard-policies": {
+ "allow-only": {
+ "min-integrity": "$GITHUB_MCP_GUARD_MIN_INTEGRITY",
+ "repos": "$GITHUB_MCP_GUARD_REPOS"
+ }
+ }
+ },
+ "safeoutputs": {
+ "type": "http",
+ "url": "http://host.docker.internal:$GH_AW_SAFE_OUTPUTS_PORT",
+ "headers": {
+ "Authorization": "\${GH_AW_SAFE_OUTPUTS_API_KEY}"
+ },
+ "guard-policies": {
+ "write-sink": {
+ "accept": [
+ "*"
+ ]
+ }
+ }
+ }
+ },
+ "gateway": {
+ "port": $MCP_GATEWAY_PORT,
+ "domain": "${MCP_GATEWAY_DOMAIN}",
+ "apiKey": "${MCP_GATEWAY_API_KEY}",
+ "payloadDir": "${MCP_GATEWAY_PAYLOAD_DIR}"
+ }
+ }
+ GH_AW_MCP_CONFIG_0b5ba1ac1a376250_EOF
+ - name: Mount MCP servers as CLIs
+ id: mount-mcp-clis
+ continue-on-error: true
+ env:
+ MCP_GATEWAY_API_KEY: ${{ steps.start-mcp-gateway.outputs.gateway-api-key }}
+ MCP_GATEWAY_DOMAIN: ${{ steps.start-mcp-gateway.outputs.gateway-domain }}
+ MCP_GATEWAY_PORT: ${{ steps.start-mcp-gateway.outputs.gateway-port }}
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/mount_mcp_as_cli.cjs');
+ await main();
+ - name: Clean credentials
+ continue-on-error: true
+ run: bash "${RUNNER_TEMP}/gh-aw/actions/clean_git_credentials.sh"
+ - name: Audit pre-agent workspace
+ id: pre_agent_audit
+ continue-on-error: true
+ run: bash "${RUNNER_TEMP}/gh-aw/actions/audit_pre_agent_workspace.sh"
+ - name: Execute GitHub Copilot CLI
+ id: agentic_execution
+ # Copilot CLI tool arguments (sorted):
+ timeout-minutes: 300
+ run: |
+ set -o pipefail
+ touch /tmp/gh-aw/agent-step-summary.md
+ GH_AW_NODE_BIN=$(command -v node 2>/dev/null || true)
+ export GH_AW_NODE_BIN
+ (umask 177 && touch /tmp/gh-aw/agent-stdio.log)
+ printf '%s\n' '{"$schema":"https://github.com/github/gh-aw-firewall/releases/download/v0.25.41/awf-config.schema.json","network":{"allowDomains":["api.business.githubcopilot.com","api.enterprise.githubcopilot.com","api.github.com","api.githubcopilot.com","api.individual.githubcopilot.com","api.snapcraft.io","archive.ubuntu.com","azure.archive.ubuntu.com","crl.geotrust.com","crl.globalsign.com","crl.identrust.com","crl.sectigo.com","crl.thawte.com","crl.usertrust.com","crl.verisign.com","crl3.digicert.com","crl4.digicert.com","crls.ssl.com","github.com","host.docker.internal","json-schema.org","json.schemastore.org","keyserver.ubuntu.com","ocsp.digicert.com","ocsp.geotrust.com","ocsp.globalsign.com","ocsp.identrust.com","ocsp.sectigo.com","ocsp.ssl.com","ocsp.thawte.com","ocsp.usertrust.com","ocsp.verisign.com","packagecloud.io","packages.cloud.google.com","packages.microsoft.com","ppa.launchpad.net","raw.githubusercontent.com","registry.npmjs.org","s.symcb.com","s.symcd.com","security.ubuntu.com","telemetry.enterprise.githubcopilot.com","tptp.org","ts-crl.ws.symantec.com","ts-ocsp.ws.symantec.com","www.googleapis.com"]},"apiProxy":{"enabled":true,"models":{"auto":["large"],"deep-research":["copilot/deep-research*","copilot/o3-deep-research*","copilot/o4-mini-deep-research*","google/deep-research*","openai/o3-deep-research*","openai/o4-mini-deep-research*"],"gemini-flash":["copilot/gemini-*flash*","google/gemini-*flash*"],"gemini-pro":["copilot/gemini-*pro*","google/gemini-*pro*"],"gpt-4.1":["copilot/gpt-4.1*","openai/gpt-4.1*"],"gpt-5":["copilot/gpt-5*","openai/gpt-5*"],"gpt-5-codex":["copilot/gpt-5*codex*","openai/gpt-5*codex*"],"gpt-5-mini":["copilot/gpt-5*mini*","openai/gpt-5*mini*"],"gpt-5-nano":["copilot/gpt-5*nano*","openai/gpt-5*nano*"],"gpt-5-pro":["copilot/gpt-5*pro*","openai/gpt-5*pro*"],"haiku":["copilot/*haiku*","anthropic/*haiku*"],"large":["sonnet","gpt-5-pro","gpt-5","gemini-pro"],"mini":["haiku","gpt-5-mini","gpt-5-nano","gemini-flash"],"opus":["copilot/*opus*","anthropic/*opus*"],"reasoning":["copilot/o1*","copilot/o3*","copilot/o4*","openai/o1*","openai/o3*","openai/o4*"],"small":["mini"],"sonnet":["copilot/*sonnet*","anthropic/*sonnet*"]}},"container":{"imageTag":"0.25.41"}}' > "${RUNNER_TEMP}/gh-aw/awf-config.json" && cp "${RUNNER_TEMP}/gh-aw/awf-config.json" /tmp/gh-aw/awf-config.json
+ # shellcheck disable=SC1003
+ sudo -E awf --config "${RUNNER_TEMP}/gh-aw/awf-config.json" --container-workdir "${GITHUB_WORKSPACE}" --mount "${RUNNER_TEMP}/gh-aw:${RUNNER_TEMP}/gh-aw:ro" --mount "${RUNNER_TEMP}/gh-aw:/host${RUNNER_TEMP}/gh-aw:ro" --env-all --exclude-env COPILOT_GITHUB_TOKEN --exclude-env GITHUB_MCP_SERVER_TOKEN --exclude-env MCP_GATEWAY_API_KEY --log-level info --proxy-logs-dir /tmp/gh-aw/sandbox/firewall/logs --audit-dir /tmp/gh-aw/sandbox/firewall/audit --enable-host-access --allow-host-ports 80,443,8080 --skip-pull \
+ -- /bin/bash -c 'export PATH="${RUNNER_TEMP}/gh-aw/mcp-cli/bin:$PATH" && export PATH="$(find /opt/hostedtoolcache /home/runner/work/_tool -maxdepth 4 -type d -name bin 2>/dev/null | tr '\''\n'\'' '\'':'\'')$PATH"; [ -n "$GOROOT" ] && export PATH="$GOROOT/bin:$PATH" || true && GH_AW_NODE_EXEC="${GH_AW_NODE_BIN:-}"; if [ -z "$GH_AW_NODE_EXEC" ] || [ ! -x "$GH_AW_NODE_EXEC" ]; then GH_AW_NODE_EXEC="$(command -v node 2>/dev/null || echo node)"; fi; "$GH_AW_NODE_EXEC" ${RUNNER_TEMP}/gh-aw/actions/copilot_harness.cjs /usr/local/bin/copilot --add-dir /tmp/gh-aw/ --log-level all --log-dir /tmp/gh-aw/sandbox/agent/logs/ --disable-builtin-mcps --no-ask-user --allow-all-tools --allow-all-paths --add-dir "${GITHUB_WORKSPACE}" --prompt-file /tmp/gh-aw/aw-prompts/prompt.txt' 2>&1 | tee -a /tmp/gh-aw/agent-stdio.log
+ env:
+ AWF_REFLECT_ENABLED: 1
+ COPILOT_AGENT_RUNNER_TYPE: STANDALONE
+ COPILOT_API_KEY: dummy-byok-key-for-offline-mode
+ COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }}
+ COPILOT_MODEL: ${{ vars.GH_AW_MODEL_AGENT_COPILOT || 'claude-sonnet-4.6' }}
+ GH_AW_MCP_CONFIG: /home/runner/.copilot/mcp-config.json
+ GH_AW_PHASE: agent
+ GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+ GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }}
+ GH_AW_VERSION: v0.72.1
+ GITHUB_API_URL: ${{ github.api_url }}
+ GITHUB_AW: true
+ GITHUB_COPILOT_INTEGRATION_ID: agentic-workflows
+ GITHUB_HEAD_REF: ${{ github.head_ref }}
+ GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+ GITHUB_REF_NAME: ${{ github.ref_name }}
+ GITHUB_SERVER_URL: ${{ github.server_url }}
+ GITHUB_STEP_SUMMARY: /tmp/gh-aw/agent-step-summary.md
+ GITHUB_WORKSPACE: ${{ github.workspace }}
+ GIT_AUTHOR_EMAIL: github-actions[bot]@users.noreply.github.com
+ GIT_AUTHOR_NAME: github-actions[bot]
+ GIT_COMMITTER_EMAIL: github-actions[bot]@users.noreply.github.com
+ GIT_COMMITTER_NAME: github-actions[bot]
+ XDG_CONFIG_HOME: /home/runner
+ - name: Detect Copilot errors
+ id: detect-copilot-errors
+ if: always()
+ continue-on-error: true
+ run: node "${RUNNER_TEMP}/gh-aw/actions/detect_copilot_errors.cjs"
+ - name: Configure Git credentials
+ env:
+ REPO_NAME: ${{ github.repository }}
+ SERVER_URL: ${{ github.server_url }}
+ GITHUB_TOKEN: ${{ github.token }}
+ run: |
+ git config --global user.email "github-actions[bot]@users.noreply.github.com"
+ git config --global user.name "github-actions[bot]"
+ git config --global am.keepcr true
+ # Re-authenticate git with GitHub token
+ SERVER_URL_STRIPPED="${SERVER_URL#https://}"
+ git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@${SERVER_URL_STRIPPED}/${REPO_NAME}.git"
+ echo "Git configured with standard GitHub Actions identity"
+ - name: Copy Copilot session state files to logs
+ if: always()
+ continue-on-error: true
+ run: bash "${RUNNER_TEMP}/gh-aw/actions/copy_copilot_session_state.sh"
+ - name: Stop MCP Gateway
+ if: always()
+ continue-on-error: true
+ env:
+ MCP_GATEWAY_PORT: ${{ steps.start-mcp-gateway.outputs.gateway-port }}
+ MCP_GATEWAY_API_KEY: ${{ steps.start-mcp-gateway.outputs.gateway-api-key }}
+ GATEWAY_PID: ${{ steps.start-mcp-gateway.outputs.gateway-pid }}
+ run: |
+ bash "${RUNNER_TEMP}/gh-aw/actions/stop_mcp_gateway.sh" "$GATEWAY_PID"
+ - name: Redact secrets in logs
+ if: always()
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/redact_secrets.cjs');
+ await main();
+ env:
+ GH_AW_SECRET_NAMES: 'COPILOT_GITHUB_TOKEN,GH_AW_GITHUB_MCP_SERVER_TOKEN,GH_AW_GITHUB_TOKEN,GITHUB_TOKEN'
+ SECRET_COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }}
+ SECRET_GH_AW_GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN }}
+ SECRET_GH_AW_GITHUB_TOKEN: ${{ secrets.GH_AW_GITHUB_TOKEN }}
+ SECRET_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ - name: Append agent step summary
+ if: always()
+ run: bash "${RUNNER_TEMP}/gh-aw/actions/append_agent_step_summary.sh"
+ - name: Copy Safe Outputs
+ if: always()
+ env:
+ GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }}
+ run: |
+ mkdir -p /tmp/gh-aw
+ cp "$GH_AW_SAFE_OUTPUTS" /tmp/gh-aw/safeoutputs.jsonl 2>/dev/null || true
+ - name: Ingest agent output
+ id: collect_output
+ if: always()
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ env:
+ GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }}
+ GH_AW_ALLOWED_DOMAINS: "api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,api.snapcraft.io,archive.ubuntu.com,azure.archive.ubuntu.com,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,github.com,host.docker.internal,json-schema.org,json.schemastore.org,keyserver.ubuntu.com,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,ppa.launchpad.net,raw.githubusercontent.com,registry.npmjs.org,s.symcb.com,s.symcd.com,security.ubuntu.com,telemetry.enterprise.githubcopilot.com,tptp.org,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com,www.googleapis.com"
+ GITHUB_SERVER_URL: ${{ github.server_url }}
+ GITHUB_API_URL: ${{ github.api_url }}
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/collect_ndjson_output.cjs');
+ await main();
+ - name: Parse agent logs for step summary
+ if: always()
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ env:
+ GH_AW_AGENT_OUTPUT: /tmp/gh-aw/sandbox/agent/logs/
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_copilot_log.cjs');
+ await main();
+ - name: Parse MCP Gateway logs for step summary
+ if: always()
+ id: parse-mcp-gateway
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_mcp_gateway_log.cjs');
+ await main();
+ - name: Print firewall logs
+ if: always()
+ continue-on-error: true
+ env:
+ AWF_LOGS_DIR: /tmp/gh-aw/sandbox/firewall/logs
+ run: |
+ # Fix permissions on firewall logs/audit dirs so they can be uploaded as artifacts
+ # AWF runs with sudo, creating files owned by root
+ sudo chmod -R a+rX /tmp/gh-aw/sandbox/firewall 2>/dev/null || true
+ # Only run awf logs summary if awf command exists (it may not be installed if workflow failed before install step)
+ if command -v awf &> /dev/null; then
+ awf logs summary | tee -a "$GITHUB_STEP_SUMMARY"
+ else
+ echo 'AWF binary not installed, skipping firewall log summary'
+ fi
+ - name: Parse token usage for step summary
+ if: always()
+ continue-on-error: true
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_token_usage.cjs');
+ await main();
+ - name: Print AWF reflect summary
+ if: always()
+ continue-on-error: true
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/awf_reflect_summary.cjs');
+ await main();
+ - name: Write agent output placeholder if missing
+ if: always()
+ run: |
+ if [ ! -f /tmp/gh-aw/agent_output.json ]; then
+ echo '{"items":[]}' > /tmp/gh-aw/agent_output.json
+ fi
+ - name: Upload agent artifacts
+ if: always()
+ continue-on-error: true
+ uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+ with:
+ name: agent
+ path: |
+ /tmp/gh-aw/aw-prompts/prompt.txt
+ /tmp/gh-aw/sandbox/agent/logs/
+ /tmp/gh-aw/redacted-urls.log
+ /tmp/gh-aw/mcp-logs/
+ /tmp/gh-aw/agent_usage.json
+ /tmp/gh-aw/agent-stdio.log
+ /tmp/gh-aw/pre-agent-audit.txt
+ /tmp/gh-aw/agent/
+ /tmp/gh-aw/github_rate_limits.jsonl
+ /tmp/gh-aw/safeoutputs.jsonl
+ /tmp/gh-aw/agent_output.json
+ /tmp/gh-aw/aw-*.patch
+ /tmp/gh-aw/aw-*.bundle
+ /tmp/gh-aw/awf-config.json
+ /tmp/gh-aw/sandbox/firewall/logs/
+ /tmp/gh-aw/sandbox/firewall/audit/
+ /tmp/gh-aw/sandbox/firewall/awf-reflect.json
+ if-no-files-found: ignore
+
+ conclusion:
+ needs:
+ - activation
+ - agent
+ - detection
+ - safe_outputs
+ if: >
+ always() && (needs.agent.result != 'skipped' || needs.activation.outputs.lockdown_check_failed == 'true' ||
+ needs.activation.outputs.stale_lock_file_failed == 'true')
+ runs-on: ubuntu-slim
+ permissions:
+ contents: read
+ discussions: write
+ issues: write
+ concurrency:
+ group: "gh-aw-conclusion-tptp-benchmark"
+ cancel-in-progress: false
+ outputs:
+ incomplete_count: ${{ steps.report_incomplete.outputs.incomplete_count }}
+ noop_message: ${{ steps.noop.outputs.noop_message }}
+ tools_reported: ${{ steps.missing_tool.outputs.tools_reported }}
+ total_count: ${{ steps.missing_tool.outputs.total_count }}
+ steps:
+ - name: Setup Scripts
+ id: setup
+ uses: github/gh-aw-actions/setup@v0.72.1
+ with:
+ destination: ${{ runner.temp }}/gh-aw/actions
+ job-name: ${{ github.job }}
+ trace-id: ${{ needs.activation.outputs.setup-trace-id }}
+ env:
+ GH_AW_SETUP_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+ GH_AW_CURRENT_WORKFLOW_REF: ${{ github.repository }}/.github/workflows/tptp-benchmark.lock.yml@${{ github.ref }}
+ GH_AW_INFO_VERSION: "1.0.40"
+ - name: Download agent output artifact
+ id: download-agent-output
+ continue-on-error: true
+ uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+ with:
+ name: agent
+ path: /tmp/gh-aw/
+ - name: Setup agent output environment variable
+ id: setup-agent-output-env
+ if: steps.download-agent-output.outcome == 'success'
+ run: |
+ mkdir -p /tmp/gh-aw/
+ find "/tmp/gh-aw/" -type f -print
+ echo "GH_AW_AGENT_OUTPUT=/tmp/gh-aw/agent_output.json" >> "$GITHUB_OUTPUT"
+ - name: Process no-op messages
+ id: noop
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ env:
+ GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }}
+ GH_AW_NOOP_MAX: "1"
+ GH_AW_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+ GH_AW_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+ GH_AW_AGENT_CONCLUSION: ${{ needs.agent.result }}
+ GH_AW_NOOP_REPORT_AS_ISSUE: "false"
+ with:
+ github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/handle_noop_message.cjs');
+ await main();
+ - name: Log detection run
+ id: detection_runs
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ env:
+ GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }}
+ GH_AW_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+ GH_AW_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+ GH_AW_DETECTION_CONCLUSION: ${{ needs.detection.outputs.detection_conclusion }}
+ GH_AW_DETECTION_REASON: ${{ needs.detection.outputs.detection_reason }}
+ with:
+ github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/handle_detection_runs.cjs');
+ await main();
+ - name: Record missing tool
+ id: missing_tool
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ env:
+ GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }}
+ GH_AW_MISSING_TOOL_CREATE_ISSUE: "true"
+ GH_AW_MISSING_TOOL_TITLE_PREFIX: "[missing tool]"
+ GH_AW_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+ with:
+ github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/missing_tool.cjs');
+ await main();
+ - name: Record incomplete
+ id: report_incomplete
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ env:
+ GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }}
+ GH_AW_REPORT_INCOMPLETE_CREATE_ISSUE: "true"
+ GH_AW_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+ with:
+ github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/report_incomplete_handler.cjs');
+ await main();
+ - name: Handle agent failure
+ id: handle_agent_failure
+ if: always()
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ env:
+ GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }}
+ GH_AW_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+ GH_AW_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+ GH_AW_AGENT_CONCLUSION: ${{ needs.agent.result }}
+ GH_AW_WORKFLOW_ID: "tptp-benchmark"
+ GH_AW_ACTION_FAILURE_ISSUE_EXPIRES_HOURS: "168"
+ GH_AW_ENGINE_ID: "copilot"
+ GH_AW_SECRET_VERIFICATION_RESULT: ${{ needs.activation.outputs.secret_verification_result }}
+ GH_AW_CHECKOUT_PR_SUCCESS: ${{ needs.agent.outputs.checkout_pr_success }}
+ GH_AW_INFERENCE_ACCESS_ERROR: ${{ needs.agent.outputs.inference_access_error }}
+ GH_AW_MCP_POLICY_ERROR: ${{ needs.agent.outputs.mcp_policy_error }}
+ GH_AW_AGENTIC_ENGINE_TIMEOUT: ${{ needs.agent.outputs.agentic_engine_timeout }}
+ GH_AW_MODEL_NOT_SUPPORTED_ERROR: ${{ needs.agent.outputs.model_not_supported_error }}
+ GH_AW_ENGINE_API_HOSTS: "api.enterprise.githubcopilot.com,api.githubcopilot.com,api.business.githubcopilot.com,api.individual.githubcopilot.com"
+ GH_AW_CREATE_DISCUSSION_ERRORS: ${{ needs.safe_outputs.outputs.create_discussion_errors }}
+ GH_AW_CREATE_DISCUSSION_ERROR_COUNT: ${{ needs.safe_outputs.outputs.create_discussion_error_count }}
+ GH_AW_LOCKDOWN_CHECK_FAILED: ${{ needs.activation.outputs.lockdown_check_failed }}
+ GH_AW_STALE_LOCK_FILE_FAILED: ${{ needs.activation.outputs.stale_lock_file_failed }}
+ GH_AW_GROUP_REPORTS: "false"
+ GH_AW_FAILURE_REPORT_AS_ISSUE: "true"
+ GH_AW_MISSING_TOOL_REPORT_AS_FAILURE: "true"
+ GH_AW_MISSING_DATA_REPORT_AS_FAILURE: "true"
+ GH_AW_TIMEOUT_MINUTES: "300"
+ with:
+ github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/handle_agent_failure.cjs');
+ await main();
+
+ detection:
+ needs:
+ - activation
+ - agent
+ if: >
+ always() && needs.agent.result != 'skipped' && (needs.agent.outputs.output_types != '' || needs.agent.outputs.has_patch == 'true')
+ runs-on: ubuntu-latest
+ permissions:
+ contents: read
+ outputs:
+ detection_conclusion: ${{ steps.detection_conclusion.outputs.conclusion }}
+ detection_reason: ${{ steps.detection_conclusion.outputs.reason }}
+ detection_success: ${{ steps.detection_conclusion.outputs.success }}
+ steps:
+ - name: Setup Scripts
+ id: setup
+ uses: github/gh-aw-actions/setup@v0.72.1
+ with:
+ destination: ${{ runner.temp }}/gh-aw/actions
+ job-name: ${{ github.job }}
+ trace-id: ${{ needs.activation.outputs.setup-trace-id }}
+ env:
+ GH_AW_SETUP_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+ GH_AW_CURRENT_WORKFLOW_REF: ${{ github.repository }}/.github/workflows/tptp-benchmark.lock.yml@${{ github.ref }}
+ GH_AW_INFO_VERSION: "1.0.40"
+ - name: Download agent output artifact
+ id: download-agent-output
+ continue-on-error: true
+ uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+ with:
+ name: agent
+ path: /tmp/gh-aw/
+ - name: Setup agent output environment variable
+ id: setup-agent-output-env
+ if: steps.download-agent-output.outcome == 'success'
+ run: |
+ mkdir -p /tmp/gh-aw/
+ find "/tmp/gh-aw/" -type f -print
+ echo "GH_AW_AGENT_OUTPUT=/tmp/gh-aw/agent_output.json" >> "$GITHUB_OUTPUT"
+ - name: Checkout repository for patch context
+ if: needs.agent.outputs.has_patch == 'true'
+ uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ with:
+ persist-credentials: false
+ # --- Threat Detection ---
+ - name: Clean stale firewall files from agent artifact
+ run: |
+ rm -rf /tmp/gh-aw/sandbox/firewall/logs
+ rm -rf /tmp/gh-aw/sandbox/firewall/audit
+ - name: Download container images
+ run: bash "${RUNNER_TEMP}/gh-aw/actions/download_docker_images.sh" ghcr.io/github/gh-aw-firewall/agent:0.25.41 ghcr.io/github/gh-aw-firewall/api-proxy:0.25.41 ghcr.io/github/gh-aw-firewall/squid:0.25.41
+ - name: Check if detection needed
+ id: detection_guard
+ if: always()
+ env:
+ OUTPUT_TYPES: ${{ needs.agent.outputs.output_types }}
+ HAS_PATCH: ${{ needs.agent.outputs.has_patch }}
+ run: |
+ if [[ -n "$OUTPUT_TYPES" || "$HAS_PATCH" == "true" ]]; then
+ echo "run_detection=true" >> "$GITHUB_OUTPUT"
+ echo "Detection will run: output_types=$OUTPUT_TYPES, has_patch=$HAS_PATCH"
+ else
+ echo "run_detection=false" >> "$GITHUB_OUTPUT"
+ echo "Detection skipped: no agent outputs or patches to analyze"
+ fi
+ - name: Clear MCP Config for detection
+ if: always() && steps.detection_guard.outputs.run_detection == 'true'
+ run: |
+ rm -f "${RUNNER_TEMP}/gh-aw/mcp-config/mcp-servers.json"
+ rm -f /home/runner/.copilot/mcp-config.json
+ rm -f "$GITHUB_WORKSPACE/.gemini/settings.json"
+ - name: Prepare threat detection files
+ if: always() && steps.detection_guard.outputs.run_detection == 'true'
+ run: |
+ mkdir -p /tmp/gh-aw/threat-detection/aw-prompts
+ cp /tmp/gh-aw/aw-prompts/prompt.txt /tmp/gh-aw/threat-detection/aw-prompts/prompt.txt 2>/dev/null || true
+ cp /tmp/gh-aw/agent_output.json /tmp/gh-aw/threat-detection/agent_output.json 2>/dev/null || true
+ for f in /tmp/gh-aw/aw-*.patch; do
+ [ -f "$f" ] && cp "$f" /tmp/gh-aw/threat-detection/ 2>/dev/null || true
+ done
+ for f in /tmp/gh-aw/aw-*.bundle; do
+ [ -f "$f" ] && cp "$f" /tmp/gh-aw/threat-detection/ 2>/dev/null || true
+ done
+ echo "Prepared threat detection files:"
+ ls -la /tmp/gh-aw/threat-detection/ 2>/dev/null || true
+ - name: Setup threat detection
+ if: always() && steps.detection_guard.outputs.run_detection == 'true'
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ env:
+ WORKFLOW_NAME: "TPTP Front-End Benchmark"
+ WORKFLOW_DESCRIPTION: "Weekly benchmark of Z3's TPTP front-end against 500 random TPTP problems. Downloads TPTP benchmarks from tptp.org, resolves axiom dependencies, skips large problems, runs each with a 5-second timeout, and posts a discrepancy/crash report as a GitHub discussion."
+ HAS_PATCH: ${{ needs.agent.outputs.has_patch }}
+ with:
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/setup_threat_detection.cjs');
+ await main();
+ - name: Ensure threat-detection directory and log
+ if: always() && steps.detection_guard.outputs.run_detection == 'true'
+ run: |
+ mkdir -p /tmp/gh-aw/threat-detection
+ touch /tmp/gh-aw/threat-detection/detection.log
+ - name: Setup Node.js
+ uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
+ with:
+ node-version: '24'
+ package-manager-cache: false
+ - name: Install GitHub Copilot CLI
+ run: bash "${RUNNER_TEMP}/gh-aw/actions/install_copilot_cli.sh" 1.0.40
+ env:
+ GH_HOST: github.com
+ - name: Install AWF binary
+ run: bash "${RUNNER_TEMP}/gh-aw/actions/install_awf_binary.sh" v0.25.41
+ - name: Execute GitHub Copilot CLI
+ if: always() && steps.detection_guard.outputs.run_detection == 'true'
+ continue-on-error: true
+ id: detection_agentic_execution
+ # Copilot CLI tool arguments (sorted):
+ timeout-minutes: 20
+ run: |
+ set -o pipefail
+ touch /tmp/gh-aw/agent-step-summary.md
+ GH_AW_NODE_BIN=$(command -v node 2>/dev/null || true)
+ export GH_AW_NODE_BIN
+ (umask 177 && touch /tmp/gh-aw/threat-detection/detection.log)
+ printf '%s\n' '{"$schema":"https://github.com/github/gh-aw-firewall/releases/download/v0.25.41/awf-config.schema.json","network":{"allowDomains":["api.business.githubcopilot.com","api.enterprise.githubcopilot.com","api.github.com","api.githubcopilot.com","api.individual.githubcopilot.com","github.com","host.docker.internal","telemetry.enterprise.githubcopilot.com"]},"apiProxy":{"enabled":true},"container":{"imageTag":"0.25.41"}}' > "${RUNNER_TEMP}/gh-aw/awf-config.json" && cp "${RUNNER_TEMP}/gh-aw/awf-config.json" /tmp/gh-aw/awf-config.json
+ # shellcheck disable=SC1003
+ sudo -E awf --config "${RUNNER_TEMP}/gh-aw/awf-config.json" --container-workdir "${GITHUB_WORKSPACE}" --mount "${RUNNER_TEMP}/gh-aw:${RUNNER_TEMP}/gh-aw:ro" --mount "${RUNNER_TEMP}/gh-aw:/host${RUNNER_TEMP}/gh-aw:ro" --env-all --exclude-env COPILOT_GITHUB_TOKEN --log-level info --proxy-logs-dir /tmp/gh-aw/sandbox/firewall/logs --audit-dir /tmp/gh-aw/sandbox/firewall/audit --enable-host-access --allow-host-ports 80,443,8080 --skip-pull \
+ -- /bin/bash -c 'export PATH="$(find /opt/hostedtoolcache /home/runner/work/_tool -maxdepth 4 -type d -name bin 2>/dev/null | tr '\''\n'\'' '\'':'\'')$PATH"; [ -n "$GOROOT" ] && export PATH="$GOROOT/bin:$PATH" || true && GH_AW_NODE_EXEC="${GH_AW_NODE_BIN:-}"; if [ -z "$GH_AW_NODE_EXEC" ] || [ ! -x "$GH_AW_NODE_EXEC" ]; then GH_AW_NODE_EXEC="$(command -v node 2>/dev/null || echo node)"; fi; "$GH_AW_NODE_EXEC" ${RUNNER_TEMP}/gh-aw/actions/copilot_harness.cjs /usr/local/bin/copilot --add-dir /tmp/gh-aw/ --log-level all --log-dir /tmp/gh-aw/sandbox/agent/logs/ --disable-builtin-mcps --no-ask-user --allow-all-tools --add-dir "${GITHUB_WORKSPACE}" --prompt-file /tmp/gh-aw/aw-prompts/prompt.txt' 2>&1 | tee -a /tmp/gh-aw/threat-detection/detection.log
+ env:
+ AWF_REFLECT_ENABLED: 1
+ COPILOT_AGENT_RUNNER_TYPE: STANDALONE
+ COPILOT_API_KEY: dummy-byok-key-for-offline-mode
+ COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }}
+ COPILOT_MODEL: ${{ vars.GH_AW_MODEL_DETECTION_COPILOT || 'claude-sonnet-4.6' }}
+ GH_AW_PHASE: detection
+ GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt
+ GH_AW_VERSION: v0.72.1
+ GITHUB_API_URL: ${{ github.api_url }}
+ GITHUB_AW: true
+ GITHUB_COPILOT_INTEGRATION_ID: agentic-workflows
+ GITHUB_HEAD_REF: ${{ github.head_ref }}
+ GITHUB_REF_NAME: ${{ github.ref_name }}
+ GITHUB_SERVER_URL: ${{ github.server_url }}
+ GITHUB_STEP_SUMMARY: /tmp/gh-aw/agent-step-summary.md
+ GITHUB_WORKSPACE: ${{ github.workspace }}
+ GIT_AUTHOR_EMAIL: github-actions[bot]@users.noreply.github.com
+ GIT_AUTHOR_NAME: github-actions[bot]
+ GIT_COMMITTER_EMAIL: github-actions[bot]@users.noreply.github.com
+ GIT_COMMITTER_NAME: github-actions[bot]
+ XDG_CONFIG_HOME: /home/runner
+ - name: Upload threat detection log
+ if: always() && steps.detection_guard.outputs.run_detection == 'true'
+ uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+ with:
+ name: detection
+ path: /tmp/gh-aw/threat-detection/detection.log
+ if-no-files-found: ignore
+ - name: Parse and conclude threat detection
+ id: detection_conclusion
+ if: always()
+ continue-on-error: true
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ env:
+ RUN_DETECTION: ${{ steps.detection_guard.outputs.run_detection }}
+ GH_AW_DETECTION_CONTINUE_ON_ERROR: "true"
+ with:
+ script: |
+ try {
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_threat_detection_results.cjs');
+ await main();
+ } catch (loadErr) {
+ const continueOnError = process.env.GH_AW_DETECTION_CONTINUE_ON_ERROR !== 'false';
+ const msg = 'ERR_SYSTEM: \u274C Unexpected error loading threat detection module: ' + (loadErr && loadErr.message ? loadErr.message : String(loadErr));
+ core.error(msg);
+ core.setOutput('reason', 'parse_error');
+ if (continueOnError) {
+ core.warning('\u26A0\uFE0F ' + msg);
+ core.setOutput('conclusion', 'warning');
+ core.setOutput('success', 'false');
+ } else {
+ core.setOutput('conclusion', 'failure');
+ core.setOutput('success', 'false');
+ core.setFailed(msg);
+ }
+ }
+
+ safe_outputs:
+ needs:
+ - activation
+ - agent
+ - detection
+ if: (!cancelled()) && needs.agent.result != 'skipped' && needs.detection.result == 'success'
+ runs-on: ubuntu-slim
+ permissions:
+ contents: read
+ discussions: write
+ issues: write
+ timeout-minutes: 15
+ env:
+ GH_AW_CALLER_WORKFLOW_ID: "${{ github.repository }}/tptp-benchmark"
+ GH_AW_DETECTION_CONCLUSION: ${{ needs.detection.outputs.detection_conclusion }}
+ GH_AW_DETECTION_REASON: ${{ needs.detection.outputs.detection_reason }}
+ GH_AW_EFFECTIVE_TOKENS: ${{ needs.agent.outputs.effective_tokens }}
+ GH_AW_ENGINE_ID: "copilot"
+ GH_AW_ENGINE_MODEL: ${{ needs.agent.outputs.model }}
+ GH_AW_ENGINE_VERSION: "1.0.40"
+ GH_AW_WORKFLOW_ID: "tptp-benchmark"
+ GH_AW_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+ outputs:
+ code_push_failure_count: ${{ steps.process_safe_outputs.outputs.code_push_failure_count }}
+ code_push_failure_errors: ${{ steps.process_safe_outputs.outputs.code_push_failure_errors }}
+ create_discussion_error_count: ${{ steps.process_safe_outputs.outputs.create_discussion_error_count }}
+ create_discussion_errors: ${{ steps.process_safe_outputs.outputs.create_discussion_errors }}
+ process_safe_outputs_processed_count: ${{ steps.process_safe_outputs.outputs.processed_count }}
+ process_safe_outputs_temporary_id_map: ${{ steps.process_safe_outputs.outputs.temporary_id_map }}
+ steps:
+ - name: Setup Scripts
+ id: setup
+ uses: github/gh-aw-actions/setup@v0.72.1
+ with:
+ destination: ${{ runner.temp }}/gh-aw/actions
+ job-name: ${{ github.job }}
+ trace-id: ${{ needs.activation.outputs.setup-trace-id }}
+ env:
+ GH_AW_SETUP_WORKFLOW_NAME: "TPTP Front-End Benchmark"
+ GH_AW_CURRENT_WORKFLOW_REF: ${{ github.repository }}/.github/workflows/tptp-benchmark.lock.yml@${{ github.ref }}
+ GH_AW_INFO_VERSION: "1.0.40"
+ - name: Download agent output artifact
+ id: download-agent-output
+ continue-on-error: true
+ uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+ with:
+ name: agent
+ path: /tmp/gh-aw/
+ - name: Setup agent output environment variable
+ id: setup-agent-output-env
+ if: steps.download-agent-output.outcome == 'success'
+ run: |
+ mkdir -p /tmp/gh-aw/
+ find "/tmp/gh-aw/" -type f -print
+ echo "GH_AW_AGENT_OUTPUT=/tmp/gh-aw/agent_output.json" >> "$GITHUB_OUTPUT"
+ - name: Configure GH_HOST for enterprise compatibility
+ id: ghes-host-config
+ shell: bash
+ run: |
+ # Derive GH_HOST from GITHUB_SERVER_URL so the gh CLI targets the correct
+ # GitHub instance (GHES/GHEC). On github.com this is a harmless no-op.
+ GH_HOST="${GITHUB_SERVER_URL#https://}"
+ GH_HOST="${GH_HOST#http://}"
+ echo "GH_HOST=${GH_HOST}" >> "$GITHUB_ENV"
+ - name: Process Safe Outputs
+ id: process_safe_outputs
+ uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+ env:
+ GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }}
+ GH_AW_ALLOWED_DOMAINS: "api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,api.snapcraft.io,archive.ubuntu.com,azure.archive.ubuntu.com,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,github.com,host.docker.internal,json-schema.org,json.schemastore.org,keyserver.ubuntu.com,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,ppa.launchpad.net,raw.githubusercontent.com,registry.npmjs.org,s.symcb.com,s.symcd.com,security.ubuntu.com,telemetry.enterprise.githubcopilot.com,tptp.org,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com,www.googleapis.com"
+ GITHUB_SERVER_URL: ${{ github.server_url }}
+ GITHUB_API_URL: ${{ github.api_url }}
+ GH_AW_SAFE_OUTPUTS_HANDLER_CONFIG: "{\"create_discussion\":{\"category\":\"agentic workflows\",\"close_older_discussions\":true,\"expires\":336,\"fallback_to_issue\":true,\"max\":1,\"title_prefix\":\"[TPTP Benchmark] \"},\"create_report_incomplete_issue\":{},\"missing_data\":{},\"missing_tool\":{},\"noop\":{\"max\":1,\"report-as-issue\":\"false\"},\"report_incomplete\":{}}"
+ with:
+ github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
+ script: |
+ const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs');
+ setupGlobals(core, github, context, exec, io, getOctokit);
+ const { main } = require('${{ runner.temp }}/gh-aw/actions/safe_output_handler_manager.cjs');
+ await main();
+ - name: Upload Safe Outputs Items
+ if: always()
+ uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+ with:
+ name: safe-outputs-items
+ path: |
+ /tmp/gh-aw/safe-output-items.jsonl
+ /tmp/gh-aw/temporary-id-map.json
+ if-no-files-found: ignore
+
diff --git a/.github/workflows/tptp-benchmark.md b/.github/workflows/tptp-benchmark.md
new file mode 100644
index 000000000..48752372a
--- /dev/null
+++ b/.github/workflows/tptp-benchmark.md
@@ -0,0 +1,554 @@
+---
+description: >
+ Weekly benchmark of Z3's TPTP front-end against 500 random TPTP problems.
+ Downloads TPTP benchmarks from tptp.org, resolves axiom dependencies,
+ skips large problems, runs each with a 5-second timeout, and posts a
+ discrepancy/crash report as a GitHub discussion.
+
+on:
+ schedule:
+ - cron: "0 6 * * 1"
+ workflow_dispatch:
+
+permissions: read-all
+
+network:
+ allowed:
+ - defaults
+ - tptp.org
+
+tools:
+ bash: true
+ github:
+ toolsets: [default]
+
+safe-outputs:
+ create-discussion:
+ title-prefix: "[TPTP Benchmark] "
+ category: "Agentic Workflows"
+ close-older-discussions: true
+ expires: 14d
+ missing-tool:
+ create-issue: true
+ noop:
+ report-as-issue: false
+
+timeout-minutes: 300
+
+steps:
+ - name: Checkout repository
+ uses: actions/checkout@v6.0.2
+ with:
+ persist-credentials: false
+
+---
+
+# TPTP Front-End Benchmark
+
+## Job Description
+
+Your name is ${{ github.workflow }}. You are an expert testing engineer for the Z3 theorem prover. Your task is to:
+
+1. Build Z3 from the current `master` branch
+2. Download the TPTP benchmark library from tptp.org
+3. Select 500 random small-to-medium problems (with their axiom dependencies)
+4. Run each problem through Z3's TPTP front-end with a 5-second timeout
+5. Compare Z3's output against the expected SZS status declared in each problem file
+6. Post a detailed report as a GitHub Discussion summarising discrepancies and crashes
+
+**Repository**: ${{ github.repository }}
+**Workspace**: ${{ github.workspace }}
+
+## Phase 1: Build Z3
+
+Install build tools and build Z3 in Release mode.
+
+```bash
+sudo apt-get update -y -q
+sudo apt-get install -y cmake ninja-build python3 wget curl bc 2>/dev/null || true
+```
+
+Configure and build:
+
+```bash
+mkdir -p /tmp/z3-build
+cd /tmp/z3-build
+cmake "$GITHUB_WORKSPACE" \
+ -G Ninja \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DZ3_BUILD_TEST_EXECUTABLES=OFF \
+ 2>&1 | tail -20
+
+# Build with limited parallelism to avoid OOM alongside LLM inference.
+# Never run ninja in the background with &.
+ninja -j2 z3 2>&1 | tail -30
+```
+
+Verify the build:
+
+```bash
+/tmp/z3-build/z3 --version
+```
+
+If the build fails, call the `noop` safe-output with a message describing the error and stop.
+
+Once the binary is confirmed, call `noop` with `"Z3 built successfully. Downloading TPTP benchmark library — this may take a few minutes."` to keep the safe-output session alive.
+
+## Phase 2: Download the TPTP Problem Library
+
+Find the latest TPTP release and download the full archive.
+
+```bash
+# Find the latest TPTP distribution version by fetching the directory listing
+TPTP_DIST_URL="https://tptp.org/TPTP/Distribution/"
+LATEST_TGZ=$(curl -sL "$TPTP_DIST_URL" \
+ | grep -oP 'TPTP-v[0-9]+\.[0-9]+\.[0-9]+\.tgz' \
+ | sort -V | tail -1)
+
+if [ -z "$LATEST_TGZ" ]; then
+ echo "ERROR: Could not determine latest TPTP version from $TPTP_DIST_URL"
+ # Fall back to a known stable version
+ LATEST_TGZ="TPTP-v9.0.0.tgz"
+fi
+
+echo "Downloading $LATEST_TGZ ..."
+mkdir -p /tmp/tptp_download
+wget -q --show-progress \
+ "${TPTP_DIST_URL}${LATEST_TGZ}" \
+ -O /tmp/tptp_download/tptp.tgz
+
+echo "Extracting TPTP library..."
+mkdir -p /tmp/tptp
+tar -xzf /tmp/tptp_download/tptp.tgz -C /tmp/tptp --strip-components=1 2>&1 | tail -5
+
+# Verify extraction
+if [ ! -d /tmp/tptp/Problems ] || [ ! -d /tmp/tptp/Axioms ]; then
+ echo "ERROR: TPTP extraction failed — Problems/ or Axioms/ directory not found"
+ ls /tmp/tptp/
+ exit 1
+fi
+
+TPTP_ROOT=/tmp/tptp
+echo "TPTP library extracted to $TPTP_ROOT"
+echo "Problem domains available:"
+ls "$TPTP_ROOT/Problems/" | wc -l
+echo "Axiom files available:"
+ls "$TPTP_ROOT/Axioms/" | wc -l
+```
+
+If the download or extraction fails, call `noop` with the error details and stop.
+
+Call `noop` with `"TPTP library downloaded and extracted. Selecting 500 benchmark problems — filtering by size."` to keep the session alive.
+
+## Phase 3: Select 500 Benchmark Problems
+
+Filter out large problems and problems that depend on large axiom files, then take a random sample of 500.
+
+Save this script to `/tmp/select_benchmarks.py` and run it:
+
+```python
+#!/usr/bin/env python3
+"""
+Select 500 random TPTP problems that:
+ - Have a known, conclusive expected status (Theorem, Unsatisfiable,
+ CounterSatisfiable, Satisfiable) OR Unknown/Open status.
+ - Are not "large" (problem file <= 50 KB).
+ - Do not include any axiom file larger than 100 KB.
+"""
+import os
+import re
+import random
+import sys
+
+TPTP_ROOT = "/tmp/tptp"
+PROBLEMS_DIR = os.path.join(TPTP_ROOT, "Problems")
+AXIOMS_DIR = os.path.join(TPTP_ROOT, "Axioms")
+MAX_PROBLEM_SIZE = 50 * 1024 # 50 KB
+MAX_AXIOM_SIZE = 100 * 1024 # 100 KB
+SAMPLE_SIZE = 500
+OUTPUT_FILE = "/tmp/selected_benchmarks.txt"
+
+include_re = re.compile(r"include\s*\(\s*['\"]([^'\"]+)['\"]", re.IGNORECASE)
+status_re = re.compile(r"%\s*Status\s*:\s*(\S+)", re.IGNORECASE)
+
+def axiom_sizes_ok(problem_path):
+ """Return True if all included axiom files exist and are <= MAX_AXIOM_SIZE."""
+ try:
+ with open(problem_path, encoding="utf-8", errors="replace") as f:
+ content = f.read(4096) # header is in first few KB
+ except OSError:
+ return False
+ for m in include_re.finditer(content):
+ axiom_rel = m.group(1) # e.g. "Axioms/AGT001+0.ax"
+ axiom_path = os.path.join(TPTP_ROOT, axiom_rel)
+ if not os.path.exists(axiom_path):
+ return False # axiom missing — skip
+ if os.path.getsize(axiom_path) > MAX_AXIOM_SIZE:
+ return False # axiom too large — skip
+ return True
+
+candidates = []
+skipped_size = 0
+skipped_axiom = 0
+
+for domain in sorted(os.listdir(PROBLEMS_DIR)):
+ domain_dir = os.path.join(PROBLEMS_DIR, domain)
+ if not os.path.isdir(domain_dir):
+ continue
+ for fname in os.listdir(domain_dir):
+ if not fname.endswith(".p"):
+ continue
+ fpath = os.path.join(domain_dir, fname)
+ size = os.path.getsize(fpath)
+ if size > MAX_PROBLEM_SIZE:
+ skipped_size += 1
+ continue
+ if not axiom_sizes_ok(fpath):
+ skipped_axiom += 1
+ continue
+ candidates.append(fpath)
+
+print(f"Total candidates (after filtering): {len(candidates)}", flush=True)
+print(f" Skipped — problem too large : {skipped_size}", flush=True)
+print(f" Skipped — axiom too large : {skipped_axiom}", flush=True)
+
+if len(candidates) == 0:
+ print("ERROR: No suitable benchmark problems found.", file=sys.stderr)
+ sys.exit(1)
+
+if len(candidates) > SAMPLE_SIZE:
+ random.seed(42)
+ selected = random.sample(candidates, SAMPLE_SIZE)
+else:
+ selected = candidates
+
+selected.sort()
+with open(OUTPUT_FILE, "w") as f:
+ f.write("\n".join(selected) + "\n")
+
+print(f"Selected {len(selected)} problems → {OUTPUT_FILE}", flush=True)
+```
+
+Run the script:
+
+```bash
+python3 /tmp/select_benchmarks.py
+SELECTED=$(wc -l < /tmp/selected_benchmarks.txt)
+echo "Benchmark set: $SELECTED problems"
+```
+
+If no problems are found, call `noop` with an error message and stop.
+
+Call `noop` with `"$SELECTED problems selected. Starting benchmark run with 5-second timeout per problem — this will take approximately $(( SELECTED * 7 / 60 )) minutes."` to keep the session alive.
+
+## Phase 4: Run Benchmarks
+
+Save the following script to `/tmp/run_tptp_benchmarks.sh`, make it executable, and run it.
+
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+
+Z3=/tmp/z3-build/z3
+TPTP_ROOT=/tmp/tptp
+TIMEOUT_HARD=8 # outer OS-level guard (seconds)
+Z3_TIMEOUT=5 # Z3 internal timeout flag -T:5
+
+RESULTS=/tmp/tptp_results.tsv
+PROBLEM_LIST=/tmp/selected_benchmarks.txt
+
+echo -e "file\texpected\tactual\ttime_s\tnotes" > "$RESULTS"
+
+# Helper: extract the expected SZS status from the TPTP problem header.
+get_expected_status() {
+ local file="$1"
+ # Look for lines like: "% Status : Theorem"
+ grep -m1 -iP '%\s*Status\s*:\s*\K\S+' "$file" 2>/dev/null || echo "Unknown"
+}
+
+# Helper: run z3 on a single TPTP problem with timeout.
+run_benchmark() {
+ local file="$1"
+ local start end elapsed output exit_code verdict
+
+ start=$(date +%s%3N)
+ output=$(TPTP="$TPTP_ROOT" timeout "$TIMEOUT_HARD" \
+ "$Z3" -tptp -T:"$Z3_TIMEOUT" "$file" 2>&1) || exit_code=$?
+ exit_code=${exit_code:-0}
+ end=$(date +%s%3N)
+ elapsed=$(echo "scale=3; ($end - $start) / 1000" | bc)
+
+ # Extract SZS status line from output
+ szs_line=$(echo "$output" | grep -m1 "% SZS status" || true)
+
+ if [ -n "$szs_line" ]; then
+ # Parse the status keyword (e.g. "Theorem", "CounterSatisfiable", "GaveUp")
+ verdict=$(echo "$szs_line" | grep -oP '% SZS status \K\S+' || echo "Unknown")
+ elif [ "$exit_code" -eq 124 ]; then
+ verdict="Timeout"
+ elif [ "$exit_code" -ne 0 ]; then
+ verdict="Crash"
+ else
+ verdict="NoOutput"
+ fi
+
+ echo "$verdict $elapsed"
+}
+
+COUNTER=0
+TOTAL=$(wc -l < "$PROBLEM_LIST")
+
+while IFS= read -r problem_file; do
+ COUNTER=$((COUNTER + 1))
+
+ expected=$(get_expected_status "$problem_file")
+ result_line=$(run_benchmark "$problem_file")
+ actual=$(echo "$result_line" | cut -d' ' -f1)
+ elapsed=$(echo "$result_line" | cut -d' ' -f2)
+ fname=$(basename "$problem_file")
+
+ # Classify notes
+ notes=""
+ # Soundness discrepancy: both answers are conclusive but conflict
+ conclusive_expected=false
+ conclusive_actual=false
+ case "$expected" in
+ Theorem|Unsatisfiable) conclusive_expected=true ;;
+ Satisfiable|CounterSatisfiable) conclusive_expected=true ;;
+ esac
+ case "$actual" in
+ Theorem|Unsatisfiable) conclusive_actual=true ;;
+ Satisfiable|CounterSatisfiable) conclusive_actual=true ;;
+ esac
+
+ if $conclusive_expected && $conclusive_actual; then
+ # Map expected to the Z3 output equivalents for comparison
+ # Theorem (has-conjecture unsat) matches "Theorem"
+ # Unsatisfiable (no-conjecture unsat) matches "Unsatisfiable"
+ # Satisfiable (no-conjecture sat) matches "Satisfiable"
+ # CounterSatisfiable (has-conjecture sat) matches "CounterSatisfiable"
+ if [ "$expected" != "$actual" ]; then
+ # Check for sat/unsat polarity conflict
+ sat_expected=false; sat_actual=false
+ case "$expected" in Satisfiable|CounterSatisfiable) sat_expected=true ;; esac
+ case "$actual" in Satisfiable|CounterSatisfiable) sat_actual=true ;; esac
+ if [ "$sat_expected" != "$sat_actual" ]; then
+ notes="SOUNDNESS_ERROR"
+ else
+ notes="STATUS_MISMATCH"
+ fi
+ fi
+ fi
+
+ if [ "$actual" = "Crash" ]; then
+ notes="CRASH"
+ fi
+
+ echo -e "$fname\t$expected\t$actual\t$elapsed\t$notes" >> "$RESULTS"
+
+ if [ -n "$notes" ]; then
+ echo "[$COUNTER/$TOTAL] $fname expected=$expected actual=$actual time=${elapsed}s *** $notes ***"
+ elif [ $((COUNTER % 50)) -eq 0 ]; then
+ echo "[$COUNTER/$TOTAL] Progress checkpoint last=$fname actual=$actual time=${elapsed}s"
+ fi
+
+done < "$PROBLEM_LIST"
+
+echo "Benchmark run complete: $COUNTER problems processed. Results → $RESULTS"
+```
+
+Run it:
+
+```bash
+chmod +x /tmp/run_tptp_benchmarks.sh
+/tmp/run_tptp_benchmarks.sh
+```
+
+Do not skip any file in the list.
+
+## Phase 5: Analyze Results
+
+Save the following script to `/tmp/analyze_tptp.py` and run it:
+
+```python
+#!/usr/bin/env python3
+"""Compute summary statistics from the TPTP benchmark TSV."""
+import csv
+
+RESULTS_FILE = "/tmp/tptp_results.tsv"
+
+rows = []
+with open(RESULTS_FILE, newline="") as f:
+ reader = csv.DictReader(f, delimiter="\t")
+ for row in reader:
+ rows.append(row)
+
+total = len(rows)
+
+# Verdict counts
+from collections import Counter, defaultdict
+actual_counts = Counter(r["actual"] for r in rows)
+expected_counts = Counter(r["expected"] for r in rows)
+
+# Flagged rows
+soundness_errors = [r for r in rows if r["notes"] == "SOUNDNESS_ERROR"]
+status_mismatches = [r for r in rows if r["notes"] == "STATUS_MISMATCH"]
+crashes = [r for r in rows if r["notes"] == "CRASH"]
+timeouts = [r for r in rows if r["actual"] == "Timeout"]
+gave_up = [r for r in rows if r["actual"] == "GaveUp"]
+
+# Solved correctly (expected matches actual for conclusive verdicts)
+conclusive_expected = {"Theorem", "Unsatisfiable", "Satisfiable", "CounterSatisfiable"}
+correct = [r for r in rows
+ if r["expected"] in conclusive_expected
+ and r["actual"] == r["expected"]]
+
+print(f"TOTAL={total}")
+print(f"CORRECT={len(correct)}")
+print(f"TIMEOUTS={len(timeouts)}")
+print(f"GAVE_UP={len(gave_up)}")
+print(f"CRASHES={len(crashes)}")
+print(f"SOUNDNESS_ERRORS={len(soundness_errors)}")
+print(f"STATUS_MISMATCHES={len(status_mismatches)}")
+
+print("\n--- Actual verdict breakdown ---")
+for v, c in sorted(actual_counts.items()):
+ print(f" {v}: {c}")
+
+print("\n--- Expected status breakdown ---")
+for v, c in sorted(expected_counts.items()):
+ print(f" {v}: {c}")
+
+if soundness_errors:
+ print(f"\n--- SOUNDNESS ERRORS ({len(soundness_errors)}) ---")
+ for r in soundness_errors:
+ print(f" {r['file']} expected={r['expected']} actual={r['actual']}")
+
+if crashes:
+ print(f"\n--- CRASHES ({len(crashes)}) ---")
+ for r in crashes:
+ print(f" {r['file']} expected={r['expected']}")
+
+if status_mismatches:
+ print(f"\n--- STATUS MISMATCHES ({len(status_mismatches)}) ---")
+ for r in status_mismatches[:20]:
+ print(f" {r['file']} expected={r['expected']} actual={r['actual']}")
+```
+
+Run the analysis:
+
+```bash
+python3 /tmp/analyze_tptp.py
+```
+
+## Phase 6: Generate and Post the Discussion Report
+
+Read the TSV at `/tmp/tptp_results.tsv` and the analysis output, then compose a Markdown report and call `create_discussion`.
+
+The report should use `###` or lower for all headers (never `#` or `##`). Use collapsible `` sections for large tables.
+
+Use this structure:
+
+```markdown
+**Date**:
+**Branch**: master
+**Commit**: `` (run `git rev-parse --short HEAD` in ${{ github.workspace }} to get the SHA)
+**Workflow Run**: [${{ github.run_id }}](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})
+**TPTP version**:
+**Problems benchmarked**: (random sample, timeout 5 s per problem)
+
+---
+
+### Summary
+
+| Metric | Count |
+|--------|-------|
+| Total problems run | N |
+| Correct (expected = actual) | N |
+| Timeouts | N |
+| GaveUp (within time budget) | N |
+| Crashes / errors | N |
+| Soundness errors (sat↔unsat conflict) | N |
+| Status mismatches (Theorem vs Unsatisfiable etc.) | N |
+
+### Expected Status Distribution
+
+| Expected Status | Count |
+|----------------|-------|
+| Theorem | N |
+| Unsatisfiable | N |
+| Satisfiable | N |
+| CounterSatisfiable | N |
+| Unknown / Open | N |
+
+---
+
+### ⚠️ Critical: Soundness Errors
+
+[List ALL files where Z3 returned a conclusive answer that contradicts the expected answer
+(e.g., expected Theorem but got CounterSatisfiable). If none, write "None detected."]
+
+### 💥 Crashes
+
+[List ALL files where Z3 crashed (non-zero exit, no SZS output, not a timeout).
+Include filename and expected status. If none, write "None detected."]
+
+### Status Mismatches
+
+[Files where both answers are conclusive but differ in Theorem vs Unsatisfiable polarity
+(e.g., expected Theorem but actual Unsatisfiable). These may indicate conjecture-handling
+differences rather than soundness bugs. If none, write "None detected."]
+
+---
+
+
+View all Timeouts (problems where Z3 exceeded the 5-second limit)
+
+| # | File | Expected Status |
+|---|------|----------------|
+[First 100 timeout rows]
+
+
+
+
+View full per-problem results table
+
+| # | File | Expected | Actual | Time (s) | Notes |
+|---|------|----------|--------|----------|-------|
+[All rows, or first 500 if over limit]
+
+
+
+---
+
+### Recommendations
+
+[Based on the findings, list actionable items. E.g.: investigate soundness errors,
+file crash bugs, note domains where Z3 consistently times out.]
+```
+
+Post the discussion using the `create_discussion` safe output. The title should be
+`[TPTP Benchmark] master — `.
+
+## Safe Output Guarantee
+
+You **MUST** call either `create_discussion` or `noop` before the workflow ends:
+
+- **Full success**: Call `create_discussion` with the complete report.
+- **Partial results** (some problems ran): Call `create_discussion` with whatever results are available and a note about incomplete execution.
+- **Build failure**: Call `noop` with a brief message describing the build error.
+- **Download failure**: Call `noop` with the download error details.
+- **No problems selected**: Call `noop` explaining why no problems were found.
+
+Failing to produce any safe output triggers an automatic workflow-failure issue that clutters the repository.
+
+## Important Notes
+
+- **Never run `ninja` in the background with `&`**: Concurrent C++ compilation and LLM inference exhausts available RAM and kills the agent process (exit 137). Always wait for build commands to finish before continuing.
+- **TPTP environment variable**: Set `TPTP=/tmp/tptp` when invoking `z3 -tptp` so that `include()` directives in problem files resolve correctly against the downloaded Axioms directory.
+- **Timeout detection**: Use `timeout 8` as the outer OS-level guard (2 seconds beyond Z3's `-T:5`) to allow Z3 to exit cleanly before the shell kills it. If the exit code from `timeout` is 124, record the verdict as `Timeout`.
+- **Crash detection**: A crash is a non-zero exit code with no `% SZS status` line in the output and no timeout. Record it separately from `GaveUp`.
+- **SZS status semantics**: Z3 outputs `Theorem` (not `Unsatisfiable`) when it proves a conjecture; `CounterSatisfiable` (not `Satisfiable`) when it finds a counterexample to a conjecture. A status mismatch between `Theorem` and `Unsatisfiable` for the same problem may be innocuous and depends on whether the problem file uses a conjecture formula.
+- **Report soundness bugs prominently**: Any case where the polarity of the answer conflicts (expected Theorem/Unsatisfiable but got CounterSatisfiable/Satisfiable, or vice versa) is a potential soundness bug and must be highlighted as critical.
+- **Keep progress log**: Print a line for every flagged result and every 50th problem so the workflow log shows progress.
+- **Close older discussions**: Configured via `close-older-discussions: true`. Only the latest weekly report remains open.