From df0abab7884ecfff887fc0b5ed0c53d1d3871c4e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 13 May 2026 09:29:35 +0000 Subject: [PATCH] Add TPTP benchmark workflow (tptp-benchmark.md + lock file) Agent-Logs-Url: https://github.com/Z3Prover/z3/sessions/834244bd-f780-46e8-b66a-d0e8c88f8dc8 Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com> --- .github/workflows/tptp-benchmark.lock.yml | 1317 +++++++++++++++++++++ .github/workflows/tptp-benchmark.md | 554 +++++++++ 2 files changed, 1871 insertions(+) create mode 100644 .github/workflows/tptp-benchmark.lock.yml create mode 100644 .github/workflows/tptp-benchmark.md diff --git a/.github/workflows/tptp-benchmark.lock.yml b/.github/workflows/tptp-benchmark.lock.yml new file mode 100644 index 000000000..7e682c2fb --- /dev/null +++ b/.github/workflows/tptp-benchmark.lock.yml @@ -0,0 +1,1317 @@ +# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"05a6ebcfc669740744c3887e595339f183e37872d3ebbaebe34513c2a8016af8","compiler_version":"v0.72.1","strict":true,"agent_id":"copilot"} +# gh-aw-manifest: {"version":1,"secrets":["COPILOT_GITHUB_TOKEN","GH_AW_GITHUB_MCP_SERVER_TOKEN","GH_AW_GITHUB_TOKEN","GITHUB_TOKEN"],"actions":[{"repo":"actions/checkout","sha":"de0fac2e4500dabe0009e67214ff5f5447ce83dd","version":"v6.0.2"},{"repo":"actions/download-artifact","sha":"3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c","version":"v8.0.1"},{"repo":"actions/github-script","sha":"3a2844b7e9c422d3c10d287c895573f7108da1b3","version":"v9"},{"repo":"actions/setup-node","sha":"48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e","version":"v6.4.0"},{"repo":"actions/upload-artifact","sha":"043fb46d1a93c77aae656e7c1c64a875d1fc6a0a","version":"v7.0.1"},{"repo":"github/gh-aw-actions/setup","sha":"v0.72.1","version":"v0.72.1"}],"containers":[{"image":"ghcr.io/github/gh-aw-firewall/agent:0.25.41"},{"image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.25.41"},{"image":"ghcr.io/github/gh-aw-firewall/squid:0.25.41"},{"image":"ghcr.io/github/gh-aw-mcpg:v0.3.6","digest":"sha256:2bb8eef86006a4c5963c55616a9c51c32f27bfdecb023b8aa6f91f6718d9171c","pinned_image":"ghcr.io/github/gh-aw-mcpg:v0.3.6@sha256:2bb8eef86006a4c5963c55616a9c51c32f27bfdecb023b8aa6f91f6718d9171c"},{"image":"ghcr.io/github/github-mcp-server:v1.0.3","digest":"sha256:2ac27ef03461ef2b877031b838a7d1fd7f12b12d4ace7796d8cad91446d55959","pinned_image":"ghcr.io/github/github-mcp-server:v1.0.3@sha256:2ac27ef03461ef2b877031b838a7d1fd7f12b12d4ace7796d8cad91446d55959"},{"image":"node:lts-alpine","digest":"sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f","pinned_image":"node:lts-alpine@sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f"}]} +# ___ _ _ +# / _ \ | | (_) +# | |_| | __ _ ___ _ __ | |_ _ ___ +# | _ |/ _` |/ _ \ '_ \| __| |/ __| +# | | | | (_| | __/ | | | |_| | (__ +# \_| |_/\__, |\___|_| |_|\__|_|\___| +# __/ | +# _ _ |___/ +# | | | | / _| | +# | | | | ___ _ __ _ __| |_| | _____ ____ +# | |/\| |/ _ \ '__| |/ /| _| |/ _ \ \ /\ / / ___| +# \ /\ / (_) | | | | ( | | | | (_) \ V V /\__ \ +# \/ \/ \___/|_| |_|\_\|_| |_|\___/ \_/\_/ |___/ +# +# This file was automatically generated by gh-aw (v0.72.1). DO NOT EDIT. +# +# To update this file, edit the corresponding .md file and run: +# gh aw compile +# Not all edits will cause changes to this file. +# +# For more information: https://github.github.com/gh-aw/introduction/overview/ +# +# Weekly benchmark of Z3's TPTP front-end against 500 random TPTP problems. Downloads TPTP benchmarks from tptp.org, resolves axiom dependencies, skips large problems, runs each with a 5-second timeout, and posts a discrepancy/crash report as a GitHub discussion. +# +# Secrets used: +# - COPILOT_GITHUB_TOKEN +# - GH_AW_GITHUB_MCP_SERVER_TOKEN +# - GH_AW_GITHUB_TOKEN +# - GITHUB_TOKEN +# +# Custom actions used: +# - actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 +# - actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 +# - actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 +# - actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 +# - actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 +# - actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 +# - github/gh-aw-actions/setup@v0.72.1 +# +# Container images used: +# - ghcr.io/github/gh-aw-firewall/agent:0.25.41 +# - ghcr.io/github/gh-aw-firewall/api-proxy:0.25.41 +# - ghcr.io/github/gh-aw-firewall/squid:0.25.41 +# - ghcr.io/github/gh-aw-mcpg:v0.3.6@sha256:2bb8eef86006a4c5963c55616a9c51c32f27bfdecb023b8aa6f91f6718d9171c +# - ghcr.io/github/github-mcp-server:v1.0.3@sha256:2ac27ef03461ef2b877031b838a7d1fd7f12b12d4ace7796d8cad91446d55959 +# - node:lts-alpine@sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f + +name: "TPTP Front-End Benchmark" +"on": + schedule: + - cron: "0 6 * * 1" + workflow_dispatch: + inputs: + aw_context: + default: "" + description: Agent caller context (used internally by Agentic Workflows). + required: false + type: string + +permissions: {} + +concurrency: + group: "gh-aw-${{ github.workflow }}" + +run-name: "TPTP Front-End Benchmark" + +jobs: + activation: + runs-on: ubuntu-slim + permissions: + actions: read + contents: read + outputs: + comment_id: "" + comment_repo: "" + engine_id: ${{ steps.generate_aw_info.outputs.engine_id }} + lockdown_check_failed: ${{ steps.generate_aw_info.outputs.lockdown_check_failed == 'true' }} + model: ${{ steps.generate_aw_info.outputs.model }} + secret_verification_result: ${{ steps.validate-secret.outputs.verification_result }} + setup-trace-id: ${{ steps.setup.outputs.trace-id }} + stale_lock_file_failed: ${{ steps.check-lock-file.outputs.stale_lock_file_failed == 'true' }} + steps: + - name: Setup Scripts + id: setup + uses: github/gh-aw-actions/setup@v0.72.1 + with: + destination: ${{ runner.temp }}/gh-aw/actions + job-name: ${{ github.job }} + env: + GH_AW_SETUP_WORKFLOW_NAME: "TPTP Front-End Benchmark" + GH_AW_CURRENT_WORKFLOW_REF: ${{ github.repository }}/.github/workflows/tptp-benchmark.lock.yml@${{ github.ref }} + GH_AW_INFO_VERSION: "1.0.40" + - name: Generate agentic run info + id: generate_aw_info + env: + GH_AW_INFO_ENGINE_ID: "copilot" + GH_AW_INFO_ENGINE_NAME: "GitHub Copilot CLI" + GH_AW_INFO_MODEL: ${{ vars.GH_AW_MODEL_AGENT_COPILOT || 'claude-sonnet-4.6' }} + GH_AW_INFO_VERSION: "1.0.40" + GH_AW_INFO_AGENT_VERSION: "1.0.40" + GH_AW_INFO_CLI_VERSION: "v0.72.1" + GH_AW_INFO_WORKFLOW_NAME: "TPTP Front-End Benchmark" + GH_AW_INFO_EXPERIMENTAL: "false" + GH_AW_INFO_SUPPORTS_TOOLS_ALLOWLIST: "true" + GH_AW_INFO_STAGED: "false" + GH_AW_INFO_ALLOWED_DOMAINS: '["defaults","tptp.org"]' + GH_AW_INFO_FIREWALL_ENABLED: "true" + GH_AW_INFO_AWF_VERSION: "v0.25.41" + GH_AW_INFO_AWMG_VERSION: "" + GH_AW_INFO_FIREWALL_TYPE: "squid" + GH_AW_COMPILED_STRICT: "true" + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/generate_aw_info.cjs'); + await main(core, context); + - name: Validate COPILOT_GITHUB_TOKEN secret + id: validate-secret + run: bash "${RUNNER_TEMP}/gh-aw/actions/validate_multi_secret.sh" COPILOT_GITHUB_TOKEN 'GitHub Copilot CLI' https://github.github.com/gh-aw/reference/engines/#github-copilot-default + env: + COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + - name: Checkout .github and .agents folders + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + sparse-checkout: | + .github + .agents + .claude + .codex + .crush + .gemini + .opencode + .pi + sparse-checkout-cone-mode: true + fetch-depth: 1 + - name: Save agent config folders for base branch restoration + env: + GH_AW_AGENT_FOLDERS: ".agents .claude .codex .crush .gemini .github .opencode .pi" + GH_AW_AGENT_FILES: ".crush.json AGENTS.md CLAUDE.md GEMINI.md PI.md opencode.jsonc" + # poutine:ignore untrusted_checkout_exec + run: bash "${RUNNER_TEMP}/gh-aw/actions/save_base_github_folders.sh" + - name: Check workflow lock file + id: check-lock-file + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + env: + GH_AW_WORKFLOW_FILE: "tptp-benchmark.lock.yml" + GH_AW_CONTEXT_WORKFLOW_REF: "${{ github.workflow_ref }}" + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/check_workflow_timestamp_api.cjs'); + await main(); + - name: Check compile-agentic version + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + env: + GH_AW_COMPILED_VERSION: "v0.72.1" + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/check_version_updates.cjs'); + await main(); + - name: Create prompt with built-in context + env: + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + GH_AW_SAFE_OUTPUTS: ${{ runner.temp }}/gh-aw/safeoutputs/outputs.jsonl + GH_AW_GITHUB_ACTOR: ${{ github.actor }} + GH_AW_GITHUB_EVENT_COMMENT_ID: ${{ github.event.comment.id }} + GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: ${{ github.event.discussion.number }} + GH_AW_GITHUB_EVENT_ISSUE_NUMBER: ${{ github.event.issue.number }} + GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: ${{ github.event.pull_request.number }} + GH_AW_GITHUB_REPOSITORY: ${{ github.repository }} + GH_AW_GITHUB_RUN_ID: ${{ github.run_id }} + GH_AW_GITHUB_WORKFLOW: ${{ github.workflow }} + GH_AW_GITHUB_WORKSPACE: ${{ github.workspace }} + # poutine:ignore untrusted_checkout_exec + run: | + bash "${RUNNER_TEMP}/gh-aw/actions/create_prompt_first.sh" + { + cat << 'GH_AW_PROMPT_dbe2d26d7191f793_EOF' + + GH_AW_PROMPT_dbe2d26d7191f793_EOF + cat "${RUNNER_TEMP}/gh-aw/prompts/xpia.md" + cat "${RUNNER_TEMP}/gh-aw/prompts/temp_folder_prompt.md" + cat "${RUNNER_TEMP}/gh-aw/prompts/markdown.md" + cat "${RUNNER_TEMP}/gh-aw/prompts/safe_outputs_prompt.md" + cat << 'GH_AW_PROMPT_dbe2d26d7191f793_EOF' + + Tools: create_discussion, missing_tool, missing_data, noop + + GH_AW_PROMPT_dbe2d26d7191f793_EOF + cat "${RUNNER_TEMP}/gh-aw/prompts/mcp_cli_tools_prompt.md" + cat << 'GH_AW_PROMPT_dbe2d26d7191f793_EOF' + + The following GitHub context information is available for this workflow: + {{#if __GH_AW_GITHUB_ACTOR__ }} + - **actor**: __GH_AW_GITHUB_ACTOR__ + {{/if}} + {{#if __GH_AW_GITHUB_REPOSITORY__ }} + - **repository**: __GH_AW_GITHUB_REPOSITORY__ + {{/if}} + {{#if __GH_AW_GITHUB_WORKSPACE__ }} + - **workspace**: __GH_AW_GITHUB_WORKSPACE__ + {{/if}} + {{#if __GH_AW_GITHUB_EVENT_ISSUE_NUMBER__ }} + - **issue-number**: #__GH_AW_GITHUB_EVENT_ISSUE_NUMBER__ + {{/if}} + {{#if __GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER__ }} + - **discussion-number**: #__GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER__ + {{/if}} + {{#if __GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER__ }} + - **pull-request-number**: #__GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER__ + {{/if}} + {{#if __GH_AW_GITHUB_EVENT_COMMENT_ID__ }} + - **comment-id**: __GH_AW_GITHUB_EVENT_COMMENT_ID__ + {{/if}} + {{#if __GH_AW_GITHUB_RUN_ID__ }} + - **workflow-run-id**: __GH_AW_GITHUB_RUN_ID__ + {{/if}} + + + GH_AW_PROMPT_dbe2d26d7191f793_EOF + cat "${RUNNER_TEMP}/gh-aw/prompts/github_mcp_tools_with_safeoutputs_prompt.md" + cat << 'GH_AW_PROMPT_dbe2d26d7191f793_EOF' + + {{#runtime-import .github/workflows/tptp-benchmark.md}} + GH_AW_PROMPT_dbe2d26d7191f793_EOF + } > "$GH_AW_PROMPT" + - name: Interpolate variables and render templates + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + env: + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + GH_AW_ENGINE_ID: "copilot" + GH_AW_GITHUB_REPOSITORY: ${{ github.repository }} + GH_AW_GITHUB_RUN_ID: ${{ github.run_id }} + GH_AW_GITHUB_WORKFLOW: ${{ github.workflow }} + GH_AW_GITHUB_WORKSPACE: ${{ github.workspace }} + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/interpolate_prompt.cjs'); + await main(); + - name: Substitute placeholders + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + env: + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + GH_AW_GITHUB_ACTOR: ${{ github.actor }} + GH_AW_GITHUB_EVENT_COMMENT_ID: ${{ github.event.comment.id }} + GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: ${{ github.event.discussion.number }} + GH_AW_GITHUB_EVENT_ISSUE_NUMBER: ${{ github.event.issue.number }} + GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: ${{ github.event.pull_request.number }} + GH_AW_GITHUB_REPOSITORY: ${{ github.repository }} + GH_AW_GITHUB_RUN_ID: ${{ github.run_id }} + GH_AW_GITHUB_WORKFLOW: ${{ github.workflow }} + GH_AW_GITHUB_WORKSPACE: ${{ github.workspace }} + GH_AW_MCP_CLI_SERVERS_LIST: '- `safeoutputs` — run `safeoutputs --help` to see available tools' + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + + const substitutePlaceholders = require('${{ runner.temp }}/gh-aw/actions/substitute_placeholders.cjs'); + + // Call the substitution function + return await substitutePlaceholders({ + file: process.env.GH_AW_PROMPT, + substitutions: { + GH_AW_GITHUB_ACTOR: process.env.GH_AW_GITHUB_ACTOR, + GH_AW_GITHUB_EVENT_COMMENT_ID: process.env.GH_AW_GITHUB_EVENT_COMMENT_ID, + GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: process.env.GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER, + GH_AW_GITHUB_EVENT_ISSUE_NUMBER: process.env.GH_AW_GITHUB_EVENT_ISSUE_NUMBER, + GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: process.env.GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER, + GH_AW_GITHUB_REPOSITORY: process.env.GH_AW_GITHUB_REPOSITORY, + GH_AW_GITHUB_RUN_ID: process.env.GH_AW_GITHUB_RUN_ID, + GH_AW_GITHUB_WORKFLOW: process.env.GH_AW_GITHUB_WORKFLOW, + GH_AW_GITHUB_WORKSPACE: process.env.GH_AW_GITHUB_WORKSPACE, + GH_AW_MCP_CLI_SERVERS_LIST: process.env.GH_AW_MCP_CLI_SERVERS_LIST + } + }); + - name: Validate prompt placeholders + env: + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + # poutine:ignore untrusted_checkout_exec + run: bash "${RUNNER_TEMP}/gh-aw/actions/validate_prompt_placeholders.sh" + - name: Print prompt + env: + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + # poutine:ignore untrusted_checkout_exec + run: bash "${RUNNER_TEMP}/gh-aw/actions/print_prompt_summary.sh" + - name: Upload activation artifact + if: success() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: activation + include-hidden-files: true + path: | + /tmp/gh-aw/aw_info.json + /tmp/gh-aw/aw-prompts/prompt.txt + /tmp/gh-aw/aw-prompts/prompt-template.txt + /tmp/gh-aw/aw-prompts/prompt-import-tree.json + /tmp/gh-aw/github_rate_limits.jsonl + /tmp/gh-aw/base + /tmp/gh-aw/.github/agents + if-no-files-found: ignore + retention-days: 1 + + agent: + needs: activation + runs-on: ubuntu-latest + permissions: read-all + concurrency: + group: "gh-aw-copilot-${{ github.workflow }}" + env: + DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + GH_AW_ASSETS_ALLOWED_EXTS: "" + GH_AW_ASSETS_BRANCH: "" + GH_AW_ASSETS_MAX_SIZE_KB: 0 + GH_AW_MCP_LOG_DIR: /tmp/gh-aw/mcp-logs/safeoutputs + GH_AW_WORKFLOW_ID_SANITIZED: tptpbenchmark + outputs: + agentic_engine_timeout: ${{ steps.detect-copilot-errors.outputs.agentic_engine_timeout || 'false' }} + checkout_pr_success: ${{ steps.checkout-pr.outputs.checkout_pr_success || 'true' }} + effective_tokens: ${{ steps.parse-mcp-gateway.outputs.effective_tokens }} + has_patch: ${{ steps.collect_output.outputs.has_patch }} + inference_access_error: ${{ steps.detect-copilot-errors.outputs.inference_access_error || 'false' }} + mcp_policy_error: ${{ steps.detect-copilot-errors.outputs.mcp_policy_error || 'false' }} + model: ${{ needs.activation.outputs.model }} + model_not_supported_error: ${{ steps.detect-copilot-errors.outputs.model_not_supported_error || 'false' }} + output: ${{ steps.collect_output.outputs.output }} + output_types: ${{ steps.collect_output.outputs.output_types }} + setup-trace-id: ${{ steps.setup.outputs.trace-id }} + steps: + - name: Setup Scripts + id: setup + uses: github/gh-aw-actions/setup@v0.72.1 + with: + destination: ${{ runner.temp }}/gh-aw/actions + job-name: ${{ github.job }} + trace-id: ${{ needs.activation.outputs.setup-trace-id }} + env: + GH_AW_SETUP_WORKFLOW_NAME: "TPTP Front-End Benchmark" + GH_AW_CURRENT_WORKFLOW_REF: ${{ github.repository }}/.github/workflows/tptp-benchmark.lock.yml@${{ github.ref }} + GH_AW_INFO_VERSION: "1.0.40" + - name: Set runtime paths + id: set-runtime-paths + run: | + { + echo "GH_AW_SAFE_OUTPUTS=${RUNNER_TEMP}/gh-aw/safeoutputs/outputs.jsonl" + echo "GH_AW_SAFE_OUTPUTS_CONFIG_PATH=${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" + echo "GH_AW_SAFE_OUTPUTS_TOOLS_PATH=${RUNNER_TEMP}/gh-aw/safeoutputs/tools.json" + } >> "$GITHUB_OUTPUT" + - name: Create gh-aw temp directory + run: bash "${RUNNER_TEMP}/gh-aw/actions/create_gh_aw_tmp_dir.sh" + - name: Configure gh CLI for GitHub Enterprise + run: bash "${RUNNER_TEMP}/gh-aw/actions/configure_gh_for_ghe.sh" + env: + GH_TOKEN: ${{ github.token }} + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Configure Git credentials + env: + REPO_NAME: ${{ github.repository }} + SERVER_URL: ${{ github.server_url }} + GITHUB_TOKEN: ${{ github.token }} + run: | + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git config --global user.name "github-actions[bot]" + git config --global am.keepcr true + # Re-authenticate git with GitHub token + SERVER_URL_STRIPPED="${SERVER_URL#https://}" + git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@${SERVER_URL_STRIPPED}/${REPO_NAME}.git" + echo "Git configured with standard GitHub Actions identity" + - name: Checkout PR branch + id: checkout-pr + if: | + github.event.pull_request || github.event.issue.pull_request + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + env: + GH_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + with: + github-token: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/checkout_pr_branch.cjs'); + await main(); + - name: Install GitHub Copilot CLI + run: bash "${RUNNER_TEMP}/gh-aw/actions/install_copilot_cli.sh" 1.0.40 + env: + GH_HOST: github.com + - name: Install AWF binary + run: bash "${RUNNER_TEMP}/gh-aw/actions/install_awf_binary.sh" v0.25.41 + - name: Determine automatic lockdown mode for GitHub MCP Server + id: determine-automatic-lockdown + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + env: + GH_AW_GITHUB_TOKEN: ${{ secrets.GH_AW_GITHUB_TOKEN }} + GH_AW_GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN }} + with: + script: | + const determineAutomaticLockdown = require('${{ runner.temp }}/gh-aw/actions/determine_automatic_lockdown.cjs'); + await determineAutomaticLockdown(github, context, core); + - name: Download activation artifact + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: activation + path: /tmp/gh-aw + - name: Restore agent config folders from base branch + if: steps.checkout-pr.outcome == 'success' + env: + GH_AW_AGENT_FOLDERS: ".agents .claude .codex .crush .gemini .github .opencode .pi" + GH_AW_AGENT_FILES: ".crush.json AGENTS.md CLAUDE.md GEMINI.md PI.md opencode.jsonc" + run: bash "${RUNNER_TEMP}/gh-aw/actions/restore_base_github_folders.sh" + - name: Restore inline sub-agents from activation artifact + env: + GH_AW_SUB_AGENT_DIR: ".github/agents" + GH_AW_SUB_AGENT_EXT: ".agent.md" + run: bash "${RUNNER_TEMP}/gh-aw/actions/restore_inline_sub_agents.sh" + - name: Download container images + run: bash "${RUNNER_TEMP}/gh-aw/actions/download_docker_images.sh" ghcr.io/github/gh-aw-firewall/agent:0.25.41 ghcr.io/github/gh-aw-firewall/api-proxy:0.25.41 ghcr.io/github/gh-aw-firewall/squid:0.25.41 ghcr.io/github/gh-aw-mcpg:v0.3.6@sha256:2bb8eef86006a4c5963c55616a9c51c32f27bfdecb023b8aa6f91f6718d9171c ghcr.io/github/github-mcp-server:v1.0.3@sha256:2ac27ef03461ef2b877031b838a7d1fd7f12b12d4ace7796d8cad91446d55959 node:lts-alpine@sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f + - name: Generate Safe Outputs Config + run: | + mkdir -p "${RUNNER_TEMP}/gh-aw/safeoutputs" + mkdir -p /tmp/gh-aw/safeoutputs + mkdir -p /tmp/gh-aw/mcp-logs/safeoutputs + cat > "${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" << 'GH_AW_SAFE_OUTPUTS_CONFIG_832ae11399c26134_EOF' + {"create_discussion":{"category":"agentic workflows","close_older_discussions":true,"expires":336,"fallback_to_issue":true,"max":1,"title_prefix":"[TPTP Benchmark] "},"create_report_incomplete_issue":{},"missing_data":{},"missing_tool":{},"noop":{"max":1,"report-as-issue":"false"},"report_incomplete":{}} + GH_AW_SAFE_OUTPUTS_CONFIG_832ae11399c26134_EOF + - name: Generate Safe Outputs Tools + env: + GH_AW_TOOLS_META_JSON: | + { + "description_suffixes": { + "create_discussion": " CONSTRAINTS: Maximum 1 discussion(s) can be created. Title will be prefixed with \"[TPTP Benchmark] \". Discussions will be created in category \"agentic workflows\"." + }, + "repo_params": {}, + "dynamic_tools": [] + } + GH_AW_VALIDATION_JSON: | + { + "create_discussion": { + "defaultMax": 1, + "fields": { + "body": { + "required": true, + "type": "string", + "sanitize": true, + "maxLength": 65000 + }, + "category": { + "type": "string", + "sanitize": true, + "maxLength": 128 + }, + "repo": { + "type": "string", + "maxLength": 256 + }, + "title": { + "required": true, + "type": "string", + "sanitize": true, + "maxLength": 128 + } + } + }, + "missing_data": { + "defaultMax": 20, + "fields": { + "alternatives": { + "type": "string", + "sanitize": true, + "maxLength": 256 + }, + "context": { + "type": "string", + "sanitize": true, + "maxLength": 256 + }, + "data_type": { + "type": "string", + "sanitize": true, + "maxLength": 128 + }, + "reason": { + "type": "string", + "sanitize": true, + "maxLength": 256 + } + } + }, + "missing_tool": { + "defaultMax": 20, + "fields": { + "alternatives": { + "type": "string", + "sanitize": true, + "maxLength": 512 + }, + "reason": { + "required": true, + "type": "string", + "sanitize": true, + "maxLength": 256 + }, + "tool": { + "type": "string", + "sanitize": true, + "maxLength": 128 + } + } + }, + "noop": { + "defaultMax": 1, + "fields": { + "message": { + "required": true, + "type": "string", + "sanitize": true, + "maxLength": 65000 + } + } + }, + "report_incomplete": { + "defaultMax": 5, + "fields": { + "details": { + "type": "string", + "sanitize": true, + "maxLength": 65000 + }, + "reason": { + "required": true, + "type": "string", + "sanitize": true, + "maxLength": 1024 + } + } + } + } + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/generate_safe_outputs_tools.cjs'); + await main(); + - name: Generate Safe Outputs MCP Server Config + id: safe-outputs-config + run: | + # Generate a secure random API key (360 bits of entropy, 40+ chars) + # Mask immediately to prevent timing vulnerabilities + API_KEY=$(openssl rand -base64 45 | tr -d '/+=') + echo "::add-mask::${API_KEY}" + + PORT=3001 + + # Set outputs for next steps + { + echo "safe_outputs_api_key=${API_KEY}" + echo "safe_outputs_port=${PORT}" + } >> "$GITHUB_OUTPUT" + + echo "Safe Outputs MCP server will run on port ${PORT}" + + - name: Start Safe Outputs MCP HTTP Server + id: safe-outputs-start + env: + DEBUG: '*' + GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }} + GH_AW_SAFE_OUTPUTS_PORT: ${{ steps.safe-outputs-config.outputs.safe_outputs_port }} + GH_AW_SAFE_OUTPUTS_API_KEY: ${{ steps.safe-outputs-config.outputs.safe_outputs_api_key }} + GH_AW_SAFE_OUTPUTS_TOOLS_PATH: ${{ runner.temp }}/gh-aw/safeoutputs/tools.json + GH_AW_SAFE_OUTPUTS_CONFIG_PATH: ${{ runner.temp }}/gh-aw/safeoutputs/config.json + GH_AW_MCP_LOG_DIR: /tmp/gh-aw/mcp-logs/safeoutputs + run: | + # Environment variables are set above to prevent template injection + export DEBUG + export GH_AW_SAFE_OUTPUTS + export GH_AW_SAFE_OUTPUTS_PORT + export GH_AW_SAFE_OUTPUTS_API_KEY + export GH_AW_SAFE_OUTPUTS_TOOLS_PATH + export GH_AW_SAFE_OUTPUTS_CONFIG_PATH + export GH_AW_MCP_LOG_DIR + + bash "${RUNNER_TEMP}/gh-aw/actions/start_safe_outputs_server.sh" + + - name: Start MCP Gateway + id: start-mcp-gateway + env: + GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }} + GH_AW_SAFE_OUTPUTS_API_KEY: ${{ steps.safe-outputs-start.outputs.api_key }} + GH_AW_SAFE_OUTPUTS_PORT: ${{ steps.safe-outputs-start.outputs.port }} + GITHUB_MCP_GUARD_MIN_INTEGRITY: ${{ steps.determine-automatic-lockdown.outputs.min_integrity }} + GITHUB_MCP_GUARD_REPOS: ${{ steps.determine-automatic-lockdown.outputs.repos }} + GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + run: | + set -eo pipefail + mkdir -p "${RUNNER_TEMP}/gh-aw/mcp-config" + + # Export gateway environment variables for MCP config and gateway script + export MCP_GATEWAY_PORT="8080" + export MCP_GATEWAY_DOMAIN="host.docker.internal" + export MCP_GATEWAY_HOST_DOMAIN="localhost" + MCP_GATEWAY_API_KEY=$(openssl rand -base64 45 | tr -d '/+=') + echo "::add-mask::${MCP_GATEWAY_API_KEY}" + export MCP_GATEWAY_API_KEY + export MCP_GATEWAY_PAYLOAD_DIR="/tmp/gh-aw/mcp-payloads" + mkdir -p "${MCP_GATEWAY_PAYLOAD_DIR}" + export MCP_GATEWAY_PAYLOAD_SIZE_THRESHOLD="524288" + export DEBUG="*" + + export GH_AW_ENGINE="copilot" + MCP_GATEWAY_UID=$(id -u 2>/dev/null || echo '0') + MCP_GATEWAY_GID=$(id -g 2>/dev/null || echo '0') + DOCKER_SOCK_GID=$(stat -c '%g' /var/run/docker.sock 2>/dev/null || echo '0') + export MCP_GATEWAY_DOCKER_COMMAND='docker run -i --rm --network host --add-host host.docker.internal:127.0.0.1 --user '"${MCP_GATEWAY_UID}"':'"${MCP_GATEWAY_GID}"' --group-add '"${DOCKER_SOCK_GID}"' -v /var/run/docker.sock:/var/run/docker.sock -e MCP_GATEWAY_PORT -e MCP_GATEWAY_DOMAIN -e MCP_GATEWAY_API_KEY -e MCP_GATEWAY_PAYLOAD_DIR -e MCP_GATEWAY_PAYLOAD_SIZE_THRESHOLD -e DEBUG -e MCP_GATEWAY_LOG_DIR -e GH_AW_MCP_LOG_DIR -e GH_AW_SAFE_OUTPUTS -e GH_AW_SAFE_OUTPUTS_CONFIG_PATH -e GH_AW_SAFE_OUTPUTS_TOOLS_PATH -e GH_AW_ASSETS_BRANCH -e GH_AW_ASSETS_MAX_SIZE_KB -e GH_AW_ASSETS_ALLOWED_EXTS -e DEFAULT_BRANCH -e GITHUB_MCP_SERVER_TOKEN -e GITHUB_MCP_GUARD_MIN_INTEGRITY -e GITHUB_MCP_GUARD_REPOS -e GITHUB_REPOSITORY -e GITHUB_SERVER_URL -e GITHUB_SHA -e GITHUB_WORKSPACE -e GITHUB_TOKEN -e GITHUB_RUN_ID -e GITHUB_RUN_NUMBER -e GITHUB_RUN_ATTEMPT -e GITHUB_JOB -e GITHUB_ACTION -e GITHUB_EVENT_NAME -e GITHUB_EVENT_PATH -e GITHUB_ACTOR -e GITHUB_ACTOR_ID -e GITHUB_TRIGGERING_ACTOR -e GITHUB_WORKFLOW -e GITHUB_WORKFLOW_REF -e GITHUB_WORKFLOW_SHA -e GITHUB_REF -e GITHUB_REF_NAME -e GITHUB_REF_TYPE -e GITHUB_HEAD_REF -e GITHUB_BASE_REF -e GH_AW_SAFE_OUTPUTS_PORT -e GH_AW_SAFE_OUTPUTS_API_KEY -v /tmp/gh-aw/mcp-payloads:/tmp/gh-aw/mcp-payloads:rw -v /opt:/opt:ro -v /tmp:/tmp:rw -v '"${GITHUB_WORKSPACE}"':'"${GITHUB_WORKSPACE}"':rw ghcr.io/github/gh-aw-mcpg:v0.3.6' + + mkdir -p /home/runner/.copilot + GH_AW_NODE=$(which node 2>/dev/null || command -v node 2>/dev/null || echo node) + cat << GH_AW_MCP_CONFIG_0b5ba1ac1a376250_EOF | "$GH_AW_NODE" "${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.cjs" + { + "mcpServers": { + "github": { + "type": "stdio", + "container": "ghcr.io/github/github-mcp-server:v1.0.3", + "env": { + "GITHUB_HOST": "\${GITHUB_SERVER_URL}", + "GITHUB_PERSONAL_ACCESS_TOKEN": "\${GITHUB_MCP_SERVER_TOKEN}", + "GITHUB_READ_ONLY": "1", + "GITHUB_TOOLSETS": "context,repos,issues,pull_requests" + }, + "guard-policies": { + "allow-only": { + "min-integrity": "$GITHUB_MCP_GUARD_MIN_INTEGRITY", + "repos": "$GITHUB_MCP_GUARD_REPOS" + } + } + }, + "safeoutputs": { + "type": "http", + "url": "http://host.docker.internal:$GH_AW_SAFE_OUTPUTS_PORT", + "headers": { + "Authorization": "\${GH_AW_SAFE_OUTPUTS_API_KEY}" + }, + "guard-policies": { + "write-sink": { + "accept": [ + "*" + ] + } + } + } + }, + "gateway": { + "port": $MCP_GATEWAY_PORT, + "domain": "${MCP_GATEWAY_DOMAIN}", + "apiKey": "${MCP_GATEWAY_API_KEY}", + "payloadDir": "${MCP_GATEWAY_PAYLOAD_DIR}" + } + } + GH_AW_MCP_CONFIG_0b5ba1ac1a376250_EOF + - name: Mount MCP servers as CLIs + id: mount-mcp-clis + continue-on-error: true + env: + MCP_GATEWAY_API_KEY: ${{ steps.start-mcp-gateway.outputs.gateway-api-key }} + MCP_GATEWAY_DOMAIN: ${{ steps.start-mcp-gateway.outputs.gateway-domain }} + MCP_GATEWAY_PORT: ${{ steps.start-mcp-gateway.outputs.gateway-port }} + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('${{ runner.temp }}/gh-aw/actions/mount_mcp_as_cli.cjs'); + await main(); + - name: Clean credentials + continue-on-error: true + run: bash "${RUNNER_TEMP}/gh-aw/actions/clean_git_credentials.sh" + - name: Audit pre-agent workspace + id: pre_agent_audit + continue-on-error: true + run: bash "${RUNNER_TEMP}/gh-aw/actions/audit_pre_agent_workspace.sh" + - name: Execute GitHub Copilot CLI + id: agentic_execution + # Copilot CLI tool arguments (sorted): + timeout-minutes: 300 + run: | + set -o pipefail + touch /tmp/gh-aw/agent-step-summary.md + GH_AW_NODE_BIN=$(command -v node 2>/dev/null || true) + export GH_AW_NODE_BIN + (umask 177 && touch /tmp/gh-aw/agent-stdio.log) + printf '%s\n' '{"$schema":"https://github.com/github/gh-aw-firewall/releases/download/v0.25.41/awf-config.schema.json","network":{"allowDomains":["api.business.githubcopilot.com","api.enterprise.githubcopilot.com","api.github.com","api.githubcopilot.com","api.individual.githubcopilot.com","api.snapcraft.io","archive.ubuntu.com","azure.archive.ubuntu.com","crl.geotrust.com","crl.globalsign.com","crl.identrust.com","crl.sectigo.com","crl.thawte.com","crl.usertrust.com","crl.verisign.com","crl3.digicert.com","crl4.digicert.com","crls.ssl.com","github.com","host.docker.internal","json-schema.org","json.schemastore.org","keyserver.ubuntu.com","ocsp.digicert.com","ocsp.geotrust.com","ocsp.globalsign.com","ocsp.identrust.com","ocsp.sectigo.com","ocsp.ssl.com","ocsp.thawte.com","ocsp.usertrust.com","ocsp.verisign.com","packagecloud.io","packages.cloud.google.com","packages.microsoft.com","ppa.launchpad.net","raw.githubusercontent.com","registry.npmjs.org","s.symcb.com","s.symcd.com","security.ubuntu.com","telemetry.enterprise.githubcopilot.com","tptp.org","ts-crl.ws.symantec.com","ts-ocsp.ws.symantec.com","www.googleapis.com"]},"apiProxy":{"enabled":true,"models":{"auto":["large"],"deep-research":["copilot/deep-research*","copilot/o3-deep-research*","copilot/o4-mini-deep-research*","google/deep-research*","openai/o3-deep-research*","openai/o4-mini-deep-research*"],"gemini-flash":["copilot/gemini-*flash*","google/gemini-*flash*"],"gemini-pro":["copilot/gemini-*pro*","google/gemini-*pro*"],"gpt-4.1":["copilot/gpt-4.1*","openai/gpt-4.1*"],"gpt-5":["copilot/gpt-5*","openai/gpt-5*"],"gpt-5-codex":["copilot/gpt-5*codex*","openai/gpt-5*codex*"],"gpt-5-mini":["copilot/gpt-5*mini*","openai/gpt-5*mini*"],"gpt-5-nano":["copilot/gpt-5*nano*","openai/gpt-5*nano*"],"gpt-5-pro":["copilot/gpt-5*pro*","openai/gpt-5*pro*"],"haiku":["copilot/*haiku*","anthropic/*haiku*"],"large":["sonnet","gpt-5-pro","gpt-5","gemini-pro"],"mini":["haiku","gpt-5-mini","gpt-5-nano","gemini-flash"],"opus":["copilot/*opus*","anthropic/*opus*"],"reasoning":["copilot/o1*","copilot/o3*","copilot/o4*","openai/o1*","openai/o3*","openai/o4*"],"small":["mini"],"sonnet":["copilot/*sonnet*","anthropic/*sonnet*"]}},"container":{"imageTag":"0.25.41"}}' > "${RUNNER_TEMP}/gh-aw/awf-config.json" && cp "${RUNNER_TEMP}/gh-aw/awf-config.json" /tmp/gh-aw/awf-config.json + # shellcheck disable=SC1003 + sudo -E awf --config "${RUNNER_TEMP}/gh-aw/awf-config.json" --container-workdir "${GITHUB_WORKSPACE}" --mount "${RUNNER_TEMP}/gh-aw:${RUNNER_TEMP}/gh-aw:ro" --mount "${RUNNER_TEMP}/gh-aw:/host${RUNNER_TEMP}/gh-aw:ro" --env-all --exclude-env COPILOT_GITHUB_TOKEN --exclude-env GITHUB_MCP_SERVER_TOKEN --exclude-env MCP_GATEWAY_API_KEY --log-level info --proxy-logs-dir /tmp/gh-aw/sandbox/firewall/logs --audit-dir /tmp/gh-aw/sandbox/firewall/audit --enable-host-access --allow-host-ports 80,443,8080 --skip-pull \ + -- /bin/bash -c 'export PATH="${RUNNER_TEMP}/gh-aw/mcp-cli/bin:$PATH" && export PATH="$(find /opt/hostedtoolcache /home/runner/work/_tool -maxdepth 4 -type d -name bin 2>/dev/null | tr '\''\n'\'' '\'':'\'')$PATH"; [ -n "$GOROOT" ] && export PATH="$GOROOT/bin:$PATH" || true && GH_AW_NODE_EXEC="${GH_AW_NODE_BIN:-}"; if [ -z "$GH_AW_NODE_EXEC" ] || [ ! -x "$GH_AW_NODE_EXEC" ]; then GH_AW_NODE_EXEC="$(command -v node 2>/dev/null || echo node)"; fi; "$GH_AW_NODE_EXEC" ${RUNNER_TEMP}/gh-aw/actions/copilot_harness.cjs /usr/local/bin/copilot --add-dir /tmp/gh-aw/ --log-level all --log-dir /tmp/gh-aw/sandbox/agent/logs/ --disable-builtin-mcps --no-ask-user --allow-all-tools --allow-all-paths --add-dir "${GITHUB_WORKSPACE}" --prompt-file /tmp/gh-aw/aw-prompts/prompt.txt' 2>&1 | tee -a /tmp/gh-aw/agent-stdio.log + env: + AWF_REFLECT_ENABLED: 1 + COPILOT_AGENT_RUNNER_TYPE: STANDALONE + COPILOT_API_KEY: dummy-byok-key-for-offline-mode + COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + COPILOT_MODEL: ${{ vars.GH_AW_MODEL_AGENT_COPILOT || 'claude-sonnet-4.6' }} + GH_AW_MCP_CONFIG: /home/runner/.copilot/mcp-config.json + GH_AW_PHASE: agent + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }} + GH_AW_VERSION: v0.72.1 + GITHUB_API_URL: ${{ github.api_url }} + GITHUB_AW: true + GITHUB_COPILOT_INTEGRATION_ID: agentic-workflows + GITHUB_HEAD_REF: ${{ github.head_ref }} + GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + GITHUB_REF_NAME: ${{ github.ref_name }} + GITHUB_SERVER_URL: ${{ github.server_url }} + GITHUB_STEP_SUMMARY: /tmp/gh-aw/agent-step-summary.md + GITHUB_WORKSPACE: ${{ github.workspace }} + GIT_AUTHOR_EMAIL: github-actions[bot]@users.noreply.github.com + GIT_AUTHOR_NAME: github-actions[bot] + GIT_COMMITTER_EMAIL: github-actions[bot]@users.noreply.github.com + GIT_COMMITTER_NAME: github-actions[bot] + XDG_CONFIG_HOME: /home/runner + - name: Detect Copilot errors + id: detect-copilot-errors + if: always() + continue-on-error: true + run: node "${RUNNER_TEMP}/gh-aw/actions/detect_copilot_errors.cjs" + - name: Configure Git credentials + env: + REPO_NAME: ${{ github.repository }} + SERVER_URL: ${{ github.server_url }} + GITHUB_TOKEN: ${{ github.token }} + run: | + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git config --global user.name "github-actions[bot]" + git config --global am.keepcr true + # Re-authenticate git with GitHub token + SERVER_URL_STRIPPED="${SERVER_URL#https://}" + git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@${SERVER_URL_STRIPPED}/${REPO_NAME}.git" + echo "Git configured with standard GitHub Actions identity" + - name: Copy Copilot session state files to logs + if: always() + continue-on-error: true + run: bash "${RUNNER_TEMP}/gh-aw/actions/copy_copilot_session_state.sh" + - name: Stop MCP Gateway + if: always() + continue-on-error: true + env: + MCP_GATEWAY_PORT: ${{ steps.start-mcp-gateway.outputs.gateway-port }} + MCP_GATEWAY_API_KEY: ${{ steps.start-mcp-gateway.outputs.gateway-api-key }} + GATEWAY_PID: ${{ steps.start-mcp-gateway.outputs.gateway-pid }} + run: | + bash "${RUNNER_TEMP}/gh-aw/actions/stop_mcp_gateway.sh" "$GATEWAY_PID" + - name: Redact secrets in logs + if: always() + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/redact_secrets.cjs'); + await main(); + env: + GH_AW_SECRET_NAMES: 'COPILOT_GITHUB_TOKEN,GH_AW_GITHUB_MCP_SERVER_TOKEN,GH_AW_GITHUB_TOKEN,GITHUB_TOKEN' + SECRET_COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + SECRET_GH_AW_GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN }} + SECRET_GH_AW_GITHUB_TOKEN: ${{ secrets.GH_AW_GITHUB_TOKEN }} + SECRET_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Append agent step summary + if: always() + run: bash "${RUNNER_TEMP}/gh-aw/actions/append_agent_step_summary.sh" + - name: Copy Safe Outputs + if: always() + env: + GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }} + run: | + mkdir -p /tmp/gh-aw + cp "$GH_AW_SAFE_OUTPUTS" /tmp/gh-aw/safeoutputs.jsonl 2>/dev/null || true + - name: Ingest agent output + id: collect_output + if: always() + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + env: + GH_AW_SAFE_OUTPUTS: ${{ steps.set-runtime-paths.outputs.GH_AW_SAFE_OUTPUTS }} + GH_AW_ALLOWED_DOMAINS: "api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,api.snapcraft.io,archive.ubuntu.com,azure.archive.ubuntu.com,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,github.com,host.docker.internal,json-schema.org,json.schemastore.org,keyserver.ubuntu.com,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,ppa.launchpad.net,raw.githubusercontent.com,registry.npmjs.org,s.symcb.com,s.symcd.com,security.ubuntu.com,telemetry.enterprise.githubcopilot.com,tptp.org,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com,www.googleapis.com" + GITHUB_SERVER_URL: ${{ github.server_url }} + GITHUB_API_URL: ${{ github.api_url }} + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/collect_ndjson_output.cjs'); + await main(); + - name: Parse agent logs for step summary + if: always() + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + env: + GH_AW_AGENT_OUTPUT: /tmp/gh-aw/sandbox/agent/logs/ + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_copilot_log.cjs'); + await main(); + - name: Parse MCP Gateway logs for step summary + if: always() + id: parse-mcp-gateway + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_mcp_gateway_log.cjs'); + await main(); + - name: Print firewall logs + if: always() + continue-on-error: true + env: + AWF_LOGS_DIR: /tmp/gh-aw/sandbox/firewall/logs + run: | + # Fix permissions on firewall logs/audit dirs so they can be uploaded as artifacts + # AWF runs with sudo, creating files owned by root + sudo chmod -R a+rX /tmp/gh-aw/sandbox/firewall 2>/dev/null || true + # Only run awf logs summary if awf command exists (it may not be installed if workflow failed before install step) + if command -v awf &> /dev/null; then + awf logs summary | tee -a "$GITHUB_STEP_SUMMARY" + else + echo 'AWF binary not installed, skipping firewall log summary' + fi + - name: Parse token usage for step summary + if: always() + continue-on-error: true + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_token_usage.cjs'); + await main(); + - name: Print AWF reflect summary + if: always() + continue-on-error: true + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/awf_reflect_summary.cjs'); + await main(); + - name: Write agent output placeholder if missing + if: always() + run: | + if [ ! -f /tmp/gh-aw/agent_output.json ]; then + echo '{"items":[]}' > /tmp/gh-aw/agent_output.json + fi + - name: Upload agent artifacts + if: always() + continue-on-error: true + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: agent + path: | + /tmp/gh-aw/aw-prompts/prompt.txt + /tmp/gh-aw/sandbox/agent/logs/ + /tmp/gh-aw/redacted-urls.log + /tmp/gh-aw/mcp-logs/ + /tmp/gh-aw/agent_usage.json + /tmp/gh-aw/agent-stdio.log + /tmp/gh-aw/pre-agent-audit.txt + /tmp/gh-aw/agent/ + /tmp/gh-aw/github_rate_limits.jsonl + /tmp/gh-aw/safeoutputs.jsonl + /tmp/gh-aw/agent_output.json + /tmp/gh-aw/aw-*.patch + /tmp/gh-aw/aw-*.bundle + /tmp/gh-aw/awf-config.json + /tmp/gh-aw/sandbox/firewall/logs/ + /tmp/gh-aw/sandbox/firewall/audit/ + /tmp/gh-aw/sandbox/firewall/awf-reflect.json + if-no-files-found: ignore + + conclusion: + needs: + - activation + - agent + - detection + - safe_outputs + if: > + always() && (needs.agent.result != 'skipped' || needs.activation.outputs.lockdown_check_failed == 'true' || + needs.activation.outputs.stale_lock_file_failed == 'true') + runs-on: ubuntu-slim + permissions: + contents: read + discussions: write + issues: write + concurrency: + group: "gh-aw-conclusion-tptp-benchmark" + cancel-in-progress: false + outputs: + incomplete_count: ${{ steps.report_incomplete.outputs.incomplete_count }} + noop_message: ${{ steps.noop.outputs.noop_message }} + tools_reported: ${{ steps.missing_tool.outputs.tools_reported }} + total_count: ${{ steps.missing_tool.outputs.total_count }} + steps: + - name: Setup Scripts + id: setup + uses: github/gh-aw-actions/setup@v0.72.1 + with: + destination: ${{ runner.temp }}/gh-aw/actions + job-name: ${{ github.job }} + trace-id: ${{ needs.activation.outputs.setup-trace-id }} + env: + GH_AW_SETUP_WORKFLOW_NAME: "TPTP Front-End Benchmark" + GH_AW_CURRENT_WORKFLOW_REF: ${{ github.repository }}/.github/workflows/tptp-benchmark.lock.yml@${{ github.ref }} + GH_AW_INFO_VERSION: "1.0.40" + - name: Download agent output artifact + id: download-agent-output + continue-on-error: true + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: agent + path: /tmp/gh-aw/ + - name: Setup agent output environment variable + id: setup-agent-output-env + if: steps.download-agent-output.outcome == 'success' + run: | + mkdir -p /tmp/gh-aw/ + find "/tmp/gh-aw/" -type f -print + echo "GH_AW_AGENT_OUTPUT=/tmp/gh-aw/agent_output.json" >> "$GITHUB_OUTPUT" + - name: Process no-op messages + id: noop + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + env: + GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }} + GH_AW_NOOP_MAX: "1" + GH_AW_WORKFLOW_NAME: "TPTP Front-End Benchmark" + GH_AW_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + GH_AW_AGENT_CONCLUSION: ${{ needs.agent.result }} + GH_AW_NOOP_REPORT_AS_ISSUE: "false" + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/handle_noop_message.cjs'); + await main(); + - name: Log detection run + id: detection_runs + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + env: + GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }} + GH_AW_WORKFLOW_NAME: "TPTP Front-End Benchmark" + GH_AW_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + GH_AW_DETECTION_CONCLUSION: ${{ needs.detection.outputs.detection_conclusion }} + GH_AW_DETECTION_REASON: ${{ needs.detection.outputs.detection_reason }} + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/handle_detection_runs.cjs'); + await main(); + - name: Record missing tool + id: missing_tool + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + env: + GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }} + GH_AW_MISSING_TOOL_CREATE_ISSUE: "true" + GH_AW_MISSING_TOOL_TITLE_PREFIX: "[missing tool]" + GH_AW_WORKFLOW_NAME: "TPTP Front-End Benchmark" + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/missing_tool.cjs'); + await main(); + - name: Record incomplete + id: report_incomplete + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + env: + GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }} + GH_AW_REPORT_INCOMPLETE_CREATE_ISSUE: "true" + GH_AW_WORKFLOW_NAME: "TPTP Front-End Benchmark" + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/report_incomplete_handler.cjs'); + await main(); + - name: Handle agent failure + id: handle_agent_failure + if: always() + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + env: + GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }} + GH_AW_WORKFLOW_NAME: "TPTP Front-End Benchmark" + GH_AW_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + GH_AW_AGENT_CONCLUSION: ${{ needs.agent.result }} + GH_AW_WORKFLOW_ID: "tptp-benchmark" + GH_AW_ACTION_FAILURE_ISSUE_EXPIRES_HOURS: "168" + GH_AW_ENGINE_ID: "copilot" + GH_AW_SECRET_VERIFICATION_RESULT: ${{ needs.activation.outputs.secret_verification_result }} + GH_AW_CHECKOUT_PR_SUCCESS: ${{ needs.agent.outputs.checkout_pr_success }} + GH_AW_INFERENCE_ACCESS_ERROR: ${{ needs.agent.outputs.inference_access_error }} + GH_AW_MCP_POLICY_ERROR: ${{ needs.agent.outputs.mcp_policy_error }} + GH_AW_AGENTIC_ENGINE_TIMEOUT: ${{ needs.agent.outputs.agentic_engine_timeout }} + GH_AW_MODEL_NOT_SUPPORTED_ERROR: ${{ needs.agent.outputs.model_not_supported_error }} + GH_AW_ENGINE_API_HOSTS: "api.enterprise.githubcopilot.com,api.githubcopilot.com,api.business.githubcopilot.com,api.individual.githubcopilot.com" + GH_AW_CREATE_DISCUSSION_ERRORS: ${{ needs.safe_outputs.outputs.create_discussion_errors }} + GH_AW_CREATE_DISCUSSION_ERROR_COUNT: ${{ needs.safe_outputs.outputs.create_discussion_error_count }} + GH_AW_LOCKDOWN_CHECK_FAILED: ${{ needs.activation.outputs.lockdown_check_failed }} + GH_AW_STALE_LOCK_FILE_FAILED: ${{ needs.activation.outputs.stale_lock_file_failed }} + GH_AW_GROUP_REPORTS: "false" + GH_AW_FAILURE_REPORT_AS_ISSUE: "true" + GH_AW_MISSING_TOOL_REPORT_AS_FAILURE: "true" + GH_AW_MISSING_DATA_REPORT_AS_FAILURE: "true" + GH_AW_TIMEOUT_MINUTES: "300" + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/handle_agent_failure.cjs'); + await main(); + + detection: + needs: + - activation + - agent + if: > + always() && needs.agent.result != 'skipped' && (needs.agent.outputs.output_types != '' || needs.agent.outputs.has_patch == 'true') + runs-on: ubuntu-latest + permissions: + contents: read + outputs: + detection_conclusion: ${{ steps.detection_conclusion.outputs.conclusion }} + detection_reason: ${{ steps.detection_conclusion.outputs.reason }} + detection_success: ${{ steps.detection_conclusion.outputs.success }} + steps: + - name: Setup Scripts + id: setup + uses: github/gh-aw-actions/setup@v0.72.1 + with: + destination: ${{ runner.temp }}/gh-aw/actions + job-name: ${{ github.job }} + trace-id: ${{ needs.activation.outputs.setup-trace-id }} + env: + GH_AW_SETUP_WORKFLOW_NAME: "TPTP Front-End Benchmark" + GH_AW_CURRENT_WORKFLOW_REF: ${{ github.repository }}/.github/workflows/tptp-benchmark.lock.yml@${{ github.ref }} + GH_AW_INFO_VERSION: "1.0.40" + - name: Download agent output artifact + id: download-agent-output + continue-on-error: true + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: agent + path: /tmp/gh-aw/ + - name: Setup agent output environment variable + id: setup-agent-output-env + if: steps.download-agent-output.outcome == 'success' + run: | + mkdir -p /tmp/gh-aw/ + find "/tmp/gh-aw/" -type f -print + echo "GH_AW_AGENT_OUTPUT=/tmp/gh-aw/agent_output.json" >> "$GITHUB_OUTPUT" + - name: Checkout repository for patch context + if: needs.agent.outputs.has_patch == 'true' + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + # --- Threat Detection --- + - name: Clean stale firewall files from agent artifact + run: | + rm -rf /tmp/gh-aw/sandbox/firewall/logs + rm -rf /tmp/gh-aw/sandbox/firewall/audit + - name: Download container images + run: bash "${RUNNER_TEMP}/gh-aw/actions/download_docker_images.sh" ghcr.io/github/gh-aw-firewall/agent:0.25.41 ghcr.io/github/gh-aw-firewall/api-proxy:0.25.41 ghcr.io/github/gh-aw-firewall/squid:0.25.41 + - name: Check if detection needed + id: detection_guard + if: always() + env: + OUTPUT_TYPES: ${{ needs.agent.outputs.output_types }} + HAS_PATCH: ${{ needs.agent.outputs.has_patch }} + run: | + if [[ -n "$OUTPUT_TYPES" || "$HAS_PATCH" == "true" ]]; then + echo "run_detection=true" >> "$GITHUB_OUTPUT" + echo "Detection will run: output_types=$OUTPUT_TYPES, has_patch=$HAS_PATCH" + else + echo "run_detection=false" >> "$GITHUB_OUTPUT" + echo "Detection skipped: no agent outputs or patches to analyze" + fi + - name: Clear MCP Config for detection + if: always() && steps.detection_guard.outputs.run_detection == 'true' + run: | + rm -f "${RUNNER_TEMP}/gh-aw/mcp-config/mcp-servers.json" + rm -f /home/runner/.copilot/mcp-config.json + rm -f "$GITHUB_WORKSPACE/.gemini/settings.json" + - name: Prepare threat detection files + if: always() && steps.detection_guard.outputs.run_detection == 'true' + run: | + mkdir -p /tmp/gh-aw/threat-detection/aw-prompts + cp /tmp/gh-aw/aw-prompts/prompt.txt /tmp/gh-aw/threat-detection/aw-prompts/prompt.txt 2>/dev/null || true + cp /tmp/gh-aw/agent_output.json /tmp/gh-aw/threat-detection/agent_output.json 2>/dev/null || true + for f in /tmp/gh-aw/aw-*.patch; do + [ -f "$f" ] && cp "$f" /tmp/gh-aw/threat-detection/ 2>/dev/null || true + done + for f in /tmp/gh-aw/aw-*.bundle; do + [ -f "$f" ] && cp "$f" /tmp/gh-aw/threat-detection/ 2>/dev/null || true + done + echo "Prepared threat detection files:" + ls -la /tmp/gh-aw/threat-detection/ 2>/dev/null || true + - name: Setup threat detection + if: always() && steps.detection_guard.outputs.run_detection == 'true' + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + env: + WORKFLOW_NAME: "TPTP Front-End Benchmark" + WORKFLOW_DESCRIPTION: "Weekly benchmark of Z3's TPTP front-end against 500 random TPTP problems. Downloads TPTP benchmarks from tptp.org, resolves axiom dependencies, skips large problems, runs each with a 5-second timeout, and posts a discrepancy/crash report as a GitHub discussion." + HAS_PATCH: ${{ needs.agent.outputs.has_patch }} + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/setup_threat_detection.cjs'); + await main(); + - name: Ensure threat-detection directory and log + if: always() && steps.detection_guard.outputs.run_detection == 'true' + run: | + mkdir -p /tmp/gh-aw/threat-detection + touch /tmp/gh-aw/threat-detection/detection.log + - name: Setup Node.js + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: '24' + package-manager-cache: false + - name: Install GitHub Copilot CLI + run: bash "${RUNNER_TEMP}/gh-aw/actions/install_copilot_cli.sh" 1.0.40 + env: + GH_HOST: github.com + - name: Install AWF binary + run: bash "${RUNNER_TEMP}/gh-aw/actions/install_awf_binary.sh" v0.25.41 + - name: Execute GitHub Copilot CLI + if: always() && steps.detection_guard.outputs.run_detection == 'true' + continue-on-error: true + id: detection_agentic_execution + # Copilot CLI tool arguments (sorted): + timeout-minutes: 20 + run: | + set -o pipefail + touch /tmp/gh-aw/agent-step-summary.md + GH_AW_NODE_BIN=$(command -v node 2>/dev/null || true) + export GH_AW_NODE_BIN + (umask 177 && touch /tmp/gh-aw/threat-detection/detection.log) + printf '%s\n' '{"$schema":"https://github.com/github/gh-aw-firewall/releases/download/v0.25.41/awf-config.schema.json","network":{"allowDomains":["api.business.githubcopilot.com","api.enterprise.githubcopilot.com","api.github.com","api.githubcopilot.com","api.individual.githubcopilot.com","github.com","host.docker.internal","telemetry.enterprise.githubcopilot.com"]},"apiProxy":{"enabled":true},"container":{"imageTag":"0.25.41"}}' > "${RUNNER_TEMP}/gh-aw/awf-config.json" && cp "${RUNNER_TEMP}/gh-aw/awf-config.json" /tmp/gh-aw/awf-config.json + # shellcheck disable=SC1003 + sudo -E awf --config "${RUNNER_TEMP}/gh-aw/awf-config.json" --container-workdir "${GITHUB_WORKSPACE}" --mount "${RUNNER_TEMP}/gh-aw:${RUNNER_TEMP}/gh-aw:ro" --mount "${RUNNER_TEMP}/gh-aw:/host${RUNNER_TEMP}/gh-aw:ro" --env-all --exclude-env COPILOT_GITHUB_TOKEN --log-level info --proxy-logs-dir /tmp/gh-aw/sandbox/firewall/logs --audit-dir /tmp/gh-aw/sandbox/firewall/audit --enable-host-access --allow-host-ports 80,443,8080 --skip-pull \ + -- /bin/bash -c 'export PATH="$(find /opt/hostedtoolcache /home/runner/work/_tool -maxdepth 4 -type d -name bin 2>/dev/null | tr '\''\n'\'' '\'':'\'')$PATH"; [ -n "$GOROOT" ] && export PATH="$GOROOT/bin:$PATH" || true && GH_AW_NODE_EXEC="${GH_AW_NODE_BIN:-}"; if [ -z "$GH_AW_NODE_EXEC" ] || [ ! -x "$GH_AW_NODE_EXEC" ]; then GH_AW_NODE_EXEC="$(command -v node 2>/dev/null || echo node)"; fi; "$GH_AW_NODE_EXEC" ${RUNNER_TEMP}/gh-aw/actions/copilot_harness.cjs /usr/local/bin/copilot --add-dir /tmp/gh-aw/ --log-level all --log-dir /tmp/gh-aw/sandbox/agent/logs/ --disable-builtin-mcps --no-ask-user --allow-all-tools --add-dir "${GITHUB_WORKSPACE}" --prompt-file /tmp/gh-aw/aw-prompts/prompt.txt' 2>&1 | tee -a /tmp/gh-aw/threat-detection/detection.log + env: + AWF_REFLECT_ENABLED: 1 + COPILOT_AGENT_RUNNER_TYPE: STANDALONE + COPILOT_API_KEY: dummy-byok-key-for-offline-mode + COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + COPILOT_MODEL: ${{ vars.GH_AW_MODEL_DETECTION_COPILOT || 'claude-sonnet-4.6' }} + GH_AW_PHASE: detection + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + GH_AW_VERSION: v0.72.1 + GITHUB_API_URL: ${{ github.api_url }} + GITHUB_AW: true + GITHUB_COPILOT_INTEGRATION_ID: agentic-workflows + GITHUB_HEAD_REF: ${{ github.head_ref }} + GITHUB_REF_NAME: ${{ github.ref_name }} + GITHUB_SERVER_URL: ${{ github.server_url }} + GITHUB_STEP_SUMMARY: /tmp/gh-aw/agent-step-summary.md + GITHUB_WORKSPACE: ${{ github.workspace }} + GIT_AUTHOR_EMAIL: github-actions[bot]@users.noreply.github.com + GIT_AUTHOR_NAME: github-actions[bot] + GIT_COMMITTER_EMAIL: github-actions[bot]@users.noreply.github.com + GIT_COMMITTER_NAME: github-actions[bot] + XDG_CONFIG_HOME: /home/runner + - name: Upload threat detection log + if: always() && steps.detection_guard.outputs.run_detection == 'true' + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: detection + path: /tmp/gh-aw/threat-detection/detection.log + if-no-files-found: ignore + - name: Parse and conclude threat detection + id: detection_conclusion + if: always() + continue-on-error: true + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + env: + RUN_DETECTION: ${{ steps.detection_guard.outputs.run_detection }} + GH_AW_DETECTION_CONTINUE_ON_ERROR: "true" + with: + script: | + try { + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_threat_detection_results.cjs'); + await main(); + } catch (loadErr) { + const continueOnError = process.env.GH_AW_DETECTION_CONTINUE_ON_ERROR !== 'false'; + const msg = 'ERR_SYSTEM: \u274C Unexpected error loading threat detection module: ' + (loadErr && loadErr.message ? loadErr.message : String(loadErr)); + core.error(msg); + core.setOutput('reason', 'parse_error'); + if (continueOnError) { + core.warning('\u26A0\uFE0F ' + msg); + core.setOutput('conclusion', 'warning'); + core.setOutput('success', 'false'); + } else { + core.setOutput('conclusion', 'failure'); + core.setOutput('success', 'false'); + core.setFailed(msg); + } + } + + safe_outputs: + needs: + - activation + - agent + - detection + if: (!cancelled()) && needs.agent.result != 'skipped' && needs.detection.result == 'success' + runs-on: ubuntu-slim + permissions: + contents: read + discussions: write + issues: write + timeout-minutes: 15 + env: + GH_AW_CALLER_WORKFLOW_ID: "${{ github.repository }}/tptp-benchmark" + GH_AW_DETECTION_CONCLUSION: ${{ needs.detection.outputs.detection_conclusion }} + GH_AW_DETECTION_REASON: ${{ needs.detection.outputs.detection_reason }} + GH_AW_EFFECTIVE_TOKENS: ${{ needs.agent.outputs.effective_tokens }} + GH_AW_ENGINE_ID: "copilot" + GH_AW_ENGINE_MODEL: ${{ needs.agent.outputs.model }} + GH_AW_ENGINE_VERSION: "1.0.40" + GH_AW_WORKFLOW_ID: "tptp-benchmark" + GH_AW_WORKFLOW_NAME: "TPTP Front-End Benchmark" + outputs: + code_push_failure_count: ${{ steps.process_safe_outputs.outputs.code_push_failure_count }} + code_push_failure_errors: ${{ steps.process_safe_outputs.outputs.code_push_failure_errors }} + create_discussion_error_count: ${{ steps.process_safe_outputs.outputs.create_discussion_error_count }} + create_discussion_errors: ${{ steps.process_safe_outputs.outputs.create_discussion_errors }} + process_safe_outputs_processed_count: ${{ steps.process_safe_outputs.outputs.processed_count }} + process_safe_outputs_temporary_id_map: ${{ steps.process_safe_outputs.outputs.temporary_id_map }} + steps: + - name: Setup Scripts + id: setup + uses: github/gh-aw-actions/setup@v0.72.1 + with: + destination: ${{ runner.temp }}/gh-aw/actions + job-name: ${{ github.job }} + trace-id: ${{ needs.activation.outputs.setup-trace-id }} + env: + GH_AW_SETUP_WORKFLOW_NAME: "TPTP Front-End Benchmark" + GH_AW_CURRENT_WORKFLOW_REF: ${{ github.repository }}/.github/workflows/tptp-benchmark.lock.yml@${{ github.ref }} + GH_AW_INFO_VERSION: "1.0.40" + - name: Download agent output artifact + id: download-agent-output + continue-on-error: true + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: agent + path: /tmp/gh-aw/ + - name: Setup agent output environment variable + id: setup-agent-output-env + if: steps.download-agent-output.outcome == 'success' + run: | + mkdir -p /tmp/gh-aw/ + find "/tmp/gh-aw/" -type f -print + echo "GH_AW_AGENT_OUTPUT=/tmp/gh-aw/agent_output.json" >> "$GITHUB_OUTPUT" + - name: Configure GH_HOST for enterprise compatibility + id: ghes-host-config + shell: bash + run: | + # Derive GH_HOST from GITHUB_SERVER_URL so the gh CLI targets the correct + # GitHub instance (GHES/GHEC). On github.com this is a harmless no-op. + GH_HOST="${GITHUB_SERVER_URL#https://}" + GH_HOST="${GH_HOST#http://}" + echo "GH_HOST=${GH_HOST}" >> "$GITHUB_ENV" + - name: Process Safe Outputs + id: process_safe_outputs + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + env: + GH_AW_AGENT_OUTPUT: ${{ steps.setup-agent-output-env.outputs.GH_AW_AGENT_OUTPUT }} + GH_AW_ALLOWED_DOMAINS: "api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,api.snapcraft.io,archive.ubuntu.com,azure.archive.ubuntu.com,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,github.com,host.docker.internal,json-schema.org,json.schemastore.org,keyserver.ubuntu.com,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,ppa.launchpad.net,raw.githubusercontent.com,registry.npmjs.org,s.symcb.com,s.symcd.com,security.ubuntu.com,telemetry.enterprise.githubcopilot.com,tptp.org,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com,www.googleapis.com" + GITHUB_SERVER_URL: ${{ github.server_url }} + GITHUB_API_URL: ${{ github.api_url }} + GH_AW_SAFE_OUTPUTS_HANDLER_CONFIG: "{\"create_discussion\":{\"category\":\"agentic workflows\",\"close_older_discussions\":true,\"expires\":336,\"fallback_to_issue\":true,\"max\":1,\"title_prefix\":\"[TPTP Benchmark] \"},\"create_report_incomplete_issue\":{},\"missing_data\":{},\"missing_tool\":{},\"noop\":{\"max\":1,\"report-as-issue\":\"false\"},\"report_incomplete\":{}}" + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io, getOctokit); + const { main } = require('${{ runner.temp }}/gh-aw/actions/safe_output_handler_manager.cjs'); + await main(); + - name: Upload Safe Outputs Items + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: safe-outputs-items + path: | + /tmp/gh-aw/safe-output-items.jsonl + /tmp/gh-aw/temporary-id-map.json + if-no-files-found: ignore + diff --git a/.github/workflows/tptp-benchmark.md b/.github/workflows/tptp-benchmark.md new file mode 100644 index 000000000..48752372a --- /dev/null +++ b/.github/workflows/tptp-benchmark.md @@ -0,0 +1,554 @@ +--- +description: > + Weekly benchmark of Z3's TPTP front-end against 500 random TPTP problems. + Downloads TPTP benchmarks from tptp.org, resolves axiom dependencies, + skips large problems, runs each with a 5-second timeout, and posts a + discrepancy/crash report as a GitHub discussion. + +on: + schedule: + - cron: "0 6 * * 1" + workflow_dispatch: + +permissions: read-all + +network: + allowed: + - defaults + - tptp.org + +tools: + bash: true + github: + toolsets: [default] + +safe-outputs: + create-discussion: + title-prefix: "[TPTP Benchmark] " + category: "Agentic Workflows" + close-older-discussions: true + expires: 14d + missing-tool: + create-issue: true + noop: + report-as-issue: false + +timeout-minutes: 300 + +steps: + - name: Checkout repository + uses: actions/checkout@v6.0.2 + with: + persist-credentials: false + +--- + +# TPTP Front-End Benchmark + +## Job Description + +Your name is ${{ github.workflow }}. You are an expert testing engineer for the Z3 theorem prover. Your task is to: + +1. Build Z3 from the current `master` branch +2. Download the TPTP benchmark library from tptp.org +3. Select 500 random small-to-medium problems (with their axiom dependencies) +4. Run each problem through Z3's TPTP front-end with a 5-second timeout +5. Compare Z3's output against the expected SZS status declared in each problem file +6. Post a detailed report as a GitHub Discussion summarising discrepancies and crashes + +**Repository**: ${{ github.repository }} +**Workspace**: ${{ github.workspace }} + +## Phase 1: Build Z3 + +Install build tools and build Z3 in Release mode. + +```bash +sudo apt-get update -y -q +sudo apt-get install -y cmake ninja-build python3 wget curl bc 2>/dev/null || true +``` + +Configure and build: + +```bash +mkdir -p /tmp/z3-build +cd /tmp/z3-build +cmake "$GITHUB_WORKSPACE" \ + -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DZ3_BUILD_TEST_EXECUTABLES=OFF \ + 2>&1 | tail -20 + +# Build with limited parallelism to avoid OOM alongside LLM inference. +# Never run ninja in the background with &. +ninja -j2 z3 2>&1 | tail -30 +``` + +Verify the build: + +```bash +/tmp/z3-build/z3 --version +``` + +If the build fails, call the `noop` safe-output with a message describing the error and stop. + +Once the binary is confirmed, call `noop` with `"Z3 built successfully. Downloading TPTP benchmark library — this may take a few minutes."` to keep the safe-output session alive. + +## Phase 2: Download the TPTP Problem Library + +Find the latest TPTP release and download the full archive. + +```bash +# Find the latest TPTP distribution version by fetching the directory listing +TPTP_DIST_URL="https://tptp.org/TPTP/Distribution/" +LATEST_TGZ=$(curl -sL "$TPTP_DIST_URL" \ + | grep -oP 'TPTP-v[0-9]+\.[0-9]+\.[0-9]+\.tgz' \ + | sort -V | tail -1) + +if [ -z "$LATEST_TGZ" ]; then + echo "ERROR: Could not determine latest TPTP version from $TPTP_DIST_URL" + # Fall back to a known stable version + LATEST_TGZ="TPTP-v9.0.0.tgz" +fi + +echo "Downloading $LATEST_TGZ ..." +mkdir -p /tmp/tptp_download +wget -q --show-progress \ + "${TPTP_DIST_URL}${LATEST_TGZ}" \ + -O /tmp/tptp_download/tptp.tgz + +echo "Extracting TPTP library..." +mkdir -p /tmp/tptp +tar -xzf /tmp/tptp_download/tptp.tgz -C /tmp/tptp --strip-components=1 2>&1 | tail -5 + +# Verify extraction +if [ ! -d /tmp/tptp/Problems ] || [ ! -d /tmp/tptp/Axioms ]; then + echo "ERROR: TPTP extraction failed — Problems/ or Axioms/ directory not found" + ls /tmp/tptp/ + exit 1 +fi + +TPTP_ROOT=/tmp/tptp +echo "TPTP library extracted to $TPTP_ROOT" +echo "Problem domains available:" +ls "$TPTP_ROOT/Problems/" | wc -l +echo "Axiom files available:" +ls "$TPTP_ROOT/Axioms/" | wc -l +``` + +If the download or extraction fails, call `noop` with the error details and stop. + +Call `noop` with `"TPTP library downloaded and extracted. Selecting 500 benchmark problems — filtering by size."` to keep the session alive. + +## Phase 3: Select 500 Benchmark Problems + +Filter out large problems and problems that depend on large axiom files, then take a random sample of 500. + +Save this script to `/tmp/select_benchmarks.py` and run it: + +```python +#!/usr/bin/env python3 +""" +Select 500 random TPTP problems that: + - Have a known, conclusive expected status (Theorem, Unsatisfiable, + CounterSatisfiable, Satisfiable) OR Unknown/Open status. + - Are not "large" (problem file <= 50 KB). + - Do not include any axiom file larger than 100 KB. +""" +import os +import re +import random +import sys + +TPTP_ROOT = "/tmp/tptp" +PROBLEMS_DIR = os.path.join(TPTP_ROOT, "Problems") +AXIOMS_DIR = os.path.join(TPTP_ROOT, "Axioms") +MAX_PROBLEM_SIZE = 50 * 1024 # 50 KB +MAX_AXIOM_SIZE = 100 * 1024 # 100 KB +SAMPLE_SIZE = 500 +OUTPUT_FILE = "/tmp/selected_benchmarks.txt" + +include_re = re.compile(r"include\s*\(\s*['\"]([^'\"]+)['\"]", re.IGNORECASE) +status_re = re.compile(r"%\s*Status\s*:\s*(\S+)", re.IGNORECASE) + +def axiom_sizes_ok(problem_path): + """Return True if all included axiom files exist and are <= MAX_AXIOM_SIZE.""" + try: + with open(problem_path, encoding="utf-8", errors="replace") as f: + content = f.read(4096) # header is in first few KB + except OSError: + return False + for m in include_re.finditer(content): + axiom_rel = m.group(1) # e.g. "Axioms/AGT001+0.ax" + axiom_path = os.path.join(TPTP_ROOT, axiom_rel) + if not os.path.exists(axiom_path): + return False # axiom missing — skip + if os.path.getsize(axiom_path) > MAX_AXIOM_SIZE: + return False # axiom too large — skip + return True + +candidates = [] +skipped_size = 0 +skipped_axiom = 0 + +for domain in sorted(os.listdir(PROBLEMS_DIR)): + domain_dir = os.path.join(PROBLEMS_DIR, domain) + if not os.path.isdir(domain_dir): + continue + for fname in os.listdir(domain_dir): + if not fname.endswith(".p"): + continue + fpath = os.path.join(domain_dir, fname) + size = os.path.getsize(fpath) + if size > MAX_PROBLEM_SIZE: + skipped_size += 1 + continue + if not axiom_sizes_ok(fpath): + skipped_axiom += 1 + continue + candidates.append(fpath) + +print(f"Total candidates (after filtering): {len(candidates)}", flush=True) +print(f" Skipped — problem too large : {skipped_size}", flush=True) +print(f" Skipped — axiom too large : {skipped_axiom}", flush=True) + +if len(candidates) == 0: + print("ERROR: No suitable benchmark problems found.", file=sys.stderr) + sys.exit(1) + +if len(candidates) > SAMPLE_SIZE: + random.seed(42) + selected = random.sample(candidates, SAMPLE_SIZE) +else: + selected = candidates + +selected.sort() +with open(OUTPUT_FILE, "w") as f: + f.write("\n".join(selected) + "\n") + +print(f"Selected {len(selected)} problems → {OUTPUT_FILE}", flush=True) +``` + +Run the script: + +```bash +python3 /tmp/select_benchmarks.py +SELECTED=$(wc -l < /tmp/selected_benchmarks.txt) +echo "Benchmark set: $SELECTED problems" +``` + +If no problems are found, call `noop` with an error message and stop. + +Call `noop` with `"$SELECTED problems selected. Starting benchmark run with 5-second timeout per problem — this will take approximately $(( SELECTED * 7 / 60 )) minutes."` to keep the session alive. + +## Phase 4: Run Benchmarks + +Save the following script to `/tmp/run_tptp_benchmarks.sh`, make it executable, and run it. + +```bash +#!/usr/bin/env bash +set -euo pipefail + +Z3=/tmp/z3-build/z3 +TPTP_ROOT=/tmp/tptp +TIMEOUT_HARD=8 # outer OS-level guard (seconds) +Z3_TIMEOUT=5 # Z3 internal timeout flag -T:5 + +RESULTS=/tmp/tptp_results.tsv +PROBLEM_LIST=/tmp/selected_benchmarks.txt + +echo -e "file\texpected\tactual\ttime_s\tnotes" > "$RESULTS" + +# Helper: extract the expected SZS status from the TPTP problem header. +get_expected_status() { + local file="$1" + # Look for lines like: "% Status : Theorem" + grep -m1 -iP '%\s*Status\s*:\s*\K\S+' "$file" 2>/dev/null || echo "Unknown" +} + +# Helper: run z3 on a single TPTP problem with timeout. +run_benchmark() { + local file="$1" + local start end elapsed output exit_code verdict + + start=$(date +%s%3N) + output=$(TPTP="$TPTP_ROOT" timeout "$TIMEOUT_HARD" \ + "$Z3" -tptp -T:"$Z3_TIMEOUT" "$file" 2>&1) || exit_code=$? + exit_code=${exit_code:-0} + end=$(date +%s%3N) + elapsed=$(echo "scale=3; ($end - $start) / 1000" | bc) + + # Extract SZS status line from output + szs_line=$(echo "$output" | grep -m1 "% SZS status" || true) + + if [ -n "$szs_line" ]; then + # Parse the status keyword (e.g. "Theorem", "CounterSatisfiable", "GaveUp") + verdict=$(echo "$szs_line" | grep -oP '% SZS status \K\S+' || echo "Unknown") + elif [ "$exit_code" -eq 124 ]; then + verdict="Timeout" + elif [ "$exit_code" -ne 0 ]; then + verdict="Crash" + else + verdict="NoOutput" + fi + + echo "$verdict $elapsed" +} + +COUNTER=0 +TOTAL=$(wc -l < "$PROBLEM_LIST") + +while IFS= read -r problem_file; do + COUNTER=$((COUNTER + 1)) + + expected=$(get_expected_status "$problem_file") + result_line=$(run_benchmark "$problem_file") + actual=$(echo "$result_line" | cut -d' ' -f1) + elapsed=$(echo "$result_line" | cut -d' ' -f2) + fname=$(basename "$problem_file") + + # Classify notes + notes="" + # Soundness discrepancy: both answers are conclusive but conflict + conclusive_expected=false + conclusive_actual=false + case "$expected" in + Theorem|Unsatisfiable) conclusive_expected=true ;; + Satisfiable|CounterSatisfiable) conclusive_expected=true ;; + esac + case "$actual" in + Theorem|Unsatisfiable) conclusive_actual=true ;; + Satisfiable|CounterSatisfiable) conclusive_actual=true ;; + esac + + if $conclusive_expected && $conclusive_actual; then + # Map expected to the Z3 output equivalents for comparison + # Theorem (has-conjecture unsat) matches "Theorem" + # Unsatisfiable (no-conjecture unsat) matches "Unsatisfiable" + # Satisfiable (no-conjecture sat) matches "Satisfiable" + # CounterSatisfiable (has-conjecture sat) matches "CounterSatisfiable" + if [ "$expected" != "$actual" ]; then + # Check for sat/unsat polarity conflict + sat_expected=false; sat_actual=false + case "$expected" in Satisfiable|CounterSatisfiable) sat_expected=true ;; esac + case "$actual" in Satisfiable|CounterSatisfiable) sat_actual=true ;; esac + if [ "$sat_expected" != "$sat_actual" ]; then + notes="SOUNDNESS_ERROR" + else + notes="STATUS_MISMATCH" + fi + fi + fi + + if [ "$actual" = "Crash" ]; then + notes="CRASH" + fi + + echo -e "$fname\t$expected\t$actual\t$elapsed\t$notes" >> "$RESULTS" + + if [ -n "$notes" ]; then + echo "[$COUNTER/$TOTAL] $fname expected=$expected actual=$actual time=${elapsed}s *** $notes ***" + elif [ $((COUNTER % 50)) -eq 0 ]; then + echo "[$COUNTER/$TOTAL] Progress checkpoint last=$fname actual=$actual time=${elapsed}s" + fi + +done < "$PROBLEM_LIST" + +echo "Benchmark run complete: $COUNTER problems processed. Results → $RESULTS" +``` + +Run it: + +```bash +chmod +x /tmp/run_tptp_benchmarks.sh +/tmp/run_tptp_benchmarks.sh +``` + +Do not skip any file in the list. + +## Phase 5: Analyze Results + +Save the following script to `/tmp/analyze_tptp.py` and run it: + +```python +#!/usr/bin/env python3 +"""Compute summary statistics from the TPTP benchmark TSV.""" +import csv + +RESULTS_FILE = "/tmp/tptp_results.tsv" + +rows = [] +with open(RESULTS_FILE, newline="") as f: + reader = csv.DictReader(f, delimiter="\t") + for row in reader: + rows.append(row) + +total = len(rows) + +# Verdict counts +from collections import Counter, defaultdict +actual_counts = Counter(r["actual"] for r in rows) +expected_counts = Counter(r["expected"] for r in rows) + +# Flagged rows +soundness_errors = [r for r in rows if r["notes"] == "SOUNDNESS_ERROR"] +status_mismatches = [r for r in rows if r["notes"] == "STATUS_MISMATCH"] +crashes = [r for r in rows if r["notes"] == "CRASH"] +timeouts = [r for r in rows if r["actual"] == "Timeout"] +gave_up = [r for r in rows if r["actual"] == "GaveUp"] + +# Solved correctly (expected matches actual for conclusive verdicts) +conclusive_expected = {"Theorem", "Unsatisfiable", "Satisfiable", "CounterSatisfiable"} +correct = [r for r in rows + if r["expected"] in conclusive_expected + and r["actual"] == r["expected"]] + +print(f"TOTAL={total}") +print(f"CORRECT={len(correct)}") +print(f"TIMEOUTS={len(timeouts)}") +print(f"GAVE_UP={len(gave_up)}") +print(f"CRASHES={len(crashes)}") +print(f"SOUNDNESS_ERRORS={len(soundness_errors)}") +print(f"STATUS_MISMATCHES={len(status_mismatches)}") + +print("\n--- Actual verdict breakdown ---") +for v, c in sorted(actual_counts.items()): + print(f" {v}: {c}") + +print("\n--- Expected status breakdown ---") +for v, c in sorted(expected_counts.items()): + print(f" {v}: {c}") + +if soundness_errors: + print(f"\n--- SOUNDNESS ERRORS ({len(soundness_errors)}) ---") + for r in soundness_errors: + print(f" {r['file']} expected={r['expected']} actual={r['actual']}") + +if crashes: + print(f"\n--- CRASHES ({len(crashes)}) ---") + for r in crashes: + print(f" {r['file']} expected={r['expected']}") + +if status_mismatches: + print(f"\n--- STATUS MISMATCHES ({len(status_mismatches)}) ---") + for r in status_mismatches[:20]: + print(f" {r['file']} expected={r['expected']} actual={r['actual']}") +``` + +Run the analysis: + +```bash +python3 /tmp/analyze_tptp.py +``` + +## Phase 6: Generate and Post the Discussion Report + +Read the TSV at `/tmp/tptp_results.tsv` and the analysis output, then compose a Markdown report and call `create_discussion`. + +The report should use `###` or lower for all headers (never `#` or `##`). Use collapsible `
` sections for large tables. + +Use this structure: + +```markdown +**Date**: +**Branch**: master +**Commit**: `` (run `git rev-parse --short HEAD` in ${{ github.workspace }} to get the SHA) +**Workflow Run**: [${{ github.run_id }}](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) +**TPTP version**: +**Problems benchmarked**: (random sample, timeout 5 s per problem) + +--- + +### Summary + +| Metric | Count | +|--------|-------| +| Total problems run | N | +| Correct (expected = actual) | N | +| Timeouts | N | +| GaveUp (within time budget) | N | +| Crashes / errors | N | +| Soundness errors (sat↔unsat conflict) | N | +| Status mismatches (Theorem vs Unsatisfiable etc.) | N | + +### Expected Status Distribution + +| Expected Status | Count | +|----------------|-------| +| Theorem | N | +| Unsatisfiable | N | +| Satisfiable | N | +| CounterSatisfiable | N | +| Unknown / Open | N | + +--- + +### ⚠️ Critical: Soundness Errors + +[List ALL files where Z3 returned a conclusive answer that contradicts the expected answer +(e.g., expected Theorem but got CounterSatisfiable). If none, write "None detected."] + +### 💥 Crashes + +[List ALL files where Z3 crashed (non-zero exit, no SZS output, not a timeout). +Include filename and expected status. If none, write "None detected."] + +### Status Mismatches + +[Files where both answers are conclusive but differ in Theorem vs Unsatisfiable polarity +(e.g., expected Theorem but actual Unsatisfiable). These may indicate conjecture-handling +differences rather than soundness bugs. If none, write "None detected."] + +--- + +
+View all Timeouts (problems where Z3 exceeded the 5-second limit) + +| # | File | Expected Status | +|---|------|----------------| +[First 100 timeout rows] + +
+ +
+View full per-problem results table + +| # | File | Expected | Actual | Time (s) | Notes | +|---|------|----------|--------|----------|-------| +[All rows, or first 500 if over limit] + +
+ +--- + +### Recommendations + +[Based on the findings, list actionable items. E.g.: investigate soundness errors, +file crash bugs, note domains where Z3 consistently times out.] +``` + +Post the discussion using the `create_discussion` safe output. The title should be +`[TPTP Benchmark] master — `. + +## Safe Output Guarantee + +You **MUST** call either `create_discussion` or `noop` before the workflow ends: + +- **Full success**: Call `create_discussion` with the complete report. +- **Partial results** (some problems ran): Call `create_discussion` with whatever results are available and a note about incomplete execution. +- **Build failure**: Call `noop` with a brief message describing the build error. +- **Download failure**: Call `noop` with the download error details. +- **No problems selected**: Call `noop` explaining why no problems were found. + +Failing to produce any safe output triggers an automatic workflow-failure issue that clutters the repository. + +## Important Notes + +- **Never run `ninja` in the background with `&`**: Concurrent C++ compilation and LLM inference exhausts available RAM and kills the agent process (exit 137). Always wait for build commands to finish before continuing. +- **TPTP environment variable**: Set `TPTP=/tmp/tptp` when invoking `z3 -tptp` so that `include()` directives in problem files resolve correctly against the downloaded Axioms directory. +- **Timeout detection**: Use `timeout 8` as the outer OS-level guard (2 seconds beyond Z3's `-T:5`) to allow Z3 to exit cleanly before the shell kills it. If the exit code from `timeout` is 124, record the verdict as `Timeout`. +- **Crash detection**: A crash is a non-zero exit code with no `% SZS status` line in the output and no timeout. Record it separately from `GaveUp`. +- **SZS status semantics**: Z3 outputs `Theorem` (not `Unsatisfiable`) when it proves a conjecture; `CounterSatisfiable` (not `Satisfiable`) when it finds a counterexample to a conjecture. A status mismatch between `Theorem` and `Unsatisfiable` for the same problem may be innocuous and depends on whether the problem file uses a conjecture formula. +- **Report soundness bugs prominently**: Any case where the polarity of the answer conflicts (expected Theorem/Unsatisfiable but got CounterSatisfiable/Satisfiable, or vice versa) is a potential soundness bug and must be highlighted as critical. +- **Keep progress log**: Print a line for every flagged result and every 50th problem so the workflow log shows progress. +- **Close older discussions**: Configured via `close-older-discussions: true`. Only the latest weekly report remains open.