From a1320d031eeaec4229108e6bfad03738e56220b5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Mar 2026 23:13:32 +0000 Subject: [PATCH] Add QF_S string solver benchmark agentic workflow Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com> --- .github/agentics/qf-s-benchmark.md | 219 +++++ .github/workflows/qf-s-benchmark.lock.yml | 1087 +++++++++++++++++++++ .github/workflows/qf-s-benchmark.md | 38 + 3 files changed, 1344 insertions(+) create mode 100644 .github/agentics/qf-s-benchmark.md create mode 100644 .github/workflows/qf-s-benchmark.lock.yml create mode 100644 .github/workflows/qf-s-benchmark.md diff --git a/.github/agentics/qf-s-benchmark.md b/.github/agentics/qf-s-benchmark.md new file mode 100644 index 000000000..5bc61cb03 --- /dev/null +++ b/.github/agentics/qf-s-benchmark.md @@ -0,0 +1,219 @@ + + + +# QF_S String Solver Benchmark + +You are an AI agent that benchmarks the Z3 string solvers (`seq` and `nseq`) on QF_S SMT-LIB2 benchmarks from the `c3` branch, and publishes a summary report as a GitHub discussion. + +## Context + +- **Repository**: ${{ github.repository }} +- **Workspace**: ${{ github.workspace }} +- **Branch**: c3 (already checked out by the workflow setup step) + +## Phase 1: Build Z3 + +Build Z3 from the checked-out `c3` branch using CMake + Ninja. + +```bash +cd ${{ github.workspace }} + +# Install build dependencies if missing +sudo apt-get install -y ninja-build cmake python3 zstd 2>/dev/null || true + +# Configure the build +mkdir -p build +cd build +cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release 2>&1 | tail -20 + +# Build z3 binary (this takes ~15-17 minutes) +ninja -j$(nproc) z3 2>&1 | tail -30 + +# Verify the build succeeded +./z3 --version +``` + +If the build fails, report the error clearly and exit without proceeding. + +## Phase 2: Extract and Select Benchmark Files + +Extract the QF_S benchmark archive and randomly select 50 files. + +```bash +cd ${{ github.workspace }} + +# Extract the archive +mkdir -p /tmp/qfs_benchmarks +tar --zstd -xf tests/QF_S.tar.zst -C /tmp/qfs_benchmarks + +# List all .smt2 files +find /tmp/qfs_benchmarks -name "*.smt2" -type f > /tmp/all_qfs_files.txt +TOTAL_FILES=$(wc -l < /tmp/all_qfs_files.txt) +echo "Total QF_S files: $TOTAL_FILES" + +# Randomly select 50 files +shuf -n 50 /tmp/all_qfs_files.txt > /tmp/selected_files.txt +echo "Selected 50 files for benchmarking" +cat /tmp/selected_files.txt +``` + +## Phase 3: Run Benchmarks + +Run each of the 50 selected files with both string solvers. Use a 10-second timeout (`-T:10`). Also wrap each run with `time` to capture wall-clock duration. + +For each file, run: +1. `z3 smt.string_solver=seq -T:10 ` +2. `z3 smt.string_solver=nseq -T:10 ` + +Capture: +- **Verdict**: `sat`, `unsat`, `unknown`, `timeout` (if exit code indicates timeout), or `bug` (if z3 crashes / produces a non-standard result, or if seq and nseq disagree on sat vs unsat) +- **Time** (seconds): wall-clock time for the run + +Use a bash script to automate this: + +```bash +#!/usr/bin/env bash +set -euo pipefail + +Z3=${{ github.workspace }}/build/z3 +RESULTS=/tmp/benchmark_results.tsv +echo -e "file\tseq_verdict\tseq_time\tnseq_verdict\tnseq_time\tnotes" > "$RESULTS" + +run_z3() { + local solver="$1" + local file="$2" + local start end elapsed verdict output exit_code + + start=$(date +%s%3N) + output=$(timeout 12 "$Z3" "smt.string_solver=$solver" -T:10 "$file" 2>&1) + exit_code=$? + end=$(date +%s%3N) + elapsed=$(echo "scale=3; ($end - $start) / 1000" | bc) + + # Parse verdict + if echo "$output" | grep -q "^unsat"; then + verdict="unsat" + elif echo "$output" | grep -q "^sat"; then + verdict="sat" + elif echo "$output" | grep -q "^unknown"; then + verdict="unknown" + elif [ "$exit_code" -eq 124 ]; then + verdict="timeout" + elif echo "$output" | grep -qi "error\|assertion\|segfault\|SIGABRT\|exception"; then + verdict="bug" + else + verdict="unknown" + fi + + echo "$verdict $elapsed" +} + +while IFS= read -r file; do + fname=$(basename "$file") + seq_result=$(run_z3 seq "$file") + nseq_result=$(run_z3 nseq "$file") + + seq_verdict=$(echo "$seq_result" | cut -d' ' -f1) + seq_time=$(echo "$seq_result" | cut -d' ' -f2) + nseq_verdict=$(echo "$nseq_result" | cut -d' ' -f1) + nseq_time=$(echo "$nseq_result" | cut -d' ' -f2) + + # Flag as bug if the two solvers disagree on sat vs unsat + notes="" + if { [ "$seq_verdict" = "sat" ] && [ "$nseq_verdict" = "unsat" ]; } || \ + { [ "$seq_verdict" = "unsat" ] && [ "$nseq_verdict" = "sat" ]; }; then + notes="SOUNDNESS_DISAGREEMENT" + fi + + echo -e "$fname\t$seq_verdict\t$seq_time\t$nseq_verdict\t$nseq_time\t$notes" >> "$RESULTS" + echo "[$fname] seq=$seq_verdict(${seq_time}s) nseq=$nseq_verdict(${nseq_time}s) $notes" +done < /tmp/selected_files.txt + +echo "Benchmark run complete. Results saved to $RESULTS" +``` + +Save this script to `/tmp/run_benchmarks.sh`, make it executable, and run it. + +## Phase 4: Generate Summary Report + +Read `/tmp/benchmark_results.tsv` and compute statistics. Then generate a Markdown report. + +Compute: +- **Total benchmarks**: 50 +- **Per solver (seq and nseq)**: count of sat / unsat / unknown / timeout / bug verdicts +- **Total time used**: sum of all times for each solver +- **Average time per benchmark**: total_time / 50 +- **Soundness disagreements**: files where seq says sat but nseq says unsat or vice versa (these are the most critical bugs) +- **Bugs / crashes**: files with error/crash verdicts + +Format the report as a GitHub Discussion post (GitHub-flavored Markdown): + +```markdown +### QF_S Benchmark Report — Z3 c3 branch + +**Date**: +**Branch**: c3 +**Benchmark set**: QF_S (50 randomly selected files from tests/QF_S.tar.zst) +**Timeout**: 10 seconds per benchmark (`-T:10`) + +--- + +### Summary + +| Metric | seq solver | nseq solver | +|--------|-----------|-------------| +| sat | X | X | +| unsat | X | X | +| unknown | X | X | +| timeout | X | X | +| bug/crash | X | X | +| **Total time (s)** | X.XXX | X.XXX | +| **Avg time/benchmark (s)** | X.XXX | X.XXX | + +**Soundness disagreements** (seq says sat, nseq says unsat or vice versa): N + +--- + +### Per-File Results + +| # | File | seq verdict | seq time (s) | nseq verdict | nseq time (s) | Notes | +|---|------|-------------|-------------|--------------|--------------|-------| +| 1 | benchmark_0001.smt2 | sat | 0.123 | sat | 0.456 | | +| ... | ... | ... | ... | ... | ... | ... | + +--- + +### Notable Issues + +#### Soundness Disagreements (Critical) + + +#### Crashes / Bugs + + +#### Slow Benchmarks (> 8s) + + +--- + +*Generated automatically by the QF_S Benchmark workflow on the c3 branch.* +``` + +## Phase 5: Post to GitHub Discussion + +Post the Markdown report as a new GitHub Discussion using the `create-discussion` safe output. + +- **Category**: "Agentic Workflows" +- **Title**: `[QF_S Benchmark] Z3 c3 branch — ` +- Close older discussions with the same title prefix to avoid clutter. + +## Guidelines + +- **Always build from c3 branch**: The workspace is already checked out on c3; don't change branches. +- **Handle build failures gracefully**: If Z3 fails to build, report the error and create a brief discussion noting the build failure. +- **Handle missing zstd**: If `tar --zstd` fails, try `zstd -d tests/QF_S.tar.zst --stdout | tar -x -C /tmp/qfs_benchmarks`. +- **Be precise with timing**: Use millisecond-precision timestamps and report times in seconds with 3 decimal places. +- **Distinguish timeout from unknown**: A timeout (process killed after 12s) is different from `(unknown)` returned by z3. +- **Report soundness bugs prominently**: If any benchmark shows seq=sat but nseq=unsat (or vice versa), highlight it as a critical finding. +- **Don't skip any file**: Run all 50 files even if some fail. +- **Large report**: If the per-file table is very long, put it in a `
` collapsible section. diff --git a/.github/workflows/qf-s-benchmark.lock.yml b/.github/workflows/qf-s-benchmark.lock.yml new file mode 100644 index 000000000..fa2e6c1fc --- /dev/null +++ b/.github/workflows/qf-s-benchmark.lock.yml @@ -0,0 +1,1087 @@ +# +# ___ _ _ +# / _ \ | | (_) +# | |_| | __ _ ___ _ __ | |_ _ ___ +# | _ |/ _` |/ _ \ '_ \| __| |/ __| +# | | | | (_| | __/ | | | |_| | (__ +# \_| |_/\__, |\___|_| |_|\__|_|\___| +# __/ | +# _ _ |___/ +# | | | | / _| | +# | | | | ___ _ __ _ __| |_| | _____ ____ +# | |/\| |/ _ \ '__| |/ /| _| |/ _ \ \ /\ / / ___| +# \ /\ / (_) | | | | ( | | | | (_) \ V V /\__ \ +# \/ \/ \___/|_| |_|\_\|_| |_|\___/ \_/\_/ |___/ +# +# This file was automatically generated by gh-aw (v0.53.4). DO NOT EDIT. +# +# To update this file, edit the corresponding .md file and run: +# gh aw compile +# Not all edits will cause changes to this file. +# +# For more information: https://github.github.com/gh-aw/introduction/overview/ +# +# Run Z3 string solver benchmarks (seq vs nseq) on QF_S test suite from the c3 branch and post results as a GitHub discussion +# +# gh-aw-metadata: {"schema_version":"v1","frontmatter_hash":"11e7fe880a77098e320d93169917eed62c8c0c2288cd5d3e54f9251ed6edbf7e","compiler_version":"v0.53.4"} + +name: "Qf S Benchmark" +"on": + schedule: + - cron: "16 3 * * 3" + # Friendly format: weekly (scattered) + workflow_dispatch: + +permissions: {} + +concurrency: + group: "gh-aw-${{ github.workflow }}" + +run-name: "Qf S Benchmark" + +jobs: + activation: + runs-on: ubuntu-slim + permissions: + contents: read + outputs: + comment_id: "" + comment_repo: "" + model: ${{ steps.generate_aw_info.outputs.model }} + secret_verification_result: ${{ steps.validate-secret.outputs.verification_result }} + steps: + - name: Setup Scripts + uses: github/gh-aw/actions/setup@v0.53.4 + with: + destination: /opt/gh-aw/actions + - name: Generate agentic run info + id: generate_aw_info + env: + GH_AW_INFO_ENGINE_ID: "copilot" + GH_AW_INFO_ENGINE_NAME: "GitHub Copilot CLI" + GH_AW_INFO_MODEL: ${{ vars.GH_AW_MODEL_AGENT_COPILOT || '' }} + GH_AW_INFO_VERSION: "" + GH_AW_INFO_AGENT_VERSION: "0.0.421" + GH_AW_INFO_CLI_VERSION: "v0.53.4" + GH_AW_INFO_WORKFLOW_NAME: "Qf S Benchmark" + GH_AW_INFO_EXPERIMENTAL: "false" + GH_AW_INFO_SUPPORTS_TOOLS_ALLOWLIST: "true" + GH_AW_INFO_STAGED: "false" + GH_AW_INFO_ALLOWED_DOMAINS: '["defaults"]' + GH_AW_INFO_FIREWALL_ENABLED: "true" + GH_AW_INFO_AWF_VERSION: "v0.23.0" + GH_AW_INFO_AWMG_VERSION: "" + GH_AW_INFO_FIREWALL_TYPE: "squid" + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + with: + script: | + const { main } = require('/opt/gh-aw/actions/generate_aw_info.cjs'); + await main(core, context); + - name: Validate COPILOT_GITHUB_TOKEN secret + id: validate-secret + run: /opt/gh-aw/actions/validate_multi_secret.sh COPILOT_GITHUB_TOKEN 'GitHub Copilot CLI' https://github.github.com/gh-aw/reference/engines/#github-copilot-default + env: + COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + - name: Checkout .github and .agents folders + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + sparse-checkout: | + .github + .agents + sparse-checkout-cone-mode: true + fetch-depth: 1 + persist-credentials: false + - name: Check workflow file timestamps + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_WORKFLOW_FILE: "qf-s-benchmark.lock.yml" + with: + script: | + const { setupGlobals } = require('/opt/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('/opt/gh-aw/actions/check_workflow_timestamp_api.cjs'); + await main(); + - name: Create prompt with built-in context + env: + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + GH_AW_SAFE_OUTPUTS: ${{ env.GH_AW_SAFE_OUTPUTS }} + GH_AW_GITHUB_ACTOR: ${{ github.actor }} + GH_AW_GITHUB_EVENT_COMMENT_ID: ${{ github.event.comment.id }} + GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: ${{ github.event.discussion.number }} + GH_AW_GITHUB_EVENT_ISSUE_NUMBER: ${{ github.event.issue.number }} + GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: ${{ github.event.pull_request.number }} + GH_AW_GITHUB_REPOSITORY: ${{ github.repository }} + GH_AW_GITHUB_RUN_ID: ${{ github.run_id }} + GH_AW_GITHUB_WORKSPACE: ${{ github.workspace }} + run: | + bash /opt/gh-aw/actions/create_prompt_first.sh + { + cat << 'GH_AW_PROMPT_EOF' + + GH_AW_PROMPT_EOF + cat "/opt/gh-aw/prompts/xpia.md" + cat "/opt/gh-aw/prompts/temp_folder_prompt.md" + cat "/opt/gh-aw/prompts/markdown.md" + cat "/opt/gh-aw/prompts/safe_outputs_prompt.md" + cat << 'GH_AW_PROMPT_EOF' + + Tools: create_discussion, missing_tool, missing_data, noop + + + The following GitHub context information is available for this workflow: + {{#if __GH_AW_GITHUB_ACTOR__ }} + - **actor**: __GH_AW_GITHUB_ACTOR__ + {{/if}} + {{#if __GH_AW_GITHUB_REPOSITORY__ }} + - **repository**: __GH_AW_GITHUB_REPOSITORY__ + {{/if}} + {{#if __GH_AW_GITHUB_WORKSPACE__ }} + - **workspace**: __GH_AW_GITHUB_WORKSPACE__ + {{/if}} + {{#if __GH_AW_GITHUB_EVENT_ISSUE_NUMBER__ }} + - **issue-number**: #__GH_AW_GITHUB_EVENT_ISSUE_NUMBER__ + {{/if}} + {{#if __GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER__ }} + - **discussion-number**: #__GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER__ + {{/if}} + {{#if __GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER__ }} + - **pull-request-number**: #__GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER__ + {{/if}} + {{#if __GH_AW_GITHUB_EVENT_COMMENT_ID__ }} + - **comment-id**: __GH_AW_GITHUB_EVENT_COMMENT_ID__ + {{/if}} + {{#if __GH_AW_GITHUB_RUN_ID__ }} + - **workflow-run-id**: __GH_AW_GITHUB_RUN_ID__ + {{/if}} + + + GH_AW_PROMPT_EOF + cat << 'GH_AW_PROMPT_EOF' + + GH_AW_PROMPT_EOF + cat << 'GH_AW_PROMPT_EOF' + {{#runtime-import .github/workflows/qf-s-benchmark.md}} + GH_AW_PROMPT_EOF + } > "$GH_AW_PROMPT" + - name: Interpolate variables and render templates + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + with: + script: | + const { setupGlobals } = require('/opt/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('/opt/gh-aw/actions/interpolate_prompt.cjs'); + await main(); + - name: Substitute placeholders + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + GH_AW_GITHUB_ACTOR: ${{ github.actor }} + GH_AW_GITHUB_EVENT_COMMENT_ID: ${{ github.event.comment.id }} + GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: ${{ github.event.discussion.number }} + GH_AW_GITHUB_EVENT_ISSUE_NUMBER: ${{ github.event.issue.number }} + GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: ${{ github.event.pull_request.number }} + GH_AW_GITHUB_REPOSITORY: ${{ github.repository }} + GH_AW_GITHUB_RUN_ID: ${{ github.run_id }} + GH_AW_GITHUB_WORKSPACE: ${{ github.workspace }} + with: + script: | + const { setupGlobals } = require('/opt/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + + const substitutePlaceholders = require('/opt/gh-aw/actions/substitute_placeholders.cjs'); + + // Call the substitution function + return await substitutePlaceholders({ + file: process.env.GH_AW_PROMPT, + substitutions: { + GH_AW_GITHUB_ACTOR: process.env.GH_AW_GITHUB_ACTOR, + GH_AW_GITHUB_EVENT_COMMENT_ID: process.env.GH_AW_GITHUB_EVENT_COMMENT_ID, + GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: process.env.GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER, + GH_AW_GITHUB_EVENT_ISSUE_NUMBER: process.env.GH_AW_GITHUB_EVENT_ISSUE_NUMBER, + GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: process.env.GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER, + GH_AW_GITHUB_REPOSITORY: process.env.GH_AW_GITHUB_REPOSITORY, + GH_AW_GITHUB_RUN_ID: process.env.GH_AW_GITHUB_RUN_ID, + GH_AW_GITHUB_WORKSPACE: process.env.GH_AW_GITHUB_WORKSPACE + } + }); + - name: Validate prompt placeholders + env: + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + run: bash /opt/gh-aw/actions/validate_prompt_placeholders.sh + - name: Print prompt + env: + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + run: bash /opt/gh-aw/actions/print_prompt_summary.sh + - name: Upload activation artifact + if: success() + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: activation + path: | + /tmp/gh-aw/aw_info.json + /tmp/gh-aw/aw-prompts/prompt.txt + retention-days: 1 + + agent: + needs: activation + runs-on: ubuntu-latest + permissions: read-all + concurrency: + group: "gh-aw-copilot-${{ github.workflow }}" + env: + DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + GH_AW_ASSETS_ALLOWED_EXTS: "" + GH_AW_ASSETS_BRANCH: "" + GH_AW_ASSETS_MAX_SIZE_KB: 0 + GH_AW_MCP_LOG_DIR: /tmp/gh-aw/mcp-logs/safeoutputs + GH_AW_SAFE_OUTPUTS: /opt/gh-aw/safeoutputs/outputs.jsonl + GH_AW_SAFE_OUTPUTS_CONFIG_PATH: /opt/gh-aw/safeoutputs/config.json + GH_AW_SAFE_OUTPUTS_TOOLS_PATH: /opt/gh-aw/safeoutputs/tools.json + GH_AW_WORKFLOW_ID_SANITIZED: qfsbenchmark + outputs: + checkout_pr_success: ${{ steps.checkout-pr.outputs.checkout_pr_success || 'true' }} + detection_conclusion: ${{ steps.detection_conclusion.outputs.conclusion }} + detection_success: ${{ steps.detection_conclusion.outputs.success }} + has_patch: ${{ steps.collect_output.outputs.has_patch }} + inference_access_error: ${{ steps.detect-inference-error.outputs.inference_access_error || 'false' }} + model: ${{ needs.activation.outputs.model }} + output: ${{ steps.collect_output.outputs.output }} + output_types: ${{ steps.collect_output.outputs.output_types }} + steps: + - name: Setup Scripts + uses: github/gh-aw/actions/setup@v0.53.4 + with: + destination: /opt/gh-aw/actions + - name: Create gh-aw temp directory + run: bash /opt/gh-aw/actions/create_gh_aw_tmp_dir.sh + - name: Checkout c3 branch + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5 + with: + fetch-depth: 1 + persist-credentials: false + ref: c3 + + - name: Configure Git credentials + env: + REPO_NAME: ${{ github.repository }} + SERVER_URL: ${{ github.server_url }} + run: | + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git config --global user.name "github-actions[bot]" + git config --global am.keepcr true + # Re-authenticate git with GitHub token + SERVER_URL_STRIPPED="${SERVER_URL#https://}" + git remote set-url origin "https://x-access-token:${{ github.token }}@${SERVER_URL_STRIPPED}/${REPO_NAME}.git" + echo "Git configured with standard GitHub Actions identity" + - name: Checkout PR branch + id: checkout-pr + if: | + (github.event.pull_request) || (github.event.issue.pull_request) + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + with: + github-token: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('/opt/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('/opt/gh-aw/actions/checkout_pr_branch.cjs'); + await main(); + - name: Install GitHub Copilot CLI + run: /opt/gh-aw/actions/install_copilot_cli.sh 0.0.421 + - name: Install awf binary + run: bash /opt/gh-aw/actions/install_awf_binary.sh v0.23.0 + - name: Determine automatic lockdown mode for GitHub MCP Server + id: determine-automatic-lockdown + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_GITHUB_TOKEN: ${{ secrets.GH_AW_GITHUB_TOKEN }} + GH_AW_GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN }} + with: + script: | + const determineAutomaticLockdown = require('/opt/gh-aw/actions/determine_automatic_lockdown.cjs'); + await determineAutomaticLockdown(github, context, core); + - name: Download container images + run: bash /opt/gh-aw/actions/download_docker_images.sh ghcr.io/github/gh-aw-firewall/agent:0.23.0 ghcr.io/github/gh-aw-firewall/api-proxy:0.23.0 ghcr.io/github/gh-aw-firewall/squid:0.23.0 ghcr.io/github/gh-aw-mcpg:v0.1.8 ghcr.io/github/github-mcp-server:v0.31.0 node:lts-alpine + - name: Write Safe Outputs Config + run: | + mkdir -p /opt/gh-aw/safeoutputs + mkdir -p /tmp/gh-aw/safeoutputs + mkdir -p /tmp/gh-aw/mcp-logs/safeoutputs + cat > /opt/gh-aw/safeoutputs/config.json << 'GH_AW_SAFE_OUTPUTS_CONFIG_EOF' + {"create_discussion":{"expires":168,"max":1},"create_missing_tool_issue":{"max":1,"title_prefix":"[missing tool]"},"missing_data":{},"missing_tool":{},"noop":{"max":1}} + GH_AW_SAFE_OUTPUTS_CONFIG_EOF + cat > /opt/gh-aw/safeoutputs/tools.json << 'GH_AW_SAFE_OUTPUTS_TOOLS_EOF' + [ + { + "description": "Create a GitHub discussion for announcements, Q\u0026A, reports, status updates, or community conversations. Use this for content that benefits from threaded replies, doesn't require task tracking, or serves as documentation. For actionable work items that need assignment and status tracking, use create_issue instead. CONSTRAINTS: Maximum 1 discussion(s) can be created. Title will be prefixed with \"[QF_S Benchmark] \". Discussions will be created in category \"agentic workflows\".", + "inputSchema": { + "additionalProperties": false, + "properties": { + "body": { + "description": "Discussion content in Markdown. Do NOT repeat the title as a heading since it already appears as the discussion's h1. Include all relevant context, findings, or questions.", + "type": "string" + }, + "category": { + "description": "Discussion category by name (e.g., 'General'), slug (e.g., 'general'), or ID. If omitted, uses the first available category. Category must exist in the repository.", + "type": "string" + }, + "integrity": { + "description": "Trustworthiness level of the message source (e.g., \"low\", \"medium\", \"high\").", + "type": "string" + }, + "secrecy": { + "description": "Confidentiality level of the message content (e.g., \"public\", \"internal\", \"private\").", + "type": "string" + }, + "title": { + "description": "Concise discussion title summarizing the topic. The title appears as the main heading, so keep it brief and descriptive.", + "type": "string" + } + }, + "required": [ + "title", + "body" + ], + "type": "object" + }, + "name": "create_discussion" + }, + { + "description": "Report that a tool or capability needed to complete the task is not available, or share any information you deem important about missing functionality or limitations. Use this when you cannot accomplish what was requested because the required functionality is missing or access is restricted.", + "inputSchema": { + "additionalProperties": false, + "properties": { + "alternatives": { + "description": "Any workarounds, manual steps, or alternative approaches the user could take (max 256 characters).", + "type": "string" + }, + "integrity": { + "description": "Trustworthiness level of the message source (e.g., \"low\", \"medium\", \"high\").", + "type": "string" + }, + "reason": { + "description": "Explanation of why this tool is needed or what information you want to share about the limitation (max 256 characters).", + "type": "string" + }, + "secrecy": { + "description": "Confidentiality level of the message content (e.g., \"public\", \"internal\", \"private\").", + "type": "string" + }, + "tool": { + "description": "Optional: Name or description of the missing tool or capability (max 128 characters). Be specific about what functionality is needed.", + "type": "string" + } + }, + "required": [ + "reason" + ], + "type": "object" + }, + "name": "missing_tool" + }, + { + "description": "Log a transparency message when no significant actions are needed. Use this to confirm workflow completion and provide visibility when analysis is complete but no changes or outputs are required (e.g., 'No issues found', 'All checks passed'). This ensures the workflow produces human-visible output even when no other actions are taken.", + "inputSchema": { + "additionalProperties": false, + "properties": { + "integrity": { + "description": "Trustworthiness level of the message source (e.g., \"low\", \"medium\", \"high\").", + "type": "string" + }, + "message": { + "description": "Status or completion message to log. Should explain what was analyzed and the outcome (e.g., 'Code review complete - no issues found', 'Analysis complete - all tests passing').", + "type": "string" + }, + "secrecy": { + "description": "Confidentiality level of the message content (e.g., \"public\", \"internal\", \"private\").", + "type": "string" + } + }, + "required": [ + "message" + ], + "type": "object" + }, + "name": "noop" + }, + { + "description": "Report that data or information needed to complete the task is not available. Use this when you cannot accomplish what was requested because required data, context, or information is missing.", + "inputSchema": { + "additionalProperties": false, + "properties": { + "alternatives": { + "description": "Any workarounds, manual steps, or alternative approaches the user could take (max 256 characters).", + "type": "string" + }, + "context": { + "description": "Additional context about the missing data or where it should come from (max 256 characters).", + "type": "string" + }, + "data_type": { + "description": "Type or description of the missing data or information (max 128 characters). Be specific about what data is needed.", + "type": "string" + }, + "integrity": { + "description": "Trustworthiness level of the message source (e.g., \"low\", \"medium\", \"high\").", + "type": "string" + }, + "reason": { + "description": "Explanation of why this data is needed to complete the task (max 256 characters).", + "type": "string" + }, + "secrecy": { + "description": "Confidentiality level of the message content (e.g., \"public\", \"internal\", \"private\").", + "type": "string" + } + }, + "required": [], + "type": "object" + }, + "name": "missing_data" + } + ] + GH_AW_SAFE_OUTPUTS_TOOLS_EOF + cat > /opt/gh-aw/safeoutputs/validation.json << 'GH_AW_SAFE_OUTPUTS_VALIDATION_EOF' + { + "create_discussion": { + "defaultMax": 1, + "fields": { + "body": { + "required": true, + "type": "string", + "sanitize": true, + "maxLength": 65000 + }, + "category": { + "type": "string", + "sanitize": true, + "maxLength": 128 + }, + "repo": { + "type": "string", + "maxLength": 256 + }, + "title": { + "required": true, + "type": "string", + "sanitize": true, + "maxLength": 128 + } + } + }, + "missing_data": { + "defaultMax": 20, + "fields": { + "alternatives": { + "type": "string", + "sanitize": true, + "maxLength": 256 + }, + "context": { + "type": "string", + "sanitize": true, + "maxLength": 256 + }, + "data_type": { + "type": "string", + "sanitize": true, + "maxLength": 128 + }, + "reason": { + "type": "string", + "sanitize": true, + "maxLength": 256 + } + } + }, + "missing_tool": { + "defaultMax": 20, + "fields": { + "alternatives": { + "type": "string", + "sanitize": true, + "maxLength": 512 + }, + "reason": { + "required": true, + "type": "string", + "sanitize": true, + "maxLength": 256 + }, + "tool": { + "type": "string", + "sanitize": true, + "maxLength": 128 + } + } + }, + "noop": { + "defaultMax": 1, + "fields": { + "message": { + "required": true, + "type": "string", + "sanitize": true, + "maxLength": 65000 + } + } + } + } + GH_AW_SAFE_OUTPUTS_VALIDATION_EOF + - name: Generate Safe Outputs MCP Server Config + id: safe-outputs-config + run: | + # Generate a secure random API key (360 bits of entropy, 40+ chars) + # Mask immediately to prevent timing vulnerabilities + API_KEY=$(openssl rand -base64 45 | tr -d '/+=') + echo "::add-mask::${API_KEY}" + + PORT=3001 + + # Set outputs for next steps + { + echo "safe_outputs_api_key=${API_KEY}" + echo "safe_outputs_port=${PORT}" + } >> "$GITHUB_OUTPUT" + + echo "Safe Outputs MCP server will run on port ${PORT}" + + - name: Start Safe Outputs MCP HTTP Server + id: safe-outputs-start + env: + DEBUG: '*' + GH_AW_SAFE_OUTPUTS_PORT: ${{ steps.safe-outputs-config.outputs.safe_outputs_port }} + GH_AW_SAFE_OUTPUTS_API_KEY: ${{ steps.safe-outputs-config.outputs.safe_outputs_api_key }} + GH_AW_SAFE_OUTPUTS_TOOLS_PATH: /opt/gh-aw/safeoutputs/tools.json + GH_AW_SAFE_OUTPUTS_CONFIG_PATH: /opt/gh-aw/safeoutputs/config.json + GH_AW_MCP_LOG_DIR: /tmp/gh-aw/mcp-logs/safeoutputs + run: | + # Environment variables are set above to prevent template injection + export DEBUG + export GH_AW_SAFE_OUTPUTS_PORT + export GH_AW_SAFE_OUTPUTS_API_KEY + export GH_AW_SAFE_OUTPUTS_TOOLS_PATH + export GH_AW_SAFE_OUTPUTS_CONFIG_PATH + export GH_AW_MCP_LOG_DIR + + bash /opt/gh-aw/actions/start_safe_outputs_server.sh + + - name: Start MCP Gateway + id: start-mcp-gateway + env: + GH_AW_SAFE_OUTPUTS: ${{ env.GH_AW_SAFE_OUTPUTS }} + GH_AW_SAFE_OUTPUTS_API_KEY: ${{ steps.safe-outputs-start.outputs.api_key }} + GH_AW_SAFE_OUTPUTS_PORT: ${{ steps.safe-outputs-start.outputs.port }} + GITHUB_MCP_LOCKDOWN: ${{ steps.determine-automatic-lockdown.outputs.lockdown == 'true' && '1' || '0' }} + GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + run: | + set -eo pipefail + mkdir -p /tmp/gh-aw/mcp-config + + # Export gateway environment variables for MCP config and gateway script + export MCP_GATEWAY_PORT="80" + export MCP_GATEWAY_DOMAIN="host.docker.internal" + MCP_GATEWAY_API_KEY=$(openssl rand -base64 45 | tr -d '/+=') + echo "::add-mask::${MCP_GATEWAY_API_KEY}" + export MCP_GATEWAY_API_KEY + export MCP_GATEWAY_PAYLOAD_DIR="/tmp/gh-aw/mcp-payloads" + mkdir -p "${MCP_GATEWAY_PAYLOAD_DIR}" + export MCP_GATEWAY_PAYLOAD_SIZE_THRESHOLD="524288" + export DEBUG="*" + + export GH_AW_ENGINE="copilot" + export MCP_GATEWAY_DOCKER_COMMAND='docker run -i --rm --network host -v /var/run/docker.sock:/var/run/docker.sock -e MCP_GATEWAY_PORT -e MCP_GATEWAY_DOMAIN -e MCP_GATEWAY_API_KEY -e MCP_GATEWAY_PAYLOAD_DIR -e MCP_GATEWAY_PAYLOAD_SIZE_THRESHOLD -e DEBUG -e MCP_GATEWAY_LOG_DIR -e GH_AW_MCP_LOG_DIR -e GH_AW_SAFE_OUTPUTS -e GH_AW_SAFE_OUTPUTS_CONFIG_PATH -e GH_AW_SAFE_OUTPUTS_TOOLS_PATH -e GH_AW_ASSETS_BRANCH -e GH_AW_ASSETS_MAX_SIZE_KB -e GH_AW_ASSETS_ALLOWED_EXTS -e DEFAULT_BRANCH -e GITHUB_MCP_SERVER_TOKEN -e GITHUB_MCP_LOCKDOWN -e GITHUB_REPOSITORY -e GITHUB_SERVER_URL -e GITHUB_SHA -e GITHUB_WORKSPACE -e GITHUB_TOKEN -e GITHUB_RUN_ID -e GITHUB_RUN_NUMBER -e GITHUB_RUN_ATTEMPT -e GITHUB_JOB -e GITHUB_ACTION -e GITHUB_EVENT_NAME -e GITHUB_EVENT_PATH -e GITHUB_ACTOR -e GITHUB_ACTOR_ID -e GITHUB_TRIGGERING_ACTOR -e GITHUB_WORKFLOW -e GITHUB_WORKFLOW_REF -e GITHUB_WORKFLOW_SHA -e GITHUB_REF -e GITHUB_REF_NAME -e GITHUB_REF_TYPE -e GITHUB_HEAD_REF -e GITHUB_BASE_REF -e GH_AW_SAFE_OUTPUTS_PORT -e GH_AW_SAFE_OUTPUTS_API_KEY -v /tmp/gh-aw/mcp-payloads:/tmp/gh-aw/mcp-payloads:rw -v /opt:/opt:ro -v /tmp:/tmp:rw -v '"${GITHUB_WORKSPACE}"':'"${GITHUB_WORKSPACE}"':rw ghcr.io/github/gh-aw-mcpg:v0.1.8' + + mkdir -p /home/runner/.copilot + cat << GH_AW_MCP_CONFIG_EOF | bash /opt/gh-aw/actions/start_mcp_gateway.sh + { + "mcpServers": { + "github": { + "type": "stdio", + "container": "ghcr.io/github/github-mcp-server:v0.31.0", + "env": { + "GITHUB_LOCKDOWN_MODE": "$GITHUB_MCP_LOCKDOWN", + "GITHUB_PERSONAL_ACCESS_TOKEN": "\${GITHUB_MCP_SERVER_TOKEN}", + "GITHUB_READ_ONLY": "1", + "GITHUB_TOOLSETS": "context,repos,issues,pull_requests" + } + }, + "safeoutputs": { + "type": "http", + "url": "http://host.docker.internal:$GH_AW_SAFE_OUTPUTS_PORT", + "headers": { + "Authorization": "\${GH_AW_SAFE_OUTPUTS_API_KEY}" + } + } + }, + "gateway": { + "port": $MCP_GATEWAY_PORT, + "domain": "${MCP_GATEWAY_DOMAIN}", + "apiKey": "${MCP_GATEWAY_API_KEY}", + "payloadDir": "${MCP_GATEWAY_PAYLOAD_DIR}" + } + } + GH_AW_MCP_CONFIG_EOF + - name: Download activation artifact + uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0 + with: + name: activation + path: /tmp/gh-aw + - name: Clean git credentials + run: bash /opt/gh-aw/actions/clean_git_credentials.sh + - name: Execute GitHub Copilot CLI + id: agentic_execution + # Copilot CLI tool arguments (sorted): + timeout-minutes: 90 + run: | + set -o pipefail + # shellcheck disable=SC1003 + sudo -E awf --env-all --container-workdir "${GITHUB_WORKSPACE}" --allow-domains "api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,api.snapcraft.io,archive.ubuntu.com,azure.archive.ubuntu.com,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,github.com,host.docker.internal,json-schema.org,json.schemastore.org,keyserver.ubuntu.com,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,ppa.launchpad.net,raw.githubusercontent.com,registry.npmjs.org,s.symcb.com,s.symcd.com,security.ubuntu.com,telemetry.enterprise.githubcopilot.com,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com" --log-level info --proxy-logs-dir /tmp/gh-aw/sandbox/firewall/logs --enable-host-access --image-tag 0.23.0 --skip-pull --enable-api-proxy \ + -- /bin/bash -c '/usr/local/bin/copilot --add-dir /tmp/gh-aw/ --log-level all --log-dir /tmp/gh-aw/sandbox/agent/logs/ --add-dir "${GITHUB_WORKSPACE}" --disable-builtin-mcps --allow-all-tools --allow-all-paths --prompt "$(cat /tmp/gh-aw/aw-prompts/prompt.txt)"' 2>&1 | tee -a /tmp/gh-aw/agent-stdio.log + env: + COPILOT_AGENT_RUNNER_TYPE: STANDALONE + COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + COPILOT_MODEL: ${{ vars.GH_AW_MODEL_AGENT_COPILOT || '' }} + GH_AW_MCP_CONFIG: /home/runner/.copilot/mcp-config.json + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + GH_AW_SAFE_OUTPUTS: ${{ env.GH_AW_SAFE_OUTPUTS }} + GITHUB_API_URL: ${{ github.api_url }} + GITHUB_HEAD_REF: ${{ github.head_ref }} + GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + GITHUB_REF_NAME: ${{ github.ref_name }} + GITHUB_SERVER_URL: ${{ github.server_url }} + GITHUB_STEP_SUMMARY: ${{ env.GITHUB_STEP_SUMMARY }} + GITHUB_WORKSPACE: ${{ github.workspace }} + XDG_CONFIG_HOME: /home/runner + - name: Detect inference access error + id: detect-inference-error + if: always() + continue-on-error: true + run: bash /opt/gh-aw/actions/detect_inference_access_error.sh + - name: Configure Git credentials + env: + REPO_NAME: ${{ github.repository }} + SERVER_URL: ${{ github.server_url }} + run: | + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git config --global user.name "github-actions[bot]" + git config --global am.keepcr true + # Re-authenticate git with GitHub token + SERVER_URL_STRIPPED="${SERVER_URL#https://}" + git remote set-url origin "https://x-access-token:${{ github.token }}@${SERVER_URL_STRIPPED}/${REPO_NAME}.git" + echo "Git configured with standard GitHub Actions identity" + - name: Copy Copilot session state files to logs + if: always() + continue-on-error: true + run: | + # Copy Copilot session state files to logs folder for artifact collection + # This ensures they are in /tmp/gh-aw/ where secret redaction can scan them + SESSION_STATE_DIR="$HOME/.copilot/session-state" + LOGS_DIR="/tmp/gh-aw/sandbox/agent/logs" + + if [ -d "$SESSION_STATE_DIR" ]; then + echo "Copying Copilot session state files from $SESSION_STATE_DIR to $LOGS_DIR" + mkdir -p "$LOGS_DIR" + cp -v "$SESSION_STATE_DIR"/*.jsonl "$LOGS_DIR/" 2>/dev/null || true + echo "Session state files copied successfully" + else + echo "No session-state directory found at $SESSION_STATE_DIR" + fi + - name: Stop MCP Gateway + if: always() + continue-on-error: true + env: + MCP_GATEWAY_PORT: ${{ steps.start-mcp-gateway.outputs.gateway-port }} + MCP_GATEWAY_API_KEY: ${{ steps.start-mcp-gateway.outputs.gateway-api-key }} + GATEWAY_PID: ${{ steps.start-mcp-gateway.outputs.gateway-pid }} + run: | + bash /opt/gh-aw/actions/stop_mcp_gateway.sh "$GATEWAY_PID" + - name: Redact secrets in logs + if: always() + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + with: + script: | + const { setupGlobals } = require('/opt/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('/opt/gh-aw/actions/redact_secrets.cjs'); + await main(); + env: + GH_AW_SECRET_NAMES: 'COPILOT_GITHUB_TOKEN,GH_AW_GITHUB_MCP_SERVER_TOKEN,GH_AW_GITHUB_TOKEN,GITHUB_TOKEN' + SECRET_COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + SECRET_GH_AW_GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN }} + SECRET_GH_AW_GITHUB_TOKEN: ${{ secrets.GH_AW_GITHUB_TOKEN }} + SECRET_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Upload Safe Outputs + if: always() + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: safe-output + path: ${{ env.GH_AW_SAFE_OUTPUTS }} + if-no-files-found: warn + - name: Ingest agent output + id: collect_output + if: always() + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_SAFE_OUTPUTS: ${{ env.GH_AW_SAFE_OUTPUTS }} + GH_AW_ALLOWED_DOMAINS: "api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,api.snapcraft.io,archive.ubuntu.com,azure.archive.ubuntu.com,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,github.com,host.docker.internal,json-schema.org,json.schemastore.org,keyserver.ubuntu.com,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,ppa.launchpad.net,raw.githubusercontent.com,registry.npmjs.org,s.symcb.com,s.symcd.com,security.ubuntu.com,telemetry.enterprise.githubcopilot.com,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com" + GITHUB_SERVER_URL: ${{ github.server_url }} + GITHUB_API_URL: ${{ github.api_url }} + with: + script: | + const { setupGlobals } = require('/opt/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('/opt/gh-aw/actions/collect_ndjson_output.cjs'); + await main(); + - name: Upload sanitized agent output + if: always() && env.GH_AW_AGENT_OUTPUT + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: agent-output + path: ${{ env.GH_AW_AGENT_OUTPUT }} + if-no-files-found: warn + - name: Upload engine output files + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: agent_outputs + path: | + /tmp/gh-aw/sandbox/agent/logs/ + /tmp/gh-aw/redacted-urls.log + if-no-files-found: ignore + - name: Parse agent logs for step summary + if: always() + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_AGENT_OUTPUT: /tmp/gh-aw/sandbox/agent/logs/ + with: + script: | + const { setupGlobals } = require('/opt/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('/opt/gh-aw/actions/parse_copilot_log.cjs'); + await main(); + - name: Parse MCP Gateway logs for step summary + if: always() + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + with: + script: | + const { setupGlobals } = require('/opt/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('/opt/gh-aw/actions/parse_mcp_gateway_log.cjs'); + await main(); + - name: Print firewall logs + if: always() + continue-on-error: true + env: + AWF_LOGS_DIR: /tmp/gh-aw/sandbox/firewall/logs + run: | + # Fix permissions on firewall logs so they can be uploaded as artifacts + # AWF runs with sudo, creating files owned by root + sudo chmod -R a+r /tmp/gh-aw/sandbox/firewall/logs 2>/dev/null || true + # Only run awf logs summary if awf command exists (it may not be installed if workflow failed before install step) + if command -v awf &> /dev/null; then + awf logs summary | tee -a "$GITHUB_STEP_SUMMARY" + else + echo 'AWF binary not installed, skipping firewall log summary' + fi + - name: Upload agent artifacts + if: always() + continue-on-error: true + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: agent-artifacts + path: | + /tmp/gh-aw/aw-prompts/prompt.txt + /tmp/gh-aw/mcp-logs/ + /tmp/gh-aw/sandbox/firewall/logs/ + /tmp/gh-aw/agent-stdio.log + /tmp/gh-aw/agent/ + if-no-files-found: ignore + # --- Threat Detection (inline) --- + - name: Check if detection needed + id: detection_guard + if: always() + env: + OUTPUT_TYPES: ${{ steps.collect_output.outputs.output_types }} + HAS_PATCH: ${{ steps.collect_output.outputs.has_patch }} + run: | + if [[ -n "$OUTPUT_TYPES" || "$HAS_PATCH" == "true" ]]; then + echo "run_detection=true" >> "$GITHUB_OUTPUT" + echo "Detection will run: output_types=$OUTPUT_TYPES, has_patch=$HAS_PATCH" + else + echo "run_detection=false" >> "$GITHUB_OUTPUT" + echo "Detection skipped: no agent outputs or patches to analyze" + fi + - name: Clear MCP configuration for detection + if: always() && steps.detection_guard.outputs.run_detection == 'true' + run: | + rm -f /tmp/gh-aw/mcp-config/mcp-servers.json + rm -f /home/runner/.copilot/mcp-config.json + rm -f "$GITHUB_WORKSPACE/.gemini/settings.json" + - name: Prepare threat detection files + if: always() && steps.detection_guard.outputs.run_detection == 'true' + run: | + mkdir -p /tmp/gh-aw/threat-detection/aw-prompts + cp /tmp/gh-aw/aw-prompts/prompt.txt /tmp/gh-aw/threat-detection/aw-prompts/prompt.txt 2>/dev/null || true + cp /tmp/gh-aw/agent_output.json /tmp/gh-aw/threat-detection/agent_output.json 2>/dev/null || true + for f in /tmp/gh-aw/aw-*.patch; do + [ -f "$f" ] && cp "$f" /tmp/gh-aw/threat-detection/ 2>/dev/null || true + done + echo "Prepared threat detection files:" + ls -la /tmp/gh-aw/threat-detection/ 2>/dev/null || true + - name: Setup threat detection + if: always() && steps.detection_guard.outputs.run_detection == 'true' + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + WORKFLOW_NAME: "Qf S Benchmark" + WORKFLOW_DESCRIPTION: "Run Z3 string solver benchmarks (seq vs nseq) on QF_S test suite from the c3 branch and post results as a GitHub discussion" + HAS_PATCH: ${{ steps.collect_output.outputs.has_patch }} + with: + script: | + const { setupGlobals } = require('/opt/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('/opt/gh-aw/actions/setup_threat_detection.cjs'); + await main(); + - name: Ensure threat-detection directory and log + if: always() && steps.detection_guard.outputs.run_detection == 'true' + run: | + mkdir -p /tmp/gh-aw/threat-detection + touch /tmp/gh-aw/threat-detection/detection.log + - name: Execute GitHub Copilot CLI + if: always() && steps.detection_guard.outputs.run_detection == 'true' + id: detection_agentic_execution + # Copilot CLI tool arguments (sorted): + # --allow-tool shell(cat) + # --allow-tool shell(grep) + # --allow-tool shell(head) + # --allow-tool shell(jq) + # --allow-tool shell(ls) + # --allow-tool shell(tail) + # --allow-tool shell(wc) + timeout-minutes: 20 + run: | + set -o pipefail + # shellcheck disable=SC1003 + sudo -E awf --env-all --container-workdir "${GITHUB_WORKSPACE}" --allow-domains "api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,github.com,host.docker.internal,raw.githubusercontent.com,registry.npmjs.org,telemetry.enterprise.githubcopilot.com" --log-level info --proxy-logs-dir /tmp/gh-aw/sandbox/firewall/logs --enable-host-access --image-tag 0.23.0 --skip-pull --enable-api-proxy \ + -- /bin/bash -c '/usr/local/bin/copilot --add-dir /tmp/gh-aw/ --log-level all --log-dir /tmp/gh-aw/sandbox/agent/logs/ --add-dir "${GITHUB_WORKSPACE}" --disable-builtin-mcps --allow-tool '\''shell(cat)'\'' --allow-tool '\''shell(grep)'\'' --allow-tool '\''shell(head)'\'' --allow-tool '\''shell(jq)'\'' --allow-tool '\''shell(ls)'\'' --allow-tool '\''shell(tail)'\'' --allow-tool '\''shell(wc)'\'' --prompt "$(cat /tmp/gh-aw/aw-prompts/prompt.txt)"' 2>&1 | tee -a /tmp/gh-aw/threat-detection/detection.log + env: + COPILOT_AGENT_RUNNER_TYPE: STANDALONE + COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + COPILOT_MODEL: ${{ vars.GH_AW_MODEL_DETECTION_COPILOT || '' }} + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + GITHUB_API_URL: ${{ github.api_url }} + GITHUB_HEAD_REF: ${{ github.head_ref }} + GITHUB_REF_NAME: ${{ github.ref_name }} + GITHUB_SERVER_URL: ${{ github.server_url }} + GITHUB_STEP_SUMMARY: ${{ env.GITHUB_STEP_SUMMARY }} + GITHUB_WORKSPACE: ${{ github.workspace }} + XDG_CONFIG_HOME: /home/runner + - name: Parse threat detection results + id: parse_detection_results + if: always() && steps.detection_guard.outputs.run_detection == 'true' + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + with: + script: | + const { setupGlobals } = require('/opt/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('/opt/gh-aw/actions/parse_threat_detection_results.cjs'); + await main(); + - name: Upload threat detection log + if: always() && steps.detection_guard.outputs.run_detection == 'true' + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: threat-detection.log + path: /tmp/gh-aw/threat-detection/detection.log + if-no-files-found: ignore + - name: Set detection conclusion + id: detection_conclusion + if: always() + env: + RUN_DETECTION: ${{ steps.detection_guard.outputs.run_detection }} + DETECTION_SUCCESS: ${{ steps.parse_detection_results.outputs.success }} + run: | + if [[ "$RUN_DETECTION" != "true" ]]; then + echo "conclusion=skipped" >> "$GITHUB_OUTPUT" + echo "success=true" >> "$GITHUB_OUTPUT" + echo "Detection was not needed, marking as skipped" + elif [[ "$DETECTION_SUCCESS" == "true" ]]; then + echo "conclusion=success" >> "$GITHUB_OUTPUT" + echo "success=true" >> "$GITHUB_OUTPUT" + echo "Detection passed successfully" + else + echo "conclusion=failure" >> "$GITHUB_OUTPUT" + echo "success=false" >> "$GITHUB_OUTPUT" + echo "Detection found issues" + fi + + conclusion: + needs: + - activation + - agent + - safe_outputs + if: (always()) && (needs.agent.result != 'skipped') + runs-on: ubuntu-slim + permissions: + contents: read + discussions: write + issues: write + concurrency: + group: "gh-aw-conclusion-qf-s-benchmark" + cancel-in-progress: false + outputs: + noop_message: ${{ steps.noop.outputs.noop_message }} + tools_reported: ${{ steps.missing_tool.outputs.tools_reported }} + total_count: ${{ steps.missing_tool.outputs.total_count }} + steps: + - name: Setup Scripts + uses: github/gh-aw/actions/setup@v0.53.4 + with: + destination: /opt/gh-aw/actions + - name: Download agent output artifact + id: download-agent-output + continue-on-error: true + uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0 + with: + name: agent-output + path: /tmp/gh-aw/safeoutputs/ + - name: Setup agent output environment variable + if: steps.download-agent-output.outcome == 'success' + run: | + mkdir -p /tmp/gh-aw/safeoutputs/ + find "/tmp/gh-aw/safeoutputs/" -type f -print + echo "GH_AW_AGENT_OUTPUT=/tmp/gh-aw/safeoutputs/agent_output.json" >> "$GITHUB_ENV" + - name: Process No-Op Messages + id: noop + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_AGENT_OUTPUT: ${{ env.GH_AW_AGENT_OUTPUT }} + GH_AW_NOOP_MAX: "1" + GH_AW_WORKFLOW_NAME: "Qf S Benchmark" + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('/opt/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('/opt/gh-aw/actions/noop.cjs'); + await main(); + - name: Record Missing Tool + id: missing_tool + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_AGENT_OUTPUT: ${{ env.GH_AW_AGENT_OUTPUT }} + GH_AW_MISSING_TOOL_CREATE_ISSUE: "true" + GH_AW_MISSING_TOOL_TITLE_PREFIX: "[missing tool]" + GH_AW_WORKFLOW_NAME: "Qf S Benchmark" + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('/opt/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('/opt/gh-aw/actions/missing_tool.cjs'); + await main(); + - name: Handle Agent Failure + id: handle_agent_failure + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_AGENT_OUTPUT: ${{ env.GH_AW_AGENT_OUTPUT }} + GH_AW_WORKFLOW_NAME: "Qf S Benchmark" + GH_AW_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + GH_AW_AGENT_CONCLUSION: ${{ needs.agent.result }} + GH_AW_WORKFLOW_ID: "qf-s-benchmark" + GH_AW_SECRET_VERIFICATION_RESULT: ${{ needs.activation.outputs.secret_verification_result }} + GH_AW_CHECKOUT_PR_SUCCESS: ${{ needs.agent.outputs.checkout_pr_success }} + GH_AW_INFERENCE_ACCESS_ERROR: ${{ needs.agent.outputs.inference_access_error }} + GH_AW_CREATE_DISCUSSION_ERRORS: ${{ needs.safe_outputs.outputs.create_discussion_errors }} + GH_AW_CREATE_DISCUSSION_ERROR_COUNT: ${{ needs.safe_outputs.outputs.create_discussion_error_count }} + GH_AW_GROUP_REPORTS: "false" + GH_AW_TIMEOUT_MINUTES: "90" + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('/opt/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('/opt/gh-aw/actions/handle_agent_failure.cjs'); + await main(); + - name: Handle No-Op Message + id: handle_noop_message + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_AGENT_OUTPUT: ${{ env.GH_AW_AGENT_OUTPUT }} + GH_AW_WORKFLOW_NAME: "Qf S Benchmark" + GH_AW_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + GH_AW_AGENT_CONCLUSION: ${{ needs.agent.result }} + GH_AW_NOOP_MESSAGE: ${{ steps.noop.outputs.noop_message }} + GH_AW_NOOP_REPORT_AS_ISSUE: "true" + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('/opt/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('/opt/gh-aw/actions/handle_noop_message.cjs'); + await main(); + + safe_outputs: + needs: agent + if: ((!cancelled()) && (needs.agent.result != 'skipped')) && (needs.agent.outputs.detection_success == 'true') + runs-on: ubuntu-slim + permissions: + contents: read + discussions: write + issues: write + timeout-minutes: 15 + env: + GH_AW_CALLER_WORKFLOW_ID: "${{ github.repository }}/qf-s-benchmark" + GH_AW_ENGINE_ID: "copilot" + GH_AW_WORKFLOW_ID: "qf-s-benchmark" + GH_AW_WORKFLOW_NAME: "Qf S Benchmark" + outputs: + code_push_failure_count: ${{ steps.process_safe_outputs.outputs.code_push_failure_count }} + code_push_failure_errors: ${{ steps.process_safe_outputs.outputs.code_push_failure_errors }} + create_discussion_error_count: ${{ steps.process_safe_outputs.outputs.create_discussion_error_count }} + create_discussion_errors: ${{ steps.process_safe_outputs.outputs.create_discussion_errors }} + process_safe_outputs_processed_count: ${{ steps.process_safe_outputs.outputs.processed_count }} + process_safe_outputs_temporary_id_map: ${{ steps.process_safe_outputs.outputs.temporary_id_map }} + steps: + - name: Setup Scripts + uses: github/gh-aw/actions/setup@v0.53.4 + with: + destination: /opt/gh-aw/actions + - name: Download agent output artifact + id: download-agent-output + continue-on-error: true + uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0 + with: + name: agent-output + path: /tmp/gh-aw/safeoutputs/ + - name: Setup agent output environment variable + if: steps.download-agent-output.outcome == 'success' + run: | + mkdir -p /tmp/gh-aw/safeoutputs/ + find "/tmp/gh-aw/safeoutputs/" -type f -print + echo "GH_AW_AGENT_OUTPUT=/tmp/gh-aw/safeoutputs/agent_output.json" >> "$GITHUB_ENV" + - name: Process Safe Outputs + id: process_safe_outputs + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_AGENT_OUTPUT: ${{ env.GH_AW_AGENT_OUTPUT }} + GH_AW_ALLOWED_DOMAINS: "api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,api.snapcraft.io,archive.ubuntu.com,azure.archive.ubuntu.com,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,github.com,host.docker.internal,json-schema.org,json.schemastore.org,keyserver.ubuntu.com,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,ppa.launchpad.net,raw.githubusercontent.com,registry.npmjs.org,s.symcb.com,s.symcd.com,security.ubuntu.com,telemetry.enterprise.githubcopilot.com,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com" + GITHUB_SERVER_URL: ${{ github.server_url }} + GITHUB_API_URL: ${{ github.api_url }} + GH_AW_SAFE_OUTPUTS_HANDLER_CONFIG: "{\"create_discussion\":{\"category\":\"agentic workflows\",\"close_older_discussions\":true,\"expires\":168,\"fallback_to_issue\":true,\"max\":1,\"title_prefix\":\"[QF_S Benchmark] \"},\"missing_data\":{},\"missing_tool\":{}}" + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('/opt/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('/opt/gh-aw/actions/safe_output_handler_manager.cjs'); + await main(); + - name: Upload safe output items manifest + if: always() + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: safe-output-items + path: /tmp/safe-output-items.jsonl + if-no-files-found: warn + diff --git a/.github/workflows/qf-s-benchmark.md b/.github/workflows/qf-s-benchmark.md new file mode 100644 index 000000000..57f6dee60 --- /dev/null +++ b/.github/workflows/qf-s-benchmark.md @@ -0,0 +1,38 @@ +--- +description: Run Z3 string solver benchmarks (seq vs nseq) on QF_S test suite from the c3 branch and post results as a GitHub discussion + +on: + schedule: weekly + workflow_dispatch: + +permissions: read-all + +network: defaults + +tools: + bash: true + github: + toolsets: [default] + +safe-outputs: + create-discussion: + title-prefix: "[QF_S Benchmark] " + category: "Agentic Workflows" + close-older-discussions: true + missing-tool: + create-issue: true + +timeout-minutes: 90 + +steps: + - name: Checkout c3 branch + uses: actions/checkout@v5 + with: + ref: c3 + fetch-depth: 1 + persist-credentials: false + +--- + + +@./agentics/qf-s-benchmark.md