From 4a8c9729bf1b6ff91320fb0c59a894fea95dc71b Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Fri, 20 Mar 2026 11:19:37 -0700 Subject: [PATCH] Add ostrich-benchmark agentic workflow for ZIPT/Z3 c3 branch benchmarking (#9064) Agent-Logs-Url: https://github.com/Z3Prover/z3/sessions/bfaec259-86d9-4b56-ab04-182835e3563b Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com> --- .github/agentics/ostrich-benchmark.md | 363 +++++++ .github/workflows/ostrich-benchmark.lock.yml | 1011 ++++++++++++++++++ .github/workflows/ostrich-benchmark.md | 41 + 3 files changed, 1415 insertions(+) create mode 100644 .github/agentics/ostrich-benchmark.md create mode 100644 .github/workflows/ostrich-benchmark.lock.yml create mode 100644 .github/workflows/ostrich-benchmark.md diff --git a/.github/agentics/ostrich-benchmark.md b/.github/agentics/ostrich-benchmark.md new file mode 100644 index 000000000..d498ee125 --- /dev/null +++ b/.github/agentics/ostrich-benchmark.md @@ -0,0 +1,363 @@ + + + +# Ostrich Benchmark: Z3 c3 branch vs ZIPT + +You are an AI agent that benchmarks Z3 string solvers (`seq` and `nseq`) and the standalone ZIPT solver on all SMT-LIB2 benchmarks from the `tests/ostrich.zip` archive on the `c3` branch, and publishes a summary report as a GitHub discussion. + +## Context + +- **Repository**: ${{ github.repository }} +- **Workspace**: ${{ github.workspace }} +- **Branch**: c3 (already checked out by the workflow setup step) + +## Phase 1: Build Z3 + +Build Z3 from the checked-out `c3` branch using CMake + Ninja, including the .NET bindings required by ZIPT. + +```bash +cd ${{ github.workspace }} + +# Install build dependencies if missing +sudo apt-get install -y ninja-build cmake python3 zstd dotnet-sdk-8.0 unzip 2>/dev/null || true + +# Configure the build in Debug mode to enable assertions and tracing +# (Debug mode is required for -tr: trace flags to produce meaningful output) +mkdir -p build +cd build +cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Debug -DZ3_BUILD_DOTNET_BINDINGS=ON 2>&1 | tail -20 + +# Build z3 binary and .NET bindings (this takes ~15-17 minutes) +ninja z3 2>&1 | tail -30 +ninja build_z3_dotnet_bindings 2>&1 | tail -20 + +# Verify the build succeeded +./z3 --version + +# Locate the Microsoft.Z3.dll produced by the build +Z3_DOTNET_DLL=$(find . -name "Microsoft.Z3.dll" -not -path "*/obj/*" | head -1) +if [ -z "$Z3_DOTNET_DLL" ]; then + echo "ERROR: Microsoft.Z3.dll not found after build" + exit 1 +fi +echo "Found Microsoft.Z3.dll at: $Z3_DOTNET_DLL" +``` + +If the build fails, report the error clearly and exit without proceeding. + +## Phase 2a: Clone and Build ZIPT + +Clone the ZIPT solver from the `parikh` branch and compile it against the Z3 .NET bindings built in Phase 1. + +```bash +cd ${{ github.workspace }} + +# Re-locate the Microsoft.Z3.dll if needed +Z3_DOTNET_DLL=$(find build -name "Microsoft.Z3.dll" -not -path "*/obj/*" | head -1) +Z3_LIB_DIR=${{ github.workspace }}/build + +# Clone ZIPT (parikh branch) +git clone --depth=1 --branch parikh https://github.com/CEisenhofer/ZIPT.git /tmp/zipt + +# Patch ZIPT.csproj to point at the freshly built Microsoft.Z3.dll +# (the repo has a Windows-relative hardcoded path that won't exist here) +sed -i "s|.*|$Z3_DOTNET_DLL|" /tmp/zipt/ZIPT/ZIPT.csproj + +# Build ZIPT in Release mode +cd /tmp/zipt/ZIPT +dotnet build --configuration Release 2>&1 | tail -20 + +# Locate the built ZIPT.dll +ZIPT_DLL=$(find /tmp/zipt/ZIPT/bin/Release -name "ZIPT.dll" | head -1) +if [ -z "$ZIPT_DLL" ]; then + echo "ERROR: ZIPT.dll not found after build" + exit 1 +fi +echo "ZIPT binary: $ZIPT_DLL" + +# Make libz3.so visible to the .NET runtime at ZIPT startup +ZIPT_OUT_DIR=$(dirname "$ZIPT_DLL") +if cp "$Z3_LIB_DIR/libz3.so" "$ZIPT_OUT_DIR/" 2>/dev/null; then + echo "Copied libz3.so to $ZIPT_OUT_DIR" +else + echo "WARNING: could not copy libz3.so to $ZIPT_OUT_DIR — setting LD_LIBRARY_PATH fallback" +fi +export LD_LIBRARY_PATH="$Z3_LIB_DIR${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" +echo "ZIPT build complete." +``` + +If the ZIPT build fails, note the error in the report but continue with the Z3-only benchmark columns. + +## Phase 2b: Extract Benchmark Files + +Extract all SMT-LIB2 files from the `tests/ostrich.zip` archive. + +```bash +cd ${{ github.workspace }} + +# Extract the zip archive +mkdir -p /tmp/ostrich_benchmarks +unzip -q tests/ostrich.zip -d /tmp/ostrich_benchmarks + +# List all .smt2 files +find /tmp/ostrich_benchmarks -name "*.smt2" -type f | sort > /tmp/all_ostrich_files.txt +TOTAL_FILES=$(wc -l < /tmp/all_ostrich_files.txt) +echo "Total Ostrich .smt2 files: $TOTAL_FILES" + +if [ "$TOTAL_FILES" -eq 0 ]; then + echo "ERROR: No .smt2 files found in tests/ostrich.zip" + exit 1 +fi +``` + +## Phase 3: Run Benchmarks + +Run every file from `/tmp/all_ostrich_files.txt` with both Z3 string solvers and ZIPT. Use a **5-second timeout** per run. + +For each file, run: +1. `z3 smt.string_solver=seq -T:5 ` — seq solver +2. `z3 smt.string_solver=nseq -T:5 ` — nseq (ZIPT) solver +3. `dotnet -t:5000 ` — standalone ZIPT solver (milliseconds) + +Capture: +- **Verdict**: `sat`, `unsat`, `unknown`, `timeout` (if exit code indicates timeout or process is killed), or `bug` (if a solver crashes / produces a non-standard result) +- **Time** (seconds): wall-clock time for the run +- A row is flagged `SOUNDNESS_DISAGREEMENT` when any two solvers that both produced a definitive answer (sat/unsat) disagree + +Use a bash script to automate this: + +```bash +#!/usr/bin/env bash +set -euo pipefail + +Z3=${{ github.workspace }}/build/z3 +ZIPT_DLL=$(find /tmp/zipt/ZIPT/bin/Release -name "ZIPT.dll" 2>/dev/null | head -1) +ZIPT_AVAILABLE=false +[ -n "$ZIPT_DLL" ] && ZIPT_AVAILABLE=true + +# Ensure libz3.so is on the dynamic-linker path for the .NET runtime +export LD_LIBRARY_PATH=${{ github.workspace }}/build${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} + +RESULTS=/tmp/benchmark_results.tsv +mkdir -p /tmp/ostrich_run + +echo -e "file\tseq_verdict\tseq_time\tnseq_verdict\tnseq_time\tzipt_verdict\tzipt_time\tnotes" > "$RESULTS" + +run_z3_seq() { + local file="$1" + local start end elapsed verdict output exit_code + + start=$(date +%s%3N) + output=$(timeout 7 "$Z3" "smt.string_solver=seq" -T:5 "$file" 2>&1) + exit_code=$? + end=$(date +%s%3N) + elapsed=$(echo "scale=3; ($end - $start) / 1000" | bc) + + if echo "$output" | grep -q "^unsat"; then + verdict="unsat" + elif echo "$output" | grep -q "^sat"; then + verdict="sat" + elif echo "$output" | grep -q "^unknown"; then + verdict="unknown" + elif [ "$exit_code" -eq 124 ]; then + verdict="timeout" + elif echo "$output" | grep -qi "error\|assertion\|segfault\|SIGABRT\|exception"; then + verdict="bug" + else + verdict="unknown" + fi + + echo "$verdict $elapsed" +} + +run_z3_nseq() { + local file="$1" + local start end elapsed verdict output exit_code + + start=$(date +%s%3N) + output=$(timeout 7 "$Z3" "smt.string_solver=nseq" -T:5 "$file" 2>&1) + exit_code=$? + end=$(date +%s%3N) + elapsed=$(echo "scale=3; ($end - $start) / 1000" | bc) + + if echo "$output" | grep -q "^unsat"; then + verdict="unsat" + elif echo "$output" | grep -q "^sat"; then + verdict="sat" + elif echo "$output" | grep -q "^unknown"; then + verdict="unknown" + elif [ "$exit_code" -eq 124 ]; then + verdict="timeout" + elif echo "$output" | grep -qi "error\|assertion\|segfault\|SIGABRT\|exception"; then + verdict="bug" + else + verdict="unknown" + fi + + echo "$verdict $elapsed" +} + +run_zipt() { + local file="$1" + local start end elapsed verdict output exit_code + + if [ "$ZIPT_AVAILABLE" != "true" ]; then + echo "n/a 0.000" + return + fi + + start=$(date +%s%3N) + # ZIPT prints the filename on the first line, then SAT/UNSAT/UNKNOWN on subsequent lines + output=$(timeout 7 dotnet "$ZIPT_DLL" -t:5000 "$file" 2>&1) + exit_code=$? + end=$(date +%s%3N) + elapsed=$(echo "scale=3; ($end - $start) / 1000" | bc) + + if echo "$output" | grep -qi "^UNSAT$"; then + verdict="unsat" + elif echo "$output" | grep -qi "^SAT$"; then + verdict="sat" + elif echo "$output" | grep -qi "^UNKNOWN$"; then + verdict="unknown" + elif [ "$exit_code" -eq 124 ]; then + verdict="timeout" + elif echo "$output" | grep -qi "error\|crash\|exception\|Unsupported"; then + verdict="bug" + else + verdict="unknown" + fi + + echo "$verdict $elapsed" +} + +COUNTER=0 +while IFS= read -r file; do + COUNTER=$((COUNTER + 1)) + fname=$(basename "$file") + + seq_result=$(run_z3_seq "$file") + nseq_result=$(run_z3_nseq "$file") + zipt_result=$(run_zipt "$file") + + seq_verdict=$(echo "$seq_result" | cut -d' ' -f1) + seq_time=$(echo "$seq_result" | cut -d' ' -f2) + nseq_verdict=$(echo "$nseq_result" | cut -d' ' -f1) + nseq_time=$(echo "$nseq_result" | cut -d' ' -f2) + zipt_verdict=$(echo "$zipt_result" | cut -d' ' -f1) + zipt_time=$(echo "$zipt_result" | cut -d' ' -f2) + + # Flag soundness disagreement when any two definitive verdicts disagree + notes="" + declare -A definitive_map + [ "$seq_verdict" = "sat" ] || [ "$seq_verdict" = "unsat" ] && definitive_map[seq]="$seq_verdict" + [ "$nseq_verdict" = "sat" ] || [ "$nseq_verdict" = "unsat" ] && definitive_map[nseq]="$nseq_verdict" + [ "$zipt_verdict" = "sat" ] || [ "$zipt_verdict" = "unsat" ] && definitive_map[zipt]="$zipt_verdict" + has_sat=false; has_unsat=false + for v in "${definitive_map[@]}"; do + [ "$v" = "sat" ] && has_sat=true + [ "$v" = "unsat" ] && has_unsat=true + done + if $has_sat && $has_unsat; then + notes="SOUNDNESS_DISAGREEMENT" + fi + + echo -e "$fname\t$seq_verdict\t$seq_time\t$nseq_verdict\t$nseq_time\t$zipt_verdict\t$zipt_time\t$notes" >> "$RESULTS" + echo "[$COUNTER] [$fname] seq=$seq_verdict(${seq_time}s) nseq=$nseq_verdict(${nseq_time}s) zipt=$zipt_verdict(${zipt_time}s) $notes" +done < /tmp/all_ostrich_files.txt + +echo "Benchmark run complete. Results saved to $RESULTS" +``` + +Save this script to `/tmp/run_ostrich_benchmarks.sh`, make it executable, and run it. Do not skip any file. + +## Phase 4: Generate Summary Report + +Read `/tmp/benchmark_results.tsv` and compute statistics. Then generate a Markdown report. + +Compute: +- **Total benchmarks**: total number of files run +- **Per solver (seq, nseq, and ZIPT)**: count of sat / unsat / unknown / timeout / bug verdicts +- **Total time used**: sum of all times for each solver +- **Average time per benchmark**: total_time / total_files +- **Soundness disagreements**: files where any two solvers that both returned a definitive answer disagree +- **Bugs / crashes**: files with error/crash verdicts + +Format the report as a GitHub Discussion post (GitHub-flavored Markdown): + +```markdown +### Ostrich Benchmark Report — Z3 c3 branch + +**Date**: +**Branch**: c3 +**Benchmark set**: Ostrich (all files from tests/ostrich.zip) +**Timeout**: 5 seconds per benchmark (`-T:5` for Z3; `-t:5000` for ZIPT) + +--- + +### Summary + +| Metric | seq solver | nseq solver | ZIPT solver | +|--------|-----------|-------------|-------------| +| sat | X | X | X | +| unsat | X | X | X | +| unknown | X | X | X | +| timeout | X | X | X | +| bug/crash | X | X | X | +| **Total time (s)** | X.XXX | X.XXX | X.XXX | +| **Avg time/benchmark (s)** | X.XXX | X.XXX | X.XXX | + +**Soundness disagreements** (any two solvers return conflicting sat/unsat): N + +--- + +### Per-File Results + +
+Click to expand full per-file table + +| # | File | seq verdict | seq time (s) | nseq verdict | nseq time (s) | ZIPT verdict | ZIPT time (s) | Notes | +|---|------|-------------|-------------|--------------|--------------|--------------|--------------|-------| +| 1 | benchmark_0001.smt2 | sat | 0.123 | sat | 0.456 | sat | 0.789 | | +| ... | ... | ... | ... | ... | ... | ... | ... | ... | + +
+ +--- + +### Notable Issues + +#### Soundness Disagreements (Critical) + + +#### Crashes / Bugs + + +#### Slow Benchmarks (> 4s) + + +--- + +*Generated automatically by the Ostrich Benchmark workflow on the c3 branch.* +``` + +## Phase 5: Post to GitHub Discussion + +Post the Markdown report as a new GitHub Discussion using the `create-discussion` safe output. + +- **Category**: "Agentic Workflows" +- **Title**: `[Ostrich Benchmark] Z3 c3 branch — ` +- Close older discussions with the same title prefix to avoid clutter. + +## Guidelines + +- **Always build from c3 branch**: The workspace is already checked out on c3; don't change branches. +- **Debug build required**: The build must use `CMAKE_BUILD_TYPE=Debug` so that Z3's internal assertions are active. +- **Run all benchmarks**: Unlike the QF_S workflow, run every file in the archive — do not randomly sample. +- **5-second timeout**: Pass `-T:5` to Z3 (both seq and nseq) and `-t:5000` to ZIPT (milliseconds). Use `timeout 7` as the outer OS-level guard to allow the solver to exit cleanly before being killed. +- **Be precise with timing**: Use millisecond-precision timestamps and report times in seconds with 3 decimal places. +- **Distinguish timeout from unknown**: A timeout is different from `(unknown)` returned by a solver within its time budget. +- **ZIPT output format**: ZIPT prints the input filename on the first line, then `SAT`, `UNSAT`, or `UNKNOWN` on subsequent lines. Parse accordingly. +- **Report soundness bugs prominently**: If any benchmark shows a conflict between any two solvers that both returned a definitive sat/unsat answer, highlight it as a critical finding and name which pair disagrees. +- **Handle build failures gracefully**: If Z3 fails to build, report the error and create a brief discussion noting the build failure. If ZIPT fails to build, continue with only the seq/nseq columns and note `n/a` for ZIPT results. +- **Large report**: Always put the per-file table in a `
` collapsible section since there may be many files. +- **Progress logging**: Print a line per file as you run it (e.g., `[N] [filename] seq=...`) so the workflow log shows progress even for large benchmark sets. diff --git a/.github/workflows/ostrich-benchmark.lock.yml b/.github/workflows/ostrich-benchmark.lock.yml new file mode 100644 index 000000000..adebe18e4 --- /dev/null +++ b/.github/workflows/ostrich-benchmark.lock.yml @@ -0,0 +1,1011 @@ +# ___ _ _ +# / _ \ | | (_) +# | |_| | __ _ ___ _ __ | |_ _ ___ +# | _ |/ _` |/ _ \ '_ \| __| |/ __| +# | | | | (_| | __/ | | | |_| | (__ +# \_| |_/\__, |\___|_| |_|\__|_|\___| +# __/ | +# _ _ |___/ +# | | | | / _| | +# | | | | ___ _ __ _ __| |_| | _____ ____ +# | |/\| |/ _ \ '__| |/ /| _| |/ _ \ \ /\ / / ___| +# \ /\ / (_) | | | | ( | | | | (_) \ V V /\__ \ +# \/ \/ \___/|_| |_|\_\|_| |_|\___/ \_/\_/ |___/ +# +# This file was automatically generated by gh-aw (v0.62.4). DO NOT EDIT. +# +# To update this file, edit the corresponding .md file and run: +# gh aw compile +# Not all edits will cause changes to this file. +# +# For more information: https://github.github.com/gh-aw/introduction/overview/ +# +# Run Z3 string solver benchmarks (seq vs nseq) and ZIPT on all Ostrich benchmarks from tests/ostrich.zip on the c3 branch and post results as a GitHub discussion +# +# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"3ac70e9acd74c08c55c4c8e60b61e24db0f1e0dbd5bc8e25c62af0279aea4d6b","compiler_version":"v0.62.4","strict":true,"agent_id":"copilot"} + +name: "Ostrich Benchmark" +"on": + schedule: + - cron: "0 6 * * *" + workflow_dispatch: + +permissions: {} + +concurrency: + group: "gh-aw-${{ github.workflow }}" + +run-name: "Ostrich Benchmark" + +jobs: + activation: + runs-on: ubuntu-slim + permissions: + contents: read + outputs: + comment_id: "" + comment_repo: "" + lockdown_check_failed: ${{ steps.generate_aw_info.outputs.lockdown_check_failed == 'true' }} + model: ${{ steps.generate_aw_info.outputs.model }} + secret_verification_result: ${{ steps.validate-secret.outputs.verification_result }} + steps: + - name: Setup Scripts + uses: github/gh-aw-actions/setup@v0.62.4 + with: + destination: ${{ runner.temp }}/gh-aw/actions + - name: Generate agentic run info + id: generate_aw_info + env: + GH_AW_INFO_ENGINE_ID: "copilot" + GH_AW_INFO_ENGINE_NAME: "GitHub Copilot CLI" + GH_AW_INFO_MODEL: ${{ vars.GH_AW_MODEL_AGENT_COPILOT || '' }} + GH_AW_INFO_VERSION: "" + GH_AW_INFO_AGENT_VERSION: "latest" + GH_AW_INFO_CLI_VERSION: "v0.62.4" + GH_AW_INFO_WORKFLOW_NAME: "Ostrich Benchmark" + GH_AW_INFO_EXPERIMENTAL: "false" + GH_AW_INFO_SUPPORTS_TOOLS_ALLOWLIST: "true" + GH_AW_INFO_STAGED: "false" + GH_AW_INFO_ALLOWED_DOMAINS: '["defaults"]' + GH_AW_INFO_FIREWALL_ENABLED: "true" + GH_AW_INFO_AWF_VERSION: "v0.24.5" + GH_AW_INFO_AWMG_VERSION: "" + GH_AW_INFO_FIREWALL_TYPE: "squid" + GH_AW_COMPILED_STRICT: "true" + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('${{ runner.temp }}/gh-aw/actions/generate_aw_info.cjs'); + await main(core, context); + - name: Validate COPILOT_GITHUB_TOKEN secret + id: validate-secret + run: ${RUNNER_TEMP}/gh-aw/actions/validate_multi_secret.sh COPILOT_GITHUB_TOKEN 'GitHub Copilot CLI' https://github.github.com/gh-aw/reference/engines/#github-copilot-default + env: + COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + - name: Checkout .github and .agents folders + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + sparse-checkout: | + .github + .agents + sparse-checkout-cone-mode: true + fetch-depth: 1 + - name: Check workflow file timestamps + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_WORKFLOW_FILE: "ostrich-benchmark.lock.yml" + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('${{ runner.temp }}/gh-aw/actions/check_workflow_timestamp_api.cjs'); + await main(); + - name: Create prompt with built-in context + env: + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + GH_AW_SAFE_OUTPUTS: ${{ env.GH_AW_SAFE_OUTPUTS }} + GH_AW_GITHUB_ACTOR: ${{ github.actor }} + GH_AW_GITHUB_EVENT_COMMENT_ID: ${{ github.event.comment.id }} + GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: ${{ github.event.discussion.number }} + GH_AW_GITHUB_EVENT_ISSUE_NUMBER: ${{ github.event.issue.number }} + GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: ${{ github.event.pull_request.number }} + GH_AW_GITHUB_REPOSITORY: ${{ github.repository }} + GH_AW_GITHUB_RUN_ID: ${{ github.run_id }} + GH_AW_GITHUB_WORKSPACE: ${{ github.workspace }} + run: | + bash ${RUNNER_TEMP}/gh-aw/actions/create_prompt_first.sh + { + cat << 'GH_AW_PROMPT_EOF' + + GH_AW_PROMPT_EOF + cat "${RUNNER_TEMP}/gh-aw/prompts/xpia.md" + cat "${RUNNER_TEMP}/gh-aw/prompts/temp_folder_prompt.md" + cat "${RUNNER_TEMP}/gh-aw/prompts/markdown.md" + cat "${RUNNER_TEMP}/gh-aw/prompts/safe_outputs_prompt.md" + cat << 'GH_AW_PROMPT_EOF' + + Tools: create_discussion, missing_tool, missing_data, noop + + + The following GitHub context information is available for this workflow: + {{#if __GH_AW_GITHUB_ACTOR__ }} + - **actor**: __GH_AW_GITHUB_ACTOR__ + {{/if}} + {{#if __GH_AW_GITHUB_REPOSITORY__ }} + - **repository**: __GH_AW_GITHUB_REPOSITORY__ + {{/if}} + {{#if __GH_AW_GITHUB_WORKSPACE__ }} + - **workspace**: __GH_AW_GITHUB_WORKSPACE__ + {{/if}} + {{#if __GH_AW_GITHUB_EVENT_ISSUE_NUMBER__ }} + - **issue-number**: #__GH_AW_GITHUB_EVENT_ISSUE_NUMBER__ + {{/if}} + {{#if __GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER__ }} + - **discussion-number**: #__GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER__ + {{/if}} + {{#if __GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER__ }} + - **pull-request-number**: #__GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER__ + {{/if}} + {{#if __GH_AW_GITHUB_EVENT_COMMENT_ID__ }} + - **comment-id**: __GH_AW_GITHUB_EVENT_COMMENT_ID__ + {{/if}} + {{#if __GH_AW_GITHUB_RUN_ID__ }} + - **workflow-run-id**: __GH_AW_GITHUB_RUN_ID__ + {{/if}} + + + GH_AW_PROMPT_EOF + cat "${RUNNER_TEMP}/gh-aw/prompts/github_mcp_tools_with_safeoutputs_prompt.md" + cat << 'GH_AW_PROMPT_EOF' + + GH_AW_PROMPT_EOF + cat << 'GH_AW_PROMPT_EOF' + {{#runtime-import .github/workflows/ostrich-benchmark.md}} + GH_AW_PROMPT_EOF + } > "$GH_AW_PROMPT" + - name: Interpolate variables and render templates + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('${{ runner.temp }}/gh-aw/actions/interpolate_prompt.cjs'); + await main(); + - name: Substitute placeholders + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + GH_AW_GITHUB_ACTOR: ${{ github.actor }} + GH_AW_GITHUB_EVENT_COMMENT_ID: ${{ github.event.comment.id }} + GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: ${{ github.event.discussion.number }} + GH_AW_GITHUB_EVENT_ISSUE_NUMBER: ${{ github.event.issue.number }} + GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: ${{ github.event.pull_request.number }} + GH_AW_GITHUB_REPOSITORY: ${{ github.repository }} + GH_AW_GITHUB_RUN_ID: ${{ github.run_id }} + GH_AW_GITHUB_WORKSPACE: ${{ github.workspace }} + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + + const substitutePlaceholders = require('${{ runner.temp }}/gh-aw/actions/substitute_placeholders.cjs'); + + // Call the substitution function + return await substitutePlaceholders({ + file: process.env.GH_AW_PROMPT, + substitutions: { + GH_AW_GITHUB_ACTOR: process.env.GH_AW_GITHUB_ACTOR, + GH_AW_GITHUB_EVENT_COMMENT_ID: process.env.GH_AW_GITHUB_EVENT_COMMENT_ID, + GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER: process.env.GH_AW_GITHUB_EVENT_DISCUSSION_NUMBER, + GH_AW_GITHUB_EVENT_ISSUE_NUMBER: process.env.GH_AW_GITHUB_EVENT_ISSUE_NUMBER, + GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER: process.env.GH_AW_GITHUB_EVENT_PULL_REQUEST_NUMBER, + GH_AW_GITHUB_REPOSITORY: process.env.GH_AW_GITHUB_REPOSITORY, + GH_AW_GITHUB_RUN_ID: process.env.GH_AW_GITHUB_RUN_ID, + GH_AW_GITHUB_WORKSPACE: process.env.GH_AW_GITHUB_WORKSPACE + } + }); + - name: Validate prompt placeholders + env: + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + run: bash ${RUNNER_TEMP}/gh-aw/actions/validate_prompt_placeholders.sh + - name: Print prompt + env: + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + run: bash ${RUNNER_TEMP}/gh-aw/actions/print_prompt_summary.sh + - name: Upload activation artifact + if: success() + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: activation + path: | + /tmp/gh-aw/aw_info.json + /tmp/gh-aw/aw-prompts/prompt.txt + retention-days: 1 + + agent: + needs: activation + runs-on: ubuntu-latest + permissions: read-all + concurrency: + group: "gh-aw-copilot-${{ github.workflow }}" + env: + DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + GH_AW_ASSETS_ALLOWED_EXTS: "" + GH_AW_ASSETS_BRANCH: "" + GH_AW_ASSETS_MAX_SIZE_KB: 0 + GH_AW_MCP_LOG_DIR: /tmp/gh-aw/mcp-logs/safeoutputs + GH_AW_WORKFLOW_ID_SANITIZED: ostrichbenchmark + outputs: + checkout_pr_success: ${{ steps.checkout-pr.outputs.checkout_pr_success || 'true' }} + detection_conclusion: ${{ steps.detection_conclusion.outputs.conclusion }} + detection_success: ${{ steps.detection_conclusion.outputs.success }} + has_patch: ${{ steps.collect_output.outputs.has_patch }} + inference_access_error: ${{ steps.detect-inference-error.outputs.inference_access_error || 'false' }} + model: ${{ needs.activation.outputs.model }} + output: ${{ steps.collect_output.outputs.output }} + output_types: ${{ steps.collect_output.outputs.output_types }} + steps: + - name: Setup Scripts + uses: github/gh-aw-actions/setup@v0.62.4 + with: + destination: ${{ runner.temp }}/gh-aw/actions + - name: Set runtime paths + run: | + echo "GH_AW_SAFE_OUTPUTS=${RUNNER_TEMP}/gh-aw/safeoutputs/outputs.jsonl" >> "$GITHUB_ENV" + echo "GH_AW_SAFE_OUTPUTS_CONFIG_PATH=${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" >> "$GITHUB_ENV" + echo "GH_AW_SAFE_OUTPUTS_TOOLS_PATH=${RUNNER_TEMP}/gh-aw/safeoutputs/tools.json" >> "$GITHUB_ENV" + - name: Create gh-aw temp directory + run: bash ${RUNNER_TEMP}/gh-aw/actions/create_gh_aw_tmp_dir.sh + - name: Configure gh CLI for GitHub Enterprise + run: bash ${RUNNER_TEMP}/gh-aw/actions/configure_gh_for_ghe.sh + env: + GH_TOKEN: ${{ github.token }} + - name: Checkout c3 branch + uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + with: + fetch-depth: 1 + persist-credentials: false + ref: c3 + + - name: Configure Git credentials + env: + REPO_NAME: ${{ github.repository }} + SERVER_URL: ${{ github.server_url }} + run: | + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git config --global user.name "github-actions[bot]" + git config --global am.keepcr true + # Re-authenticate git with GitHub token + SERVER_URL_STRIPPED="${SERVER_URL#https://}" + git remote set-url origin "https://x-access-token:${{ github.token }}@${SERVER_URL_STRIPPED}/${REPO_NAME}.git" + echo "Git configured with standard GitHub Actions identity" + - name: Checkout PR branch + id: checkout-pr + if: | + (github.event.pull_request) || (github.event.issue.pull_request) + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + with: + github-token: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('${{ runner.temp }}/gh-aw/actions/checkout_pr_branch.cjs'); + await main(); + - name: Install GitHub Copilot CLI + run: ${RUNNER_TEMP}/gh-aw/actions/install_copilot_cli.sh latest + env: + GH_HOST: github.com + - name: Install AWF binary + run: bash ${RUNNER_TEMP}/gh-aw/actions/install_awf_binary.sh v0.24.5 + - name: Determine automatic lockdown mode for GitHub MCP Server + id: determine-automatic-lockdown + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_GITHUB_TOKEN: ${{ secrets.GH_AW_GITHUB_TOKEN }} + GH_AW_GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN }} + with: + script: | + const determineAutomaticLockdown = require('${{ runner.temp }}/gh-aw/actions/determine_automatic_lockdown.cjs'); + await determineAutomaticLockdown(github, context, core); + - name: Download container images + run: bash ${RUNNER_TEMP}/gh-aw/actions/download_docker_images.sh ghcr.io/github/gh-aw-firewall/agent:0.24.5 ghcr.io/github/gh-aw-firewall/api-proxy:0.24.5 ghcr.io/github/gh-aw-firewall/squid:0.24.5 ghcr.io/github/gh-aw-mcpg:v0.1.19 ghcr.io/github/github-mcp-server:v0.32.0 node:lts-alpine + - name: Write Safe Outputs Config + run: | + mkdir -p ${RUNNER_TEMP}/gh-aw/safeoutputs + mkdir -p /tmp/gh-aw/safeoutputs + mkdir -p /tmp/gh-aw/mcp-logs/safeoutputs + cat > ${RUNNER_TEMP}/gh-aw/safeoutputs/config.json << 'GH_AW_SAFE_OUTPUTS_CONFIG_EOF' + {"create_discussion":{"expires":168,"max":1},"create_missing_tool_issue":{"max":1,"title_prefix":"[missing tool]"},"missing_data":{},"missing_tool":{},"noop":{"max":1}} + GH_AW_SAFE_OUTPUTS_CONFIG_EOF + - name: Write Safe Outputs Tools + run: | + cat > ${RUNNER_TEMP}/gh-aw/safeoutputs/tools_meta.json << 'GH_AW_SAFE_OUTPUTS_TOOLS_META_EOF' + { + "description_suffixes": { + "create_discussion": " CONSTRAINTS: Maximum 1 discussion(s) can be created. Title will be prefixed with \"[Ostrich Benchmark] \". Discussions will be created in category \"agentic workflows\"." + }, + "repo_params": {}, + "dynamic_tools": [] + } + GH_AW_SAFE_OUTPUTS_TOOLS_META_EOF + cat > ${RUNNER_TEMP}/gh-aw/safeoutputs/validation.json << 'GH_AW_SAFE_OUTPUTS_VALIDATION_EOF' + { + "create_discussion": { + "defaultMax": 1, + "fields": { + "body": { + "required": true, + "type": "string", + "sanitize": true, + "maxLength": 65000 + }, + "category": { + "type": "string", + "sanitize": true, + "maxLength": 128 + }, + "repo": { + "type": "string", + "maxLength": 256 + }, + "title": { + "required": true, + "type": "string", + "sanitize": true, + "maxLength": 128 + } + } + }, + "missing_data": { + "defaultMax": 20, + "fields": { + "alternatives": { + "type": "string", + "sanitize": true, + "maxLength": 256 + }, + "context": { + "type": "string", + "sanitize": true, + "maxLength": 256 + }, + "data_type": { + "type": "string", + "sanitize": true, + "maxLength": 128 + }, + "reason": { + "type": "string", + "sanitize": true, + "maxLength": 256 + } + } + }, + "missing_tool": { + "defaultMax": 20, + "fields": { + "alternatives": { + "type": "string", + "sanitize": true, + "maxLength": 512 + }, + "reason": { + "required": true, + "type": "string", + "sanitize": true, + "maxLength": 256 + }, + "tool": { + "type": "string", + "sanitize": true, + "maxLength": 128 + } + } + }, + "noop": { + "defaultMax": 1, + "fields": { + "message": { + "required": true, + "type": "string", + "sanitize": true, + "maxLength": 65000 + } + } + } + } + GH_AW_SAFE_OUTPUTS_VALIDATION_EOF + node ${RUNNER_TEMP}/gh-aw/actions/generate_safe_outputs_tools.cjs + - name: Generate Safe Outputs MCP Server Config + id: safe-outputs-config + run: | + # Generate a secure random API key (360 bits of entropy, 40+ chars) + # Mask immediately to prevent timing vulnerabilities + API_KEY=$(openssl rand -base64 45 | tr -d '/+=') + echo "::add-mask::${API_KEY}" + + PORT=3001 + + # Set outputs for next steps + { + echo "safe_outputs_api_key=${API_KEY}" + echo "safe_outputs_port=${PORT}" + } >> "$GITHUB_OUTPUT" + + echo "Safe Outputs MCP server will run on port ${PORT}" + + - name: Start Safe Outputs MCP HTTP Server + id: safe-outputs-start + env: + DEBUG: '*' + GH_AW_SAFE_OUTPUTS_PORT: ${{ steps.safe-outputs-config.outputs.safe_outputs_port }} + GH_AW_SAFE_OUTPUTS_API_KEY: ${{ steps.safe-outputs-config.outputs.safe_outputs_api_key }} + GH_AW_SAFE_OUTPUTS_TOOLS_PATH: ${{ runner.temp }}/gh-aw/safeoutputs/tools.json + GH_AW_SAFE_OUTPUTS_CONFIG_PATH: ${{ runner.temp }}/gh-aw/safeoutputs/config.json + GH_AW_MCP_LOG_DIR: /tmp/gh-aw/mcp-logs/safeoutputs + run: | + # Environment variables are set above to prevent template injection + export DEBUG + export GH_AW_SAFE_OUTPUTS_PORT + export GH_AW_SAFE_OUTPUTS_API_KEY + export GH_AW_SAFE_OUTPUTS_TOOLS_PATH + export GH_AW_SAFE_OUTPUTS_CONFIG_PATH + export GH_AW_MCP_LOG_DIR + + bash ${RUNNER_TEMP}/gh-aw/actions/start_safe_outputs_server.sh + + - name: Start MCP Gateway + id: start-mcp-gateway + env: + GH_AW_SAFE_OUTPUTS: ${{ env.GH_AW_SAFE_OUTPUTS }} + GH_AW_SAFE_OUTPUTS_API_KEY: ${{ steps.safe-outputs-start.outputs.api_key }} + GH_AW_SAFE_OUTPUTS_PORT: ${{ steps.safe-outputs-start.outputs.port }} + GITHUB_MCP_GUARD_MIN_INTEGRITY: ${{ steps.determine-automatic-lockdown.outputs.min_integrity }} + GITHUB_MCP_GUARD_REPOS: ${{ steps.determine-automatic-lockdown.outputs.repos }} + GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + run: | + set -eo pipefail + mkdir -p /tmp/gh-aw/mcp-config + + # Export gateway environment variables for MCP config and gateway script + export MCP_GATEWAY_PORT="80" + export MCP_GATEWAY_DOMAIN="host.docker.internal" + MCP_GATEWAY_API_KEY=$(openssl rand -base64 45 | tr -d '/+=') + echo "::add-mask::${MCP_GATEWAY_API_KEY}" + export MCP_GATEWAY_API_KEY + export MCP_GATEWAY_PAYLOAD_DIR="/tmp/gh-aw/mcp-payloads" + mkdir -p "${MCP_GATEWAY_PAYLOAD_DIR}" + export MCP_GATEWAY_PAYLOAD_SIZE_THRESHOLD="524288" + export DEBUG="*" + + export GH_AW_ENGINE="copilot" + export MCP_GATEWAY_DOCKER_COMMAND='docker run -i --rm --network host -v /var/run/docker.sock:/var/run/docker.sock -e MCP_GATEWAY_PORT -e MCP_GATEWAY_DOMAIN -e MCP_GATEWAY_API_KEY -e MCP_GATEWAY_PAYLOAD_DIR -e MCP_GATEWAY_PAYLOAD_SIZE_THRESHOLD -e DEBUG -e MCP_GATEWAY_LOG_DIR -e GH_AW_MCP_LOG_DIR -e GH_AW_SAFE_OUTPUTS -e GH_AW_SAFE_OUTPUTS_CONFIG_PATH -e GH_AW_SAFE_OUTPUTS_TOOLS_PATH -e GH_AW_ASSETS_BRANCH -e GH_AW_ASSETS_MAX_SIZE_KB -e GH_AW_ASSETS_ALLOWED_EXTS -e DEFAULT_BRANCH -e GITHUB_MCP_SERVER_TOKEN -e GITHUB_MCP_GUARD_MIN_INTEGRITY -e GITHUB_MCP_GUARD_REPOS -e GITHUB_REPOSITORY -e GITHUB_SERVER_URL -e GITHUB_SHA -e GITHUB_WORKSPACE -e GITHUB_TOKEN -e GITHUB_RUN_ID -e GITHUB_RUN_NUMBER -e GITHUB_RUN_ATTEMPT -e GITHUB_JOB -e GITHUB_ACTION -e GITHUB_EVENT_NAME -e GITHUB_EVENT_PATH -e GITHUB_ACTOR -e GITHUB_ACTOR_ID -e GITHUB_TRIGGERING_ACTOR -e GITHUB_WORKFLOW -e GITHUB_WORKFLOW_REF -e GITHUB_WORKFLOW_SHA -e GITHUB_REF -e GITHUB_REF_NAME -e GITHUB_REF_TYPE -e GITHUB_HEAD_REF -e GITHUB_BASE_REF -e GH_AW_SAFE_OUTPUTS_PORT -e GH_AW_SAFE_OUTPUTS_API_KEY -v /tmp/gh-aw/mcp-payloads:/tmp/gh-aw/mcp-payloads:rw -v /opt:/opt:ro -v /tmp:/tmp:rw -v '"${GITHUB_WORKSPACE}"':'"${GITHUB_WORKSPACE}"':rw ghcr.io/github/gh-aw-mcpg:v0.1.19' + + mkdir -p /home/runner/.copilot + cat << GH_AW_MCP_CONFIG_EOF | bash ${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.sh + { + "mcpServers": { + "github": { + "type": "stdio", + "container": "ghcr.io/github/github-mcp-server:v0.32.0", + "env": { + "GITHUB_HOST": "\${GITHUB_SERVER_URL}", + "GITHUB_PERSONAL_ACCESS_TOKEN": "\${GITHUB_MCP_SERVER_TOKEN}", + "GITHUB_READ_ONLY": "1", + "GITHUB_TOOLSETS": "context,repos,issues,pull_requests" + }, + "guard-policies": { + "allow-only": { + "min-integrity": "$GITHUB_MCP_GUARD_MIN_INTEGRITY", + "repos": "$GITHUB_MCP_GUARD_REPOS" + } + } + }, + "safeoutputs": { + "type": "http", + "url": "http://host.docker.internal:$GH_AW_SAFE_OUTPUTS_PORT", + "headers": { + "Authorization": "\${GH_AW_SAFE_OUTPUTS_API_KEY}" + }, + "guard-policies": { + "write-sink": { + "accept": [ + "*" + ] + } + } + } + }, + "gateway": { + "port": $MCP_GATEWAY_PORT, + "domain": "${MCP_GATEWAY_DOMAIN}", + "apiKey": "${MCP_GATEWAY_API_KEY}", + "payloadDir": "${MCP_GATEWAY_PAYLOAD_DIR}" + } + } + GH_AW_MCP_CONFIG_EOF + - name: Download activation artifact + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: activation + path: /tmp/gh-aw + - name: Clean git credentials + continue-on-error: true + run: bash ${RUNNER_TEMP}/gh-aw/actions/clean_git_credentials.sh + - name: Execute GitHub Copilot CLI + id: agentic_execution + # Copilot CLI tool arguments (sorted): + timeout-minutes: 180 + run: | + set -o pipefail + touch /tmp/gh-aw/agent-step-summary.md + # shellcheck disable=SC1003 + sudo -E awf --env-all --container-workdir "${GITHUB_WORKSPACE}" --mount "${RUNNER_TEMP}/gh-aw:${RUNNER_TEMP}/gh-aw:ro" --mount "${RUNNER_TEMP}/gh-aw:/host${RUNNER_TEMP}/gh-aw:ro" --allow-domains "api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,api.snapcraft.io,archive.ubuntu.com,azure.archive.ubuntu.com,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,github.com,host.docker.internal,json-schema.org,json.schemastore.org,keyserver.ubuntu.com,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,ppa.launchpad.net,raw.githubusercontent.com,registry.npmjs.org,s.symcb.com,s.symcd.com,security.ubuntu.com,telemetry.enterprise.githubcopilot.com,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com,www.googleapis.com" --log-level info --proxy-logs-dir /tmp/gh-aw/sandbox/firewall/logs --enable-host-access --image-tag 0.24.5 --skip-pull --enable-api-proxy \ + -- /bin/bash -c '/usr/local/bin/copilot --add-dir /tmp/gh-aw/ --log-level all --log-dir /tmp/gh-aw/sandbox/agent/logs/ --add-dir "${GITHUB_WORKSPACE}" --disable-builtin-mcps --allow-all-tools --allow-all-paths --prompt "$(cat /tmp/gh-aw/aw-prompts/prompt.txt)"' 2>&1 | tee -a /tmp/gh-aw/agent-stdio.log + env: + COPILOT_AGENT_RUNNER_TYPE: STANDALONE + COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + COPILOT_MODEL: ${{ vars.GH_AW_MODEL_AGENT_COPILOT || '' }} + GH_AW_MCP_CONFIG: /home/runner/.copilot/mcp-config.json + GH_AW_PHASE: agent + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + GH_AW_SAFE_OUTPUTS: ${{ env.GH_AW_SAFE_OUTPUTS }} + GH_AW_VERSION: v0.62.4 + GITHUB_API_URL: ${{ github.api_url }} + GITHUB_AW: true + GITHUB_HEAD_REF: ${{ github.head_ref }} + GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN || secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + GITHUB_REF_NAME: ${{ github.ref_name }} + GITHUB_SERVER_URL: ${{ github.server_url }} + GITHUB_STEP_SUMMARY: /tmp/gh-aw/agent-step-summary.md + GITHUB_WORKSPACE: ${{ github.workspace }} + GIT_AUTHOR_EMAIL: github-actions[bot]@users.noreply.github.com + GIT_AUTHOR_NAME: github-actions[bot] + GIT_COMMITTER_EMAIL: github-actions[bot]@users.noreply.github.com + GIT_COMMITTER_NAME: github-actions[bot] + XDG_CONFIG_HOME: /home/runner + - name: Detect inference access error + id: detect-inference-error + if: always() + continue-on-error: true + run: bash ${RUNNER_TEMP}/gh-aw/actions/detect_inference_access_error.sh + - name: Configure Git credentials + env: + REPO_NAME: ${{ github.repository }} + SERVER_URL: ${{ github.server_url }} + run: | + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git config --global user.name "github-actions[bot]" + git config --global am.keepcr true + # Re-authenticate git with GitHub token + SERVER_URL_STRIPPED="${SERVER_URL#https://}" + git remote set-url origin "https://x-access-token:${{ github.token }}@${SERVER_URL_STRIPPED}/${REPO_NAME}.git" + echo "Git configured with standard GitHub Actions identity" + - name: Copy Copilot session state files to logs + if: always() + continue-on-error: true + run: | + # Copy Copilot session state files to logs folder for artifact collection + # This ensures they are in /tmp/gh-aw/ where secret redaction can scan them + SESSION_STATE_DIR="$HOME/.copilot/session-state" + LOGS_DIR="/tmp/gh-aw/sandbox/agent/logs" + + if [ -d "$SESSION_STATE_DIR" ]; then + echo "Copying Copilot session state files from $SESSION_STATE_DIR to $LOGS_DIR" + mkdir -p "$LOGS_DIR" + cp -v "$SESSION_STATE_DIR"/*.jsonl "$LOGS_DIR/" 2>/dev/null || true + echo "Session state files copied successfully" + else + echo "No session-state directory found at $SESSION_STATE_DIR" + fi + - name: Stop MCP Gateway + if: always() + continue-on-error: true + env: + MCP_GATEWAY_PORT: ${{ steps.start-mcp-gateway.outputs.gateway-port }} + MCP_GATEWAY_API_KEY: ${{ steps.start-mcp-gateway.outputs.gateway-api-key }} + GATEWAY_PID: ${{ steps.start-mcp-gateway.outputs.gateway-pid }} + run: | + bash ${RUNNER_TEMP}/gh-aw/actions/stop_mcp_gateway.sh "$GATEWAY_PID" + - name: Redact secrets in logs + if: always() + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('${{ runner.temp }}/gh-aw/actions/redact_secrets.cjs'); + await main(); + env: + GH_AW_SECRET_NAMES: 'COPILOT_GITHUB_TOKEN,GH_AW_GITHUB_MCP_SERVER_TOKEN,GH_AW_GITHUB_TOKEN,GITHUB_TOKEN' + SECRET_COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + SECRET_GH_AW_GITHUB_MCP_SERVER_TOKEN: ${{ secrets.GH_AW_GITHUB_MCP_SERVER_TOKEN }} + SECRET_GH_AW_GITHUB_TOKEN: ${{ secrets.GH_AW_GITHUB_TOKEN }} + SECRET_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Append agent step summary + if: always() + run: bash ${RUNNER_TEMP}/gh-aw/actions/append_agent_step_summary.sh + - name: Copy Safe Outputs + if: always() + run: | + mkdir -p /tmp/gh-aw + cp "$GH_AW_SAFE_OUTPUTS" /tmp/gh-aw/safeoutputs.jsonl 2>/dev/null || true + - name: Ingest agent output + id: collect_output + if: always() + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_SAFE_OUTPUTS: ${{ env.GH_AW_SAFE_OUTPUTS }} + GH_AW_ALLOWED_DOMAINS: "api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,api.snapcraft.io,archive.ubuntu.com,azure.archive.ubuntu.com,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,github.com,host.docker.internal,json-schema.org,json.schemastore.org,keyserver.ubuntu.com,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,ppa.launchpad.net,raw.githubusercontent.com,registry.npmjs.org,s.symcb.com,s.symcd.com,security.ubuntu.com,telemetry.enterprise.githubcopilot.com,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com,www.googleapis.com" + GITHUB_SERVER_URL: ${{ github.server_url }} + GITHUB_API_URL: ${{ github.api_url }} + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('${{ runner.temp }}/gh-aw/actions/collect_ndjson_output.cjs'); + await main(); + - name: Parse agent logs for step summary + if: always() + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_AGENT_OUTPUT: /tmp/gh-aw/sandbox/agent/logs/ + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_copilot_log.cjs'); + await main(); + - name: Parse MCP Gateway logs for step summary + if: always() + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_mcp_gateway_log.cjs'); + await main(); + - name: Print firewall logs + if: always() + continue-on-error: true + env: + AWF_LOGS_DIR: /tmp/gh-aw/sandbox/firewall/logs + run: | + # Fix permissions on firewall logs so they can be uploaded as artifacts + # AWF runs with sudo, creating files owned by root + sudo chmod -R a+r /tmp/gh-aw/sandbox/firewall/logs 2>/dev/null || true + # Only run awf logs summary if awf command exists (it may not be installed if workflow failed before install step) + if command -v awf &> /dev/null; then + awf logs summary | tee -a "$GITHUB_STEP_SUMMARY" + else + echo 'AWF binary not installed, skipping firewall log summary' + fi + - name: Upload agent artifacts + if: always() + continue-on-error: true + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: agent + path: | + /tmp/gh-aw/aw-prompts/prompt.txt + /tmp/gh-aw/sandbox/agent/logs/ + /tmp/gh-aw/redacted-urls.log + /tmp/gh-aw/mcp-logs/ + /tmp/gh-aw/sandbox/firewall/logs/ + /tmp/gh-aw/agent-stdio.log + /tmp/gh-aw/agent/ + /tmp/gh-aw/safeoutputs.jsonl + /tmp/gh-aw/agent_output.json + if-no-files-found: ignore + # --- Threat Detection (inline) --- + - name: Check if detection needed + id: detection_guard + if: always() + env: + OUTPUT_TYPES: ${{ steps.collect_output.outputs.output_types }} + HAS_PATCH: ${{ steps.collect_output.outputs.has_patch }} + run: | + if [[ -n "$OUTPUT_TYPES" || "$HAS_PATCH" == "true" ]]; then + echo "run_detection=true" >> "$GITHUB_OUTPUT" + echo "Detection will run: output_types=$OUTPUT_TYPES, has_patch=$HAS_PATCH" + else + echo "run_detection=false" >> "$GITHUB_OUTPUT" + echo "Detection skipped: no agent outputs or patches to analyze" + fi + - name: Clear MCP configuration for detection + if: always() && steps.detection_guard.outputs.run_detection == 'true' + run: | + rm -f /tmp/gh-aw/mcp-config/mcp-servers.json + rm -f /home/runner/.copilot/mcp-config.json + rm -f "$GITHUB_WORKSPACE/.gemini/settings.json" + - name: Prepare threat detection files + if: always() && steps.detection_guard.outputs.run_detection == 'true' + run: | + mkdir -p /tmp/gh-aw/threat-detection/aw-prompts + cp /tmp/gh-aw/aw-prompts/prompt.txt /tmp/gh-aw/threat-detection/aw-prompts/prompt.txt 2>/dev/null || true + cp /tmp/gh-aw/agent_output.json /tmp/gh-aw/threat-detection/agent_output.json 2>/dev/null || true + for f in /tmp/gh-aw/aw-*.patch; do + [ -f "$f" ] && cp "$f" /tmp/gh-aw/threat-detection/ 2>/dev/null || true + done + echo "Prepared threat detection files:" + ls -la /tmp/gh-aw/threat-detection/ 2>/dev/null || true + - name: Setup threat detection + if: always() && steps.detection_guard.outputs.run_detection == 'true' + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + WORKFLOW_NAME: "Ostrich Benchmark" + WORKFLOW_DESCRIPTION: "Run Z3 string solver benchmarks (seq vs nseq) and ZIPT on all Ostrich benchmarks from tests/ostrich.zip on the c3 branch and post results as a GitHub discussion" + HAS_PATCH: ${{ steps.collect_output.outputs.has_patch }} + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('${{ runner.temp }}/gh-aw/actions/setup_threat_detection.cjs'); + await main(); + - name: Ensure threat-detection directory and log + if: always() && steps.detection_guard.outputs.run_detection == 'true' + run: | + mkdir -p /tmp/gh-aw/threat-detection + touch /tmp/gh-aw/threat-detection/detection.log + - name: Execute GitHub Copilot CLI + if: always() && steps.detection_guard.outputs.run_detection == 'true' + id: detection_agentic_execution + # Copilot CLI tool arguments (sorted): + # --allow-tool shell(cat) + # --allow-tool shell(grep) + # --allow-tool shell(head) + # --allow-tool shell(jq) + # --allow-tool shell(ls) + # --allow-tool shell(tail) + # --allow-tool shell(wc) + timeout-minutes: 20 + run: | + set -o pipefail + touch /tmp/gh-aw/agent-step-summary.md + # shellcheck disable=SC1003 + sudo -E awf --env-all --container-workdir "${GITHUB_WORKSPACE}" --mount "${RUNNER_TEMP}/gh-aw:${RUNNER_TEMP}/gh-aw:ro" --mount "${RUNNER_TEMP}/gh-aw:/host${RUNNER_TEMP}/gh-aw:ro" --allow-domains "api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,github.com,host.docker.internal,raw.githubusercontent.com,registry.npmjs.org,telemetry.enterprise.githubcopilot.com" --log-level info --proxy-logs-dir /tmp/gh-aw/sandbox/firewall/logs --enable-host-access --image-tag 0.24.5 --skip-pull --enable-api-proxy \ + -- /bin/bash -c '/usr/local/bin/copilot --add-dir /tmp/gh-aw/ --log-level all --log-dir /tmp/gh-aw/sandbox/agent/logs/ --add-dir "${GITHUB_WORKSPACE}" --disable-builtin-mcps --allow-tool '\''shell(cat)'\'' --allow-tool '\''shell(grep)'\'' --allow-tool '\''shell(head)'\'' --allow-tool '\''shell(jq)'\'' --allow-tool '\''shell(ls)'\'' --allow-tool '\''shell(tail)'\'' --allow-tool '\''shell(wc)'\'' --prompt "$(cat /tmp/gh-aw/aw-prompts/prompt.txt)"' 2>&1 | tee -a /tmp/gh-aw/threat-detection/detection.log + env: + COPILOT_AGENT_RUNNER_TYPE: STANDALONE + COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + COPILOT_MODEL: ${{ vars.GH_AW_MODEL_DETECTION_COPILOT || '' }} + GH_AW_PHASE: detection + GH_AW_PROMPT: /tmp/gh-aw/aw-prompts/prompt.txt + GH_AW_VERSION: v0.62.4 + GITHUB_API_URL: ${{ github.api_url }} + GITHUB_AW: true + GITHUB_HEAD_REF: ${{ github.head_ref }} + GITHUB_REF_NAME: ${{ github.ref_name }} + GITHUB_SERVER_URL: ${{ github.server_url }} + GITHUB_STEP_SUMMARY: /tmp/gh-aw/agent-step-summary.md + GITHUB_WORKSPACE: ${{ github.workspace }} + GIT_AUTHOR_EMAIL: github-actions[bot]@users.noreply.github.com + GIT_AUTHOR_NAME: github-actions[bot] + GIT_COMMITTER_EMAIL: github-actions[bot]@users.noreply.github.com + GIT_COMMITTER_NAME: github-actions[bot] + XDG_CONFIG_HOME: /home/runner + - name: Parse threat detection results + id: parse_detection_results + if: always() && steps.detection_guard.outputs.run_detection == 'true' + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + with: + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('${{ runner.temp }}/gh-aw/actions/parse_threat_detection_results.cjs'); + await main(); + - name: Upload threat detection log + if: always() && steps.detection_guard.outputs.run_detection == 'true' + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: detection + path: /tmp/gh-aw/threat-detection/detection.log + if-no-files-found: ignore + - name: Set detection conclusion + id: detection_conclusion + if: always() + env: + RUN_DETECTION: ${{ steps.detection_guard.outputs.run_detection }} + DETECTION_SUCCESS: ${{ steps.parse_detection_results.outputs.success }} + run: | + if [[ "$RUN_DETECTION" != "true" ]]; then + echo "conclusion=skipped" >> "$GITHUB_OUTPUT" + echo "success=true" >> "$GITHUB_OUTPUT" + echo "Detection was not needed, marking as skipped" + elif [[ "$DETECTION_SUCCESS" == "true" ]]; then + echo "conclusion=success" >> "$GITHUB_OUTPUT" + echo "success=true" >> "$GITHUB_OUTPUT" + echo "Detection passed successfully" + else + echo "conclusion=failure" >> "$GITHUB_OUTPUT" + echo "success=false" >> "$GITHUB_OUTPUT" + echo "Detection found issues" + fi + + conclusion: + needs: + - activation + - agent + - safe_outputs + if: (always()) && ((needs.agent.result != 'skipped') || (needs.activation.outputs.lockdown_check_failed == 'true')) + runs-on: ubuntu-slim + permissions: + contents: read + discussions: write + issues: write + concurrency: + group: "gh-aw-conclusion-ostrich-benchmark" + cancel-in-progress: false + outputs: + noop_message: ${{ steps.noop.outputs.noop_message }} + tools_reported: ${{ steps.missing_tool.outputs.tools_reported }} + total_count: ${{ steps.missing_tool.outputs.total_count }} + steps: + - name: Setup Scripts + uses: github/gh-aw-actions/setup@v0.62.4 + with: + destination: ${{ runner.temp }}/gh-aw/actions + - name: Download agent output artifact + id: download-agent-output + continue-on-error: true + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: agent + path: /tmp/gh-aw/ + - name: Setup agent output environment variable + if: steps.download-agent-output.outcome == 'success' + run: | + mkdir -p /tmp/gh-aw/ + find "/tmp/gh-aw/" -type f -print + echo "GH_AW_AGENT_OUTPUT=/tmp/gh-aw/agent_output.json" >> "$GITHUB_ENV" + - name: Process No-Op Messages + id: noop + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_AGENT_OUTPUT: ${{ env.GH_AW_AGENT_OUTPUT }} + GH_AW_NOOP_MAX: "1" + GH_AW_WORKFLOW_NAME: "Ostrich Benchmark" + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('${{ runner.temp }}/gh-aw/actions/noop.cjs'); + await main(); + - name: Record Missing Tool + id: missing_tool + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_AGENT_OUTPUT: ${{ env.GH_AW_AGENT_OUTPUT }} + GH_AW_MISSING_TOOL_CREATE_ISSUE: "true" + GH_AW_MISSING_TOOL_TITLE_PREFIX: "[missing tool]" + GH_AW_WORKFLOW_NAME: "Ostrich Benchmark" + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('${{ runner.temp }}/gh-aw/actions/missing_tool.cjs'); + await main(); + - name: Handle Agent Failure + id: handle_agent_failure + if: always() + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_AGENT_OUTPUT: ${{ env.GH_AW_AGENT_OUTPUT }} + GH_AW_WORKFLOW_NAME: "Ostrich Benchmark" + GH_AW_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + GH_AW_AGENT_CONCLUSION: ${{ needs.agent.result }} + GH_AW_WORKFLOW_ID: "ostrich-benchmark" + GH_AW_SECRET_VERIFICATION_RESULT: ${{ needs.activation.outputs.secret_verification_result }} + GH_AW_CHECKOUT_PR_SUCCESS: ${{ needs.agent.outputs.checkout_pr_success }} + GH_AW_INFERENCE_ACCESS_ERROR: ${{ needs.agent.outputs.inference_access_error }} + GH_AW_CREATE_DISCUSSION_ERRORS: ${{ needs.safe_outputs.outputs.create_discussion_errors }} + GH_AW_CREATE_DISCUSSION_ERROR_COUNT: ${{ needs.safe_outputs.outputs.create_discussion_error_count }} + GH_AW_LOCKDOWN_CHECK_FAILED: ${{ needs.activation.outputs.lockdown_check_failed }} + GH_AW_GROUP_REPORTS: "false" + GH_AW_FAILURE_REPORT_AS_ISSUE: "true" + GH_AW_TIMEOUT_MINUTES: "180" + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('${{ runner.temp }}/gh-aw/actions/handle_agent_failure.cjs'); + await main(); + - name: Handle No-Op Message + id: handle_noop_message + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_AGENT_OUTPUT: ${{ env.GH_AW_AGENT_OUTPUT }} + GH_AW_WORKFLOW_NAME: "Ostrich Benchmark" + GH_AW_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + GH_AW_AGENT_CONCLUSION: ${{ needs.agent.result }} + GH_AW_NOOP_MESSAGE: ${{ steps.noop.outputs.noop_message }} + GH_AW_NOOP_REPORT_AS_ISSUE: "false" + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('${{ runner.temp }}/gh-aw/actions/handle_noop_message.cjs'); + await main(); + + safe_outputs: + needs: agent + if: ((!cancelled()) && (needs.agent.result != 'skipped')) && (needs.agent.outputs.detection_success == 'true') + runs-on: ubuntu-slim + permissions: + contents: read + discussions: write + issues: write + timeout-minutes: 15 + env: + GH_AW_CALLER_WORKFLOW_ID: "${{ github.repository }}/ostrich-benchmark" + GH_AW_ENGINE_ID: "copilot" + GH_AW_WORKFLOW_ID: "ostrich-benchmark" + GH_AW_WORKFLOW_NAME: "Ostrich Benchmark" + outputs: + code_push_failure_count: ${{ steps.process_safe_outputs.outputs.code_push_failure_count }} + code_push_failure_errors: ${{ steps.process_safe_outputs.outputs.code_push_failure_errors }} + create_discussion_error_count: ${{ steps.process_safe_outputs.outputs.create_discussion_error_count }} + create_discussion_errors: ${{ steps.process_safe_outputs.outputs.create_discussion_errors }} + process_safe_outputs_processed_count: ${{ steps.process_safe_outputs.outputs.processed_count }} + process_safe_outputs_temporary_id_map: ${{ steps.process_safe_outputs.outputs.temporary_id_map }} + steps: + - name: Setup Scripts + uses: github/gh-aw-actions/setup@v0.62.4 + with: + destination: ${{ runner.temp }}/gh-aw/actions + - name: Download agent output artifact + id: download-agent-output + continue-on-error: true + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: agent + path: /tmp/gh-aw/ + - name: Setup agent output environment variable + if: steps.download-agent-output.outcome == 'success' + run: | + mkdir -p /tmp/gh-aw/ + find "/tmp/gh-aw/" -type f -print + echo "GH_AW_AGENT_OUTPUT=/tmp/gh-aw/agent_output.json" >> "$GITHUB_ENV" + - name: Configure GH_HOST for enterprise compatibility + shell: bash + run: | + # Derive GH_HOST from GITHUB_SERVER_URL so the gh CLI targets the correct + # GitHub instance (GHES/GHEC). On github.com this is a harmless no-op. + GH_HOST="${GITHUB_SERVER_URL#https://}" + GH_HOST="${GH_HOST#http://}" + echo "GH_HOST=${GH_HOST}" >> "$GITHUB_ENV" + - name: Process Safe Outputs + id: process_safe_outputs + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 + env: + GH_AW_AGENT_OUTPUT: ${{ env.GH_AW_AGENT_OUTPUT }} + GH_AW_ALLOWED_DOMAINS: "api.business.githubcopilot.com,api.enterprise.githubcopilot.com,api.github.com,api.githubcopilot.com,api.individual.githubcopilot.com,api.snapcraft.io,archive.ubuntu.com,azure.archive.ubuntu.com,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,github.com,host.docker.internal,json-schema.org,json.schemastore.org,keyserver.ubuntu.com,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,ppa.launchpad.net,raw.githubusercontent.com,registry.npmjs.org,s.symcb.com,s.symcd.com,security.ubuntu.com,telemetry.enterprise.githubcopilot.com,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com,www.googleapis.com" + GITHUB_SERVER_URL: ${{ github.server_url }} + GITHUB_API_URL: ${{ github.api_url }} + GH_AW_SAFE_OUTPUTS_HANDLER_CONFIG: "{\"create_discussion\":{\"category\":\"agentic workflows\",\"close_older_discussions\":true,\"expires\":168,\"fallback_to_issue\":true,\"max\":1,\"title_prefix\":\"[Ostrich Benchmark] \"},\"missing_data\":{},\"missing_tool\":{},\"noop\":{\"max\":1,\"report-as-issue\":\"false\"}}" + with: + github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} + script: | + const { setupGlobals } = require('${{ runner.temp }}/gh-aw/actions/setup_globals.cjs'); + setupGlobals(core, github, context, exec, io); + const { main } = require('${{ runner.temp }}/gh-aw/actions/safe_output_handler_manager.cjs'); + await main(); + - name: Upload safe output items + if: always() + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: safe-output-items + path: /tmp/gh-aw/safe-output-items.jsonl + if-no-files-found: ignore + diff --git a/.github/workflows/ostrich-benchmark.md b/.github/workflows/ostrich-benchmark.md new file mode 100644 index 000000000..140e899f1 --- /dev/null +++ b/.github/workflows/ostrich-benchmark.md @@ -0,0 +1,41 @@ +--- +description: Run Z3 string solver benchmarks (seq vs nseq) and ZIPT on all Ostrich benchmarks from tests/ostrich.zip on the c3 branch and post results as a GitHub discussion + +on: + schedule: + - cron: "0 6 * * *" + workflow_dispatch: + +permissions: read-all + +network: defaults + +tools: + bash: true + github: + toolsets: [default] + +safe-outputs: + create-discussion: + title-prefix: "[Ostrich Benchmark] " + category: "Agentic Workflows" + close-older-discussions: true + missing-tool: + create-issue: true + noop: + report-as-issue: false + +timeout-minutes: 180 + +steps: + - name: Checkout c3 branch + uses: actions/checkout@v5 + with: + ref: c3 + fetch-depth: 1 + persist-credentials: false + +--- + + +@./agentics/ostrich-benchmark.md