From 853c62f58aa8f9739876f7983fe553a869b1c267 Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Sat, 11 Apr 2026 11:17:05 -0700
Subject: [PATCH] Fix qf-s-benchmark: broken code fence, OOM build, and timeout
 budget (#9268)

* Initial plan

* fix qf-s-benchmark: Release mode build, fix broken code fence, reduce timeouts

Agent-Logs-Url: https://github.com/Z3Prover/z3/sessions/8eaace11-bbc1-49d9-993d-67290f5b1841

Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com>

* recompile all workflow lock files with gh-aw v0.68.1

Agent-Logs-Url: https://github.com/Z3Prover/z3/sessions/a5c25542-de48-41e0-a48b-b7128fcb49bf

Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com>

* revert unaffected lock files to pre-PR state, keep only qf-s-benchmark compiled

Agent-Logs-Url: https://github.com/Z3Prover/z3/sessions/8ec3816f-882f-459b-b7cc-49d0c91b25c3

Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com>

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: NikolajBjorner <3085284+NikolajBjorner@users.noreply.github.com>
Co-authored-by: Nikolaj Bjorner <nbjorner@microsoft.com>
---
 .github/workflows/qf-s-benchmark.lock.yml | 24 +++++++++++------------
 .github/workflows/qf-s-benchmark.md       | 24 +++++++++++------------
 2 files changed, 23 insertions(+), 25 deletions(-)
diff --git a/.github/workflows/qf-s-benchmark.lock.yml b/.github/workflows/qf-s-benchmark.lock.yml
index 5bf698561..d56b85235 100644
--- a/.github/workflows/qf-s-benchmark.lock.yml
+++ b/.github/workflows/qf-s-benchmark.lock.yml
@@ -156,14 +156,14 @@ jobs:
         run: |
           bash "${RUNNER_TEMP}/gh-aw/actions/create_prompt_first.sh"
           {
-          cat << 'GH_AW_PROMPT_c25676ba2ab40d85_EOF'
+          cat << 'GH_AW_PROMPT_78ca018fd0ee9e53_EOF'
           <system>
-          GH_AW_PROMPT_c25676ba2ab40d85_EOF
+          GH_AW_PROMPT_78ca018fd0ee9e53_EOF
           cat "${RUNNER_TEMP}/gh-aw/prompts/xpia.md"
           cat "${RUNNER_TEMP}/gh-aw/prompts/temp_folder_prompt.md"
           cat "${RUNNER_TEMP}/gh-aw/prompts/markdown.md"
           cat "${RUNNER_TEMP}/gh-aw/prompts/safe_outputs_prompt.md"
-          cat << 'GH_AW_PROMPT_c25676ba2ab40d85_EOF'
+          cat << 'GH_AW_PROMPT_78ca018fd0ee9e53_EOF'
           <safe-output-tools>
           Tools: create_discussion, missing_tool, missing_data, noop
           </safe-output-tools>
@@ -195,12 +195,12 @@ jobs:
           {{/if}}
           </github-context>
           
-          GH_AW_PROMPT_c25676ba2ab40d85_EOF
+          GH_AW_PROMPT_78ca018fd0ee9e53_EOF
           cat "${RUNNER_TEMP}/gh-aw/prompts/github_mcp_tools_with_safeoutputs_prompt.md"
-          cat << 'GH_AW_PROMPT_c25676ba2ab40d85_EOF'
+          cat << 'GH_AW_PROMPT_78ca018fd0ee9e53_EOF'
           </system>
           {{#runtime-import .github/workflows/qf-s-benchmark.md}}
-          GH_AW_PROMPT_c25676ba2ab40d85_EOF
+          GH_AW_PROMPT_78ca018fd0ee9e53_EOF
           } > "$GH_AW_PROMPT"
       - name: Interpolate variables and render templates
         uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9
@@ -371,9 +371,9 @@ jobs:
           mkdir -p "${RUNNER_TEMP}/gh-aw/safeoutputs"
           mkdir -p /tmp/gh-aw/safeoutputs
           mkdir -p /tmp/gh-aw/mcp-logs/safeoutputs
-          cat > "${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" << 'GH_AW_SAFE_OUTPUTS_CONFIG_ef274886e4650c6c_EOF'
+          cat > "${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" << 'GH_AW_SAFE_OUTPUTS_CONFIG_7b004ca92a18e849_EOF'
           {"create_discussion":{"category":"agentic workflows","close_older_discussions":true,"expires":168,"fallback_to_issue":true,"max":1,"title_prefix":"[QF_S Benchmark] "},"create_report_incomplete_issue":{},"missing_data":{},"missing_tool":{},"noop":{"max":1,"report-as-issue":"false"},"report_incomplete":{}}
-          GH_AW_SAFE_OUTPUTS_CONFIG_ef274886e4650c6c_EOF
+          GH_AW_SAFE_OUTPUTS_CONFIG_7b004ca92a18e849_EOF
       - name: Write Safe Outputs Tools
         env:
           GH_AW_TOOLS_META_JSON: |
@@ -561,7 +561,7 @@ jobs:
           export MCP_GATEWAY_DOCKER_COMMAND='docker run -i --rm --network host -v /var/run/docker.sock:/var/run/docker.sock -e MCP_GATEWAY_PORT -e MCP_GATEWAY_DOMAIN -e MCP_GATEWAY_API_KEY -e MCP_GATEWAY_PAYLOAD_DIR -e MCP_GATEWAY_PAYLOAD_SIZE_THRESHOLD -e DEBUG -e MCP_GATEWAY_LOG_DIR -e GH_AW_MCP_LOG_DIR -e GH_AW_SAFE_OUTPUTS -e GH_AW_SAFE_OUTPUTS_CONFIG_PATH -e GH_AW_SAFE_OUTPUTS_TOOLS_PATH -e GH_AW_ASSETS_BRANCH -e GH_AW_ASSETS_MAX_SIZE_KB -e GH_AW_ASSETS_ALLOWED_EXTS -e DEFAULT_BRANCH -e GITHUB_MCP_SERVER_TOKEN -e GITHUB_MCP_GUARD_MIN_INTEGRITY -e GITHUB_MCP_GUARD_REPOS -e GITHUB_REPOSITORY -e GITHUB_SERVER_URL -e GITHUB_SHA -e GITHUB_WORKSPACE -e GITHUB_TOKEN -e GITHUB_RUN_ID -e GITHUB_RUN_NUMBER -e GITHUB_RUN_ATTEMPT -e GITHUB_JOB -e GITHUB_ACTION -e GITHUB_EVENT_NAME -e GITHUB_EVENT_PATH -e GITHUB_ACTOR -e GITHUB_ACTOR_ID -e GITHUB_TRIGGERING_ACTOR -e GITHUB_WORKFLOW -e GITHUB_WORKFLOW_REF -e GITHUB_WORKFLOW_SHA -e GITHUB_REF -e GITHUB_REF_NAME -e GITHUB_REF_TYPE -e GITHUB_HEAD_REF -e GITHUB_BASE_REF -e GH_AW_SAFE_OUTPUTS_PORT -e GH_AW_SAFE_OUTPUTS_API_KEY -v /tmp/gh-aw/mcp-payloads:/tmp/gh-aw/mcp-payloads:rw -v /opt:/opt:ro -v /tmp:/tmp:rw -v '"${GITHUB_WORKSPACE}"':'"${GITHUB_WORKSPACE}"':rw ghcr.io/github/gh-aw-mcpg:v0.2.17'
           
           mkdir -p /home/runner/.copilot
-          cat << GH_AW_MCP_CONFIG_c990404fbee0ddc5_EOF | bash "${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.sh"
+          cat << GH_AW_MCP_CONFIG_dd4b678d464df877_EOF | bash "${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.sh"
           {
             "mcpServers": {
               "github": {
@@ -602,7 +602,7 @@ jobs:
               "payloadDir": "${MCP_GATEWAY_PAYLOAD_DIR}"
             }
           }
-          GH_AW_MCP_CONFIG_c990404fbee0ddc5_EOF
+          GH_AW_MCP_CONFIG_dd4b678d464df877_EOF
       - name: Download activation artifact
         uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
@@ -614,7 +614,7 @@ jobs:
       - name: Execute GitHub Copilot CLI
         id: agentic_execution
         # Copilot CLI tool arguments (sorted):
-        timeout-minutes: 90
+        timeout-minutes: 120
         run: |
           set -o pipefail
           touch /tmp/gh-aw/agent-step-summary.md
@@ -908,7 +908,7 @@ jobs:
           GH_AW_STALE_LOCK_FILE_FAILED: ${{ needs.activation.outputs.stale_lock_file_failed }}
           GH_AW_GROUP_REPORTS: "false"
           GH_AW_FAILURE_REPORT_AS_ISSUE: "true"
-          GH_AW_TIMEOUT_MINUTES: "90"
+          GH_AW_TIMEOUT_MINUTES: "120"
         with:
           github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
           script: |
diff --git a/.github/workflows/qf-s-benchmark.md b/.github/workflows/qf-s-benchmark.md
index fcd3050a6..8b6927071 100644
--- a/.github/workflows/qf-s-benchmark.md
+++ b/.github/workflows/qf-s-benchmark.md
@@ -25,7 +25,7 @@ safe-outputs:
   noop:
     report-as-issue: false
 
-timeout-minutes: 90
+timeout-minutes: 120
 
 steps:
   - name: Checkout c3 branch
@@ -62,19 +62,19 @@ ninja --version
 python3 --version
 ```
 
-## Phase 2: Build Z3 in Debug Mode with Seq Tracing
+## Phase 2: Build Z3 in Release Mode
 
-Build Z3 with debug symbols so that tracing and timing data are meaningful.
+Build Z3 in Release mode for accurate benchmark performance numbers and lower memory usage. Running `ninja` in the background with `&` is not allowed — concurrent C++ compilation and LLM inference can exhaust available RAM and kill the agent process.
 
 ```bash
 mkdir -p /tmp/z3-build
 cd /tmp/z3-build
 cmake "$GITHUB_WORKSPACE" \
   -G Ninja \
-  -DCMAKE_BUILD_TYPE=Debug \
+  -DCMAKE_BUILD_TYPE=Release \
   -DZ3_BUILD_TEST_EXECUTABLES=OFF \
   2>&1 | tee /tmp/z3-cmake.log
-ninja z3 2>&1 | tee /tmp/z3-build.log
+ninja -j2 z3 2>&1 | tee /tmp/z3-build.log
 ```
 
 Verify the binary was built:
@@ -133,12 +133,12 @@ echo "Running benchmarks on $SAMPLE files"
 
 ## Phase 4: Run Benchmarks — seq vs nseq
 
-Run each benchmark with both solvers. Use a per-file timeout of 10 seconds. Set Z3's internal timeout to 9 seconds so it exits cleanly before the shell timeout fires.
+Run each benchmark with both solvers. Use a per-file timeout of 5 seconds. Set Z3's internal timeout to 4 seconds so it exits cleanly before the shell timeout fires.
 
 ```bash
 Z3=/tmp/z3-build/z3
-TIMEOUT_SEC=10
-Z3_TIMEOUT_SEC=9
+TIMEOUT_SEC=5
+Z3_TIMEOUT_SEC=4
 RESULTS=/tmp/benchmark-results.csv
 
 echo "file,seq_result,seq_time_ms,nseq_result,nseq_time_ms" > "$RESULTS"
@@ -208,10 +208,7 @@ done
 
 ## Phase 6: Analyze Results
 
-Compute summary statistics from the CSV:
-
-```bash
-Save the analysis script to a file and run it:
+Compute summary statistics from the CSV. Save the analysis script to a file and run it:
 
 ```bash
 cat > /tmp/analyze_benchmark.py << 'PYEOF'
@@ -397,7 +394,8 @@ These are benchmarks where `nseq` shows a performance advantage.
 ## Important Notes
 
 - **DO NOT** modify any source files or create pull requests.
-- **DO NOT** run benchmarks for longer than 80 minutes total (leave buffer for posting).
+- **DO NOT** run `ninja` or any build command in the background with `&` — concurrent C++ compilation and LLM inference can exhaust available RAM and kill the agent process. Always wait for build commands to complete before proceeding.
+- **DO NOT** run benchmarks for longer than 100 minutes total (leave buffer for posting).
 - **DO** always report the commit SHA so results can be correlated with specific code versions.
 - **DO** close older QF_S Benchmark discussions automatically (configured via `close-older-discussions: true`).
 - **DO** highlight disagreements prominently — these are potential correctness bugs.