diff --git a/.github/workflows/tptp-benchmark.lock.yml b/.github/workflows/tptp-benchmark.lock.yml index 7e682c2fb..b7054fdba 100644 --- a/.github/workflows/tptp-benchmark.lock.yml +++ b/.github/workflows/tptp-benchmark.lock.yml @@ -1,4 +1,4 @@ -# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"05a6ebcfc669740744c3887e595339f183e37872d3ebbaebe34513c2a8016af8","compiler_version":"v0.72.1","strict":true,"agent_id":"copilot"} +# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"f76e98e3954f6892f78728b22721e87d255ca21f122ab7076306b1ed8b5679b8","compiler_version":"v0.72.1","strict":true,"agent_id":"copilot"} # gh-aw-manifest: {"version":1,"secrets":["COPILOT_GITHUB_TOKEN","GH_AW_GITHUB_MCP_SERVER_TOKEN","GH_AW_GITHUB_TOKEN","GITHUB_TOKEN"],"actions":[{"repo":"actions/checkout","sha":"de0fac2e4500dabe0009e67214ff5f5447ce83dd","version":"v6.0.2"},{"repo":"actions/download-artifact","sha":"3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c","version":"v8.0.1"},{"repo":"actions/github-script","sha":"3a2844b7e9c422d3c10d287c895573f7108da1b3","version":"v9"},{"repo":"actions/setup-node","sha":"48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e","version":"v6.4.0"},{"repo":"actions/upload-artifact","sha":"043fb46d1a93c77aae656e7c1c64a875d1fc6a0a","version":"v7.0.1"},{"repo":"github/gh-aw-actions/setup","sha":"v0.72.1","version":"v0.72.1"}],"containers":[{"image":"ghcr.io/github/gh-aw-firewall/agent:0.25.41"},{"image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.25.41"},{"image":"ghcr.io/github/gh-aw-firewall/squid:0.25.41"},{"image":"ghcr.io/github/gh-aw-mcpg:v0.3.6","digest":"sha256:2bb8eef86006a4c5963c55616a9c51c32f27bfdecb023b8aa6f91f6718d9171c","pinned_image":"ghcr.io/github/gh-aw-mcpg:v0.3.6@sha256:2bb8eef86006a4c5963c55616a9c51c32f27bfdecb023b8aa6f91f6718d9171c"},{"image":"ghcr.io/github/github-mcp-server:v1.0.3","digest":"sha256:2ac27ef03461ef2b877031b838a7d1fd7f12b12d4ace7796d8cad91446d55959","pinned_image":"ghcr.io/github/github-mcp-server:v1.0.3@sha256:2ac27ef03461ef2b877031b838a7d1fd7f12b12d4ace7796d8cad91446d55959"},{"image":"node:lts-alpine","digest":"sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f","pinned_image":"node:lts-alpine@sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f"}]} # ___ _ _ # / _ \ | | (_) @@ -183,20 +183,20 @@ jobs: run: | bash "${RUNNER_TEMP}/gh-aw/actions/create_prompt_first.sh" { - cat << 'GH_AW_PROMPT_dbe2d26d7191f793_EOF' + cat << 'GH_AW_PROMPT_0f8b33477e7f4791_EOF' - GH_AW_PROMPT_dbe2d26d7191f793_EOF + GH_AW_PROMPT_0f8b33477e7f4791_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/xpia.md" cat "${RUNNER_TEMP}/gh-aw/prompts/temp_folder_prompt.md" cat "${RUNNER_TEMP}/gh-aw/prompts/markdown.md" cat "${RUNNER_TEMP}/gh-aw/prompts/safe_outputs_prompt.md" - cat << 'GH_AW_PROMPT_dbe2d26d7191f793_EOF' + cat << 'GH_AW_PROMPT_0f8b33477e7f4791_EOF' Tools: create_discussion, missing_tool, missing_data, noop - GH_AW_PROMPT_dbe2d26d7191f793_EOF + GH_AW_PROMPT_0f8b33477e7f4791_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/mcp_cli_tools_prompt.md" - cat << 'GH_AW_PROMPT_dbe2d26d7191f793_EOF' + cat << 'GH_AW_PROMPT_0f8b33477e7f4791_EOF' The following GitHub context information is available for this workflow: {{#if __GH_AW_GITHUB_ACTOR__ }} @@ -225,12 +225,12 @@ jobs: {{/if}} - GH_AW_PROMPT_dbe2d26d7191f793_EOF + GH_AW_PROMPT_0f8b33477e7f4791_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/github_mcp_tools_with_safeoutputs_prompt.md" - cat << 'GH_AW_PROMPT_dbe2d26d7191f793_EOF' + cat << 'GH_AW_PROMPT_0f8b33477e7f4791_EOF' {{#runtime-import .github/workflows/tptp-benchmark.md}} - GH_AW_PROMPT_dbe2d26d7191f793_EOF + GH_AW_PROMPT_0f8b33477e7f4791_EOF } > "$GH_AW_PROMPT" - name: Interpolate variables and render templates uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 @@ -366,6 +366,12 @@ jobs: uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false + - name: Install build dependencies + run: | + sudo apt-get update -y -q + sudo apt-get install -y cmake ninja-build python3 wget curl bc + - name: Build Z3 + run: "mkdir -p /tmp/z3-build\ncd /tmp/z3-build\ncmake \"$GITHUB_WORKSPACE\" \\\n -G Ninja \\\n -DCMAKE_BUILD_TYPE=Release \\\n -DZ3_BUILD_TEST_EXECUTABLES=OFF\nninja -j$(nproc) z3\n./z3 --version\n" - name: Configure Git credentials env: @@ -433,9 +439,9 @@ jobs: mkdir -p "${RUNNER_TEMP}/gh-aw/safeoutputs" mkdir -p /tmp/gh-aw/safeoutputs mkdir -p /tmp/gh-aw/mcp-logs/safeoutputs - cat > "${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" << 'GH_AW_SAFE_OUTPUTS_CONFIG_832ae11399c26134_EOF' + cat > "${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" << 'GH_AW_SAFE_OUTPUTS_CONFIG_6d18c7040963ed61_EOF' {"create_discussion":{"category":"agentic workflows","close_older_discussions":true,"expires":336,"fallback_to_issue":true,"max":1,"title_prefix":"[TPTP Benchmark] "},"create_report_incomplete_issue":{},"missing_data":{},"missing_tool":{},"noop":{"max":1,"report-as-issue":"false"},"report_incomplete":{}} - GH_AW_SAFE_OUTPUTS_CONFIG_832ae11399c26134_EOF + GH_AW_SAFE_OUTPUTS_CONFIG_6d18c7040963ed61_EOF - name: Generate Safe Outputs Tools env: GH_AW_TOOLS_META_JSON: | @@ -628,7 +634,7 @@ jobs: mkdir -p /home/runner/.copilot GH_AW_NODE=$(which node 2>/dev/null || command -v node 2>/dev/null || echo node) - cat << GH_AW_MCP_CONFIG_0b5ba1ac1a376250_EOF | "$GH_AW_NODE" "${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.cjs" + cat << GH_AW_MCP_CONFIG_9ec215538233c65c_EOF | "$GH_AW_NODE" "${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.cjs" { "mcpServers": { "github": { @@ -669,7 +675,7 @@ jobs: "payloadDir": "${MCP_GATEWAY_PAYLOAD_DIR}" } } - GH_AW_MCP_CONFIG_0b5ba1ac1a376250_EOF + GH_AW_MCP_CONFIG_9ec215538233c65c_EOF - name: Mount MCP servers as CLIs id: mount-mcp-clis continue-on-error: true diff --git a/.github/workflows/tptp-benchmark.md b/.github/workflows/tptp-benchmark.md index b00a5fe82..fe3f7d5b2 100644 --- a/.github/workflows/tptp-benchmark.md +++ b/.github/workflows/tptp-benchmark.md @@ -41,6 +41,22 @@ steps: with: persist-credentials: false + - name: Install build dependencies + run: | + sudo apt-get update -y -q + sudo apt-get install -y cmake ninja-build python3 wget curl bc + + - name: Build Z3 + run: | + mkdir -p /tmp/z3-build + cd /tmp/z3-build + cmake "$GITHUB_WORKSPACE" \ + -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DZ3_BUILD_TEST_EXECUTABLES=OFF + ninja -j$(nproc) z3 + ./z3 --version + --- # TPTP Front-End Benchmark @@ -49,7 +65,7 @@ steps: Your name is ${{ github.workflow }}. You are an expert testing engineer for the Z3 theorem prover. Your task is to: -1. Build Z3 from the current `master` branch +1. Verify the Z3 binary built by the pre-flight step is available 2. Download the TPTP benchmark library from tptp.org 3. Select 500 random small-to-medium problems (with their axiom dependencies) 4. Run each problem through Z3's TPTP front-end with a 5-second timeout @@ -59,40 +75,18 @@ Your name is ${{ github.workflow }}. You are an expert testing engineer for the **Repository**: ${{ github.repository }} **Workspace**: ${{ github.workspace }} -## Phase 1: Build Z3 +## Phase 1: Verify Z3 Binary -Install build tools and build Z3 in Release mode. - -```bash -sudo apt-get update -y -q -sudo apt-get install -y cmake ninja-build python3 wget curl bc 2>/dev/null || true -``` - -Configure and build: - -```bash -mkdir -p /tmp/z3-build -cd /tmp/z3-build -cmake "$GITHUB_WORKSPACE" \ - -G Ninja \ - -DCMAKE_BUILD_TYPE=Release \ - -DZ3_BUILD_TEST_EXECUTABLES=OFF \ - 2>&1 | tail -20 - -# Build with limited parallelism to avoid OOM alongside LLM inference. -# Never run ninja in the background with &. -ninja -j2 z3 2>&1 | tail -30 -``` - -Verify the build: +Z3 was built by the workflow pre-flight step and is available at `/tmp/z3-build/z3`. +Confirm the binary is present and functional: ```bash /tmp/z3-build/z3 --version ``` -If the build fails, call the `noop` safe-output with a message describing the error and stop. +If the binary is missing or returns an error, call the `noop` safe-output with a message describing the problem and stop. -Once the binary is confirmed, call `noop` with `"Z3 built successfully. Downloading TPTP benchmark library — this may take a few minutes."` to keep the safe-output session alive. +Once confirmed, call `noop` with `"Z3 binary verified. Downloading TPTP benchmark library — this may take a few minutes."` to keep the safe-output session alive. ## Phase 2: Download the TPTP Problem Library @@ -536,15 +530,13 @@ You **MUST** call either `create_discussion` or `noop` before the workflow ends: - **Full success**: Call `create_discussion` with the complete report. - **Partial results** (some problems ran): Call `create_discussion` with whatever results are available and a note about incomplete execution. -- **Build failure**: Call `noop` with a brief message describing the build error. - **Download failure**: Call `noop` with the download error details. - **No problems selected**: Call `noop` explaining why no problems were found. - -Failing to produce any safe output triggers an automatic workflow-failure issue that clutters the repository. +- **Binary missing**: If `/tmp/z3-build/z3` is unexpectedly absent, call `noop` with that detail and stop. ## Important Notes -- **Never run `ninja` in the background with `&`**: Concurrent C++ compilation and LLM inference exhausts available RAM and kills the agent process (exit 137). Always wait for build commands to finish before continuing. +- **Build failure handling**: Z3 was built before the agent loaded. If the binary is missing or non-functional, call `noop` with the error and stop. - **TPTP environment variable**: Set `TPTP=/tmp/tptp` when invoking `z3 -tptp` so that `include()` directives in problem files resolve correctly against the downloaded Axioms directory. - **Timeout detection**: Use `timeout 8` as the outer OS-level guard (3 seconds beyond Z3's `-T:5`) to allow Z3 to exit cleanly before the shell kills it. If the exit code from `timeout` is 124, record the verdict as `Timeout`. - **Crash detection**: A crash is a non-zero exit code with no `% SZS status` line in the output and no timeout. Record it separately from `GaveUp`.