mirror of
https://github.com/Z3Prover/z3
synced 2026-06-19 15:16:29 +00:00
Merge branch 'master' into c3
This commit is contained in:
commit
043c6c0ad1
259 changed files with 18907 additions and 3725 deletions
12
.github/dependabot.yml
vendored
12
.github/dependabot.yml
vendored
|
|
@ -1,6 +1,8 @@
|
|||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: "github-actions"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
- directory: /
|
||||
ignore:
|
||||
- dependency-name: "github/gh-aw-actions/**" # Managed by gh aw compile. Version-locked to the gh-aw compiler; do not bump.
|
||||
package-ecosystem: github-actions
|
||||
schedule:
|
||||
interval: weekly
|
||||
version: 2
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ safe-outputs:
|
|||
title-prefix: "[Research Trends] "
|
||||
category: "Agentic Workflows"
|
||||
close-older-discussions: true
|
||||
expires: 60
|
||||
expires: 60d
|
||||
missing-tool:
|
||||
create-issue: true
|
||||
noop:
|
||||
|
|
@ -295,4 +295,4 @@ Store for next run:
|
|||
- DO NOT reproduce copyrighted paper text beyond short fair-use quotes.
|
||||
- DO close older Research Trends discussions automatically (configured).
|
||||
- DO always cite sources (arXiv ID, DOI, GitHub URL) so maintainers can verify.
|
||||
- DO use cache memory to track longitudinal trends across months.
|
||||
- DO use cache memory to track longitudinal trends across months.
|
||||
2
.github/workflows/android-build.yml
vendored
2
.github/workflows/android-build.yml
vendored
|
|
@ -33,7 +33,7 @@ jobs:
|
|||
tar -cvf z3-build-${{ matrix.android-abi }}.tar *.jar *.so
|
||||
|
||||
- name: Archive production artifacts
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: android-build-${{ matrix.android-abi }}
|
||||
path: build/z3-build-${{ matrix.android-abi }}.tar
|
||||
|
|
|
|||
2
.github/workflows/api-coherence-checker.md
vendored
2
.github/workflows/api-coherence-checker.md
vendored
|
|
@ -13,12 +13,10 @@ network: defaults
|
|||
|
||||
tools:
|
||||
cache-memory: true
|
||||
serena: ["java", "python", "typescript", "csharp"]
|
||||
github:
|
||||
toolsets: [default]
|
||||
bash: [":*"]
|
||||
edit: {}
|
||||
glob: {}
|
||||
web-search: {}
|
||||
|
||||
safe-outputs:
|
||||
|
|
|
|||
2
.github/workflows/build-warning-fixer.md
vendored
2
.github/workflows/build-warning-fixer.md
vendored
|
|
@ -5,8 +5,6 @@ on:
|
|||
workflow_dispatch:
|
||||
permissions: read-all
|
||||
tools:
|
||||
view: {}
|
||||
glob: {}
|
||||
edit:
|
||||
bash: true
|
||||
safe-outputs:
|
||||
|
|
|
|||
2
.github/workflows/build-z3-cache.yml
vendored
2
.github/workflows/build-z3-cache.yml
vendored
|
|
@ -45,7 +45,7 @@ jobs:
|
|||
|
||||
- name: Restore or create cache
|
||||
id: cache-z3
|
||||
uses: actions/cache@v5.0.4
|
||||
uses: actions/cache@v5.0.5
|
||||
with:
|
||||
path: |
|
||||
build/z3
|
||||
|
|
|
|||
22
.github/workflows/ci.yml
vendored
22
.github/workflows/ci.yml
vendored
|
|
@ -83,8 +83,16 @@ jobs:
|
|||
- name: Checkout code
|
||||
uses: actions/checkout@v6.0.2
|
||||
|
||||
- name: Select Python
|
||||
run: |
|
||||
# Use the first available manylinux interpreter for deterministic selection.
|
||||
PYTHON=$(printf '%s\n' /opt/python/*/bin/python | sort -V | head -n1)
|
||||
test -x "$PYTHON" || { echo "Error: no interpreter found under /opt/python/*/bin/python"; exit 1; }
|
||||
echo "PYTHON=$PYTHON" >> "$GITHUB_ENV"
|
||||
"$PYTHON" --version
|
||||
|
||||
- name: Setup Python virtual environment
|
||||
run: "/opt/python/cp38-cp38/bin/python -m venv $PWD/env"
|
||||
run: "$PYTHON -m venv $PWD/env"
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
|
|
@ -123,8 +131,16 @@ jobs:
|
|||
mkdir -p /tmp/arm-toolchain/
|
||||
tar xf /tmp/arm-toolchain.tar.xz -C /tmp/arm-toolchain/ --strip-components=1
|
||||
|
||||
- name: Select Python
|
||||
run: |
|
||||
# Use the first available manylinux interpreter for deterministic selection.
|
||||
PYTHON=$(printf '%s\n' /opt/python/*/bin/python | sort -V | head -n1)
|
||||
test -x "$PYTHON" || { echo "Error: no interpreter found under /opt/python/*/bin/python"; exit 1; }
|
||||
echo "PYTHON=$PYTHON" >> "$GITHUB_ENV"
|
||||
"$PYTHON" --version
|
||||
|
||||
- name: Setup Python virtual environment
|
||||
run: "/opt/python/cp38-cp38/bin/python -m venv $PWD/env"
|
||||
run: "$PYTHON -m venv $PWD/env"
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
|
|
@ -315,7 +331,7 @@ jobs:
|
|||
|
||||
- name: Setup Julia (if needed)
|
||||
if: matrix.name == 'debugClang'
|
||||
uses: julia-actions/setup-julia@v2
|
||||
uses: julia-actions/setup-julia@v3
|
||||
with:
|
||||
version: '1'
|
||||
|
||||
|
|
|
|||
|
|
@ -8,8 +8,6 @@ tools:
|
|||
cache-memory: true
|
||||
github:
|
||||
toolsets: [default]
|
||||
view: {}
|
||||
glob: {}
|
||||
edit: {}
|
||||
bash:
|
||||
- "clang-format --version"
|
||||
|
|
|
|||
1393
.github/workflows/code-simplifier.lock.yml
generated
vendored
1393
.github/workflows/code-simplifier.lock.yml
generated
vendored
File diff suppressed because it is too large
Load diff
819
.github/workflows/code-simplifier.md
vendored
819
.github/workflows/code-simplifier.md
vendored
|
|
@ -1,434 +1,3 @@
|
|||
<<<<<<< current (local changes)
|
||||
---
|
||||
on:
|
||||
schedule: daily
|
||||
skip-if-match: is:pr is:open in:title "[code-simplifier]"
|
||||
permissions:
|
||||
contents: read
|
||||
issues: read
|
||||
pull-requests: read
|
||||
safe-outputs:
|
||||
create-issue:
|
||||
labels:
|
||||
- refactoring
|
||||
- code-quality
|
||||
- automation
|
||||
title-prefix: "[code-simplifier] "
|
||||
noop:
|
||||
report-as-issue: false
|
||||
description: Analyzes recently modified code and creates pull requests with simplifications that improve clarity, consistency, and maintainability while preserving functionality
|
||||
name: Code Simplifier
|
||||
source: github/gh-aw/.github/workflows/code-simplifier.md@76d37d925abd44fee97379206f105b74b91a285b
|
||||
strict: true
|
||||
timeout-minutes: 30
|
||||
tools:
|
||||
github:
|
||||
toolsets:
|
||||
- default
|
||||
tracker-id: code-simplifier
|
||||
---
|
||||
<!-- This prompt will be imported in the agentic workflow .github/workflows/code-simplifier.md at runtime. -->
|
||||
<!-- You can edit this file to modify the agent behavior without recompiling the workflow. -->
|
||||
|
||||
# Code Simplifier Agent
|
||||
|
||||
You are an expert code simplification specialist focused on enhancing code clarity, consistency, and maintainability while preserving exact functionality. Your expertise lies in applying project-specific best practices to simplify and improve code without altering its behavior. You prioritize readable, explicit code over overly compact solutions. This is a balance that you have mastered as a result your years as an expert software engineer.
|
||||
|
||||
## Your Mission
|
||||
|
||||
Analyze recently modified code from the last 24 hours and apply refinements that improve code quality while preserving all functionality. Create a GitHub issue with a properly formatted diff if improvements are found.
|
||||
|
||||
## Current Context
|
||||
|
||||
- **Repository**: ${{ github.repository }}
|
||||
- **Workspace**: ${{ github.workspace }}
|
||||
|
||||
## Phase 1: Identify Recently Modified Code
|
||||
|
||||
### 1.1 Find Recent Changes
|
||||
|
||||
Search for merged pull requests and commits from the last 24 hours:
|
||||
|
||||
```bash
|
||||
# Get yesterday's date in ISO format
|
||||
YESTERDAY=$(date -d '1 day ago' '+%Y-%m-%d' 2>/dev/null || date -v-1d '+%Y-%m-%d')
|
||||
|
||||
# List recent commits
|
||||
git log --since="24 hours ago" --pretty=format:"%H %s" --no-merges
|
||||
```
|
||||
|
||||
Use GitHub tools to:
|
||||
- Search for pull requests merged in the last 24 hours: `repo:${{ github.repository }} is:pr is:merged merged:>=${YESTERDAY}`
|
||||
- Get details of merged PRs to understand what files were changed
|
||||
- List commits from the last 24 hours to identify modified files
|
||||
|
||||
### 1.2 Extract Changed Files
|
||||
|
||||
For each merged PR or recent commit:
|
||||
- Use `pull_request_read` with `method: get_files` to list changed files
|
||||
- Use `get_commit` to see file changes in recent commits
|
||||
- Focus on source code files (`.go`, `.js`, `.ts`, `.tsx`, `.cjs`, `.py`, etc.)
|
||||
- Exclude test files, lock files, and generated files
|
||||
|
||||
### 1.3 Determine Scope
|
||||
|
||||
If **no files were changed in the last 24 hours**, exit gracefully without creating a PR:
|
||||
|
||||
```
|
||||
✅ No code changes detected in the last 24 hours.
|
||||
Code simplifier has nothing to process today.
|
||||
```
|
||||
|
||||
If **files were changed**, proceed to Phase 2.
|
||||
|
||||
## Phase 2: Analyze and Simplify Code
|
||||
|
||||
### 2.1 Review Project Standards
|
||||
|
||||
Before simplifying, review the project's coding standards from relevant documentation:
|
||||
|
||||
- For Go projects: Check `AGENTS.md`, `DEVGUIDE.md`, or similar files
|
||||
- For JavaScript/TypeScript: Look for `CLAUDE.md`, style guides, or coding conventions
|
||||
- For Python: Check for style guides, PEP 8 adherence, or project-specific conventions
|
||||
|
||||
**Key Standards to Apply:**
|
||||
|
||||
For **JavaScript/TypeScript** projects:
|
||||
- Use ES modules with proper import sorting and extensions
|
||||
- Prefer `function` keyword over arrow functions for top-level functions
|
||||
- Use explicit return type annotations for top-level functions
|
||||
- Follow proper React component patterns with explicit Props types
|
||||
- Use proper error handling patterns (avoid try/catch when possible)
|
||||
- Maintain consistent naming conventions
|
||||
|
||||
For **Go** projects:
|
||||
- Use `any` instead of `interface{}`
|
||||
- Follow console formatting for CLI output
|
||||
- Use semantic type aliases for domain concepts
|
||||
- Prefer small, focused files (200-500 lines ideal)
|
||||
- Use table-driven tests with descriptive names
|
||||
|
||||
For **Python** projects:
|
||||
- Follow PEP 8 style guide
|
||||
- Use type hints for function signatures
|
||||
- Prefer explicit over implicit code
|
||||
- Use list/dict comprehensions where they improve clarity (not complexity)
|
||||
|
||||
### 2.2 Simplification Principles
|
||||
|
||||
Apply these refinements to the recently modified code:
|
||||
|
||||
#### 1. Preserve Functionality
|
||||
- **NEVER** change what the code does - only how it does it
|
||||
- All original features, outputs, and behaviors must remain intact
|
||||
- Run tests before and after to ensure no behavioral changes
|
||||
|
||||
#### 2. Enhance Clarity
|
||||
- Reduce unnecessary complexity and nesting
|
||||
- Eliminate redundant code and abstractions
|
||||
- Improve readability through clear variable and function names
|
||||
- Consolidate related logic
|
||||
- Remove unnecessary comments that describe obvious code
|
||||
- **IMPORTANT**: Avoid nested ternary operators - prefer switch statements or if/else chains
|
||||
- Choose clarity over brevity - explicit code is often better than compact code
|
||||
|
||||
#### 3. Apply Project Standards
|
||||
- Use project-specific conventions and patterns
|
||||
- Follow established naming conventions
|
||||
- Apply consistent formatting
|
||||
- Use appropriate language features (modern syntax where beneficial)
|
||||
|
||||
#### 4. Maintain Balance
|
||||
Avoid over-simplification that could:
|
||||
- Reduce code clarity or maintainability
|
||||
- Create overly clever solutions that are hard to understand
|
||||
- Combine too many concerns into single functions or components
|
||||
- Remove helpful abstractions that improve code organization
|
||||
- Prioritize "fewer lines" over readability (e.g., nested ternaries, dense one-liners)
|
||||
- Make the code harder to debug or extend
|
||||
|
||||
### 2.3 Perform Code Analysis
|
||||
|
||||
For each changed file:
|
||||
|
||||
1. **Read the file contents** using the edit or view tool
|
||||
2. **Identify refactoring opportunities**:
|
||||
- Long functions that could be split
|
||||
- Duplicate code patterns
|
||||
- Complex conditionals that could be simplified
|
||||
- Unclear variable names
|
||||
- Missing or excessive comments
|
||||
- Non-standard patterns
|
||||
3. **Design the simplification**:
|
||||
- What specific changes will improve clarity?
|
||||
- How can complexity be reduced?
|
||||
- What patterns should be applied?
|
||||
- Will this maintain all functionality?
|
||||
|
||||
### 2.4 Apply Simplifications
|
||||
|
||||
Use the **edit** tool to modify files:
|
||||
|
||||
```bash
|
||||
# For each file with improvements:
|
||||
# 1. Read the current content
|
||||
# 2. Apply targeted edits to simplify code
|
||||
# 3. Ensure all functionality is preserved
|
||||
```
|
||||
|
||||
**Guidelines for edits:**
|
||||
- Make surgical, targeted changes
|
||||
- One logical improvement per edit (but batch multiple edits in a single response)
|
||||
- Preserve all original behavior
|
||||
- Keep changes focused on recently modified code
|
||||
- Don't refactor unrelated code unless it improves understanding of the changes
|
||||
|
||||
## Phase 3: Validate Changes
|
||||
|
||||
### 3.1 Run Tests
|
||||
|
||||
After making simplifications, run the project's test suite to ensure no functionality was broken:
|
||||
|
||||
```bash
|
||||
# For Go projects
|
||||
make test-unit
|
||||
|
||||
# For JavaScript/TypeScript projects
|
||||
npm test
|
||||
|
||||
# For Python projects
|
||||
pytest
|
||||
```
|
||||
|
||||
If tests fail:
|
||||
- Review the failures carefully
|
||||
- Revert changes that broke functionality
|
||||
- Adjust simplifications to preserve behavior
|
||||
- Re-run tests until they pass
|
||||
|
||||
### 3.2 Run Linters
|
||||
|
||||
Ensure code style is consistent:
|
||||
|
||||
```bash
|
||||
# For Go projects
|
||||
make lint
|
||||
|
||||
# For JavaScript/TypeScript projects
|
||||
npm run lint
|
||||
|
||||
# For Python projects
|
||||
flake8 . || pylint .
|
||||
```
|
||||
|
||||
Fix any linting issues introduced by the simplifications.
|
||||
|
||||
### 3.3 Check Build
|
||||
|
||||
Verify the project still builds successfully:
|
||||
|
||||
```bash
|
||||
# For Go projects
|
||||
make build
|
||||
|
||||
# For JavaScript/TypeScript projects
|
||||
npm run build
|
||||
|
||||
# For Python projects
|
||||
# (typically no build step, but check imports)
|
||||
python -m py_compile changed_files.py
|
||||
```
|
||||
|
||||
## Phase 4: Create GitHub Issue with Diff
|
||||
|
||||
### 4.1 Determine If Issue Is Needed
|
||||
|
||||
Only create an issue if:
|
||||
- ✅ You made actual code simplifications
|
||||
- ✅ All tests pass
|
||||
- ✅ Linting is clean
|
||||
- ✅ Build succeeds
|
||||
- ✅ Changes improve code quality without breaking functionality
|
||||
|
||||
If no improvements were made or changes broke tests, exit gracefully:
|
||||
|
||||
```
|
||||
✅ Code analyzed from last 24 hours.
|
||||
No simplifications needed - code already meets quality standards.
|
||||
```
|
||||
|
||||
### 4.2 Generate Git Diff
|
||||
|
||||
Before creating the issue, generate a properly formatted git diff that can be used to create a pull request:
|
||||
|
||||
```bash
|
||||
# Stage all changes if not already staged
|
||||
git add .
|
||||
|
||||
# Generate a complete unified diff of all staged changes
|
||||
git diff --cached > /tmp/code-simplification.diff
|
||||
|
||||
# Read the diff to include in the discussion
|
||||
cat /tmp/code-simplification.diff
|
||||
```
|
||||
|
||||
**Important**: The diff must be in standard unified diff format (git unified diff) that includes:
|
||||
- File headers with `diff --git a/path b/path`
|
||||
- Index lines with git hashes
|
||||
- `---` and `+++` lines showing old and new file paths
|
||||
- `@@` lines showing line numbers
|
||||
- Actual code changes with `-` for removed lines and `+` for added lines
|
||||
|
||||
This format is compatible with:
|
||||
- `git apply` command for direct application
|
||||
- GitHub's "Create PR from diff" functionality
|
||||
- GitHub Copilot for suggesting PR creation
|
||||
- Manual copy-paste into PR creation interface
|
||||
|
||||
### 4.3 Generate Issue Description
|
||||
|
||||
If creating an issue, use this structure:
|
||||
|
||||
```markdown
|
||||
## Code Simplification - [Date]
|
||||
|
||||
This discussion presents code simplifications that improve clarity, consistency, and maintainability while preserving all functionality.
|
||||
|
||||
### Files Simplified
|
||||
|
||||
- `path/to/file1.go` - [Brief description of improvements]
|
||||
- `path/to/file2.js` - [Brief description of improvements]
|
||||
|
||||
### Improvements Made
|
||||
|
||||
1. **Reduced Complexity**
|
||||
- Simplified nested conditionals in `file1.go`
|
||||
- Extracted helper function for repeated logic
|
||||
|
||||
2. **Enhanced Clarity**
|
||||
- Renamed variables for better readability
|
||||
- Removed redundant comments
|
||||
- Applied consistent naming conventions
|
||||
|
||||
3. **Applied Project Standards**
|
||||
- Used `function` keyword instead of arrow functions
|
||||
- Added explicit type annotations
|
||||
- Followed established patterns
|
||||
|
||||
### Changes Based On
|
||||
|
||||
Recent changes from:
|
||||
- #[PR_NUMBER] - [PR title]
|
||||
- Commit [SHORT_SHA] - [Commit message]
|
||||
|
||||
### Testing
|
||||
|
||||
- ✅ All tests pass
|
||||
- ✅ Linting passes
|
||||
- ✅ Build succeeds
|
||||
- ✅ No functional changes - behavior is identical
|
||||
|
||||
### Git Diff
|
||||
|
||||
Below is the complete diff that can be used to create a pull request. You can copy this diff and:
|
||||
- Use it with GitHub Copilot to create a PR
|
||||
- Apply it directly with `git apply`
|
||||
- Create a PR manually by copying the changes
|
||||
|
||||
```diff
|
||||
[PASTE THE COMPLETE GIT DIFF HERE]
|
||||
```
|
||||
|
||||
To apply this diff:
|
||||
|
||||
```bash
|
||||
# Save the diff to a file
|
||||
cat > /tmp/code-simplification.diff << 'EOF'
|
||||
[PASTE DIFF CONTENT]
|
||||
EOF
|
||||
|
||||
# Apply the diff
|
||||
git apply /tmp/code-simplification.diff
|
||||
|
||||
# Or create a PR from the current branch
|
||||
gh pr create --title "[code-simplifier] Code Simplification" --body "See discussion #[NUMBER]"
|
||||
```
|
||||
|
||||
### Review Focus
|
||||
|
||||
Please verify:
|
||||
- Functionality is preserved
|
||||
- Simplifications improve code quality
|
||||
- Changes align with project conventions
|
||||
- No unintended side effects
|
||||
|
||||
---
|
||||
|
||||
*Automated by Code Simplifier Agent - analyzing code from the last 24 hours*
|
||||
```
|
||||
|
||||
### 4.4 Use Safe Outputs
|
||||
|
||||
Create the issue using the safe-outputs configuration:
|
||||
|
||||
- Title will be prefixed with `[code-simplifier]`
|
||||
- Labeled with `refactoring`, `code-quality`, `automation`
|
||||
- Contains complete git diff for easy PR creation
|
||||
|
||||
## Important Guidelines
|
||||
|
||||
### Scope Control
|
||||
- **Focus on recent changes**: Only refine code modified in the last 24 hours
|
||||
- **Don't over-refactor**: Avoid touching unrelated code
|
||||
- **Preserve interfaces**: Don't change public APIs or exported functions
|
||||
- **Incremental improvements**: Make targeted, surgical changes
|
||||
|
||||
### Quality Standards
|
||||
- **Test first**: Always run tests after simplifications
|
||||
- **Preserve behavior**: Functionality must remain identical
|
||||
- **Follow conventions**: Apply project-specific patterns consistently
|
||||
- **Clear over clever**: Prioritize readability and maintainability
|
||||
|
||||
### Exit Conditions
|
||||
Exit gracefully without creating an issue if:
|
||||
- No code was changed in the last 24 hours
|
||||
- No simplifications are beneficial
|
||||
- Tests fail after changes
|
||||
- Build fails after changes
|
||||
- Changes are too risky or complex
|
||||
|
||||
### Success Metrics
|
||||
A successful simplification:
|
||||
- ✅ Improves code clarity without changing behavior
|
||||
- ✅ Passes all tests and linting
|
||||
- ✅ Applies project-specific conventions
|
||||
- ✅ Makes code easier to understand and maintain
|
||||
- ✅ Focuses on recently modified code
|
||||
- ✅ Provides clear documentation of changes
|
||||
|
||||
## Output Requirements
|
||||
|
||||
Your output MUST either:
|
||||
|
||||
1. **If no changes in last 24 hours**:
|
||||
```
|
||||
✅ No code changes detected in the last 24 hours.
|
||||
Code simplifier has nothing to process today.
|
||||
```
|
||||
|
||||
2. **If no simplifications beneficial**:
|
||||
```
|
||||
✅ Code analyzed from last 24 hours.
|
||||
No simplifications needed - code already meets quality standards.
|
||||
```
|
||||
|
||||
3. **If simplifications made**: Create an issue with the changes using safe-outputs, including:
|
||||
- Clear description of improvements
|
||||
- Complete git diff in proper format
|
||||
- Instructions for applying the diff or creating a PR
|
||||
|
||||
Begin your code simplification analysis now. Find recently modified code, assess simplification opportunities, apply improvements while preserving functionality, validate changes, and create an issue with a git diff if beneficial.
|
||||
||||||| base (original)
|
||||
---
|
||||
name: Code Simplifier
|
||||
description: Analyzes recently modified code and creates pull requests with simplifications that improve clarity, consistency, and maintainability while preserving functionality
|
||||
|
|
@ -443,389 +12,6 @@ permissions:
|
|||
|
||||
tracker-id: code-simplifier
|
||||
|
||||
imports:
|
||||
- shared/reporting.md
|
||||
|
||||
safe-outputs:
|
||||
create-pull-request:
|
||||
title-prefix: "[code-simplifier] "
|
||||
labels: [refactoring, code-quality, automation]
|
||||
reviewers: [copilot]
|
||||
expires: 7d
|
||||
|
||||
tools:
|
||||
github:
|
||||
toolsets: [default]
|
||||
|
||||
timeout-minutes: 30
|
||||
strict: true
|
||||
source: github/gh-aw/.github/workflows/code-simplifier.md@76d37d925abd44fee97379206f105b74b91a285b
|
||||
---
|
||||
|
||||
<!-- This prompt will be imported in the agentic workflow .github/workflows/code-simplifier.md at runtime. -->
|
||||
<!-- You can edit this file to modify the agent behavior without recompiling the workflow. -->
|
||||
|
||||
# Code Simplifier Agent
|
||||
|
||||
You are an expert code simplification specialist focused on enhancing code clarity, consistency, and maintainability while preserving exact functionality. Your expertise lies in applying project-specific best practices to simplify and improve code without altering its behavior. You prioritize readable, explicit code over overly compact solutions. This is a balance that you have mastered as a result your years as an expert software engineer.
|
||||
|
||||
## Your Mission
|
||||
|
||||
Analyze recently modified code from the last 24 hours and apply refinements that improve code quality while preserving all functionality. Create a pull request with the simplified code if improvements are found.
|
||||
|
||||
## Current Context
|
||||
|
||||
- **Repository**: ${{ github.repository }}
|
||||
- **Analysis Date**: $(date +%Y-%m-%d)
|
||||
- **Workspace**: ${{ github.workspace }}
|
||||
|
||||
## Phase 1: Identify Recently Modified Code
|
||||
|
||||
### 1.1 Find Recent Changes
|
||||
|
||||
Search for merged pull requests and commits from the last 24 hours:
|
||||
|
||||
```bash
|
||||
# Get yesterday's date in ISO format
|
||||
YESTERDAY=$(date -d '1 day ago' '+%Y-%m-%d' 2>/dev/null || date -v-1d '+%Y-%m-%d')
|
||||
|
||||
# List recent commits
|
||||
git log --since="24 hours ago" --pretty=format:"%H %s" --no-merges
|
||||
```
|
||||
|
||||
Use GitHub tools to:
|
||||
- Search for pull requests merged in the last 24 hours: `repo:${{ github.repository }} is:pr is:merged merged:>=${YESTERDAY}`
|
||||
- Get details of merged PRs to understand what files were changed
|
||||
- List commits from the last 24 hours to identify modified files
|
||||
|
||||
### 1.2 Extract Changed Files
|
||||
|
||||
For each merged PR or recent commit:
|
||||
- Use `pull_request_read` with `method: get_files` to list changed files
|
||||
- Use `get_commit` to see file changes in recent commits
|
||||
- Focus on source code files (`.go`, `.js`, `.ts`, `.tsx`, `.cjs`, `.py`, etc.)
|
||||
- Exclude test files, lock files, and generated files
|
||||
|
||||
### 1.3 Determine Scope
|
||||
|
||||
If **no files were changed in the last 24 hours**, exit gracefully without creating a PR:
|
||||
|
||||
```
|
||||
✅ No code changes detected in the last 24 hours.
|
||||
Code simplifier has nothing to process today.
|
||||
```
|
||||
|
||||
If **files were changed**, proceed to Phase 2.
|
||||
|
||||
## Phase 2: Analyze and Simplify Code
|
||||
|
||||
### 2.1 Review Project Standards
|
||||
|
||||
Before simplifying, review the project's coding standards from relevant documentation:
|
||||
|
||||
- For Go projects: Check `AGENTS.md`, `DEVGUIDE.md`, or similar files
|
||||
- For JavaScript/TypeScript: Look for `CLAUDE.md`, style guides, or coding conventions
|
||||
- For Python: Check for style guides, PEP 8 adherence, or project-specific conventions
|
||||
|
||||
**Key Standards to Apply:**
|
||||
|
||||
For **JavaScript/TypeScript** projects:
|
||||
- Use ES modules with proper import sorting and extensions
|
||||
- Prefer `function` keyword over arrow functions for top-level functions
|
||||
- Use explicit return type annotations for top-level functions
|
||||
- Follow proper React component patterns with explicit Props types
|
||||
- Use proper error handling patterns (avoid try/catch when possible)
|
||||
- Maintain consistent naming conventions
|
||||
|
||||
For **Go** projects:
|
||||
- Use `any` instead of `interface{}`
|
||||
- Follow console formatting for CLI output
|
||||
- Use semantic type aliases for domain concepts
|
||||
- Prefer small, focused files (200-500 lines ideal)
|
||||
- Use table-driven tests with descriptive names
|
||||
|
||||
For **Python** projects:
|
||||
- Follow PEP 8 style guide
|
||||
- Use type hints for function signatures
|
||||
- Prefer explicit over implicit code
|
||||
- Use list/dict comprehensions where they improve clarity (not complexity)
|
||||
|
||||
### 2.2 Simplification Principles
|
||||
|
||||
Apply these refinements to the recently modified code:
|
||||
|
||||
#### 1. Preserve Functionality
|
||||
- **NEVER** change what the code does - only how it does it
|
||||
- All original features, outputs, and behaviors must remain intact
|
||||
- Run tests before and after to ensure no behavioral changes
|
||||
|
||||
#### 2. Enhance Clarity
|
||||
- Reduce unnecessary complexity and nesting
|
||||
- Eliminate redundant code and abstractions
|
||||
- Improve readability through clear variable and function names
|
||||
- Consolidate related logic
|
||||
- Remove unnecessary comments that describe obvious code
|
||||
- **IMPORTANT**: Avoid nested ternary operators - prefer switch statements or if/else chains
|
||||
- Choose clarity over brevity - explicit code is often better than compact code
|
||||
|
||||
#### 3. Apply Project Standards
|
||||
- Use project-specific conventions and patterns
|
||||
- Follow established naming conventions
|
||||
- Apply consistent formatting
|
||||
- Use appropriate language features (modern syntax where beneficial)
|
||||
|
||||
#### 4. Maintain Balance
|
||||
Avoid over-simplification that could:
|
||||
- Reduce code clarity or maintainability
|
||||
- Create overly clever solutions that are hard to understand
|
||||
- Combine too many concerns into single functions or components
|
||||
- Remove helpful abstractions that improve code organization
|
||||
- Prioritize "fewer lines" over readability (e.g., nested ternaries, dense one-liners)
|
||||
- Make the code harder to debug or extend
|
||||
|
||||
### 2.3 Perform Code Analysis
|
||||
|
||||
For each changed file:
|
||||
|
||||
1. **Read the file contents** using the edit or view tool
|
||||
2. **Identify refactoring opportunities**:
|
||||
- Long functions that could be split
|
||||
- Duplicate code patterns
|
||||
- Complex conditionals that could be simplified
|
||||
- Unclear variable names
|
||||
- Missing or excessive comments
|
||||
- Non-standard patterns
|
||||
3. **Design the simplification**:
|
||||
- What specific changes will improve clarity?
|
||||
- How can complexity be reduced?
|
||||
- What patterns should be applied?
|
||||
- Will this maintain all functionality?
|
||||
|
||||
### 2.4 Apply Simplifications
|
||||
|
||||
Use the **edit** tool to modify files:
|
||||
|
||||
```bash
|
||||
# For each file with improvements:
|
||||
# 1. Read the current content
|
||||
# 2. Apply targeted edits to simplify code
|
||||
# 3. Ensure all functionality is preserved
|
||||
```
|
||||
|
||||
**Guidelines for edits:**
|
||||
- Make surgical, targeted changes
|
||||
- One logical improvement per edit (but batch multiple edits in a single response)
|
||||
- Preserve all original behavior
|
||||
- Keep changes focused on recently modified code
|
||||
- Don't refactor unrelated code unless it improves understanding of the changes
|
||||
|
||||
## Phase 3: Validate Changes
|
||||
|
||||
### 3.1 Run Tests
|
||||
|
||||
After making simplifications, run the project's test suite to ensure no functionality was broken:
|
||||
|
||||
```bash
|
||||
# For Go projects
|
||||
make test-unit
|
||||
|
||||
# For JavaScript/TypeScript projects
|
||||
npm test
|
||||
|
||||
# For Python projects
|
||||
pytest
|
||||
```
|
||||
|
||||
If tests fail:
|
||||
- Review the failures carefully
|
||||
- Revert changes that broke functionality
|
||||
- Adjust simplifications to preserve behavior
|
||||
- Re-run tests until they pass
|
||||
|
||||
### 3.2 Run Linters
|
||||
|
||||
Ensure code style is consistent:
|
||||
|
||||
```bash
|
||||
# For Go projects
|
||||
make lint
|
||||
|
||||
# For JavaScript/TypeScript projects
|
||||
npm run lint
|
||||
|
||||
# For Python projects
|
||||
flake8 . || pylint .
|
||||
```
|
||||
|
||||
Fix any linting issues introduced by the simplifications.
|
||||
|
||||
### 3.3 Check Build
|
||||
|
||||
Verify the project still builds successfully:
|
||||
|
||||
```bash
|
||||
# For Go projects
|
||||
make build
|
||||
|
||||
# For JavaScript/TypeScript projects
|
||||
npm run build
|
||||
|
||||
# For Python projects
|
||||
# (typically no build step, but check imports)
|
||||
python -m py_compile changed_files.py
|
||||
```
|
||||
|
||||
## Phase 4: Create Pull Request
|
||||
|
||||
### 4.1 Determine If PR Is Needed
|
||||
|
||||
Only create a PR if:
|
||||
- ✅ You made actual code simplifications
|
||||
- ✅ All tests pass
|
||||
- ✅ Linting is clean
|
||||
- ✅ Build succeeds
|
||||
- ✅ Changes improve code quality without breaking functionality
|
||||
|
||||
If no improvements were made or changes broke tests, exit gracefully:
|
||||
|
||||
```
|
||||
✅ Code analyzed from last 24 hours.
|
||||
No simplifications needed - code already meets quality standards.
|
||||
```
|
||||
|
||||
### 4.2 Generate PR Description
|
||||
|
||||
If creating a PR, use this structure:
|
||||
|
||||
```markdown
|
||||
## Code Simplification - [Date]
|
||||
|
||||
This PR simplifies recently modified code to improve clarity, consistency, and maintainability while preserving all functionality.
|
||||
|
||||
### Files Simplified
|
||||
|
||||
- `path/to/file1.go` - [Brief description of improvements]
|
||||
- `path/to/file2.js` - [Brief description of improvements]
|
||||
|
||||
### Improvements Made
|
||||
|
||||
1. **Reduced Complexity**
|
||||
- Simplified nested conditionals in `file1.go`
|
||||
- Extracted helper function for repeated logic
|
||||
|
||||
2. **Enhanced Clarity**
|
||||
- Renamed variables for better readability
|
||||
- Removed redundant comments
|
||||
- Applied consistent naming conventions
|
||||
|
||||
3. **Applied Project Standards**
|
||||
- Used `function` keyword instead of arrow functions
|
||||
- Added explicit type annotations
|
||||
- Followed established patterns
|
||||
|
||||
### Changes Based On
|
||||
|
||||
Recent changes from:
|
||||
- #[PR_NUMBER] - [PR title]
|
||||
- Commit [SHORT_SHA] - [Commit message]
|
||||
|
||||
### Testing
|
||||
|
||||
- ✅ All tests pass (`make test-unit`)
|
||||
- ✅ Linting passes (`make lint`)
|
||||
- ✅ Build succeeds (`make build`)
|
||||
- ✅ No functional changes - behavior is identical
|
||||
|
||||
### Review Focus
|
||||
|
||||
Please verify:
|
||||
- Functionality is preserved
|
||||
- Simplifications improve code quality
|
||||
- Changes align with project conventions
|
||||
- No unintended side effects
|
||||
|
||||
---
|
||||
|
||||
*Automated by Code Simplifier Agent - analyzing code from the last 24 hours*
|
||||
```
|
||||
|
||||
### 4.3 Use Safe Outputs
|
||||
|
||||
Create the pull request using the safe-outputs configuration:
|
||||
|
||||
- Title will be prefixed with `[code-simplifier]`
|
||||
- Labeled with `refactoring`, `code-quality`, `automation`
|
||||
- Assigned to `copilot` for review
|
||||
- Set as ready for review (not draft)
|
||||
|
||||
## Important Guidelines
|
||||
|
||||
### Scope Control
|
||||
- **Focus on recent changes**: Only refine code modified in the last 24 hours
|
||||
- **Don't over-refactor**: Avoid touching unrelated code
|
||||
- **Preserve interfaces**: Don't change public APIs or exported functions
|
||||
- **Incremental improvements**: Make targeted, surgical changes
|
||||
|
||||
### Quality Standards
|
||||
- **Test first**: Always run tests after simplifications
|
||||
- **Preserve behavior**: Functionality must remain identical
|
||||
- **Follow conventions**: Apply project-specific patterns consistently
|
||||
- **Clear over clever**: Prioritize readability and maintainability
|
||||
|
||||
### Exit Conditions
|
||||
Exit gracefully without creating a PR if:
|
||||
- No code was changed in the last 24 hours
|
||||
- No simplifications are beneficial
|
||||
- Tests fail after changes
|
||||
- Build fails after changes
|
||||
- Changes are too risky or complex
|
||||
|
||||
### Success Metrics
|
||||
A successful simplification:
|
||||
- ✅ Improves code clarity without changing behavior
|
||||
- ✅ Passes all tests and linting
|
||||
- ✅ Applies project-specific conventions
|
||||
- ✅ Makes code easier to understand and maintain
|
||||
- ✅ Focuses on recently modified code
|
||||
- ✅ Provides clear documentation of changes
|
||||
|
||||
## Output Requirements
|
||||
|
||||
Your output MUST either:
|
||||
|
||||
1. **If no changes in last 24 hours**:
|
||||
```
|
||||
✅ No code changes detected in the last 24 hours.
|
||||
Code simplifier has nothing to process today.
|
||||
```
|
||||
|
||||
2. **If no simplifications beneficial**:
|
||||
```
|
||||
✅ Code analyzed from last 24 hours.
|
||||
No simplifications needed - code already meets quality standards.
|
||||
```
|
||||
|
||||
3. **If simplifications made**: Create a PR with the changes using safe-outputs
|
||||
|
||||
Begin your code simplification analysis now. Find recently modified code, assess simplification opportunities, apply improvements while preserving functionality, validate changes, and create a PR if beneficial.
|
||||
=======
|
||||
---
|
||||
name: Code Simplifier
|
||||
description: Analyzes recently modified code and creates pull requests with simplifications that improve clarity, consistency, and maintainability while preserving functionality
|
||||
on:
|
||||
schedule: daily
|
||||
skip-if-match: 'is:pr is:open in:title "[code-simplifier]"'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
issues: read
|
||||
pull-requests: read
|
||||
|
||||
tracker-id: code-simplifier
|
||||
|
||||
imports:
|
||||
- shared/activation-app.md
|
||||
- shared/reporting.md
|
||||
|
||||
safe-outputs:
|
||||
create-pull-request:
|
||||
|
|
@ -833,6 +19,8 @@ safe-outputs:
|
|||
labels: [refactoring, code-quality, automation]
|
||||
reviewers: [copilot]
|
||||
expires: 1d
|
||||
noop:
|
||||
report-as-issue: false
|
||||
|
||||
network:
|
||||
allowed:
|
||||
|
|
@ -1216,5 +404,4 @@ Begin your code simplification analysis now. Find recently modified code, assess
|
|||
|
||||
```json
|
||||
{"noop": {"message": "No action needed: [brief explanation of what was analyzed and why]"}}
|
||||
```
|
||||
>>>>>>> new (upstream)
|
||||
```
|
||||
1373
.github/workflows/compare-stats-anomaly-reporter.lock.yml
generated
vendored
Normal file
1373
.github/workflows/compare-stats-anomaly-reporter.lock.yml
generated
vendored
Normal file
File diff suppressed because it is too large
Load diff
191
.github/workflows/compare-stats-anomaly-reporter.md
vendored
Normal file
191
.github/workflows/compare-stats-anomaly-reporter.md
vendored
Normal file
|
|
@ -0,0 +1,191 @@
|
|||
---
|
||||
description: Analyze benchmark statistics from the latest 30 hours and publish bug/crash/anomaly summary as a GitHub Discussion
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: "0 */12 * * *"
|
||||
workflow_dispatch:
|
||||
|
||||
permissions: read-all
|
||||
|
||||
strict: false
|
||||
timeout-minutes: 45
|
||||
|
||||
network:
|
||||
allowed:
|
||||
- defaults
|
||||
- mtzguido.tplinkdns.com
|
||||
|
||||
tools:
|
||||
bash: [":*"]
|
||||
github:
|
||||
toolsets: [default]
|
||||
|
||||
safe-outputs:
|
||||
create-discussion:
|
||||
title-prefix: "[Compare Stats] "
|
||||
category: "agentic workflows"
|
||||
close-older-discussions: true
|
||||
missing-tool:
|
||||
create-issue: true
|
||||
noop:
|
||||
report-as-issue: false
|
||||
---
|
||||
|
||||
# Compare Stats Bug/Crash/Anomaly Reporter
|
||||
|
||||
Your name is ${{ github.workflow }}. You are a Z3 benchmarking analysis agent for `${{ github.repository }}`.
|
||||
|
||||
Analyze the benchmark statistics page below, focusing on results from the last 30 hours, then create a GitHub Discussion with a concise but actionable summary of:
|
||||
|
||||
- Bugs
|
||||
- Crashes
|
||||
- Anomalies
|
||||
|
||||
Source URL:
|
||||
`http://mtzguido.tplinkdns.com:8081/z3/`
|
||||
|
||||
Note: this endpoint is currently HTTP-only. Treat fetched data as non-sensitive benchmark telemetry and do not include secrets in requests or reports.
|
||||
Note: the workflow runs every 12 hours but analyzes 30 hours intentionally to provide overlap and avoid missing transient failures between runs.
|
||||
Overlapping windows are expected; `close-older-discussions: true` keeps only the latest report thread active.
|
||||
|
||||
## Requirements
|
||||
|
||||
### 1) Fetch and save the source page
|
||||
|
||||
Use bash to fetch the page into `/tmp/gh-aw/agent/benchmark_stats.html`.
|
||||
|
||||
Try this first:
|
||||
```bash
|
||||
curl -fsSL --max-time 60 "http://mtzguido.tplinkdns.com:8081/z3/" -o /tmp/gh-aw/agent/benchmark_stats.html
|
||||
```
|
||||
|
||||
If that fails, retry once with:
|
||||
```bash
|
||||
wget -q -T 60 -O /tmp/gh-aw/agent/benchmark_stats.html "http://mtzguido.tplinkdns.com:8081/z3/"
|
||||
```
|
||||
|
||||
If both fail, still create a discussion that explains the fetch failure, includes stderr output, and marks the report as incomplete.
|
||||
After a successful fetch, perform basic integrity checks before parsing:
|
||||
- file is non-empty
|
||||
- content includes `<html` and at least one `<table`
|
||||
- if checks fail, treat as suspicious/incomplete data and report this explicitly
|
||||
|
||||
### 2) Parse tabular data
|
||||
|
||||
Use Python to parse all tables from the HTML into normalized rows.
|
||||
|
||||
Use resilient parsing:
|
||||
- Prefer `pandas.read_html` when available.
|
||||
- If pandas fails, parse with `html.parser`/regex fallback.
|
||||
|
||||
Persist normalized JSON to `/tmp/gh-aw/agent/compare_stats_rows.json`.
|
||||
|
||||
### 3) Detect time window (last 30 hours)
|
||||
|
||||
Find candidate timestamp columns using case-insensitive column-name matches:
|
||||
- `time`, `timestamp`, `date`, `run`, `created`, `updated`
|
||||
|
||||
Parse datetimes with timezone handling if present. Use current UTC time and filter to rows where timestamp is within the past 30 hours.
|
||||
Treat naive timestamps as UTC.
|
||||
|
||||
If no timestamp can be extracted:
|
||||
- Report this limitation explicitly.
|
||||
- Continue analysis on all rows.
|
||||
- Mark the discussion as "time-window fallback".
|
||||
|
||||
### 4) Classify bugs/crashes/anomalies
|
||||
|
||||
Infer key columns using column-name heuristics:
|
||||
- status/result/outcome
|
||||
- benchmark/instance/file/name
|
||||
- set/suite/group/track/family
|
||||
- message/error/details/reason
|
||||
|
||||
Normalize status strings to lowercase.
|
||||
|
||||
#### Bugs / Crashes
|
||||
Classify a row as **crash/bug** if status/details contain terms like:
|
||||
- `crash`, `segfault`, `assert`, `abort`, `exception`, `error`, `failed`, `bug`
|
||||
|
||||
#### Anomalies
|
||||
At minimum, detect:
|
||||
|
||||
1. **Unknown-outlier anomaly** (required):
|
||||
- Within the same benchmark set/suite/group, if most rows are in `{sat, unsat, timeout}` but a minority are `unknown`, flag the `unknown` rows as anomalies.
|
||||
- Rationale: require enough samples for confidence and avoid flagging sets where `unknown` is common behavior. `0.4` caps unknown results to a minority, while `0.6` enforces a decisive majority of sat/unsat/timeout outcomes. Any remainder after those constraints is intentionally allowed for other statuses.
|
||||
- Use this threshold: `total_rows >= 4`, `unknown_count / total_rows <= 0.4`, and `(sat_count + unsat_count + timeout_count) / total_rows >= 0.6`.
|
||||
- If set/suite/group columns are missing, fallback grouping order is: directory prefix of benchmark path/name, then benchmark name prefix before first separator (`/`, `:`, `::`), then a single global group.
|
||||
|
||||
2. **Status divergence anomaly**:
|
||||
- Same benchmark name appears multiple times with conflicting non-timeout statuses (for example `sat` vs `unsat`).
|
||||
- Ignore timeout-only disagreements here; timeout behavior is covered under the repeated hard-failure anomaly section to reduce noise from transient runtime variance.
|
||||
|
||||
3. **Repeated hard-failure anomaly**:
|
||||
- Same benchmark appears repeatedly with crash/error-like status in the time window.
|
||||
|
||||
### 5) Generate discussion report
|
||||
|
||||
Create a GitHub Discussion using `create-discussion` safe output.
|
||||
|
||||
Use this structure:
|
||||
|
||||
```markdown
|
||||
### Compare Stats Analysis Report
|
||||
|
||||
**Source**: [benchmark statistics](http://mtzguido.tplinkdns.com:8081/z3/)
|
||||
**Workflow Run**: [#${{ github.run_id }}](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})
|
||||
**Analysis Time (UTC)**: <timestamp>
|
||||
**Window**: last 30 hours (or fallback mode)
|
||||
|
||||
### Executive Summary
|
||||
|
||||
- Rows analyzed: N
|
||||
- Rows in 30h window: M (or "timestamp unavailable")
|
||||
- Bugs/crashes: B
|
||||
- Anomalies: A
|
||||
|
||||
### Bugs and Crashes
|
||||
|
||||
| Benchmark Set | Benchmark | Status | Details | Timestamp |
|
||||
|---|---|---|---|---|
|
||||
| ... |
|
||||
|
||||
### Anomalies
|
||||
|
||||
#### Unknown-Outlier Cases
|
||||
| Benchmark Set | Benchmark | Status | Peer Status Distribution | Timestamp |
|
||||
|---|---|---|---|---|
|
||||
| ... |
|
||||
|
||||
#### Status Divergences
|
||||
| Benchmark | Observed Statuses | Benchmark Set(s) | Timestamp(s) |
|
||||
|---|---|---|---|
|
||||
| ... |
|
||||
|
||||
#### Repeated Hard Failures
|
||||
| Benchmark | Failure Count | Representative Status/Details | Benchmark Set(s) |
|
||||
|---|---|---|---|
|
||||
| ... |
|
||||
|
||||
### Notes and Limitations
|
||||
- Mention parsing assumptions
|
||||
- Mention missing columns/timestamps if any
|
||||
|
||||
<details>
|
||||
<summary><b>Raw Extraction Summary</b></summary>
|
||||
|
||||
- Table count
|
||||
- Candidate columns used
|
||||
- Top status distribution
|
||||
- Up to 30 representative raw rows (sanitized)
|
||||
|
||||
</details>
|
||||
```
|
||||
|
||||
## Reporting Rules
|
||||
|
||||
- Be factual and concise.
|
||||
- Do not claim certainty when column mapping is heuristic.
|
||||
- If no bugs/crashes/anomalies are found, still create the discussion and explicitly state "No issues detected in analyzed window."
|
||||
- Do not open PRs or modify repository files.
|
||||
4
.github/workflows/coverage.yml
vendored
4
.github/workflows/coverage.yml
vendored
|
|
@ -89,13 +89,13 @@ jobs:
|
|||
id: date
|
||||
run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
|
||||
|
||||
- uses: actions/upload-artifact@v7
|
||||
- uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: coverage-${{steps.date.outputs.date}}
|
||||
path: ${{github.workspace}}/coverage.html
|
||||
retention-days: 4
|
||||
|
||||
- uses: actions/upload-artifact@v7
|
||||
- uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: coverage-details-${{steps.date.outputs.date}}
|
||||
path: ${{env.COV_DETAILS_PATH}}
|
||||
|
|
|
|||
2
.github/workflows/csa-analysis.md
vendored
2
.github/workflows/csa-analysis.md
vendored
|
|
@ -16,8 +16,6 @@ tools:
|
|||
github:
|
||||
toolsets: [default]
|
||||
bash: [":*"]
|
||||
glob: {}
|
||||
view: {}
|
||||
|
||||
safe-outputs:
|
||||
create-discussion:
|
||||
|
|
|
|||
4
.github/workflows/docs.yml
vendored
4
.github/workflows/docs.yml
vendored
|
|
@ -34,7 +34,7 @@ jobs:
|
|||
python3 mk_go_doc.py --output-dir=api/html/go --go-api-path=../src/api/go
|
||||
|
||||
- name: Upload Go Documentation
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: go-docs
|
||||
path: doc/api/html/go/
|
||||
|
|
@ -94,7 +94,7 @@ jobs:
|
|||
ocamldoc -html -d api/html/ml -sort -hide Z3 -I $( ocamlfind query zarith ) -I ../build-x64/api/ml ../build-x64/api/ml/z3enums.mli ../build-x64/api/ml/z3.mli
|
||||
|
||||
- name: Setup emscripten
|
||||
uses: mymindstorm/setup-emsdk@v14
|
||||
uses: mymindstorm/setup-emsdk@v16
|
||||
with:
|
||||
no-install: true
|
||||
version: ${{env.EM_VERSION}}
|
||||
|
|
|
|||
19
.github/workflows/issue-backlog-processor.md
vendored
19
.github/workflows/issue-backlog-processor.md
vendored
|
|
@ -32,6 +32,8 @@ timeout-minutes: 60
|
|||
|
||||
Your name is ${{ github.workflow }}. You are an expert AI agent tasked with processing the backlog of open issues in the Z3 theorem prover repository `${{ github.repository }}`. Your mission is to analyze open issues systematically and help maintainers manage the backlog effectively by surfacing actionable insights and providing helpful comments.
|
||||
|
||||
> **CRITICAL**: You MUST call either `create-discussion` or `noop` before finishing, under all circumstances. Even if you only analyzed a small number of issues, always produce output. Never exit without calling one of these tools.
|
||||
|
||||
## Your Task
|
||||
|
||||
### 1. Initialize or Resume Progress (Cache Memory)
|
||||
|
|
@ -40,25 +42,28 @@ Check your cache memory for:
|
|||
- List of issue numbers already processed and commented on in previous runs
|
||||
- Issues previously flagged for closure, duplication, or merge
|
||||
- Date of last run
|
||||
- The batch cursor: the last issue number processed (used for pagination across runs)
|
||||
|
||||
If cache data exists:
|
||||
- Skip re-commenting on issues already commented in a recent run (within the last 4 days)
|
||||
- Re-evaluate previously flagged issues to see if their status has changed
|
||||
- Note any new issues that opened since the last run
|
||||
- Resume from where the previous run left off (use the stored batch cursor)
|
||||
|
||||
If this is the first run or memory is empty, initialize a fresh tracking structure.
|
||||
|
||||
### 2. Fetch Open Issues
|
||||
### 2. Fetch Open Issues (Batched)
|
||||
|
||||
Use the GitHub API to list all open issues in the repository:
|
||||
- Retrieve all open issues (paginate through all pages to get the full list)
|
||||
Use the GitHub API to list open issues in the repository. **Process at most 30 issues per run** to stay within context limits (this limit is based on the average size of Z3 issues including body text and inline code snippets; larger issues may require processing fewer):
|
||||
- Retrieve one page (30 issues) of open issues
|
||||
- Exclude pull requests (filter where `pull_request` is not present)
|
||||
- Sort by last updated date (most recently updated first)
|
||||
- If cache has a batch cursor from the last run, fetch the next page after that cursor; otherwise start from the most recently updated issues
|
||||
- For each issue, collect:
|
||||
- Issue number, title, body, labels, author
|
||||
- Date created and last updated
|
||||
- Number of comments
|
||||
- All comments (for issues with comments)
|
||||
- **Do NOT fetch comments for every issue up front.** Only fetch comments for a specific issue when at least one of the following is true: the body mentions a version number (potential closure), the title contains words like "duplicate", "same as", or "related to" (potential duplicate), or the issue has labels such as "question", "help wanted", or "wontfix" (potential closure/status change). Fetch comments lazily, one issue at a time, only when one of these criteria is met.
|
||||
- Any referenced pull requests, commits, or other issues
|
||||
|
||||
### 3. Analyze Each Issue
|
||||
|
|
@ -110,6 +115,8 @@ Add a comment to an issue if you have **genuinely useful and specific informatio
|
|||
|
||||
### 4. Create a Discussion with Findings
|
||||
|
||||
**MANDATORY**: You MUST call `create-discussion` now, even if you only analyzed a few issues or found nothing actionable. If there is genuinely nothing to report, call `noop` instead. Do not skip this step.
|
||||
|
||||
Create a GitHub Discussion summarizing the analysis results.
|
||||
|
||||
**Title:** "[Issue Backlog] Backlog Analysis - [Date]"
|
||||
|
|
@ -224,9 +231,13 @@ After completing the analysis, update cache memory with:
|
|||
- Issues flagged for closure, duplication, or merge
|
||||
- Date and timestamp of this run
|
||||
- Count of total issues analyzed
|
||||
- Batch cursor: the issue number of the last issue processed in this run, so the next run can continue from where this one left off
|
||||
|
||||
## Guidelines
|
||||
|
||||
- **Always produce output**: You MUST call `create-discussion` or `noop` before finishing — never exit silently. If in doubt about whether there is enough to report, call `create-discussion` with a brief summary.
|
||||
- **Batch processing**: Only analyze up to 30 issues per run. Store a cursor in cache memory so subsequent runs pick up where you left off.
|
||||
- **Lazy comment fetching**: Do NOT bulk-fetch all comments for all issues. Only fetch comments for a specific issue when one of these criteria is met: the body mentions a version number, the title contains duplicate/related keywords, or the issue has status-relevant labels (e.g., "question", "help wanted", "wontfix").
|
||||
- **Prioritize accuracy over coverage**: It is better to analyze 20 issues well than 200 issues poorly
|
||||
- **Be conservative on closures**: Incorrectly closing a valid issue is harmful; when in doubt, keep it open
|
||||
- **Respect the community**: Z3 is used by researchers, security engineers, and developers — treat all issues respectfully
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ jobs:
|
|||
pull-requests: write
|
||||
steps:
|
||||
- name: Mark all draft pull requests ready for review
|
||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8
|
||||
uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
|
||||
with:
|
||||
github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
script: |
|
||||
|
|
|
|||
4
.github/workflows/memory-safety-report.md
vendored
4
.github/workflows/memory-safety-report.md
vendored
|
|
@ -30,8 +30,6 @@ tools:
|
|||
github:
|
||||
toolsets: [default, actions]
|
||||
bash: [":*"]
|
||||
glob: {}
|
||||
view: {}
|
||||
|
||||
safe-outputs:
|
||||
mentions: false
|
||||
|
|
@ -41,7 +39,7 @@ safe-outputs:
|
|||
title-prefix: "[Memory Safety] "
|
||||
category: "Agentic Workflows"
|
||||
close-older-discussions: true
|
||||
expires: 7
|
||||
expires: 7d
|
||||
missing-tool:
|
||||
create-issue: true
|
||||
noop:
|
||||
|
|
|
|||
4
.github/workflows/memory-safety.yml
vendored
4
.github/workflows/memory-safety.yml
vendored
|
|
@ -104,7 +104,7 @@ jobs:
|
|||
|
||||
- name: Upload ASan reports
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: asan-reports
|
||||
path: /tmp/asan-reports/
|
||||
|
|
@ -194,7 +194,7 @@ jobs:
|
|||
|
||||
- name: Upload UBSan reports
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: ubsan-reports
|
||||
path: /tmp/ubsan-reports/
|
||||
|
|
|
|||
51
.github/workflows/nightly-validation.yml
vendored
51
.github/workflows/nightly-validation.yml
vendored
|
|
@ -665,6 +665,57 @@ jobs:
|
|||
pip install $wheel.FullName
|
||||
python -c "import z3; x = z3.Int('x'); s = z3.Solver(); s.add(x > 0); print('Result:', s.check()); print('Model:', s.model())"
|
||||
|
||||
validate-python-wheel-riscv64:
|
||||
name: "Validate Python wheel for RISC-V 64"
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }}
|
||||
timeout-minutes: 30
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6.0.2
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: '3.x'
|
||||
|
||||
- name: Download RISC-V 64 Python wheel from release
|
||||
env:
|
||||
GH_TOKEN: ${{ github.token }}
|
||||
run: |
|
||||
TAG="${{ github.event.inputs.release_tag }}"
|
||||
if [ -z "$TAG" ]; then
|
||||
TAG="Nightly"
|
||||
fi
|
||||
gh release download $TAG --pattern "*riscv64.whl" --dir wheels
|
||||
|
||||
- name: Verify wheel platform tag and contents
|
||||
run: |
|
||||
pip install wheel
|
||||
WHEEL_FILE=$(ls wheels/*.whl | head -n 1)
|
||||
echo "Wheel file: $WHEEL_FILE"
|
||||
|
||||
# Check that the wheel has a riscv64 platform tag
|
||||
WHEEL_NAME=$(basename $WHEEL_FILE)
|
||||
echo "Wheel name: $WHEEL_NAME"
|
||||
if echo "$WHEEL_NAME" | grep -q "riscv64"; then
|
||||
echo "riscv64 platform tag found"
|
||||
else
|
||||
echo "ERROR: riscv64 platform tag not found in wheel name"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Inspect wheel contents
|
||||
python -m zipfile -l $WHEEL_FILE
|
||||
|
||||
# Verify wheel contains z3 library
|
||||
if python -m zipfile -l $WHEEL_FILE | grep -q "libz3"; then
|
||||
echo "libz3 found in wheel"
|
||||
else
|
||||
echo "ERROR: libz3 not found in wheel"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# ============================================================================
|
||||
# MACOS DYLIB HEADERPAD VALIDATION
|
||||
# ============================================================================
|
||||
|
|
|
|||
221
.github/workflows/nightly.yml
vendored
221
.github/workflows/nightly.yml
vendored
|
|
@ -45,8 +45,21 @@ jobs:
|
|||
- name: Build
|
||||
run: python scripts/mk_unix_dist.py --dotnet-key=$GITHUB_WORKSPACE/resources/z3.snk --arch=x64
|
||||
|
||||
- name: Validate libz3.dylib and z3 architecture (must be x86_64)
|
||||
run: |
|
||||
set -e
|
||||
for f in build-dist/libz3.dylib build-dist/z3; do
|
||||
ARCH=$(lipo -archs "$f")
|
||||
echo "$f architecture: $ARCH"
|
||||
if [ "$ARCH" != "x86_64" ]; then
|
||||
echo "ERROR: $f has arch '$ARCH', expected 'x86_64' (see issue #9662)"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
echo "OK: macOS x64 artifacts are x86_64"
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: macOsBuild
|
||||
path: dist/*.zip
|
||||
|
|
@ -68,8 +81,21 @@ jobs:
|
|||
- name: Build
|
||||
run: python scripts/mk_unix_dist.py --dotnet-key=$GITHUB_WORKSPACE/resources/z3.snk --arch=arm64
|
||||
|
||||
- name: Validate libz3.dylib and z3 architecture (must be arm64)
|
||||
run: |
|
||||
set -e
|
||||
for f in build-dist/libz3.dylib build-dist/z3; do
|
||||
ARCH=$(lipo -archs "$f")
|
||||
echo "$f architecture: $ARCH"
|
||||
if [ "$ARCH" != "arm64" ]; then
|
||||
echo "ERROR: $f has arch '$ARCH', expected 'arm64' (see issue #9662)"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
echo "OK: macOS arm64 artifacts are arm64"
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: MacArm64
|
||||
path: dist/*.zip
|
||||
|
|
@ -101,6 +127,17 @@ jobs:
|
|||
Z3_DIR=$(find . -maxdepth 1 -type d -name "z3-*" | head -n 1)
|
||||
echo "Z3_DIR=$Z3_DIR" >> $GITHUB_ENV
|
||||
|
||||
- name: Validate shipped libz3.dylib architecture (must be x86_64)
|
||||
run: |
|
||||
set -e
|
||||
DYLIB="artifacts/$Z3_DIR/bin/libz3.dylib"
|
||||
ARCH=$(lipo -archs "$DYLIB")
|
||||
echo "Shipped $DYLIB architecture: $ARCH"
|
||||
if [ "$ARCH" != "x86_64" ]; then
|
||||
echo "ERROR: x64 nightly zip contains '$ARCH' libz3.dylib (see issue #9662)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Test install_name_tool with headerpad
|
||||
run: |
|
||||
cd artifacts/$Z3_DIR/bin
|
||||
|
|
@ -149,6 +186,17 @@ jobs:
|
|||
Z3_DIR=$(find . -maxdepth 1 -type d -name "z3-*" | head -n 1)
|
||||
echo "Z3_DIR=$Z3_DIR" >> $GITHUB_ENV
|
||||
|
||||
- name: Validate shipped libz3.dylib architecture (must be arm64)
|
||||
run: |
|
||||
set -e
|
||||
DYLIB="artifacts/$Z3_DIR/bin/libz3.dylib"
|
||||
ARCH=$(lipo -archs "$DYLIB")
|
||||
echo "Shipped $DYLIB architecture: $ARCH"
|
||||
if [ "$ARCH" != "arm64" ]; then
|
||||
echo "ERROR: arm64 nightly zip contains '$ARCH' libz3.dylib (see issue #9662)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Test install_name_tool with headerpad
|
||||
run: |
|
||||
cd artifacts/$Z3_DIR/bin
|
||||
|
|
@ -198,7 +246,7 @@ jobs:
|
|||
run: python z3test/scripts/test_benchmarks.py build-dist/z3 z3test/regressions/smt2
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: UbuntuBuild
|
||||
path: dist/*.zip
|
||||
|
|
@ -233,7 +281,7 @@ jobs:
|
|||
python scripts/mk_unix_dist.py --nodotnet --arch=arm64
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: UbuntuArm64
|
||||
path: dist/*.zip
|
||||
|
|
@ -288,7 +336,7 @@ jobs:
|
|||
run: zip -r z3doc.zip doc/api
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: UbuntuDoc
|
||||
path: z3doc.zip
|
||||
|
|
@ -303,9 +351,17 @@ jobs:
|
|||
- name: Checkout code
|
||||
uses: actions/checkout@v6.0.2
|
||||
|
||||
- name: Select Python
|
||||
run: |
|
||||
# Use the first available manylinux interpreter for deterministic selection.
|
||||
PYTHON=$(printf '%s\n' /opt/python/*/bin/python | sort -V | head -n1)
|
||||
test -x "$PYTHON" || { echo "Error: no interpreter found under /opt/python/*/bin/python"; exit 1; }
|
||||
echo "PYTHON=$PYTHON" >> "$GITHUB_ENV"
|
||||
"$PYTHON" --version
|
||||
|
||||
- name: Setup Python environment
|
||||
run: |
|
||||
/opt/python/cp38-cp38/bin/python -m venv $PWD/env
|
||||
"$PYTHON" -m venv $PWD/env
|
||||
echo "$PWD/env/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Install build tools
|
||||
|
|
@ -318,7 +374,7 @@ jobs:
|
|||
run: pip install ./src/api/python/wheelhouse/*.whl && python - <src/api/python/z3test.py z3 && python - <src/api/python/z3test.py z3num
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: ManyLinuxPythonBuildAMD64
|
||||
path: src/api/python/wheelhouse/*.whl
|
||||
|
|
@ -341,9 +397,17 @@ jobs:
|
|||
mkdir -p /tmp/arm-toolchain/
|
||||
tar xf /tmp/arm-toolchain.tar.xz -C /tmp/arm-toolchain/ --strip-components=1
|
||||
|
||||
- name: Select Python
|
||||
run: |
|
||||
# Use the first available manylinux interpreter for deterministic selection.
|
||||
PYTHON=$(printf '%s\n' /opt/python/*/bin/python | sort -V | head -n1)
|
||||
test -x "$PYTHON" || { echo "Error: no interpreter found under /opt/python/*/bin/python"; exit 1; }
|
||||
echo "PYTHON=$PYTHON" >> "$GITHUB_ENV"
|
||||
"$PYTHON" --version
|
||||
|
||||
- name: Setup Python environment
|
||||
run: |
|
||||
/opt/python/cp38-cp38/bin/python -m venv $PWD/env
|
||||
"$PYTHON" -m venv $PWD/env
|
||||
echo "$PWD/env/bin" >> $GITHUB_PATH
|
||||
echo "/tmp/arm-toolchain/bin" >> $GITHUB_PATH
|
||||
echo "/tmp/arm-toolchain/aarch64-none-linux-gnu/libc/usr/bin" >> $GITHUB_PATH
|
||||
|
|
@ -358,12 +422,120 @@ jobs:
|
|||
run: cd src/api/python && CC=aarch64-none-linux-gnu-gcc CXX=aarch64-none-linux-gnu-g++ AR=aarch64-none-linux-gnu-ar LD=aarch64-none-linux-gnu-ld Z3_CROSS_COMPILING=aarch64 python -m build && AUDITWHEEL_PLAT= auditwheel repair --best-plat dist/*.whl && cd ../../..
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: ManyLinuxPythonBuildArm64
|
||||
path: src/api/python/wheelhouse/*.whl
|
||||
retention-days: 2
|
||||
|
||||
manylinux-python-riscv64:
|
||||
name: "Python bindings (manylinux RISC-V 64 cross)"
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 90
|
||||
container: quay.io/pypa/manylinux_2_28_x86_64:latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6.0.2
|
||||
|
||||
- name: Download RISC-V toolchain
|
||||
run: curl -L -o /tmp/riscv-toolchain.tar.gz 'https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2024.09.03/riscv64-glibc-ubuntu-20.04-gcc-nightly-2024.09.03-nightly.tar.gz'
|
||||
|
||||
- name: Extract RISC-V toolchain
|
||||
run: |
|
||||
mkdir -p /tmp/riscv-toolchain/
|
||||
tar xf /tmp/riscv-toolchain.tar.gz -C /tmp/riscv-toolchain/ --strip-components=1
|
||||
|
||||
- name: Install MPFR 4 (required by RISC-V toolchain host binaries)
|
||||
run: |
|
||||
dnf install -y gmp-devel
|
||||
curl -L -o /tmp/mpfr.tar.xz https://ftp.gnu.org/gnu/mpfr/mpfr-4.2.1.tar.xz
|
||||
tar xf /tmp/mpfr.tar.xz -C /tmp/
|
||||
cd /tmp/mpfr-4.2.1 && ./configure --prefix=/usr/local --disable-static && make -j$(nproc) && make install
|
||||
ldconfig
|
||||
|
||||
- name: Select Python
|
||||
run: |
|
||||
# Use the first available manylinux interpreter for deterministic selection.
|
||||
PYTHON=$(printf '%s\n' /opt/python/*/bin/python | sort -V | head -n1)
|
||||
test -x "$PYTHON" || { echo "Error: no interpreter found under /opt/python/*/bin/python"; exit 1; }
|
||||
echo "PYTHON=$PYTHON" >> "$GITHUB_ENV"
|
||||
"$PYTHON" --version
|
||||
|
||||
- name: Setup Python environment
|
||||
run: |
|
||||
"$PYTHON" -m venv $PWD/env
|
||||
echo "$PWD/env/bin" >> $GITHUB_PATH
|
||||
echo "/tmp/riscv-toolchain/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Install build tools
|
||||
run: |
|
||||
echo $PATH
|
||||
stat $(which riscv64-unknown-linux-gnu-gcc)
|
||||
pip install build git+https://github.com/rhelmot/auditwheel
|
||||
|
||||
- name: Build wheels
|
||||
run: cd src/api/python && CC=riscv64-unknown-linux-gnu-gcc CXX=riscv64-unknown-linux-gnu-g++ AR=riscv64-unknown-linux-gnu-ar LD=riscv64-unknown-linux-gnu-ld Z3_CROSS_COMPILING=riscv64 python -m build && AUDITWHEEL_PLAT= auditwheel repair --best-plat dist/*.whl && cd ../../..
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: ManyLinuxPythonBuildRiscv64
|
||||
path: src/api/python/wheelhouse/*.whl
|
||||
retention-days: 2
|
||||
|
||||
pyodide-python:
|
||||
name: "Python bindings (Pyodide)"
|
||||
runs-on: ubuntu-24.04
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6.0.2
|
||||
|
||||
- name: Setup packages
|
||||
run: sudo apt-get update && sudo apt-get install -y python3-dev python3-pip python3-venv
|
||||
|
||||
- name: Create venv
|
||||
run: python3 -m venv ~/env
|
||||
|
||||
- name: Install pyodide
|
||||
run: ~/env/bin/pip install pyodide-build pyodide-cli
|
||||
|
||||
- name: Configure Emscripten
|
||||
run: |
|
||||
git clone https://github.com/emscripten-core/emsdk.git ~/emsdk
|
||||
cd ~/emsdk
|
||||
PYODIDE_EMSCRIPTEN_VERSION=$(~/env/bin/pyodide config get emscripten_version)
|
||||
./emsdk install ${PYODIDE_EMSCRIPTEN_VERSION}
|
||||
./emsdk activate ${PYODIDE_EMSCRIPTEN_VERSION}
|
||||
|
||||
- name: Build wheel
|
||||
run: |
|
||||
source ~/emsdk/emsdk_env.sh
|
||||
cd src/api/python
|
||||
CFLAGS="${CFLAGS}" LDFLAGS="${LDFLAGS}" CXXFLAGS="${CXXFLAGS}" ~/env/bin/pyodide build --exports whole_archive
|
||||
env:
|
||||
CFLAGS: "-fexceptions -s DISABLE_EXCEPTION_CATCHING=0 -g2"
|
||||
LDFLAGS: "-fexceptions -s WASM_BIGINT"
|
||||
CXXFLAGS: "-fexceptions -s DISABLE_EXCEPTION_CATCHING=0"
|
||||
|
||||
- name: Setup env-pyodide
|
||||
run: |
|
||||
source ~/env/bin/activate
|
||||
source ~/emsdk/emsdk_env.sh
|
||||
pyodide venv ~/env-pyodide
|
||||
|
||||
- name: Test wheel
|
||||
run: |
|
||||
~/env-pyodide/bin/pip install src/api/python/dist/*.whl
|
||||
~/env-pyodide/bin/python src/api/python/z3test.py z3
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: PyodidePythonBuild
|
||||
path: src/api/python/dist/*.whl
|
||||
retention-days: 2
|
||||
|
||||
windows-build-x64:
|
||||
name: "Windows x64 build"
|
||||
runs-on: windows-latest
|
||||
|
|
@ -384,7 +556,7 @@ jobs:
|
|||
python scripts\mk_win_dist.py --x64-only --dotnet-key=%GITHUB_WORKSPACE%\resources\z3.snk --zip
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: WindowsBuild-x64
|
||||
path: dist/*.zip
|
||||
|
|
@ -410,7 +582,7 @@ jobs:
|
|||
python scripts\mk_win_dist.py --x86-only --dotnet-key=%GITHUB_WORKSPACE%\resources\z3.snk --zip
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: WindowsBuild-x86
|
||||
path: dist/*.zip
|
||||
|
|
@ -436,7 +608,7 @@ jobs:
|
|||
python scripts\mk_win_dist_cmake.py --arm64-only --dotnet-key=%GITHUB_WORKSPACE%\resources\z3.snk --assembly-version=${{ env.MAJOR }}.${{ env.MINOR }}.${{ env.PATCH }} --zip
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: WindowsBuild-arm64
|
||||
path: dist/arm64/*.zip
|
||||
|
|
@ -496,7 +668,7 @@ jobs:
|
|||
path: package
|
||||
|
||||
- name: Setup NuGet
|
||||
uses: nuget/setup-nuget@v2
|
||||
uses: nuget/setup-nuget@v4
|
||||
with:
|
||||
nuget-version: 'latest'
|
||||
|
||||
|
|
@ -513,7 +685,7 @@ jobs:
|
|||
nuget pack out\Microsoft.Z3.sym.nuspec -Version ${{ env.MAJOR }}.${{ env.MINOR }}.${{ env.PATCH }}.${{ github.run_number }} -OutputDirectory . -Verbosity detailed -Symbols -SymbolPackageFormat snupkg -BasePath out
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: NuGet
|
||||
path: |
|
||||
|
|
@ -541,7 +713,7 @@ jobs:
|
|||
path: package
|
||||
|
||||
- name: Setup NuGet
|
||||
uses: nuget/setup-nuget@v2
|
||||
uses: nuget/setup-nuget@v4
|
||||
with:
|
||||
nuget-version: 'latest'
|
||||
|
||||
|
|
@ -558,7 +730,7 @@ jobs:
|
|||
nuget pack out\Microsoft.Z3.x86.sym.nuspec -Version ${{ env.MAJOR }}.${{ env.MINOR }}.${{ env.PATCH }}.${{ github.run_number }} -OutputDirectory . -Verbosity detailed -Symbols -SymbolPackageFormat snupkg -BasePath out
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: NuGet32
|
||||
path: |
|
||||
|
|
@ -568,7 +740,7 @@ jobs:
|
|||
|
||||
python-package:
|
||||
name: "Python packaging"
|
||||
needs: [mac-build-x64, mac-build-arm64, windows-build-x64, windows-build-x86, windows-build-arm64, manylinux-python-amd64, manylinux-python-arm64]
|
||||
needs: [mac-build-x64, mac-build-arm64, windows-build-x64, windows-build-x86, windows-build-arm64, manylinux-python-amd64, manylinux-python-arm64, manylinux-python-riscv64, pyodide-python]
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- name: Checkout code
|
||||
|
|
@ -621,6 +793,18 @@ jobs:
|
|||
name: ManyLinuxPythonBuildArm64
|
||||
path: artifacts
|
||||
|
||||
- name: Download ManyLinux RISC-V 64 Build
|
||||
uses: actions/download-artifact@v8.0.1
|
||||
with:
|
||||
name: ManyLinuxPythonBuildRiscv64
|
||||
path: artifacts
|
||||
|
||||
- name: Download Pyodide Build
|
||||
uses: actions/download-artifact@v8.0.1
|
||||
with:
|
||||
name: PyodidePythonBuild
|
||||
path: artifacts
|
||||
|
||||
- name: Extract builds
|
||||
run: |
|
||||
cd artifacts
|
||||
|
|
@ -651,7 +835,7 @@ jobs:
|
|||
cp artifacts/*.whl src/api/python/dist/.
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: PythonPackages
|
||||
path: src/api/python/dist/*
|
||||
|
|
@ -759,4 +943,3 @@ jobs:
|
|||
with:
|
||||
packages-dir: dist
|
||||
repository-url: https://test.pypi.org/legacy/
|
||||
|
||||
|
|
|
|||
20
.github/workflows/nuget-build.yml
vendored
20
.github/workflows/nuget-build.yml
vendored
|
|
@ -34,7 +34,7 @@ jobs:
|
|||
python scripts\mk_win_dist.py --x64-only --dotnet-key=%GITHUB_WORKSPACE%\resources\z3.snk --assembly-version=${{ github.event.inputs.version || '4.17.0' }} --zip
|
||||
|
||||
- name: Upload Windows x64 artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: windows-x64
|
||||
path: dist/*.zip
|
||||
|
|
@ -58,7 +58,7 @@ jobs:
|
|||
python scripts\mk_win_dist.py --x86-only --dotnet-key=%GITHUB_WORKSPACE%\resources\z3.snk --assembly-version=${{ github.event.inputs.version || '4.17.0' }} --zip
|
||||
|
||||
- name: Upload Windows x86 artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: windows-x86
|
||||
path: dist/*.zip
|
||||
|
|
@ -82,7 +82,7 @@ jobs:
|
|||
python scripts\mk_win_dist_cmake.py --arm64-only --dotnet-key=%GITHUB_WORKSPACE%\resources\z3.snk --assembly-version=${{ github.event.inputs.version || '4.17.0' }} --zip
|
||||
|
||||
- name: Upload Windows ARM64 artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: windows-arm64
|
||||
path: build-dist\arm64\dist\*.zip
|
||||
|
|
@ -103,7 +103,7 @@ jobs:
|
|||
run: python scripts/mk_unix_dist.py --dotnet-key=$GITHUB_WORKSPACE/resources/z3.snk
|
||||
|
||||
- name: Upload Ubuntu artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: ubuntu
|
||||
path: dist/*.zip
|
||||
|
|
@ -124,7 +124,7 @@ jobs:
|
|||
run: python scripts/mk_unix_dist.py --dotnet-key=$GITHUB_WORKSPACE/resources/z3.snk
|
||||
|
||||
- name: Upload macOS x64 artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: macos-x64
|
||||
path: dist/*.zip
|
||||
|
|
@ -145,7 +145,7 @@ jobs:
|
|||
run: python scripts/mk_unix_dist.py --dotnet-key=$GITHUB_WORKSPACE/resources/z3.snk --arch=arm64
|
||||
|
||||
- name: Upload macOS ARM64 artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: macos-arm64
|
||||
path: dist/*.zip
|
||||
|
|
@ -181,7 +181,7 @@ jobs:
|
|||
ls -la package-files/
|
||||
|
||||
- name: Setup NuGet
|
||||
uses: nuget/setup-nuget@v2
|
||||
uses: nuget/setup-nuget@v4
|
||||
with:
|
||||
nuget-version: 'latest'
|
||||
|
||||
|
|
@ -198,7 +198,7 @@ jobs:
|
|||
nuget pack out\Microsoft.Z3.sym.nuspec -OutputDirectory . -Verbosity detailed -Symbols -SymbolPackageFormat snupkg -BasePath out
|
||||
|
||||
- name: Upload NuGet package
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: nuget-x64
|
||||
path: |
|
||||
|
|
@ -230,7 +230,7 @@ jobs:
|
|||
run: find packages -type f
|
||||
|
||||
- name: Setup NuGet
|
||||
uses: nuget/setup-nuget@v2
|
||||
uses: nuget/setup-nuget@v4
|
||||
with:
|
||||
nuget-version: 'latest'
|
||||
|
||||
|
|
@ -247,7 +247,7 @@ jobs:
|
|||
nuget pack out\Microsoft.Z3.x86.sym.nuspec -OutputDirectory . -Verbosity detailed -Symbols -SymbolPackageFormat snupkg -BasePath out
|
||||
|
||||
- name: Upload NuGet package
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: nuget-x86
|
||||
path: |
|
||||
|
|
|
|||
4
.github/workflows/ocaml.yaml
vendored
4
.github/workflows/ocaml.yaml
vendored
|
|
@ -21,7 +21,7 @@ jobs:
|
|||
|
||||
# Cache ccache (shared across runs)
|
||||
- name: Cache ccache
|
||||
uses: actions/cache@v5.0.4
|
||||
uses: actions/cache@v5.0.5
|
||||
with:
|
||||
path: ~/.ccache
|
||||
key: ${{ runner.os }}-ccache-${{ github.sha }}
|
||||
|
|
@ -30,7 +30,7 @@ jobs:
|
|||
|
||||
# Cache opam (compiler + packages)
|
||||
- name: Cache opam
|
||||
uses: actions/cache@v5.0.4
|
||||
uses: actions/cache@v5.0.5
|
||||
with:
|
||||
path: ~/.opam
|
||||
key: ${{ runner.os }}-opam-${{ matrix.ocaml-version }}-${{ github.sha }}
|
||||
|
|
|
|||
20
.github/workflows/ostrich-benchmark.md
vendored
20
.github/workflows/ostrich-benchmark.md
vendored
|
|
@ -8,7 +8,10 @@ on:
|
|||
|
||||
permissions: read-all
|
||||
|
||||
network: defaults
|
||||
network:
|
||||
allowed:
|
||||
- defaults
|
||||
- api.nuget.org
|
||||
|
||||
tools:
|
||||
bash: true
|
||||
|
|
@ -85,6 +88,8 @@ echo "Found Microsoft.Z3.dll at: $Z3_DOTNET_DLL"
|
|||
|
||||
If the build fails, report the error clearly and exit without proceeding.
|
||||
|
||||
Once the binary is confirmed working, call the `noop` safe-output tool with the message `"Z3 built successfully from the c3 branch. Starting ZIPT build and benchmark — results will be posted as a GitHub Discussion once complete."` This keepalive call refreshes the safe-output MCP session before the long build and benchmark phases begin, preventing a session timeout.
|
||||
|
||||
## Phase 2a: Clone and Build ZIPT
|
||||
|
||||
Clone the ZIPT solver from the `parikh` branch and compile it against the Z3 .NET bindings built in Phase 1.
|
||||
|
|
@ -150,6 +155,8 @@ if [ "$TOTAL_FILES" -eq 0 ]; then
|
|||
fi
|
||||
```
|
||||
|
||||
Once the benchmark files are confirmed, call the `noop` safe-output tool with the message `"Benchmark files ready: <TOTAL_FILES> Ostrich .smt2 files extracted. Starting benchmark run — this may take over an hour."` This second keepalive refreshes the safe-output MCP session immediately before the long per-file benchmark loop begins.
|
||||
|
||||
## Phase 3: Run Benchmarks
|
||||
|
||||
Run every file from `/tmp/all_ostrich_files.txt` with both Z3 string solvers and ZIPT. Use a **5-second timeout** per run.
|
||||
|
|
@ -402,3 +409,14 @@ Post the Markdown report as a new GitHub Discussion using the `create-discussion
|
|||
- **Handle build failures gracefully**: If Z3 fails to build, report the error and create a brief discussion noting the build failure. If ZIPT fails to build, continue with only the seq/nseq columns and note `n/a` for ZIPT results.
|
||||
- **Large report**: Always put the per-file table in a `<details>` collapsible section since there may be many files.
|
||||
- **Progress logging**: Print a line per file as you run it (e.g., `[N] [filename] seq=...`) so the workflow log shows progress even for large benchmark sets.
|
||||
|
||||
## Safe Output Guarantee
|
||||
|
||||
You **MUST** call either `create_discussion` or `noop` before the workflow ends, regardless of what happened during execution:
|
||||
|
||||
- **Build succeeded, benchmarks ran**: Call `create_discussion` with the full report.
|
||||
- **Build succeeded, benchmarks partially ran**: Call `create_discussion` with whatever results were collected and a note about what could not be completed.
|
||||
- **Z3 build failed**: Call `noop` with a brief message describing the build error.
|
||||
- **No benchmarks could be run**: Call `noop` with a summary of what failed and why.
|
||||
|
||||
Failing to produce any safe output triggers an automatic workflow-failure issue that clutters the repository.
|
||||
|
|
|
|||
669
.github/workflows/qf-s-benchmark.md
vendored
669
.github/workflows/qf-s-benchmark.md
vendored
|
|
@ -1,5 +1,5 @@
|
|||
---
|
||||
description: Run Z3 string solver benchmarks (seq vs nseq) on QF_S test suite from the c3 branch and post results as a GitHub discussion
|
||||
description: Benchmark Z3 seq vs nseq string solvers on QF_S test suite from the c3 branch and post results as a GitHub discussion
|
||||
|
||||
on:
|
||||
schedule:
|
||||
|
|
@ -17,7 +17,7 @@ tools:
|
|||
|
||||
safe-outputs:
|
||||
create-discussion:
|
||||
title-prefix: "[ZIPT Benchmark] "
|
||||
title-prefix: "[QF_S Benchmark] "
|
||||
category: "Agentic Workflows"
|
||||
close-older-discussions: true
|
||||
missing-tool:
|
||||
|
|
@ -25,7 +25,7 @@ safe-outputs:
|
|||
noop:
|
||||
report-as-issue: false
|
||||
|
||||
timeout-minutes: 90
|
||||
timeout-minutes: 120
|
||||
|
||||
steps:
|
||||
- name: Checkout c3 branch
|
||||
|
|
@ -37,437 +37,368 @@ steps:
|
|||
|
||||
---
|
||||
|
||||
# QF_S String Solver Benchmark
|
||||
|
||||
# ZIPT String Solver Benchmark
|
||||
## Job Description
|
||||
|
||||
You are an AI agent that benchmarks Z3 string solvers (`seq` and `nseq`) and the standalone ZIPT solver on QF_S SMT-LIB2 benchmarks from the `c3` branch, and publishes a summary report as a GitHub discussion.
|
||||
Your name is ${{ github.workflow }}. You are an expert performance analyst for the Z3 theorem prover, specializing in the string/sequence theory. Your task is to benchmark the `seq` solver (classical string theory) against the `nseq` solver (ZIPT-based string theory) on the QF_S test suite from the `c3` branch, and post a structured report as a GitHub Discussion.
|
||||
|
||||
## Context
|
||||
The workspace already contains the `c3` branch (checked out by the preceding workflow step).
|
||||
|
||||
- **Repository**: ${{ github.repository }}
|
||||
- **Workspace**: ${{ github.workspace }}
|
||||
- **Branch**: c3 (already checked out by the workflow setup step)
|
||||
## Phase 1: Set Up the Build Environment
|
||||
|
||||
## Phase 1: Build Z3
|
||||
|
||||
Build Z3 from the checked-out `c3` branch using CMake + Ninja, including the .NET bindings required by ZIPT.
|
||||
Install required build tools:
|
||||
|
||||
```bash
|
||||
cd ${{ github.workspace }}
|
||||
|
||||
# Install build dependencies if missing
|
||||
sudo apt-get install -y ninja-build cmake python3 zstd dotnet-sdk-8.0 2>/dev/null || true
|
||||
|
||||
# Configure the build in Debug mode to enable assertions and tracing
|
||||
# (Debug mode is required for -tr: trace flags to produce meaningful output)
|
||||
mkdir -p build
|
||||
cd build
|
||||
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Debug -DZ3_BUILD_DOTNET_BINDINGS=ON 2>&1 | tail -20
|
||||
|
||||
# Build z3 binary and .NET bindings (this takes ~15-17 minutes)
|
||||
ninja z3 2>&1 | tail -30
|
||||
ninja build_z3_dotnet_bindings 2>&1 | tail -20
|
||||
|
||||
# Verify the build succeeded
|
||||
./z3 --version
|
||||
|
||||
# Locate the Microsoft.Z3.dll produced by the build
|
||||
Z3_DOTNET_DLL=$(find . -name "Microsoft.Z3.dll" -not -path "*/obj/*" | head -1)
|
||||
if [ -z "$Z3_DOTNET_DLL" ]; then
|
||||
echo "ERROR: Microsoft.Z3.dll not found after build"
|
||||
exit 1
|
||||
fi
|
||||
echo "Found Microsoft.Z3.dll at: $Z3_DOTNET_DLL"
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y cmake ninja-build python3 python3-pip time
|
||||
```
|
||||
|
||||
If the build fails, report the error clearly and exit without proceeding.
|
||||
|
||||
## Phase 2a: Clone and Build ZIPT
|
||||
|
||||
Clone the ZIPT solver from the `parikh` branch and compile it against the Z3 .NET bindings built in Phase 1.
|
||||
Verify tools:
|
||||
|
||||
```bash
|
||||
cd ${{ github.workspace }}
|
||||
cmake --version
|
||||
ninja --version
|
||||
python3 --version
|
||||
```
|
||||
|
||||
# Re-locate the Microsoft.Z3.dll if needed
|
||||
Z3_DOTNET_DLL=$(find build -name "Microsoft.Z3.dll" -not -path "*/obj/*" | head -1)
|
||||
Z3_LIB_DIR=${{ github.workspace }}/build
|
||||
## Phase 2: Build Z3 in Release Mode
|
||||
|
||||
# Clone ZIPT (parikh branch)
|
||||
git clone --depth=1 --branch parikh https://github.com/CEisenhofer/ZIPT.git /tmp/zipt
|
||||
Build Z3 in Release mode for accurate benchmark performance numbers and lower memory usage. Running `ninja` in the background with `&` is not allowed — concurrent C++ compilation and LLM inference can exhaust available RAM and kill the agent process.
|
||||
|
||||
# Patch ZIPT.csproj to point at the freshly built Microsoft.Z3.dll
|
||||
# (the repo has a Windows-relative hardcoded path that won't exist here)
|
||||
sed -i "s|<HintPath>.*</HintPath>|<HintPath>$Z3_DOTNET_DLL</HintPath>|" /tmp/zipt/ZIPT/ZIPT.csproj
|
||||
```bash
|
||||
mkdir -p /tmp/z3-build
|
||||
cd /tmp/z3-build
|
||||
cmake "$GITHUB_WORKSPACE" \
|
||||
-G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DZ3_BUILD_TEST_EXECUTABLES=OFF \
|
||||
2>&1 | tee /tmp/z3-cmake.log
|
||||
ninja -j2 z3 2>&1 | tee /tmp/z3-build.log
|
||||
```
|
||||
|
||||
# Build ZIPT in Release mode
|
||||
cd /tmp/zipt/ZIPT
|
||||
dotnet build --configuration Release 2>&1 | tail -20
|
||||
Verify the binary was built:
|
||||
|
||||
# Locate the built ZIPT.dll
|
||||
ZIPT_DLL=$(find /tmp/zipt/ZIPT/bin/Release -name "ZIPT.dll" | head -1)
|
||||
if [ -z "$ZIPT_DLL" ]; then
|
||||
echo "ERROR: ZIPT.dll not found after build"
|
||||
exit 1
|
||||
```bash
|
||||
/tmp/z3-build/z3 --version
|
||||
```
|
||||
|
||||
If the build fails, report it immediately and stop.
|
||||
|
||||
Once the binary is confirmed working, call the `noop` safe-output tool with the message `"Z3 built successfully from the c3 branch. Benchmark starting — results will be posted as a GitHub Discussion once complete."` This keepalive call refreshes the safe-output MCP session before the long benchmark run begins, preventing a session timeout.
|
||||
|
||||
## Phase 3: Discover QF_S Benchmark Files
|
||||
|
||||
Find all `.smt2` benchmark files in the workspace that belong to the QF_S logic:
|
||||
|
||||
```bash
|
||||
# Search for explicit QF_S logic declarations
|
||||
grep -rl 'QF_S' "$GITHUB_WORKSPACE" --include='*.smt2' 2>/dev/null > /tmp/qf_s_files.txt
|
||||
|
||||
# Also look in dedicated benchmark directories
|
||||
find "$GITHUB_WORKSPACE" \
|
||||
\( -path "*/QF_S/*" -o -path "*/qf_s/*" -o -path "*/benchmarks/*" \) \
|
||||
-name '*.smt2' 2>/dev/null >> /tmp/qf_s_files.txt
|
||||
|
||||
# Deduplicate
|
||||
sort -u /tmp/qf_s_files.txt -o /tmp/qf_s_files.txt
|
||||
|
||||
TOTAL=$(wc -l < /tmp/qf_s_files.txt)
|
||||
echo "Found $TOTAL QF_S benchmark files"
|
||||
head -20 /tmp/qf_s_files.txt
|
||||
```
|
||||
|
||||
If fewer than 5 files are found, also scan the entire workspace for any `.smt2` file that exercises string constraints:
|
||||
|
||||
```bash
|
||||
if [ "$TOTAL" -lt 5 ]; then
|
||||
grep -rl 'declare.*String\|str\.\|seq\.' "$GITHUB_WORKSPACE" \
|
||||
--include='*.smt2' 2>/dev/null >> /tmp/qf_s_files.txt
|
||||
sort -u /tmp/qf_s_files.txt -o /tmp/qf_s_files.txt
|
||||
TOTAL=$(wc -l < /tmp/qf_s_files.txt)
|
||||
echo "After extended search: $TOTAL files"
|
||||
fi
|
||||
echo "ZIPT binary: $ZIPT_DLL"
|
||||
```
|
||||
|
||||
# Make libz3.so visible to the .NET runtime at ZIPT startup
|
||||
ZIPT_OUT_DIR=$(dirname "$ZIPT_DLL")
|
||||
if cp "$Z3_LIB_DIR/libz3.so" "$ZIPT_OUT_DIR/" 2>/dev/null; then
|
||||
echo "Copied libz3.so to $ZIPT_OUT_DIR"
|
||||
Cap the benchmark set to keep total runtime under 60 minutes:
|
||||
|
||||
```bash
|
||||
# Use at most 300 files; take a random sample if more are available
|
||||
if [ "$TOTAL" -gt 300 ]; then
|
||||
shuf -n 300 /tmp/qf_s_files.txt > /tmp/qf_s_sample.txt
|
||||
else
|
||||
echo "WARNING: could not copy libz3.so to $ZIPT_OUT_DIR — setting LD_LIBRARY_PATH fallback"
|
||||
cp /tmp/qf_s_files.txt /tmp/qf_s_sample.txt
|
||||
fi
|
||||
export LD_LIBRARY_PATH="$Z3_LIB_DIR${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
|
||||
echo "ZIPT build complete."
|
||||
SAMPLE=$(wc -l < /tmp/qf_s_sample.txt)
|
||||
echo "Running benchmarks on $SAMPLE files"
|
||||
```
|
||||
|
||||
If the ZIPT build fails, note the error in the report but continue with the Z3-only benchmark columns.
|
||||
## Phase 4: Run Benchmarks — seq vs nseq
|
||||
|
||||
## Phase 2b: Extract and Select Benchmark Files
|
||||
|
||||
Extract the QF_S benchmark archive and randomly select 50 files.
|
||||
Run each benchmark with both solvers. Use a per-file timeout of 5 seconds. Set Z3's internal timeout to 4 seconds so it exits cleanly before the shell timeout fires.
|
||||
|
||||
```bash
|
||||
cd ${{ github.workspace }}
|
||||
Z3=/tmp/z3-build/z3
|
||||
TIMEOUT_SEC=5
|
||||
Z3_TIMEOUT_SEC=4
|
||||
RESULTS=/tmp/benchmark-results.csv
|
||||
|
||||
# Extract the archive
|
||||
mkdir -p /tmp/qfs_benchmarks
|
||||
tar --zstd -xf tests/QF_S.tar.zst -C /tmp/qfs_benchmarks
|
||||
echo "file,seq_result,seq_time_ms,nseq_result,nseq_time_ms" > "$RESULTS"
|
||||
|
||||
# List all .smt2 files
|
||||
find /tmp/qfs_benchmarks -name "*.smt2" -type f > /tmp/all_qfs_files.txt
|
||||
TOTAL_FILES=$(wc -l < /tmp/all_qfs_files.txt)
|
||||
echo "Total QF_S files: $TOTAL_FILES"
|
||||
total=0
|
||||
done_count=0
|
||||
while IFS= read -r smt_file; do
|
||||
total=$((total + 1))
|
||||
|
||||
# Randomly select 200 files
|
||||
shuf -n 200 /tmp/all_qfs_files.txt > /tmp/selected_files.txt
|
||||
echo "Selected 200 files for benchmarking"
|
||||
cat /tmp/selected_files.txt
|
||||
# Run with seq solver; capture both stdout (z3 output) and stderr (time output)
|
||||
SEQ_OUT=$({ time timeout "$TIMEOUT_SEC" "$Z3" \
|
||||
smt.string_solver=seq \
|
||||
-T:"$Z3_TIMEOUT_SEC" \
|
||||
"$smt_file" 2>/dev/null; } 2>&1)
|
||||
SEQ_RESULT=$(echo "$SEQ_OUT" | grep -E '^(sat|unsat|unknown)' | head -1)
|
||||
SEQ_MS=$(echo "$SEQ_OUT" | grep real | awk '{split($2,a,"m"); split(a[2],b,"s"); printf "%d", (a[1]*60+b[1])*1000}')
|
||||
[ -z "$SEQ_RESULT" ] && SEQ_RESULT="timeout"
|
||||
[ -z "$SEQ_MS" ] && SEQ_MS=$((TIMEOUT_SEC * 1000))
|
||||
|
||||
# Run with nseq solver; same structure
|
||||
NSEQ_OUT=$({ time timeout "$TIMEOUT_SEC" "$Z3" \
|
||||
smt.string_solver=nseq \
|
||||
-T:"$Z3_TIMEOUT_SEC" \
|
||||
"$smt_file" 2>/dev/null; } 2>&1)
|
||||
NSEQ_RESULT=$(echo "$NSEQ_OUT" | grep -E '^(sat|unsat|unknown)' | head -1)
|
||||
NSEQ_MS=$(echo "$NSEQ_OUT" | grep real | awk '{split($2,a,"m"); split(a[2],b,"s"); printf "%d", (a[1]*60+b[1])*1000}')
|
||||
[ -z "$NSEQ_RESULT" ] && NSEQ_RESULT="timeout"
|
||||
[ -z "$NSEQ_MS" ] && NSEQ_MS=$((TIMEOUT_SEC * 1000))
|
||||
|
||||
SHORT=$(basename "$smt_file")
|
||||
echo "$SHORT,$SEQ_RESULT,$SEQ_MS,$NSEQ_RESULT,$NSEQ_MS" >> "$RESULTS"
|
||||
|
||||
done_count=$((done_count + 1))
|
||||
if [ $((done_count % 50)) -eq 0 ]; then
|
||||
echo "Progress: $done_count / $SAMPLE files completed"
|
||||
fi
|
||||
done < /tmp/qf_s_sample.txt
|
||||
|
||||
echo "Benchmark run complete: $done_count files"
|
||||
```
|
||||
|
||||
## Phase 3: Run Benchmarks
|
||||
## Phase 5: Collect Seq Traces for Interesting Cases
|
||||
|
||||
Run each of the 200 selected files with both Z3 string solvers and ZIPT. Use a 5-second timeout for seq and a 10-second timeout for nseq and ZIPT.
|
||||
|
||||
For each file, run:
|
||||
1. `z3 smt.string_solver=seq -tr:seq -T:5 <file>` — seq solver with sequence-solver tracing enabled; rename the `.z3-trace` output after each run so it is not overwritten. Use `-T:5` when tracing to cap trace size.
|
||||
2. `z3 smt.string_solver=nseq -T:5 <file>` — nseq solver without tracing (timing only).
|
||||
3. `dotnet <ZIPT.dll> -t:5000 <file>` — ZIPT solver (milliseconds).
|
||||
|
||||
Capture:
|
||||
- **Verdict**: `sat`, `unsat`, `unknown`, `timeout` (if exit code indicates timeout or process is killed), or `bug` (if a solver crashes / produces a non-standard result)
|
||||
- **Time** (seconds): wall-clock time for the run
|
||||
- A row is flagged `SOUNDNESS_DISAGREEMENT` when any two solvers that both produced a definitive answer (sat/unsat) disagree
|
||||
|
||||
Use a bash script to automate this:
|
||||
For benchmarks where `seq` solves in under 2 s but `nseq` times out (seq-fast/nseq-slow cases), collect a brief `seq` trace to understand what algorithm is used:
|
||||
|
||||
```bash
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
Z3=/tmp/z3-build/z3
|
||||
mkdir -p /tmp/traces
|
||||
|
||||
Z3=${{ github.workspace }}/build/z3
|
||||
ZIPT_DLL=$(find /tmp/zipt/ZIPT/bin/Release -name "ZIPT.dll" 2>/dev/null | head -1)
|
||||
ZIPT_AVAILABLE=false
|
||||
[ -n "$ZIPT_DLL" ] && ZIPT_AVAILABLE=true
|
||||
# Find seq-fast / nseq-slow files: seq solved (sat/unsat) in <2000ms AND nseq timed out
|
||||
awk -F, 'NR>1 && ($2=="sat"||$2=="unsat") && $3<2000 && $4=="timeout" {print $1}' \
|
||||
/tmp/benchmark-results.csv > /tmp/seq_fast_nseq_slow.txt
|
||||
echo "seq-fast / nseq-slow files: $(wc -l < /tmp/seq_fast_nseq_slow.txt)"
|
||||
|
||||
# Ensure libz3.so is on the dynamic-linker path for the .NET runtime
|
||||
export LD_LIBRARY_PATH=${{ github.workspace }}/build${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}
|
||||
|
||||
RESULTS=/tmp/benchmark_results.tsv
|
||||
TRACES_DIR=/tmp/seq_traces
|
||||
mkdir -p "$TRACES_DIR"
|
||||
|
||||
echo -e "file\tseq_verdict\tseq_time\tnseq_verdict\tnseq_time\tzipt_verdict\tzipt_time\tnotes" > "$RESULTS"
|
||||
|
||||
run_z3_seq_traced() {
|
||||
# Run seq solver with -tr:seq tracing. Cap at 5 s so trace files stay manageable.
|
||||
local file="$1"
|
||||
local trace_dest="$2"
|
||||
local start end elapsed verdict output exit_code
|
||||
|
||||
# Remove any leftover trace from a prior run so we can detect whether one was produced.
|
||||
rm -f .z3-trace
|
||||
|
||||
start=$(date +%s%3N)
|
||||
output=$(timeout 7 "$Z3" "smt.string_solver=seq" -tr:seq -T:5 "$file" 2>&1)
|
||||
exit_code=$?
|
||||
end=$(date +%s%3N)
|
||||
elapsed=$(echo "scale=3; ($end - $start) / 1000" | bc)
|
||||
|
||||
# Rename the trace file immediately so the next run does not overwrite it.
|
||||
if [ -f .z3-trace ]; then
|
||||
mv .z3-trace "$trace_dest"
|
||||
else
|
||||
# Write a sentinel so Phase 4 can detect the absence of a trace.
|
||||
echo "(no trace produced)" > "$trace_dest"
|
||||
fi
|
||||
|
||||
if echo "$output" | grep -q "^unsat"; then
|
||||
verdict="unsat"
|
||||
elif echo "$output" | grep -q "^sat"; then
|
||||
verdict="sat"
|
||||
elif echo "$output" | grep -q "^unknown"; then
|
||||
verdict="unknown"
|
||||
elif [ "$exit_code" -eq 124 ]; then
|
||||
verdict="timeout"
|
||||
elif echo "$output" | grep -qi "error\|assertion\|segfault\|SIGABRT\|exception"; then
|
||||
verdict="bug"
|
||||
else
|
||||
verdict="unknown"
|
||||
fi
|
||||
|
||||
echo "$verdict $elapsed"
|
||||
}
|
||||
|
||||
run_z3_nseq() {
|
||||
local file="$1"
|
||||
local start end elapsed verdict output exit_code
|
||||
|
||||
start=$(date +%s%3N)
|
||||
output=$(timeout 12 "$Z3" "smt.string_solver=nseq" -T:5 "$file" 2>&1)
|
||||
exit_code=$?
|
||||
end=$(date +%s%3N)
|
||||
elapsed=$(echo "scale=3; ($end - $start) / 1000" | bc)
|
||||
|
||||
if echo "$output" | grep -q "^unsat"; then
|
||||
verdict="unsat"
|
||||
elif echo "$output" | grep -q "^sat"; then
|
||||
verdict="sat"
|
||||
elif echo "$output" | grep -q "^unknown"; then
|
||||
verdict="unknown"
|
||||
elif [ "$exit_code" -eq 124 ]; then
|
||||
verdict="timeout"
|
||||
elif echo "$output" | grep -qi "error\|assertion\|segfault\|SIGABRT\|exception"; then
|
||||
verdict="bug"
|
||||
else
|
||||
verdict="unknown"
|
||||
fi
|
||||
|
||||
echo "$verdict $elapsed"
|
||||
}
|
||||
|
||||
run_zipt() {
|
||||
local file="$1"
|
||||
local start end elapsed verdict output exit_code
|
||||
|
||||
if [ "$ZIPT_AVAILABLE" != "true" ]; then
|
||||
echo "n/a 0.000"
|
||||
return
|
||||
fi
|
||||
|
||||
start=$(date +%s%3N)
|
||||
# ZIPT prints the filename on the first line, then SAT/UNSAT/UNKNOWN on subsequent lines
|
||||
output=$(timeout 12 dotnet "$ZIPT_DLL" -t:5000 "$file" 2>&1)
|
||||
exit_code=$?
|
||||
end=$(date +%s%3N)
|
||||
elapsed=$(echo "scale=3; ($end - $start) / 1000" | bc)
|
||||
|
||||
if echo "$output" | grep -qi "^UNSAT$"; then
|
||||
verdict="unsat"
|
||||
elif echo "$output" | grep -qi "^SAT$"; then
|
||||
verdict="sat"
|
||||
elif echo "$output" | grep -qi "^UNKNOWN$"; then
|
||||
verdict="unknown"
|
||||
elif [ "$exit_code" -eq 124 ]; then
|
||||
verdict="timeout"
|
||||
elif echo "$output" | grep -qi "error\|crash\|exception\|Unsupported"; then
|
||||
verdict="bug"
|
||||
else
|
||||
verdict="unknown"
|
||||
fi
|
||||
|
||||
echo "$verdict $elapsed"
|
||||
}
|
||||
|
||||
while IFS= read -r file; do
|
||||
fname=$(basename "$file")
|
||||
# Use a sanitised filename (replace non-alphanumeric with _) for the trace path.
|
||||
safe_name=$(echo "$fname" | tr -cs 'A-Za-z0-9._-' '_')
|
||||
trace_path="$TRACES_DIR/${safe_name}.z3-trace"
|
||||
|
||||
seq_result=$(run_z3_seq_traced "$file" "$trace_path")
|
||||
nseq_result=$(run_z3_nseq "$file")
|
||||
zipt_result=$(run_zipt "$file")
|
||||
|
||||
seq_verdict=$(echo "$seq_result" | cut -d' ' -f1)
|
||||
seq_time=$(echo "$seq_result" | cut -d' ' -f2)
|
||||
nseq_verdict=$(echo "$nseq_result" | cut -d' ' -f1)
|
||||
nseq_time=$(echo "$nseq_result" | cut -d' ' -f2)
|
||||
zipt_verdict=$(echo "$zipt_result" | cut -d' ' -f1)
|
||||
zipt_time=$(echo "$zipt_result" | cut -d' ' -f2)
|
||||
|
||||
# Flag soundness disagreement when any two definitive verdicts disagree
|
||||
notes=""
|
||||
# Build list of (solver, verdict) pairs for definitive answers only
|
||||
declare -A definitive_map
|
||||
[ "$seq_verdict" = "sat" ] || [ "$seq_verdict" = "unsat" ] && definitive_map[seq]="$seq_verdict"
|
||||
[ "$nseq_verdict" = "sat" ] || [ "$nseq_verdict" = "unsat" ] && definitive_map[nseq]="$nseq_verdict"
|
||||
[ "$zipt_verdict" = "sat" ] || [ "$zipt_verdict" = "unsat" ] && definitive_map[zipt]="$zipt_verdict"
|
||||
# Check every pair for conflict
|
||||
has_sat=false; has_unsat=false
|
||||
for v in "${definitive_map[@]}"; do
|
||||
[ "$v" = "sat" ] && has_sat=true
|
||||
[ "$v" = "unsat" ] && has_unsat=true
|
||||
done
|
||||
if $has_sat && $has_unsat; then
|
||||
notes="SOUNDNESS_DISAGREEMENT"
|
||||
fi
|
||||
|
||||
echo -e "$fname\t$seq_verdict\t$seq_time\t$nseq_verdict\t$nseq_time\t$zipt_verdict\t$zipt_time\t$notes" >> "$RESULTS"
|
||||
echo "[$fname] seq=$seq_verdict(${seq_time}s) nseq=$nseq_verdict(${nseq_time}s) zipt=$zipt_verdict(${zipt_time}s) $notes"
|
||||
done < /tmp/selected_files.txt
|
||||
|
||||
echo "Benchmark run complete. Results saved to $RESULTS"
|
||||
echo "Trace files saved to $TRACES_DIR"
|
||||
```
|
||||
|
||||
Save this script to `/tmp/run_benchmarks.sh`, make it executable, and run it.
|
||||
|
||||
## Phase 3.5: Identify seq-fast / nseq-slow Cases and Analyse Traces
|
||||
|
||||
After the benchmark loop completes, identify files where seq solved the instance quickly but nseq was significantly slower (or timed out). For each such file, read its saved seq trace and produce a hypothesis for why nseq is slower.
|
||||
|
||||
**Definition of "seq-fast / nseq-slow"**: seq_time < 1.0 s AND nseq_time > 3 × seq_time (and nseq_time > 0.5 s).
|
||||
|
||||
For each matching file:
|
||||
1. Read the corresponding trace file from `/tmp/seq_traces/`.
|
||||
2. Look for the sequence of lemmas, reductions, or decisions that led seq to a fast conclusion.
|
||||
3. Identify patterns absent or less exploited in nseq: e.g., length-based propagation early in the trace, Parikh constraints eliminating possibilities, Nielsen graph pruning, equation splitting, or overlap resolution.
|
||||
4. Write a 3–5 sentence hypothesis explaining the likely reason for the nseq slowdown, referencing specific trace entries where possible.
|
||||
|
||||
Use a script to collect the candidates:
|
||||
|
||||
```bash
|
||||
#!/usr/bin/env bash
|
||||
RESULTS=/tmp/benchmark_results.tsv
|
||||
TRACES_DIR=/tmp/seq_traces
|
||||
ANALYSIS=/tmp/trace_analysis.md
|
||||
|
||||
echo "# Trace Analysis: seq-fast / nseq-slow Candidates" > "$ANALYSIS"
|
||||
echo "" >> "$ANALYSIS"
|
||||
|
||||
# Skip header line; columns: file seq_verdict seq_time nseq_verdict nseq_time ...
|
||||
tail -n +2 "$RESULTS" | while IFS=$'\t' read -r fname seq_verdict seq_time nseq_verdict nseq_time _rest; do
|
||||
# Use bc for floating-point comparison; bc does not support && so split into separate tests.
|
||||
is_fast=$(echo "$seq_time < 1.0" | bc -l 2>/dev/null || echo 0)
|
||||
threshold=$(echo "$seq_time * 3" | bc -l 2>/dev/null || echo 99999)
|
||||
is_slow_threshold=$(echo "$nseq_time > $threshold" | bc -l 2>/dev/null || echo 0)
|
||||
# Extra guard: exclude trivially fast seq cases where 3× is still < 0.5 s
|
||||
is_over_half=$(echo "$nseq_time > 0.5" | bc -l 2>/dev/null || echo 0)
|
||||
|
||||
if [ "$is_fast" = "1" ] && [ "$is_slow_threshold" = "1" ] && [ "$is_over_half" = "1" ]; then
|
||||
safe_name=$(echo "$fname" | tr -cs 'A-Za-z0-9._-' '_')
|
||||
trace_path="$TRACES_DIR/${safe_name}.z3-trace"
|
||||
echo "## $fname" >> "$ANALYSIS"
|
||||
echo "" >> "$ANALYSIS"
|
||||
echo "seq: ${seq_time}s (${seq_verdict}), nseq: ${nseq_time}s (${nseq_verdict})" >> "$ANALYSIS"
|
||||
echo "" >> "$ANALYSIS"
|
||||
echo "### Trace excerpt (first 200 lines)" >> "$ANALYSIS"
|
||||
echo '```' >> "$ANALYSIS"
|
||||
head -200 "$trace_path" 2>/dev/null >> "$ANALYSIS" || echo "(trace file not found on disk)" >> "$ANALYSIS"
|
||||
echo '```' >> "$ANALYSIS"
|
||||
echo "" >> "$ANALYSIS"
|
||||
echo "---" >> "$ANALYSIS"
|
||||
echo "" >> "$ANALYSIS"
|
||||
fi
|
||||
# Collect traces for at most 5 such cases
|
||||
head -5 /tmp/seq_fast_nseq_slow.txt | while IFS= read -r short; do
|
||||
# Find the full path
|
||||
full=$(grep "/$short$" /tmp/qf_s_sample.txt | head -1)
|
||||
[ -z "$full" ] && continue
|
||||
timeout 5 "$Z3" \
|
||||
smt.string_solver=seq \
|
||||
-tr:seq \
|
||||
-T:5 \
|
||||
"$full" > "/tmp/traces/${short%.smt2}.seq.trace" 2>&1 || true
|
||||
done
|
||||
|
||||
echo "Candidate list written to $ANALYSIS"
|
||||
cat "$ANALYSIS"
|
||||
```
|
||||
|
||||
Save this to `/tmp/analyse_traces.sh`, make it executable, and run it. Then read the trace excerpts collected in `/tmp/trace_analysis.md` and — for each candidate — write your hypothesis in the Phase 4 summary report under a **"Trace Analysis"** section.
|
||||
## Phase 6: Analyze Results
|
||||
|
||||
## Phase 4: Generate Summary Report
|
||||
Compute summary statistics from the CSV. Save the analysis script to a file and run it:
|
||||
|
||||
Read `/tmp/benchmark_results.tsv` and compute statistics. Then generate a Markdown report.
|
||||
```bash
|
||||
cat > /tmp/analyze_benchmark.py << 'PYEOF'
|
||||
import csv, sys
|
||||
|
||||
Compute:
|
||||
- **Total benchmarks**: 200
|
||||
- **Per solver (seq, nseq, and ZIPT)**: count of sat / unsat / unknown / timeout / bug verdicts
|
||||
- **Total time used**: sum of all times for each solver
|
||||
- **Average time per benchmark**: total_time / 200
|
||||
- **Soundness disagreements**: files where any two solvers that both returned a definitive answer disagree (these are the most critical bugs)
|
||||
- **Bugs / crashes**: files with error/crash verdicts
|
||||
results = []
|
||||
with open('/tmp/benchmark-results.csv') as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
results.append(row)
|
||||
|
||||
Format the report as a GitHub Discussion post (GitHub-flavored Markdown):
|
||||
total = len(results)
|
||||
if total == 0:
|
||||
print("No results found.")
|
||||
sys.exit(0)
|
||||
|
||||
def is_correct(r, solver):
|
||||
prefix = 'seq' if solver == 'seq' else 'nseq'
|
||||
return r[f'{prefix}_result'] in ('sat', 'unsat')
|
||||
|
||||
def timed_out(r, solver):
|
||||
prefix = 'seq' if solver == 'seq' else 'nseq'
|
||||
return r[f'{prefix}_result'] == 'timeout'
|
||||
|
||||
seq_solved = sum(1 for r in results if is_correct(r, 'seq'))
|
||||
nseq_solved = sum(1 for r in results if is_correct(r, 'nseq'))
|
||||
seq_to = sum(1 for r in results if timed_out(r, 'seq'))
|
||||
nseq_to = sum(1 for r in results if timed_out(r, 'nseq'))
|
||||
|
||||
seq_times = [int(r['seq_time_ms']) for r in results if is_correct(r, 'seq')]
|
||||
nseq_times = [int(r['nseq_time_ms']) for r in results if is_correct(r, 'nseq')]
|
||||
|
||||
def median(lst):
|
||||
s = sorted(lst)
|
||||
n = len(s)
|
||||
return s[n//2] if n else 0
|
||||
|
||||
def mean(lst):
|
||||
return sum(lst)//len(lst) if lst else 0
|
||||
|
||||
# Disagreements (sat vs unsat or vice-versa)
|
||||
disagreements = [
|
||||
r for r in results
|
||||
if r['seq_result'] in ('sat','unsat')
|
||||
and r['nseq_result'] in ('sat','unsat')
|
||||
and r['seq_result'] != r['nseq_result']
|
||||
]
|
||||
|
||||
# seq-fast / nseq-slow: seq solved in <2s, nseq timed out
|
||||
seq_fast_nseq_slow = [
|
||||
r for r in results
|
||||
if is_correct(r, 'seq') and int(r['seq_time_ms']) < 2000 and timed_out(r, 'nseq')
|
||||
]
|
||||
# nseq-fast / seq-slow: nseq solved in <2s, seq timed out
|
||||
nseq_fast_seq_slow = [
|
||||
r for r in results
|
||||
if is_correct(r, 'nseq') and int(r['nseq_time_ms']) < 2000 and timed_out(r, 'seq')
|
||||
]
|
||||
|
||||
print(f"TOTAL={total}")
|
||||
print(f"SEQ_SOLVED={seq_solved}")
|
||||
print(f"NSEQ_SOLVED={nseq_solved}")
|
||||
print(f"SEQ_TIMEOUTS={seq_to}")
|
||||
print(f"NSEQ_TIMEOUTS={nseq_to}")
|
||||
print(f"SEQ_MEDIAN_MS={median(seq_times)}")
|
||||
print(f"NSEQ_MEDIAN_MS={median(nseq_times)}")
|
||||
print(f"SEQ_MEAN_MS={mean(seq_times)}")
|
||||
print(f"NSEQ_MEAN_MS={mean(nseq_times)}")
|
||||
print(f"DISAGREEMENTS={len(disagreements)}")
|
||||
print(f"SEQ_FAST_NSEQ_SLOW={len(seq_fast_nseq_slow)}")
|
||||
print(f"NSEQ_FAST_SEQ_SLOW={len(nseq_fast_seq_slow)}")
|
||||
|
||||
# Print top-10 slowest for nseq that seq handles fast
|
||||
print("\nTOP_SEQ_FAST_NSEQ_SLOW:")
|
||||
for r in sorted(seq_fast_nseq_slow, key=lambda x: -int(x['nseq_time_ms']))[:10]:
|
||||
print(f" {r['file']} seq={r['seq_time_ms']}ms nseq={r['nseq_time_ms']}ms seq_result={r['seq_result']} nseq_result={r['nseq_result']}")
|
||||
|
||||
print("\nTOP_NSEQ_FAST_SEQ_SLOW:")
|
||||
for r in sorted(nseq_fast_seq_slow, key=lambda x: -int(x['seq_time_ms']))[:10]:
|
||||
print(f" {r['file']} seq={r['seq_time_ms']}ms nseq={r['nseq_time_ms']}ms seq_result={r['seq_result']} nseq_result={r['nseq_result']}")
|
||||
|
||||
if disagreements:
|
||||
print(f"\nDISAGREEMENTS ({len(disagreements)}):")
|
||||
for r in disagreements[:10]:
|
||||
print(f" {r['file']} seq={r['seq_result']} nseq={r['nseq_result']}")
|
||||
PYEOF
|
||||
|
||||
python3 /tmp/analyze_benchmark.py
|
||||
```
|
||||
|
||||
## Phase 7: Create GitHub Discussion
|
||||
|
||||
Use the `create_discussion` safe-output tool to post a structured benchmark report.
|
||||
|
||||
The discussion body should be formatted as follows (fill in real numbers from Phase 6):
|
||||
|
||||
```markdown
|
||||
### ZIPT Benchmark Report — Z3 c3 branch
|
||||
# QF_S Benchmark: seq vs nseq
|
||||
|
||||
**Date**: <today's date>
|
||||
**Date**: YYYY-MM-DD
|
||||
**Branch**: c3
|
||||
**Benchmark set**: QF_S (200 randomly selected files from tests/QF_S.tar.zst)
|
||||
**Timeout**: 5 seconds for seq (`-T:5`); 5 seconds for nseq (`-T:5`) and ZIPT (`-t:5000`)
|
||||
**Commit**: `<short SHA>`
|
||||
**Workflow Run**: [#<run_id>](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})
|
||||
**Files benchmarked**: N (capped at 300, timeout 5 s per file)
|
||||
|
||||
---
|
||||
|
||||
### Summary
|
||||
## Summary
|
||||
|
||||
| Metric | seq solver | nseq solver | ZIPT solver |
|
||||
|--------|-----------|-------------|-------------|
|
||||
| sat | X | X | X |
|
||||
| unsat | X | X | X |
|
||||
| unknown | X | X | X |
|
||||
| timeout | X | X | X |
|
||||
| bug/crash | X | X | X |
|
||||
| **Total time (s)** | X.XXX | X.XXX | X.XXX |
|
||||
| **Avg time/benchmark (s)** | X.XXX | X.XXX | X.XXX |
|
||||
|
||||
**Soundness disagreements** (any two solvers return conflicting sat/unsat): N
|
||||
| Metric | seq | nseq |
|
||||
|--------|-----|------|
|
||||
| Files solved (sat/unsat) | SEQ_SOLVED | NSEQ_SOLVED |
|
||||
| Timeouts | SEQ_TO | NSEQ_TO |
|
||||
| Median solve time (solved files) | X ms | Y ms |
|
||||
| Mean solve time (solved files) | X ms | Y ms |
|
||||
| **Disagreements (sat≠unsat)** | — | N |
|
||||
|
||||
---
|
||||
|
||||
### Per-File Results
|
||||
## Performance Comparison
|
||||
|
||||
| # | File | seq verdict | seq time (s) | nseq verdict | nseq time (s) | ZIPT verdict | ZIPT time (s) | Notes |
|
||||
|---|------|-------------|-------------|--------------|--------------|--------------|--------------|-------|
|
||||
| 1 | benchmark_0001.smt2 | sat | 0.123 | sat | 0.456 | sat | 0.789 | |
|
||||
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
|
||||
### seq-fast / nseq-slow (seq < 2 s, nseq timed out)
|
||||
|
||||
These are benchmarks where the classical `seq` solver is significantly faster. These represent regression risk for `nseq`.
|
||||
|
||||
| File | seq (ms) | nseq (ms) | seq result | nseq result |
|
||||
|------|----------|-----------|------------|-------------|
|
||||
[TOP 10 ENTRIES]
|
||||
|
||||
### nseq-fast / seq-slow (nseq < 2 s, seq timed out)
|
||||
|
||||
These are benchmarks where `nseq` shows a performance advantage.
|
||||
|
||||
| File | seq (ms) | nseq (ms) | seq result | nseq result |
|
||||
|------|----------|-----------|------------|-------------|
|
||||
[TOP 10 ENTRIES]
|
||||
|
||||
---
|
||||
|
||||
### Notable Issues
|
||||
## Correctness
|
||||
|
||||
#### Soundness Disagreements (Critical)
|
||||
<list files where any two solvers disagree on sat/unsat, naming which solvers disagree>
|
||||
**Disagreements** (files where seq says `sat` but nseq says `unsat` or vice versa): N
|
||||
|
||||
#### Crashes / Bugs
|
||||
<list files where any solver crashed or produced an error>
|
||||
|
||||
#### Slow Benchmarks (> 8s)
|
||||
<list files that took more than 8 seconds for any solver>
|
||||
|
||||
#### Trace Analysis: seq-fast / nseq-slow Hypotheses
|
||||
<For each file where seq finished in < 1 s and nseq took > 3× longer, write a 3–5 sentence hypothesis based on the trace excerpt, referencing specific trace entries where possible. If no such files were found, state "No seq-fast / nseq-slow cases were observed in this run.">
|
||||
[If disagreements exist, list all of them here with file paths and both results]
|
||||
|
||||
---
|
||||
|
||||
*Generated automatically by the ZIPT Benchmark workflow on the c3 branch.*
|
||||
## seq Trace Analysis (seq-fast / nseq-slow cases)
|
||||
|
||||
<details>
|
||||
<summary>Click to expand trace snippets for top seq-fast/nseq-slow cases</summary>
|
||||
|
||||
[Insert trace snippet for each traced file, or "No traces collected" if section was skipped]
|
||||
|
||||
</details>
|
||||
|
||||
---
|
||||
|
||||
## Raw Data
|
||||
|
||||
<details>
|
||||
<summary>Full results CSV (click to expand)</summary>
|
||||
|
||||
```csv
|
||||
[PASTE FIRST 200 LINES OF /tmp/benchmark-results.csv]
|
||||
```
|
||||
|
||||
## Phase 5: Post to GitHub Discussion
|
||||
</details>
|
||||
|
||||
Post the Markdown report as a new GitHub Discussion using the `create-discussion` safe output.
|
||||
---
|
||||
|
||||
- **Category**: "Agentic Workflows"
|
||||
- **Title**: `[ZIPT Benchmark] Z3 c3 branch — <date>`
|
||||
- Close older discussions with the same title prefix to avoid clutter.
|
||||
*Generated by the QF_S Benchmark workflow. To reproduce: build Z3 from the `c3` branch and run `z3 smt.string_solver=seq|nseq -T:10 <file.smt2>`.*
|
||||
```
|
||||
|
||||
## Guidelines
|
||||
## Edge Cases
|
||||
|
||||
- **Always build from c3 branch**: The workspace is already checked out on c3; don't change branches.
|
||||
- **Debug build required**: The build must use `CMAKE_BUILD_TYPE=Debug` so that Z3's internal assertions and trace infrastructure are active; `-tr:` trace flags have no effect in Release builds.
|
||||
- **Tracing time cap**: Always pass `-T:5` when running with `-tr:seq` to limit solver runtime and keep trace files a manageable size. The nseq and ZIPT runs use `-T:5` / `-t:5000` as before.
|
||||
- **Rename trace files immediately**: After each seq run, rename `.z3-trace` to a per-benchmark path before starting the next run, or the next invocation will overwrite it.
|
||||
- **Handle build failures gracefully**: If Z3 fails to build, report the error and create a brief discussion noting the build failure. If ZIPT fails to build, continue with only the seq/nseq columns and note `n/a` for ZIPT results.
|
||||
- **Handle missing zstd**: If `tar --zstd` fails, try `zstd -d tests/QF_S.tar.zst --stdout | tar -x -C /tmp/qfs_benchmarks`.
|
||||
- **Be precise with timing**: Use millisecond-precision timestamps and report times in seconds with 3 decimal places.
|
||||
- **Distinguish timeout from unknown**: A timeout (process killed after 7s outer / 5s Z3-internal for seq, or 12s/10s for nseq) is different from `(unknown)` returned by a solver.
|
||||
- **ZIPT timeout unit**: ZIPT's `-t` flag takes **milliseconds**, so pass `-t:5000` for a 5-second limit.
|
||||
- **ZIPT output format**: ZIPT prints the input filename on the first line, then `SAT`, `UNSAT`, or `UNKNOWN` on subsequent lines. Parse accordingly.
|
||||
- **Report soundness bugs prominently**: If any benchmark shows a conflict between any two solvers that both returned a definitive sat/unsat answer, highlight it as a critical finding and name which pair disagrees.
|
||||
- **Don't skip any file**: Run all 200 files even if some fail.
|
||||
- **Large report**: If the per-file table is very long, put it in a `<details>` collapsible section.
|
||||
- If the build fails, call `missing_data` explaining the build error and stop.
|
||||
- If no benchmark files are found at all, call `missing_data` explaining that no QF_S `.smt2` files were found in the `c3` branch.
|
||||
- If Z3 crashes (segfault) on a file with either solver, record the result as `crash` and continue.
|
||||
- If the total benchmark set is very small (< 5 files), note this prominently in the discussion and suggest adding more QF_S benchmarks to the `c3` branch.
|
||||
- If zero disagreements and both solvers time out on the same files, note that the solvers are in agreement.
|
||||
- If `create_discussion` fails (e.g., MCP session error), call `report_incomplete` with the reason and include the top-line statistics (files solved, timeouts, disagreement count) in the `details` field.
|
||||
|
||||
## Important Notes
|
||||
|
||||
- **DO NOT** modify any source files or create pull requests.
|
||||
- **DO NOT** run `ninja` or any build command in the background with `&` — concurrent C++ compilation and LLM inference can exhaust available RAM and kill the agent process. Always wait for build commands to complete before proceeding.
|
||||
- **DO NOT** run benchmarks for longer than 100 minutes total (leave buffer for posting).
|
||||
- **DO** always report the commit SHA so results can be correlated with specific code versions.
|
||||
- **DO** close older QF_S Benchmark discussions automatically (configured via `close-older-discussions: true`).
|
||||
- **DO** highlight disagreements prominently — these are potential correctness bugs.
|
||||
|
|
|
|||
2
.github/workflows/release-notes-updater.md
vendored
2
.github/workflows/release-notes-updater.md
vendored
|
|
@ -16,8 +16,6 @@ tools:
|
|||
toolsets: [default]
|
||||
bash: [":*"]
|
||||
edit: {}
|
||||
glob: {}
|
||||
view: {}
|
||||
|
||||
safe-outputs:
|
||||
create-discussion:
|
||||
|
|
|
|||
222
.github/workflows/release.yml
vendored
222
.github/workflows/release.yml
vendored
|
|
@ -46,6 +46,19 @@ jobs:
|
|||
- name: Build
|
||||
run: python scripts/mk_unix_dist.py --dotnet-key=$GITHUB_WORKSPACE/resources/z3.snk --arch=x64
|
||||
|
||||
- name: Validate libz3.dylib and z3 architecture (must be x86_64)
|
||||
run: |
|
||||
set -e
|
||||
for f in build-dist/libz3.dylib build-dist/z3; do
|
||||
ARCH=$(lipo -archs "$f")
|
||||
echo "$f architecture: $ARCH"
|
||||
if [ "$ARCH" != "x86_64" ]; then
|
||||
echo "ERROR: $f has arch '$ARCH', expected 'x86_64' (see issue #9662)"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
echo "OK: macOS x64 artifacts are x86_64"
|
||||
|
||||
- name: Clone z3test
|
||||
run: git clone https://github.com/z3prover/z3test z3test
|
||||
|
||||
|
|
@ -53,7 +66,7 @@ jobs:
|
|||
run: python z3test/scripts/test_benchmarks.py build-dist/z3 z3test/regressions/smt2
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: macOsBuild
|
||||
path: dist/*.zip
|
||||
|
|
@ -75,11 +88,24 @@ jobs:
|
|||
- name: Build
|
||||
run: python scripts/mk_unix_dist.py --dotnet-key=$GITHUB_WORKSPACE/resources/z3.snk --arch=arm64
|
||||
|
||||
- name: Validate libz3.dylib and z3 architecture (must be arm64)
|
||||
run: |
|
||||
set -e
|
||||
for f in build-dist/libz3.dylib build-dist/z3; do
|
||||
ARCH=$(lipo -archs "$f")
|
||||
echo "$f architecture: $ARCH"
|
||||
if [ "$ARCH" != "arm64" ]; then
|
||||
echo "ERROR: $f has arch '$ARCH', expected 'arm64' (see issue #9662)"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
echo "OK: macOS arm64 artifacts are arm64"
|
||||
|
||||
- name: Clone z3test
|
||||
run: git clone https://github.com/z3prover/z3test z3test
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: MacArm64
|
||||
path: dist/*.zip
|
||||
|
|
@ -111,6 +137,17 @@ jobs:
|
|||
Z3_DIR=$(find . -maxdepth 1 -type d -name "z3-*" | head -n 1)
|
||||
echo "Z3_DIR=$Z3_DIR" >> $GITHUB_ENV
|
||||
|
||||
- name: Validate shipped libz3.dylib architecture (must be x86_64)
|
||||
run: |
|
||||
set -e
|
||||
DYLIB="artifacts/$Z3_DIR/bin/libz3.dylib"
|
||||
ARCH=$(lipo -archs "$DYLIB")
|
||||
echo "Shipped $DYLIB architecture: $ARCH"
|
||||
if [ "$ARCH" != "x86_64" ]; then
|
||||
echo "ERROR: x64 release zip contains '$ARCH' libz3.dylib (see issue #9662)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Test install_name_tool with headerpad
|
||||
run: |
|
||||
cd artifacts/$Z3_DIR/bin
|
||||
|
|
@ -159,6 +196,17 @@ jobs:
|
|||
Z3_DIR=$(find . -maxdepth 1 -type d -name "z3-*" | head -n 1)
|
||||
echo "Z3_DIR=$Z3_DIR" >> $GITHUB_ENV
|
||||
|
||||
- name: Validate shipped libz3.dylib architecture (must be arm64)
|
||||
run: |
|
||||
set -e
|
||||
DYLIB="artifacts/$Z3_DIR/bin/libz3.dylib"
|
||||
ARCH=$(lipo -archs "$DYLIB")
|
||||
echo "Shipped $DYLIB architecture: $ARCH"
|
||||
if [ "$ARCH" != "arm64" ]; then
|
||||
echo "ERROR: arm64 release zip contains '$ARCH' libz3.dylib (see issue #9662)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Test install_name_tool with headerpad
|
||||
run: |
|
||||
cd artifacts/$Z3_DIR/bin
|
||||
|
|
@ -208,7 +256,7 @@ jobs:
|
|||
run: python z3test/scripts/test_benchmarks.py build-dist/z3 z3test/regressions/smt2
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: UbuntuBuild
|
||||
path: dist/*.zip
|
||||
|
|
@ -243,7 +291,7 @@ jobs:
|
|||
python scripts/mk_unix_dist.py --nodotnet --arch=arm64
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: UbuntuArm64
|
||||
path: dist/*.zip
|
||||
|
|
@ -298,7 +346,7 @@ jobs:
|
|||
run: zip -r z3doc.zip doc/api
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: UbuntuDoc
|
||||
path: z3doc.zip
|
||||
|
|
@ -313,9 +361,17 @@ jobs:
|
|||
- name: Checkout code
|
||||
uses: actions/checkout@v6.0.2
|
||||
|
||||
- name: Select Python
|
||||
run: |
|
||||
# Use the first available manylinux interpreter for deterministic selection.
|
||||
PYTHON=$(printf '%s\n' /opt/python/*/bin/python | sort -V | head -n1)
|
||||
test -x "$PYTHON" || { echo "Error: no interpreter found under /opt/python/*/bin/python"; exit 1; }
|
||||
echo "PYTHON=$PYTHON" >> "$GITHUB_ENV"
|
||||
"$PYTHON" --version
|
||||
|
||||
- name: Setup Python environment
|
||||
run: |
|
||||
/opt/python/cp38-cp38/bin/python -m venv $PWD/env
|
||||
"$PYTHON" -m venv $PWD/env
|
||||
echo "$PWD/env/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Install build tools
|
||||
|
|
@ -328,7 +384,7 @@ jobs:
|
|||
run: pip install ./src/api/python/wheelhouse/*.whl && python - <src/api/python/z3test.py z3 && python - <src/api/python/z3test.py z3num
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: ManyLinuxPythonBuildAMD64
|
||||
path: src/api/python/wheelhouse/*.whl
|
||||
|
|
@ -351,9 +407,17 @@ jobs:
|
|||
mkdir -p /tmp/arm-toolchain/
|
||||
tar xf /tmp/arm-toolchain.tar.xz -C /tmp/arm-toolchain/ --strip-components=1
|
||||
|
||||
- name: Select Python
|
||||
run: |
|
||||
# Use the first available manylinux interpreter for deterministic selection.
|
||||
PYTHON=$(printf '%s\n' /opt/python/*/bin/python | sort -V | head -n1)
|
||||
test -x "$PYTHON" || { echo "Error: no interpreter found under /opt/python/*/bin/python"; exit 1; }
|
||||
echo "PYTHON=$PYTHON" >> "$GITHUB_ENV"
|
||||
"$PYTHON" --version
|
||||
|
||||
- name: Setup Python environment
|
||||
run: |
|
||||
/opt/python/cp38-cp38/bin/python -m venv $PWD/env
|
||||
"$PYTHON" -m venv $PWD/env
|
||||
echo "$PWD/env/bin" >> $GITHUB_PATH
|
||||
echo "/tmp/arm-toolchain/bin" >> $GITHUB_PATH
|
||||
echo "/tmp/arm-toolchain/aarch64-none-linux-gnu/libc/usr/bin" >> $GITHUB_PATH
|
||||
|
|
@ -368,12 +432,120 @@ jobs:
|
|||
run: cd src/api/python && CC=aarch64-none-linux-gnu-gcc CXX=aarch64-none-linux-gnu-g++ AR=aarch64-none-linux-gnu-ar LD=aarch64-none-linux-gnu-ld Z3_CROSS_COMPILING=aarch64 python -m build && AUDITWHEEL_PLAT= auditwheel repair --best-plat dist/*.whl && cd ../../..
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: ManyLinuxPythonBuildArm64
|
||||
path: src/api/python/wheelhouse/*.whl
|
||||
retention-days: 7
|
||||
|
||||
manylinux-python-riscv64:
|
||||
name: "Python bindings (manylinux RISC-V 64 cross)"
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 90
|
||||
container: quay.io/pypa/manylinux_2_28_x86_64:latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6.0.2
|
||||
|
||||
- name: Download RISC-V toolchain
|
||||
run: curl -L -o /tmp/riscv-toolchain.tar.gz 'https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2024.09.03/riscv64-glibc-ubuntu-20.04-gcc-nightly-2024.09.03-nightly.tar.gz'
|
||||
|
||||
- name: Extract RISC-V toolchain
|
||||
run: |
|
||||
mkdir -p /tmp/riscv-toolchain/
|
||||
tar xf /tmp/riscv-toolchain.tar.gz -C /tmp/riscv-toolchain/ --strip-components=1
|
||||
|
||||
- name: Install MPFR 4 (required by RISC-V toolchain host binaries)
|
||||
run: |
|
||||
dnf install -y gmp-devel
|
||||
curl -L -o /tmp/mpfr.tar.xz https://ftp.gnu.org/gnu/mpfr/mpfr-4.2.1.tar.xz
|
||||
tar xf /tmp/mpfr.tar.xz -C /tmp/
|
||||
cd /tmp/mpfr-4.2.1 && ./configure --prefix=/usr/local --disable-static && make -j$(nproc) && make install
|
||||
ldconfig
|
||||
|
||||
- name: Select Python
|
||||
run: |
|
||||
# Use the first available manylinux interpreter for deterministic selection.
|
||||
PYTHON=$(printf '%s\n' /opt/python/*/bin/python | sort -V | head -n1)
|
||||
test -x "$PYTHON" || { echo "Error: no interpreter found under /opt/python/*/bin/python"; exit 1; }
|
||||
echo "PYTHON=$PYTHON" >> "$GITHUB_ENV"
|
||||
"$PYTHON" --version
|
||||
|
||||
- name: Setup Python environment
|
||||
run: |
|
||||
"$PYTHON" -m venv $PWD/env
|
||||
echo "$PWD/env/bin" >> $GITHUB_PATH
|
||||
echo "/tmp/riscv-toolchain/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Install build tools
|
||||
run: |
|
||||
echo $PATH
|
||||
stat $(which riscv64-unknown-linux-gnu-gcc)
|
||||
pip install build git+https://github.com/rhelmot/auditwheel
|
||||
|
||||
- name: Build wheels
|
||||
run: cd src/api/python && CC=riscv64-unknown-linux-gnu-gcc CXX=riscv64-unknown-linux-gnu-g++ AR=riscv64-unknown-linux-gnu-ar LD=riscv64-unknown-linux-gnu-ld Z3_CROSS_COMPILING=riscv64 python -m build && AUDITWHEEL_PLAT= auditwheel repair --best-plat dist/*.whl && cd ../../..
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: ManyLinuxPythonBuildRiscv64
|
||||
path: src/api/python/wheelhouse/*.whl
|
||||
retention-days: 7
|
||||
|
||||
pyodide-python:
|
||||
name: "Python bindings (Pyodide)"
|
||||
runs-on: ubuntu-24.04
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6.0.2
|
||||
|
||||
- name: Setup packages
|
||||
run: sudo apt-get update && sudo apt-get install -y python3-dev python3-pip python3-venv
|
||||
|
||||
- name: Create venv
|
||||
run: python3 -m venv ~/env
|
||||
|
||||
- name: Install pyodide
|
||||
run: ~/env/bin/pip install pyodide-build pyodide-cli
|
||||
|
||||
- name: Configure Emscripten
|
||||
run: |
|
||||
git clone https://github.com/emscripten-core/emsdk.git ~/emsdk
|
||||
cd ~/emsdk
|
||||
PYODIDE_EMSCRIPTEN_VERSION=$(~/env/bin/pyodide config get emscripten_version)
|
||||
./emsdk install ${PYODIDE_EMSCRIPTEN_VERSION}
|
||||
./emsdk activate ${PYODIDE_EMSCRIPTEN_VERSION}
|
||||
|
||||
- name: Build wheel
|
||||
run: |
|
||||
source ~/emsdk/emsdk_env.sh
|
||||
cd src/api/python
|
||||
CFLAGS="${CFLAGS}" LDFLAGS="${LDFLAGS}" CXXFLAGS="${CXXFLAGS}" ~/env/bin/pyodide build --exports whole_archive
|
||||
env:
|
||||
CFLAGS: "-fexceptions -s DISABLE_EXCEPTION_CATCHING=0 -g2"
|
||||
LDFLAGS: "-fexceptions -s WASM_BIGINT"
|
||||
CXXFLAGS: "-fexceptions -s DISABLE_EXCEPTION_CATCHING=0"
|
||||
|
||||
- name: Setup env-pyodide
|
||||
run: |
|
||||
source ~/env/bin/activate
|
||||
source ~/emsdk/emsdk_env.sh
|
||||
pyodide venv ~/env-pyodide
|
||||
|
||||
- name: Test wheel
|
||||
run: |
|
||||
~/env-pyodide/bin/pip install src/api/python/dist/*.whl
|
||||
~/env-pyodide/bin/python src/api/python/z3test.py z3
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: PyodidePythonBuild
|
||||
path: src/api/python/dist/*.whl
|
||||
retention-days: 7
|
||||
|
||||
windows-build-x64:
|
||||
name: "Windows x64 build"
|
||||
runs-on: windows-latest
|
||||
|
|
@ -394,7 +566,7 @@ jobs:
|
|||
python scripts\mk_win_dist.py --x64-only --dotnet-key=%GITHUB_WORKSPACE%\resources\z3.snk --zip
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: WindowsBuild-x64
|
||||
path: dist/*.zip
|
||||
|
|
@ -420,7 +592,7 @@ jobs:
|
|||
python scripts\mk_win_dist.py --x86-only --dotnet-key=%GITHUB_WORKSPACE%\resources\z3.snk --zip
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: WindowsBuild-x86
|
||||
path: dist/*.zip
|
||||
|
|
@ -446,7 +618,7 @@ jobs:
|
|||
python scripts\mk_win_dist_cmake.py --arm64-only --dotnet-key=%GITHUB_WORKSPACE%\resources\z3.snk --assembly-version=${{ env.RELEASE_VERSION }} --zip
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: WindowsBuild-arm64
|
||||
path: dist/arm64/*.zip
|
||||
|
|
@ -506,7 +678,7 @@ jobs:
|
|||
path: package
|
||||
|
||||
- name: Setup NuGet
|
||||
uses: nuget/setup-nuget@v2
|
||||
uses: nuget/setup-nuget@v4
|
||||
with:
|
||||
nuget-version: 'latest'
|
||||
|
||||
|
|
@ -523,7 +695,7 @@ jobs:
|
|||
nuget pack out\Microsoft.Z3.sym.nuspec -OutputDirectory . -Verbosity detailed -Symbols -SymbolPackageFormat snupkg -BasePath out
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: NuGet
|
||||
path: |
|
||||
|
|
@ -551,7 +723,7 @@ jobs:
|
|||
path: package
|
||||
|
||||
- name: Setup NuGet
|
||||
uses: nuget/setup-nuget@v2
|
||||
uses: nuget/setup-nuget@v4
|
||||
with:
|
||||
nuget-version: 'latest'
|
||||
|
||||
|
|
@ -568,7 +740,7 @@ jobs:
|
|||
nuget pack out\Microsoft.Z3.x86.sym.nuspec -OutputDirectory . -Verbosity detailed -Symbols -SymbolPackageFormat snupkg -BasePath out
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: NuGet32
|
||||
path: |
|
||||
|
|
@ -578,7 +750,7 @@ jobs:
|
|||
|
||||
python-package:
|
||||
name: "Python packaging"
|
||||
needs: [mac-build-x64, mac-build-arm64, windows-build-x64, windows-build-x86, windows-build-arm64, manylinux-python-amd64, manylinux-python-arm64]
|
||||
needs: [mac-build-x64, mac-build-arm64, windows-build-x64, windows-build-x86, windows-build-arm64, manylinux-python-amd64, manylinux-python-arm64, manylinux-python-riscv64, pyodide-python]
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- name: Checkout code
|
||||
|
|
@ -631,6 +803,18 @@ jobs:
|
|||
name: ManyLinuxPythonBuildArm64
|
||||
path: artifacts
|
||||
|
||||
- name: Download ManyLinux RISC-V 64 Build
|
||||
uses: actions/download-artifact@v8.0.1
|
||||
with:
|
||||
name: ManyLinuxPythonBuildRiscv64
|
||||
path: artifacts
|
||||
|
||||
- name: Download Pyodide Build
|
||||
uses: actions/download-artifact@v8.0.1
|
||||
with:
|
||||
name: PyodidePythonBuild
|
||||
path: artifacts
|
||||
|
||||
- name: Extract builds
|
||||
run: |
|
||||
cd artifacts
|
||||
|
|
@ -658,7 +842,7 @@ jobs:
|
|||
cp artifacts/*.whl src/api/python/dist/.
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
uses: actions/upload-artifact@v7.0.1
|
||||
with:
|
||||
name: PythonPackage
|
||||
path: src/api/python/dist/*
|
||||
|
|
@ -760,7 +944,7 @@ jobs:
|
|||
path: packages
|
||||
|
||||
- name: Setup NuGet
|
||||
uses: nuget/setup-nuget@v2
|
||||
uses: nuget/setup-nuget@v4
|
||||
with:
|
||||
nuget-version: 'latest'
|
||||
|
||||
|
|
|
|||
1442
.github/workflows/smtlib-benchmark-finder.lock.yml
generated
vendored
Normal file
1442
.github/workflows/smtlib-benchmark-finder.lock.yml
generated
vendored
Normal file
File diff suppressed because it is too large
Load diff
342
.github/workflows/smtlib-benchmark-finder.md
vendored
Normal file
342
.github/workflows/smtlib-benchmark-finder.md
vendored
Normal file
|
|
@ -0,0 +1,342 @@
|
|||
---
|
||||
description: >
|
||||
Monthly SMTLIB Benchmark Finder.
|
||||
Searches GitHub for repositories containing SMT-LIB benchmarks (.smt2 files),
|
||||
excludes repositories that belong to the official SMT-LIB benchmark sets
|
||||
(linked from smtlib.org and hosted on Zenodo), and posts a curated summary
|
||||
of community-contributed benchmark links as a GitHub Discussion.
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: "0 8 1 * *"
|
||||
workflow_dispatch:
|
||||
|
||||
timeout-minutes: 60
|
||||
|
||||
permissions: read-all
|
||||
|
||||
network:
|
||||
allowed:
|
||||
- defaults
|
||||
- github
|
||||
- smtlib.cs.uiowa.edu
|
||||
- zenodo.org
|
||||
|
||||
tools:
|
||||
cache-memory: true
|
||||
web-fetch: {}
|
||||
github:
|
||||
toolsets: [default, repos]
|
||||
bash: [":*"]
|
||||
|
||||
safe-outputs:
|
||||
mentions: false
|
||||
allowed-github-references: []
|
||||
max-bot-mentions: 1
|
||||
create-discussion:
|
||||
title-prefix: "[SMT-LIB Benchmarks] "
|
||||
category: "Agentic Workflows"
|
||||
close-older-discussions: true
|
||||
expires: 90d
|
||||
missing-tool:
|
||||
create-issue: true
|
||||
noop:
|
||||
report-as-issue: false
|
||||
|
||||
---
|
||||
|
||||
# SMTLIB Benchmark Finder
|
||||
|
||||
## Job Description
|
||||
|
||||
Your name is ${{ github.workflow }}. You are a research analyst for the Z3 theorem
|
||||
prover repository `${{ github.repository }}`. Your mission is to discover GitHub
|
||||
repositories that host SMT-LIB benchmarks, exclude the ones that are already part
|
||||
of the official SMT-LIB benchmark distribution (linked from smtlib.org and published
|
||||
on Zenodo), and post a curated summary of community-contributed benchmark links as a
|
||||
GitHub Discussion.
|
||||
|
||||
## Step 1: Load Cache and Determine Run Mode
|
||||
|
||||
Check cache memory for:
|
||||
- `official_repos`: set of GitHub repository full names (`owner/repo`) and Zenodo
|
||||
record IDs already identified as official SMT-LIB benchmark sets
|
||||
- `known_community_repos`: set of repo full names already listed in a previous report
|
||||
- `last_run_date`: ISO-8601 date string of the previous run
|
||||
|
||||
Use the cache to skip repos already classified. On the very first run (no cache),
|
||||
perform a full discovery pass. On subsequent runs focus on repos pushed or created
|
||||
since `last_run_date`.
|
||||
|
||||
## Step 2: Collect Official SMT-LIB Benchmark Sets to Exclude
|
||||
|
||||
### 2.1 Scrape smtlib.org
|
||||
|
||||
Fetch the SMT-LIB benchmarks page and extract all linked Zenodo DOIs and GitHub URLs:
|
||||
|
||||
```bash
|
||||
curl -s "https://smtlib.cs.uiowa.edu/benchmarks.shtml" -o /tmp/smtlib-benchmarks.html
|
||||
# Also try the main page and any mirror
|
||||
curl -s "https://smtlib.cs.uiowa.edu/" -o /tmp/smtlib-home.html
|
||||
```
|
||||
|
||||
Parse both files for:
|
||||
- Zenodo DOI links (`doi.org/10.5281/zenodo.*` or `zenodo.org/record/*`)
|
||||
- GitHub repository URLs (`github.com/...`)
|
||||
- Any other hosted benchmark archive links
|
||||
|
||||
### 2.2 Enumerate Zenodo SMT-LIB Community Records
|
||||
|
||||
Query the Zenodo API for all records in the SMT-LIB community:
|
||||
|
||||
```bash
|
||||
curl -s "https://zenodo.org/api/records?communities=smt-lib&size=100&page=1" \
|
||||
-o /tmp/zenodo-smtlib-page1.json
|
||||
|
||||
# Check if there are more pages (paginate until empty)
|
||||
curl -s "https://zenodo.org/api/records?communities=smt-lib&size=100&page=2" \
|
||||
-o /tmp/zenodo-smtlib-page2.json 2>/dev/null || true
|
||||
|
||||
curl -s "https://zenodo.org/api/records?communities=smt-lib&size=100&page=3" \
|
||||
-o /tmp/zenodo-smtlib-page3.json 2>/dev/null || true
|
||||
```
|
||||
|
||||
For each Zenodo record extract:
|
||||
- Record ID (e.g. `5827900`)
|
||||
- Title
|
||||
- Any GitHub repository URLs listed in the description or related identifiers
|
||||
|
||||
```bash
|
||||
python3 - <<'PYEOF'
|
||||
import json, re
|
||||
|
||||
def extract_github_repos(text):
|
||||
pattern = r'github\.com/([A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+)'
|
||||
return set(re.findall(pattern, text or ''))
|
||||
|
||||
official_repos = set()
|
||||
official_zenodo_ids = set()
|
||||
|
||||
for fname in ['/tmp/zenodo-smtlib-page1.json', '/tmp/zenodo-smtlib-page2.json',
|
||||
'/tmp/zenodo-smtlib-page3.json']:
|
||||
try:
|
||||
data = json.load(open(fname))
|
||||
except Exception:
|
||||
continue
|
||||
for hit in data.get('hits', {}).get('hits', []):
|
||||
rid = str(hit.get('id', ''))
|
||||
official_zenodo_ids.add(rid)
|
||||
metadata = hit.get('metadata', {})
|
||||
description = metadata.get('description', '')
|
||||
related = ' '.join(
|
||||
r.get('identifier', '')
|
||||
for r in metadata.get('related_identifiers', [])
|
||||
)
|
||||
title = metadata.get('title', '')
|
||||
for repo in extract_github_repos(description + ' ' + related + ' ' + title):
|
||||
official_repos.add(repo.lower().rstrip('.'))
|
||||
|
||||
with open('/tmp/smtlib-benchmarks.html') as f:
|
||||
html = f.read()
|
||||
for repo in extract_github_repos(html):
|
||||
official_repos.add(repo.lower().rstrip('.'))
|
||||
|
||||
print("OFFICIAL_ZENODO_IDS:", ','.join(sorted(official_zenodo_ids)) or '(none)')
|
||||
print("OFFICIAL_GITHUB_REPOS:", ','.join(sorted(official_repos)) or '(none)')
|
||||
PYEOF
|
||||
```
|
||||
|
||||
### 2.3 Well-Known Official Repository Patterns
|
||||
|
||||
Regardless of the above scrape, always exclude:
|
||||
- Any repo under the `SMT-LIB` GitHub organization (`SMT-LIB/*`)
|
||||
- Any repo whose name matches `smt-comp-*` that is under `SMT-Competition` org
|
||||
- The Z3 repo itself (`Z3Prover/z3`) and its forks
|
||||
|
||||
Combine all official sources into a single exclusion set stored at
|
||||
`/tmp/official_exclusions.txt` (one `owner/repo` pattern per line, lowercase).
|
||||
|
||||
## Step 3: Search GitHub for Community SMT-LIB Benchmark Repositories
|
||||
|
||||
Use multiple GitHub search strategies to find repos containing `.smt2` benchmark
|
||||
files that are NOT part of the official distribution. Search for repos updated
|
||||
since the last run date (or the last 90 days for the initial run).
|
||||
|
||||
Compute the cutoff date first:
|
||||
```bash
|
||||
CUTOFF=$(date -d "90 days ago" +%Y-%m-%d 2>/dev/null || date -v-90d +%Y-%m-%d)
|
||||
echo "Using cutoff date: $CUTOFF"
|
||||
```
|
||||
|
||||
### Search Strategies
|
||||
|
||||
Use the GitHub MCP server tools to run these searches:
|
||||
|
||||
1. **Topic search**: `topic:smtlib pushed:>$CUTOFF`
|
||||
2. **Topic search variant**: `topic:smt-lib pushed:>$CUTOFF`
|
||||
3. **Topic search variant**: `topic:smt2 pushed:>$CUTOFF`
|
||||
4. **Benchmark filename pattern**: `filename:*.smt2 pushed:>$CUTOFF` (limit to top results)
|
||||
5. **Benchmarks directory pattern**: `path:benchmarks *.smt2 in:path pushed:>$CUTOFF`
|
||||
6. **README mention**: `SMT-LIB benchmarks in:readme stars:>2 pushed:>$CUTOFF`
|
||||
7. **Organization-level search**: repos under `SMT-Competition` org, if any are not already excluded
|
||||
|
||||
For each search, collect: `full_name`, `html_url`, `description`, `stargazers_count`,
|
||||
`updated_at`, `pushed_at`, `default_branch`, `topics`.
|
||||
|
||||
Deduplicate by `full_name`. Limit to 200 total candidates before filtering.
|
||||
|
||||
## Step 4: Filter Out Official Benchmark Sets
|
||||
|
||||
For each candidate repo:
|
||||
|
||||
1. Check if `full_name.lower()` is in the exclusion set from Step 2.
|
||||
2. Check if the repo is owned by `SMT-LIB` or `SMT-Competition` org (case-insensitive).
|
||||
3. Check if the repo is a fork of a known official repo (if `fork: true` and parent is
|
||||
in the exclusion set).
|
||||
|
||||
Discard repos that match any of the above. Keep the rest as community benchmarks.
|
||||
|
||||
Also apply quality filters to reduce noise — skip repos that:
|
||||
- Have 0 stars and fewer than 3 `.smt2` files (likely a student homework or test repo
|
||||
with minimal public value); use your judgement — if the repo description clearly
|
||||
describes a research benchmark, keep it regardless of star count.
|
||||
- Were created but never pushed after creation (empty repos).
|
||||
- Have names that are clearly course assignment repositories
|
||||
(e.g. contain `homework`, `assignment`, `hw[0-9]`, `cs[0-9]{3}`).
|
||||
|
||||
## Step 5: Classify Remaining Repos
|
||||
|
||||
For each repo that survives filtering, classify it into one of these categories:
|
||||
|
||||
| Category | Description |
|
||||
|----------|-------------|
|
||||
| **Solver evaluation** | Benchmarks used to evaluate or compare SMT solvers |
|
||||
| **Verification** | Benchmarks from program verification or model checking |
|
||||
| **Security / CTF** | Benchmarks from security research or CTF challenges |
|
||||
| **Theory / logic** | Benchmarks exploring specific SMT theories or logics |
|
||||
| **Tool output** | Benchmarks generated by another tool (e.g. a compiler, fuzzer) |
|
||||
| **Education** | Course materials or tutorials with benchmark examples |
|
||||
| **Other / unknown** | Does not fit another category |
|
||||
|
||||
Base the classification on: repo description, README (fetch if web-fetch is available),
|
||||
topics, and directory structure. A single brief `web-fetch` of the repo's README is
|
||||
sufficient; do not fetch individual `.smt2` files.
|
||||
|
||||
Note the dominant SMT logic(s) present, if discernible from the description or topics
|
||||
(e.g. QF_BV, QF_LIA, QF_S, NIA, …).
|
||||
|
||||
## Step 6: Generate the Discussion Report
|
||||
|
||||
Create a GitHub Discussion. Use heading level 3 or deeper (`###`, `####`, …) for all
|
||||
section headers; never use `##` or `#` in the body.
|
||||
Wrap long tables in `<details>` tags to keep the report scannable.
|
||||
|
||||
Title: `[SMT-LIB Benchmarks] Community Benchmark Repository Survey — [Month YYYY]`
|
||||
|
||||
Structure the report as follows:
|
||||
|
||||
```markdown
|
||||
**Period covered**: [cutoff date] – [today's date]
|
||||
**Repositories found**: N community repos (after excluding M official sets)
|
||||
**New this run**: N (not listed in previous report)
|
||||
|
||||
### Overview
|
||||
|
||||
1–2 sentences summarising the breadth of community SMT-LIB benchmarks found.
|
||||
|
||||
### Community Benchmark Repositories
|
||||
|
||||
Use `###` for category headers. Within each category, list repos as a markdown table.
|
||||
For each repo include:
|
||||
- Repo link (`[owner/repo](html_url)`)
|
||||
- Stars
|
||||
- Last pushed date
|
||||
- Dominant logic(s) (if known)
|
||||
- Brief description (from repo description or README, max 120 chars)
|
||||
|
||||
#### Solver Evaluation
|
||||
|
||||
| Repository | ⭐ | Last pushed | Logic(s) | Description |
|
||||
|------------|-----|------------|---------|-------------|
|
||||
| [owner/repo](url) | N | YYYY-MM-DD | QF_BV, QF_LIA | … |
|
||||
|
||||
#### Verification
|
||||
|
||||
[same table structure]
|
||||
|
||||
#### Security / CTF
|
||||
|
||||
[same table structure]
|
||||
|
||||
#### Theory / Logic
|
||||
|
||||
[same table structure]
|
||||
|
||||
#### Tool Output
|
||||
|
||||
[same table structure]
|
||||
|
||||
#### Education
|
||||
|
||||
[same table structure]
|
||||
|
||||
#### Other / Unknown
|
||||
|
||||
[same table structure]
|
||||
|
||||
---
|
||||
|
||||
### Exclusions Applied
|
||||
|
||||
<details>
|
||||
<summary>Official SMT-LIB sets excluded from this report</summary>
|
||||
|
||||
List Zenodo record IDs and GitHub repos identified as official distributions.
|
||||
|
||||
| Source | Identifier | Notes |
|
||||
|--------|-----------|-------|
|
||||
| Zenodo | [10.5281/zenodo.XXXXXXX](https://zenodo.org/record/XXXXXXX) | Official QF_BV benchmark set |
|
||||
| GitHub | [SMT-LIB/benchmarks-non-incremental](https://github.com/SMT-LIB/benchmarks-non-incremental) | |
|
||||
|
||||
</details>
|
||||
|
||||
---
|
||||
|
||||
### Methodology
|
||||
|
||||
Brief note on search queries used, cutoff date, and any quality filters applied.
|
||||
```
|
||||
|
||||
## Step 7: Update Cache Memory
|
||||
|
||||
After posting the discussion, update cache memory with:
|
||||
- `official_repos`: updated exclusion set (union of previous + newly found)
|
||||
- `known_community_repos`: union of previous + repos listed in this report
|
||||
- `last_run_date`: today's ISO-8601 date
|
||||
- `report_url`: URL of the GitHub Discussion created
|
||||
|
||||
## Guidelines
|
||||
|
||||
- **Be conservative with exclusions**: when in doubt whether a repo is "official",
|
||||
keep it in the community list rather than silently dropping it.
|
||||
- **Be accurate**: only include repos that genuinely contain SMT-LIB `.smt2` files
|
||||
or clearly describe themselves as SMT-LIB benchmark collections.
|
||||
- **Avoid noise**: student homework repos and trivially small repos add clutter;
|
||||
apply the quality filters from Step 4 judiciously.
|
||||
- **No source code changes**: DO NOT create pull requests or modify any source files.
|
||||
- **No copyrighted content**: DO NOT reproduce benchmark file contents; only post
|
||||
links and metadata.
|
||||
- **Always cite sources**: include the full GitHub URL for every listed repository.
|
||||
- **Use cache**: skip repos already classified in a previous run to keep runtime short.
|
||||
- **Fail gracefully**: if GitHub search rate-limits the workflow, post whatever was
|
||||
collected so far with a note that the search was incomplete.
|
||||
|
||||
## Important Notes
|
||||
|
||||
- DO NOT create pull requests or modify source files.
|
||||
- DO close older SMT-LIB Benchmarks discussions automatically (configured).
|
||||
- DO always call `create_discussion` or `noop` before the workflow ends.
|
||||
Failing to produce any safe output triggers an automatic failure issue.
|
||||
- DO use cache memory to avoid re-processing repos already surveyed.
|
||||
- DO limit individual `web-fetch` calls (README fetches) to repos where the
|
||||
description alone is insufficient for classification.
|
||||
1429
.github/workflows/specbot-crash-analyzer.lock.yml
generated
vendored
Normal file
1429
.github/workflows/specbot-crash-analyzer.lock.yml
generated
vendored
Normal file
File diff suppressed because it is too large
Load diff
247
.github/workflows/specbot-crash-analyzer.md
vendored
Normal file
247
.github/workflows/specbot-crash-analyzer.md
vendored
Normal file
|
|
@ -0,0 +1,247 @@
|
|||
---
|
||||
description: >
|
||||
Build Z3 in debug mode from the c3 branch, compile and run the specbot tests,
|
||||
identify root causes for any crashes, and post findings as a GitHub Discussion.
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
timeout-minutes: 120
|
||||
|
||||
permissions: read-all
|
||||
|
||||
network: defaults
|
||||
|
||||
tools:
|
||||
cache-memory: true
|
||||
github:
|
||||
toolsets: [default, discussions]
|
||||
bash: [":*"]
|
||||
edit: {}
|
||||
|
||||
safe-outputs:
|
||||
create-discussion:
|
||||
title-prefix: "[Specbot] "
|
||||
category: "Agentic Workflows"
|
||||
close-older-discussions: true
|
||||
missing-tool:
|
||||
create-issue: true
|
||||
noop:
|
||||
report-as-issue: false
|
||||
|
||||
steps:
|
||||
- name: Checkout c3 branch
|
||||
uses: actions/checkout@v6.0.2
|
||||
with:
|
||||
ref: c3
|
||||
persist-credentials: false
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y cmake ninja-build python3 gcc g++ 2>&1 | tail -5
|
||||
|
||||
- name: Build Z3 in debug mode
|
||||
id: build-z3
|
||||
continue-on-error: true
|
||||
run: |
|
||||
mkdir -p build/debug specbot-results
|
||||
cd build/debug
|
||||
cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug ../.. 2>&1 | tee ../../specbot-results/cmake.log
|
||||
ninja 2>&1 | tee ../../specbot-results/build.log
|
||||
BUILD_EXIT=$?
|
||||
cd ../..
|
||||
echo "build_exit=${BUILD_EXIT}" >> specbot-results/build-status.txt
|
||||
ls -la build/debug/libz3* build/debug/*.so* 2>/dev/null >> specbot-results/build-status.txt || echo "Library not found" >> specbot-results/build-status.txt
|
||||
exit $BUILD_EXIT
|
||||
|
||||
- name: Compile specbot tests
|
||||
continue-on-error: true
|
||||
run: |
|
||||
mkdir -p specbot-results
|
||||
gcc -g -O0 \
|
||||
-I src/api \
|
||||
specbot/test_specbot_seq.c \
|
||||
-L build/debug \
|
||||
-lz3 \
|
||||
-Wl,-rpath,"${GITHUB_WORKSPACE}/build/debug" \
|
||||
-o specbot-results/test_specbot_seq \
|
||||
2>&1 | tee specbot-results/compile_specbot_seq.log
|
||||
echo "compile_specbot_seq_exit=$?" >> specbot-results/compile-status.txt
|
||||
|
||||
gcc -g -O0 \
|
||||
-I src/api \
|
||||
specbot/test_deeptest_seq.c \
|
||||
-L build/debug \
|
||||
-lz3 \
|
||||
-Wl,-rpath,"${GITHUB_WORKSPACE}/build/debug" \
|
||||
-o specbot-results/test_deeptest_seq \
|
||||
2>&1 | tee specbot-results/compile_deeptest_seq.log
|
||||
echo "compile_deeptest_seq_exit=$?" >> specbot-results/compile-status.txt
|
||||
|
||||
- name: Run specbot tests
|
||||
continue-on-error: true
|
||||
run: |
|
||||
mkdir -p specbot-results
|
||||
if [ -f specbot-results/test_specbot_seq ]; then
|
||||
LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/build/debug" timeout 120 specbot-results/test_specbot_seq > specbot-results/test_specbot_seq.log 2>&1
|
||||
SPECBOT_EXIT=$?
|
||||
echo "specbot_seq_exit=${SPECBOT_EXIT}" >> specbot-results/test-status.txt
|
||||
else
|
||||
echo "Binary not compiled" > specbot-results/test_specbot_seq.log
|
||||
echo "specbot_seq_exit=127" >> specbot-results/test-status.txt
|
||||
fi
|
||||
|
||||
if [ -f specbot-results/test_deeptest_seq ]; then
|
||||
LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/build/debug" timeout 120 specbot-results/test_deeptest_seq > specbot-results/test_deeptest_seq.log 2>&1
|
||||
DEEPTEST_EXIT=$?
|
||||
echo "deeptest_seq_exit=${DEEPTEST_EXIT}" >> specbot-results/test-status.txt
|
||||
else
|
||||
echo "Binary not compiled" > specbot-results/test_deeptest_seq.log
|
||||
echo "deeptest_seq_exit=127" >> specbot-results/test-status.txt
|
||||
fi
|
||||
|
||||
---
|
||||
|
||||
# Specbot Crash Analyzer
|
||||
|
||||
## Job Description
|
||||
|
||||
Your name is ${{ github.workflow }}. You are an expert C/C++ and SMT solver analyst for the Z3 theorem prover
|
||||
repository `${{ github.repository }}`. The pre-steps above have already built Z3 in debug mode from the `c3`
|
||||
branch, compiled and run the specbot test suite, and saved all output to the `specbot-results/` directory in
|
||||
the workspace (`${{ github.workspace }}/specbot-results/`). Your task is to analyze those results, diagnose
|
||||
any crash root causes by reading the relevant source files, and publish a structured findings report as a
|
||||
GitHub Discussion.
|
||||
|
||||
**Do not try to build Z3 or run tests yourself.** All build and test output is already in `specbot-results/`.
|
||||
|
||||
## Your Task
|
||||
|
||||
### 1. Read the Pre-Generated Results
|
||||
|
||||
All build and test outputs are in `specbot-results/` (relative to the workspace root). Read each file:
|
||||
|
||||
```bash
|
||||
# Build status
|
||||
cat specbot-results/build-status.txt 2>/dev/null || echo "No build status"
|
||||
|
||||
# Compile status
|
||||
cat specbot-results/compile-status.txt 2>/dev/null || echo "No compile status"
|
||||
|
||||
# Test status
|
||||
cat specbot-results/test-status.txt 2>/dev/null || echo "No test status"
|
||||
|
||||
# Test output from test_specbot_seq
|
||||
cat specbot-results/test_specbot_seq.log 2>/dev/null || echo "No test_specbot_seq output"
|
||||
|
||||
# Test output from test_deeptest_seq
|
||||
cat specbot-results/test_deeptest_seq.log 2>/dev/null || echo "No test_deeptest_seq output"
|
||||
|
||||
# Last 30 lines of the build log
|
||||
tail -30 specbot-results/build.log 2>/dev/null || echo "No build log"
|
||||
```
|
||||
|
||||
If `specbot-results/build-status.txt` shows `build_exit=0`, the build succeeded.
|
||||
If it shows a non-zero exit, include the last 50 lines of `specbot-results/build.log` in the report
|
||||
under a "Build Failure" section.
|
||||
|
||||
If `specbot-results/compile-status.txt` shows a non-zero exit for a test, include the compile error
|
||||
from `specbot-results/compile_specbot_seq.log` or `specbot-results/compile_deeptest_seq.log`.
|
||||
|
||||
Collect every line containing `CRASH` or `ABORT` from the test log files — these are the crashes to analyze.
|
||||
|
||||
### 2. Diagnose Each Crash
|
||||
|
||||
For each crashed test function, perform the following analysis:
|
||||
|
||||
1. **Identify the test body**: read `specbot/test_specbot_seq.c` or `specbot/test_deeptest_seq.c`
|
||||
to understand what Z3 API calls the test makes and what invariants it exercises.
|
||||
|
||||
2. **Find the likely crash site**: the test exercises the Z3 Nielsen/nseq string solver. Relevant source files are:
|
||||
- `src/smt/seq_solver.h` and `src/smt/seq_solver.cpp` (or nearby files)
|
||||
- `src/smt/seq_axioms.cpp`, `src/smt/seq_eq_solver.cpp`, `src/smt/seq_regex.cpp`
|
||||
- `src/math/lp/` for length-arithmetic paths
|
||||
- `src/api/z3_api.h` for the public API entry points
|
||||
|
||||
Use `grep` and `view` to locate assertion macros, `UNREACHABLE()`, `SASSERT`, or `throw` statements
|
||||
in the code paths exercised by the failing test. Example:
|
||||
```bash
|
||||
grep -rn "SASSERT\|UNREACHABLE\|Z3_CATCH" src/smt/seq_solver.cpp 2>/dev/null | head -30
|
||||
```
|
||||
|
||||
3. **Hypothesize root cause**: based on the Z3 API calls in the test and the assertion/throw sites in
|
||||
the solver source, state the most likely root cause. Common categories include:
|
||||
- Violated invariant (SASSERT/UNREACHABLE hit due to unexpected solver state)
|
||||
- Use-after-free or dangling reference during push/pop
|
||||
- Unhandled edge case in Nielsen graph construction
|
||||
- Missing theory-combination lemma between string length and integer arithmetic
|
||||
|
||||
4. **Suggest a fix**: propose a minimal, concrete fix — e.g., a guard condition, an additional lemma,
|
||||
a missing reference-count increment, or a missing case in a switch/match.
|
||||
|
||||
### 3. Generate the Report
|
||||
|
||||
After analyzing all crashes, produce a structured GitHub Discussion in the "Agentic Workflows" category
|
||||
using `create-discussion`.
|
||||
|
||||
The discussion body must follow this structure (use `###` and lower for headers):
|
||||
|
||||
```
|
||||
### Summary
|
||||
|
||||
- Build: Debug (CMake + Ninja, c3 branch)
|
||||
- Tests compiled: N
|
||||
- Tests run: N
|
||||
- Tests passed: N
|
||||
- Tests crashed: N
|
||||
- Tests timed out: N
|
||||
|
||||
### Crash Findings
|
||||
|
||||
For each crash, one subsection:
|
||||
|
||||
#### <test function name>
|
||||
|
||||
**Test file**: `specbot/test_specbot_seq.c` or `specbot/test_deeptest_seq.c`
|
||||
|
||||
**Observed failure**: ABORT/CRASH — one-line description of what was caught
|
||||
|
||||
**Root cause hypothesis**: explanation of which assertion or code path was hit and why
|
||||
|
||||
**Suggested fix**: concrete proposed change (file, function, what to add/change)
|
||||
|
||||
---
|
||||
|
||||
### Tests Passed
|
||||
|
||||
List of test names that passed.
|
||||
|
||||
<details>
|
||||
<summary><b>Full Test Output</b></summary>
|
||||
|
||||
Raw stdout/stderr from both test binaries.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><b>Build Log</b></summary>
|
||||
|
||||
Last 30 lines of the ninja build output.
|
||||
|
||||
</details>
|
||||
```
|
||||
|
||||
If there are no crashes at all, write a "No Crashes Found" summary celebrating that all tests passed,
|
||||
and include the full test output in a collapsible section.
|
||||
|
||||
Use `mentions: false` behavior — do not mention any GitHub usernames in the report.
|
||||
|
||||
Format workflow run references as: `[§${{ github.run_id }}](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})`.
|
||||
|
||||
## Usage
|
||||
|
||||
Trigger via **Actions → Specbot Crash Analyzer → Run workflow** on any branch. The pre-steps
|
||||
always check out the `c3` branch where `specbot/test_specbot_seq.c` and
|
||||
`specbot/test_deeptest_seq.c` live, build Z3, run the tests, and save results to `specbot-results/`.
|
||||
The agent then analyzes the results and posts a discussion to the "Agentic Workflows" category.
|
||||
2
.github/workflows/tactic-to-simplifier.md
vendored
2
.github/workflows/tactic-to-simplifier.md
vendored
|
|
@ -19,8 +19,6 @@ tools:
|
|||
github:
|
||||
toolsets: [default]
|
||||
bash: [":*"]
|
||||
glob: {}
|
||||
view: {}
|
||||
|
||||
safe-outputs:
|
||||
create-issue:
|
||||
|
|
|
|||
1323
.github/workflows/tptp-benchmark.lock.yml
generated
vendored
Normal file
1323
.github/workflows/tptp-benchmark.lock.yml
generated
vendored
Normal file
File diff suppressed because it is too large
Load diff
546
.github/workflows/tptp-benchmark.md
vendored
Normal file
546
.github/workflows/tptp-benchmark.md
vendored
Normal file
|
|
@ -0,0 +1,546 @@
|
|||
---
|
||||
description: >
|
||||
Weekly benchmark of Z3's TPTP front-end against 500 random TPTP problems.
|
||||
Downloads TPTP benchmarks from tptp.org, resolves axiom dependencies,
|
||||
skips large problems, runs each with a 5-second timeout, and posts a
|
||||
discrepancy/crash report as a GitHub discussion.
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: "0 6 * * 1"
|
||||
workflow_dispatch:
|
||||
|
||||
permissions: read-all
|
||||
|
||||
network:
|
||||
allowed:
|
||||
- defaults
|
||||
- tptp.org
|
||||
|
||||
tools:
|
||||
bash: true
|
||||
github:
|
||||
toolsets: [default]
|
||||
|
||||
safe-outputs:
|
||||
create-discussion:
|
||||
title-prefix: "[TPTP Benchmark] "
|
||||
category: "Agentic Workflows"
|
||||
close-older-discussions: true
|
||||
expires: 14d
|
||||
missing-tool:
|
||||
create-issue: true
|
||||
noop:
|
||||
report-as-issue: false
|
||||
|
||||
timeout-minutes: 300
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
sudo apt-get update -y -q
|
||||
sudo apt-get install -y cmake ninja-build python3 wget curl bc
|
||||
|
||||
- name: Build Z3
|
||||
run: |
|
||||
mkdir -p /tmp/z3-build
|
||||
cd /tmp/z3-build
|
||||
cmake "$GITHUB_WORKSPACE" \
|
||||
-G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DZ3_BUILD_TEST_EXECUTABLES=OFF
|
||||
ninja -j$(nproc) z3
|
||||
./z3 --version
|
||||
|
||||
---
|
||||
|
||||
# TPTP Front-End Benchmark
|
||||
|
||||
## Job Description
|
||||
|
||||
Your name is ${{ github.workflow }}. You are an expert testing engineer for the Z3 theorem prover. Your task is to:
|
||||
|
||||
1. Verify the Z3 binary built by the pre-flight step is available
|
||||
2. Download the TPTP benchmark library from tptp.org
|
||||
3. Select 500 random small-to-medium problems (with their axiom dependencies)
|
||||
4. Run each problem through Z3's TPTP front-end with a 5-second timeout
|
||||
5. Compare Z3's output against the expected SZS status declared in each problem file
|
||||
6. Post a detailed report as a GitHub Discussion summarising discrepancies and crashes
|
||||
|
||||
**Repository**: ${{ github.repository }}
|
||||
**Workspace**: ${{ github.workspace }}
|
||||
|
||||
## Phase 1: Verify Z3 Binary
|
||||
|
||||
Z3 was built by the workflow pre-flight step and is available at `/tmp/z3-build/z3`.
|
||||
Confirm the binary is present and functional:
|
||||
|
||||
```bash
|
||||
/tmp/z3-build/z3 --version
|
||||
```
|
||||
|
||||
If the binary is missing or returns an error, call the `noop` safe-output with a message describing the problem and stop.
|
||||
|
||||
Once confirmed, call `noop` with `"Z3 binary verified. Downloading TPTP benchmark library — this may take a few minutes."` to keep the safe-output session alive.
|
||||
|
||||
## Phase 2: Download the TPTP Problem Library
|
||||
|
||||
Find the latest TPTP release and download the full archive.
|
||||
|
||||
```bash
|
||||
# Find the latest TPTP distribution version by fetching the directory listing
|
||||
TPTP_DIST_URL="https://tptp.org/TPTP/Distribution/"
|
||||
LATEST_TGZ=$(curl -sL "$TPTP_DIST_URL" \
|
||||
| grep -oP 'TPTP-v[0-9]+\.[0-9]+\.[0-9]+\.tgz' \
|
||||
| sort -V | tail -1)
|
||||
|
||||
if [ -z "$LATEST_TGZ" ]; then
|
||||
echo "ERROR: Could not determine latest TPTP version from $TPTP_DIST_URL"
|
||||
# Fall back to a known stable version
|
||||
LATEST_TGZ="TPTP-v9.0.0.tgz"
|
||||
fi
|
||||
|
||||
echo "Downloading $LATEST_TGZ ..."
|
||||
mkdir -p /tmp/tptp_download
|
||||
wget -q --show-progress \
|
||||
"${TPTP_DIST_URL}${LATEST_TGZ}" \
|
||||
-O /tmp/tptp_download/tptp.tgz
|
||||
|
||||
echo "Extracting TPTP library..."
|
||||
mkdir -p /tmp/tptp
|
||||
tar -xzf /tmp/tptp_download/tptp.tgz -C /tmp/tptp --strip-components=1 2>&1 | tail -5
|
||||
|
||||
# Verify extraction
|
||||
if [ ! -d /tmp/tptp/Problems ] || [ ! -d /tmp/tptp/Axioms ]; then
|
||||
echo "ERROR: TPTP extraction failed — Problems/ or Axioms/ directory not found"
|
||||
ls /tmp/tptp/
|
||||
exit 1
|
||||
fi
|
||||
|
||||
TPTP_ROOT=/tmp/tptp
|
||||
echo "TPTP library extracted to $TPTP_ROOT"
|
||||
echo "Problem domains available:"
|
||||
ls "$TPTP_ROOT/Problems/" | wc -l
|
||||
echo "Axiom files available:"
|
||||
ls "$TPTP_ROOT/Axioms/" | wc -l
|
||||
```
|
||||
|
||||
If the download or extraction fails, call `noop` with the error details and stop.
|
||||
|
||||
Call `noop` with `"TPTP library downloaded and extracted. Selecting 500 benchmark problems — filtering by size."` to keep the session alive.
|
||||
|
||||
## Phase 3: Select 500 Benchmark Problems
|
||||
|
||||
Filter out large problems and problems that depend on large axiom files, then take a random sample of 500.
|
||||
|
||||
Save this script to `/tmp/select_benchmarks.py` and run it:
|
||||
|
||||
```python
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Select 500 random TPTP problems that:
|
||||
- Have a known, conclusive expected status (Theorem, Unsatisfiable,
|
||||
CounterSatisfiable, Satisfiable) OR Unknown/Open status.
|
||||
- Are not "large" (problem file <= 50 KB).
|
||||
- Do not include any axiom file larger than 100 KB.
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
import random
|
||||
import sys
|
||||
|
||||
TPTP_ROOT = "/tmp/tptp"
|
||||
PROBLEMS_DIR = os.path.join(TPTP_ROOT, "Problems")
|
||||
AXIOMS_DIR = os.path.join(TPTP_ROOT, "Axioms")
|
||||
MAX_PROBLEM_SIZE = 50 * 1024 # 50 KB
|
||||
MAX_AXIOM_SIZE = 100 * 1024 # 100 KB
|
||||
SAMPLE_SIZE = 500
|
||||
OUTPUT_FILE = "/tmp/selected_benchmarks.txt"
|
||||
|
||||
include_re = re.compile(r"include\s*\(\s*['\"]([^'\"]+)['\"]", re.IGNORECASE)
|
||||
status_re = re.compile(r"%\s*Status\s*:\s*(\S+)", re.IGNORECASE)
|
||||
|
||||
def axiom_sizes_ok(problem_path):
|
||||
"""Return True if all included axiom files exist and are <= MAX_AXIOM_SIZE."""
|
||||
try:
|
||||
with open(problem_path, encoding="utf-8", errors="replace") as f:
|
||||
content = f.read(4096) # header is in first few KB
|
||||
except OSError:
|
||||
return False
|
||||
for m in include_re.finditer(content):
|
||||
axiom_rel = m.group(1) # e.g. "Axioms/AGT001+0.ax"
|
||||
axiom_path = os.path.join(TPTP_ROOT, axiom_rel)
|
||||
if not os.path.exists(axiom_path):
|
||||
return False # axiom missing — skip
|
||||
if os.path.getsize(axiom_path) > MAX_AXIOM_SIZE:
|
||||
return False # axiom too large — skip
|
||||
return True
|
||||
|
||||
candidates = []
|
||||
skipped_size = 0
|
||||
skipped_axiom = 0
|
||||
|
||||
for domain in sorted(os.listdir(PROBLEMS_DIR)):
|
||||
domain_dir = os.path.join(PROBLEMS_DIR, domain)
|
||||
if not os.path.isdir(domain_dir):
|
||||
continue
|
||||
for fname in os.listdir(domain_dir):
|
||||
if not fname.endswith(".p"):
|
||||
continue
|
||||
fpath = os.path.join(domain_dir, fname)
|
||||
size = os.path.getsize(fpath)
|
||||
if size > MAX_PROBLEM_SIZE:
|
||||
skipped_size += 1
|
||||
continue
|
||||
if not axiom_sizes_ok(fpath):
|
||||
skipped_axiom += 1
|
||||
continue
|
||||
candidates.append(fpath)
|
||||
|
||||
print(f"Total candidates (after filtering): {len(candidates)}", flush=True)
|
||||
print(f" Skipped — problem too large : {skipped_size}", flush=True)
|
||||
print(f" Skipped — axiom too large : {skipped_axiom}", flush=True)
|
||||
|
||||
if len(candidates) == 0:
|
||||
print("ERROR: No suitable benchmark problems found.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if len(candidates) > SAMPLE_SIZE:
|
||||
random.seed(42)
|
||||
selected = random.sample(candidates, SAMPLE_SIZE)
|
||||
else:
|
||||
selected = candidates
|
||||
|
||||
selected.sort()
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
f.write("\n".join(selected) + "\n")
|
||||
|
||||
print(f"Selected {len(selected)} problems → {OUTPUT_FILE}", flush=True)
|
||||
```
|
||||
|
||||
Run the script:
|
||||
|
||||
```bash
|
||||
python3 /tmp/select_benchmarks.py
|
||||
SELECTED=$(wc -l < /tmp/selected_benchmarks.txt)
|
||||
echo "Benchmark set: $SELECTED problems"
|
||||
```
|
||||
|
||||
If no problems are found, call `noop` with an error message and stop.
|
||||
|
||||
Call `noop` with `"$SELECTED problems selected. Starting benchmark run with 5-second timeout per problem — this will take approximately $(( SELECTED * 7 / 60 )) minutes."` to keep the session alive.
|
||||
|
||||
## Phase 4: Run Benchmarks
|
||||
|
||||
Save the following script to `/tmp/run_tptp_benchmarks.sh`, make it executable, and run it.
|
||||
|
||||
```bash
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
Z3=/tmp/z3-build/z3
|
||||
TPTP_ROOT=/tmp/tptp
|
||||
TIMEOUT_HARD=8 # outer OS-level guard (seconds; 3 s beyond Z3's -T:5)
|
||||
Z3_TIMEOUT=5 # Z3 internal timeout: -T:N sets N-second limit (uppercase -T is seconds)
|
||||
|
||||
RESULTS=/tmp/tptp_results.tsv
|
||||
PROBLEM_LIST=/tmp/selected_benchmarks.txt
|
||||
|
||||
echo -e "file\texpected\tactual\ttime_s\tnotes" > "$RESULTS"
|
||||
|
||||
# Helper: extract the expected SZS status from the TPTP problem header.
|
||||
get_expected_status() {
|
||||
local file="$1"
|
||||
# Look for lines like: "% Status : Theorem"
|
||||
grep -m1 -iP '%\s*Status\s*:\s*\K\S+' "$file" 2>/dev/null || echo "Unknown"
|
||||
}
|
||||
|
||||
# Helper: run z3 on a single TPTP problem with timeout.
|
||||
run_benchmark() {
|
||||
local file="$1"
|
||||
local start end elapsed output exit_code verdict
|
||||
|
||||
start=$(date +%s%3N) # milliseconds since epoch
|
||||
output=$(TPTP="$TPTP_ROOT" timeout "$TIMEOUT_HARD" \
|
||||
"$Z3" -tptp -T:"$Z3_TIMEOUT" "$file" 2>&1) || exit_code=$?
|
||||
exit_code=${exit_code:-0}
|
||||
end=$(date +%s%3N)
|
||||
elapsed=$(echo "scale=3; ($end - $start) / 1000" | bc)
|
||||
|
||||
# Extract SZS status line from output
|
||||
szs_line=$(echo "$output" | grep -m1 "% SZS status" || true)
|
||||
|
||||
if [ -n "$szs_line" ]; then
|
||||
# Parse the status keyword (e.g. "Theorem", "CounterSatisfiable", "GaveUp")
|
||||
verdict=$(echo "$szs_line" | grep -oP '% SZS status \K\S+' || echo "Unknown")
|
||||
elif [ "$exit_code" -eq 124 ]; then
|
||||
verdict="Timeout"
|
||||
elif [ "$exit_code" -ne 0 ]; then
|
||||
verdict="Crash"
|
||||
else
|
||||
verdict="NoOutput"
|
||||
fi
|
||||
|
||||
echo "$verdict $elapsed"
|
||||
}
|
||||
|
||||
COUNTER=0
|
||||
TOTAL=$(wc -l < "$PROBLEM_LIST")
|
||||
|
||||
while IFS= read -r problem_file; do
|
||||
COUNTER=$((COUNTER + 1))
|
||||
|
||||
expected=$(get_expected_status "$problem_file")
|
||||
result_line=$(run_benchmark "$problem_file")
|
||||
actual=$(echo "$result_line" | cut -d' ' -f1)
|
||||
elapsed=$(echo "$result_line" | cut -d' ' -f2)
|
||||
fname=$(basename "$problem_file")
|
||||
|
||||
# Classify notes
|
||||
notes=""
|
||||
# Soundness discrepancy: both answers are conclusive but conflict
|
||||
conclusive_expected=false
|
||||
conclusive_actual=false
|
||||
case "$expected" in
|
||||
Theorem|Unsatisfiable) conclusive_expected=true ;;
|
||||
Satisfiable|CounterSatisfiable) conclusive_expected=true ;;
|
||||
esac
|
||||
case "$actual" in
|
||||
Theorem|Unsatisfiable) conclusive_actual=true ;;
|
||||
Satisfiable|CounterSatisfiable) conclusive_actual=true ;;
|
||||
esac
|
||||
|
||||
if $conclusive_expected && $conclusive_actual; then
|
||||
# Map expected to the Z3 output equivalents for comparison
|
||||
# Theorem (has-conjecture unsat) matches "Theorem"
|
||||
# Unsatisfiable (no-conjecture unsat) matches "Unsatisfiable"
|
||||
# Satisfiable (no-conjecture sat) matches "Satisfiable"
|
||||
# CounterSatisfiable (has-conjecture sat) matches "CounterSatisfiable"
|
||||
if [ "$expected" != "$actual" ]; then
|
||||
# Check for sat/unsat polarity conflict
|
||||
sat_expected=false; sat_actual=false
|
||||
case "$expected" in Satisfiable|CounterSatisfiable) sat_expected=true ;; esac
|
||||
case "$actual" in Satisfiable|CounterSatisfiable) sat_actual=true ;; esac
|
||||
if [ "$sat_expected" != "$sat_actual" ]; then
|
||||
notes="SOUNDNESS_ERROR"
|
||||
else
|
||||
notes="STATUS_MISMATCH"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$actual" = "Crash" ]; then
|
||||
notes="CRASH"
|
||||
fi
|
||||
|
||||
echo -e "$fname\t$expected\t$actual\t$elapsed\t$notes" >> "$RESULTS"
|
||||
|
||||
if [ -n "$notes" ]; then
|
||||
echo "[$COUNTER/$TOTAL] $fname expected=$expected actual=$actual time=${elapsed}s *** $notes ***"
|
||||
elif [ $((COUNTER % 50)) -eq 0 ]; then
|
||||
echo "[$COUNTER/$TOTAL] Progress checkpoint last=$fname actual=$actual time=${elapsed}s"
|
||||
fi
|
||||
|
||||
done < "$PROBLEM_LIST"
|
||||
|
||||
echo "Benchmark run complete: $COUNTER problems processed. Results → $RESULTS"
|
||||
```
|
||||
|
||||
Run it:
|
||||
|
||||
```bash
|
||||
chmod +x /tmp/run_tptp_benchmarks.sh
|
||||
/tmp/run_tptp_benchmarks.sh
|
||||
```
|
||||
|
||||
Do not skip any file in the list.
|
||||
|
||||
## Phase 5: Analyze Results
|
||||
|
||||
Save the following script to `/tmp/analyze_tptp.py` and run it:
|
||||
|
||||
```python
|
||||
#!/usr/bin/env python3
|
||||
"""Compute summary statistics from the TPTP benchmark TSV."""
|
||||
import csv
|
||||
|
||||
RESULTS_FILE = "/tmp/tptp_results.tsv"
|
||||
|
||||
rows = []
|
||||
with open(RESULTS_FILE, newline="") as f:
|
||||
reader = csv.DictReader(f, delimiter="\t")
|
||||
for row in reader:
|
||||
rows.append(row)
|
||||
|
||||
total = len(rows)
|
||||
|
||||
# Verdict counts
|
||||
from collections import Counter, defaultdict
|
||||
actual_counts = Counter(r["actual"] for r in rows)
|
||||
expected_counts = Counter(r["expected"] for r in rows)
|
||||
|
||||
# Flagged rows
|
||||
soundness_errors = [r for r in rows if r["notes"] == "SOUNDNESS_ERROR"]
|
||||
status_mismatches = [r for r in rows if r["notes"] == "STATUS_MISMATCH"]
|
||||
crashes = [r for r in rows if r["notes"] == "CRASH"]
|
||||
timeouts = [r for r in rows if r["actual"] == "Timeout"]
|
||||
gave_up = [r for r in rows if r["actual"] == "GaveUp"]
|
||||
|
||||
# Solved correctly (expected matches actual for conclusive verdicts)
|
||||
conclusive_expected = {"Theorem", "Unsatisfiable", "Satisfiable", "CounterSatisfiable"}
|
||||
correct = [r for r in rows
|
||||
if r["expected"] in conclusive_expected
|
||||
and r["actual"] == r["expected"]]
|
||||
|
||||
print(f"TOTAL={total}")
|
||||
print(f"CORRECT={len(correct)}")
|
||||
print(f"TIMEOUTS={len(timeouts)}")
|
||||
print(f"GAVE_UP={len(gave_up)}")
|
||||
print(f"CRASHES={len(crashes)}")
|
||||
print(f"SOUNDNESS_ERRORS={len(soundness_errors)}")
|
||||
print(f"STATUS_MISMATCHES={len(status_mismatches)}")
|
||||
|
||||
print("\n--- Actual verdict breakdown ---")
|
||||
for v, c in sorted(actual_counts.items()):
|
||||
print(f" {v}: {c}")
|
||||
|
||||
print("\n--- Expected status breakdown ---")
|
||||
for v, c in sorted(expected_counts.items()):
|
||||
print(f" {v}: {c}")
|
||||
|
||||
if soundness_errors:
|
||||
print(f"\n--- SOUNDNESS ERRORS ({len(soundness_errors)}) ---")
|
||||
for r in soundness_errors:
|
||||
print(f" {r['file']} expected={r['expected']} actual={r['actual']}")
|
||||
|
||||
if crashes:
|
||||
print(f"\n--- CRASHES ({len(crashes)}) ---")
|
||||
for r in crashes:
|
||||
print(f" {r['file']} expected={r['expected']}")
|
||||
|
||||
if status_mismatches:
|
||||
print(f"\n--- STATUS MISMATCHES ({len(status_mismatches)}) ---")
|
||||
for r in status_mismatches[:20]:
|
||||
print(f" {r['file']} expected={r['expected']} actual={r['actual']}")
|
||||
```
|
||||
|
||||
Run the analysis:
|
||||
|
||||
```bash
|
||||
python3 /tmp/analyze_tptp.py
|
||||
```
|
||||
|
||||
## Phase 6: Generate and Post the Discussion Report
|
||||
|
||||
Read the TSV at `/tmp/tptp_results.tsv` and the analysis output, then compose a Markdown report and call `create_discussion`.
|
||||
|
||||
The report should use `###` or lower for all headers (never `#` or `##`). Use collapsible `<details>` sections for large tables.
|
||||
|
||||
Use this structure:
|
||||
|
||||
```markdown
|
||||
**Date**: <today's date>
|
||||
**Branch**: master
|
||||
**Commit**: `<short SHA>` (run `git rev-parse --short HEAD` in ${{ github.workspace }} to get the SHA)
|
||||
**Workflow Run**: [${{ github.run_id }}](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})
|
||||
**TPTP version**: <downloaded version>
|
||||
**Problems benchmarked**: <N> (random sample, timeout 5 s per problem)
|
||||
|
||||
---
|
||||
|
||||
### Summary
|
||||
|
||||
| Metric | Count |
|
||||
|--------|-------|
|
||||
| Total problems run | N |
|
||||
| Correct (expected = actual) | N |
|
||||
| Timeouts | N |
|
||||
| GaveUp (within time budget) | N |
|
||||
| Crashes / errors | N |
|
||||
| Soundness errors (sat↔unsat conflict) | N |
|
||||
| Status mismatches (Theorem vs Unsatisfiable etc.) | N |
|
||||
|
||||
### Expected Status Distribution
|
||||
|
||||
| Expected Status | Count |
|
||||
|----------------|-------|
|
||||
| Theorem | N |
|
||||
| Unsatisfiable | N |
|
||||
| Satisfiable | N |
|
||||
| CounterSatisfiable | N |
|
||||
| Unknown / Open | N |
|
||||
|
||||
---
|
||||
|
||||
### ⚠️ Critical: Soundness Errors
|
||||
|
||||
[List ALL files where Z3 returned a conclusive answer that contradicts the expected answer
|
||||
(e.g., expected Theorem but got CounterSatisfiable). If none, write "None detected."]
|
||||
|
||||
### 💥 Crashes
|
||||
|
||||
[List ALL files where Z3 crashed (non-zero exit, no SZS output, not a timeout).
|
||||
Include filename and expected status. If none, write "None detected."]
|
||||
|
||||
### Status Mismatches
|
||||
|
||||
[Files where both answers are conclusive but differ in Theorem vs Unsatisfiable polarity
|
||||
(e.g., expected Theorem but actual Unsatisfiable). These may indicate conjecture-handling
|
||||
differences rather than soundness bugs. If none, write "None detected."]
|
||||
|
||||
---
|
||||
|
||||
<details>
|
||||
<summary>View all Timeouts (problems where Z3 exceeded the 5-second limit)</summary>
|
||||
|
||||
| # | File | Expected Status |
|
||||
|---|------|----------------|
|
||||
[First 100 timeout rows]
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>View full per-problem results table</summary>
|
||||
|
||||
| # | File | Expected | Actual | Time (s) | Notes |
|
||||
|---|------|----------|--------|----------|-------|
|
||||
[All rows, or first 500 if over limit]
|
||||
|
||||
</details>
|
||||
|
||||
---
|
||||
|
||||
### Recommendations
|
||||
|
||||
[Based on the findings, list actionable items. E.g.: investigate soundness errors,
|
||||
file crash bugs, note domains where Z3 consistently times out.]
|
||||
```
|
||||
|
||||
Post the discussion using the `create_discussion` safe output. The title should be
|
||||
`[TPTP Benchmark] master — <date>`.
|
||||
|
||||
## Safe Output Guarantee
|
||||
|
||||
You **MUST** call either `create_discussion` or `noop` before the workflow ends:
|
||||
|
||||
- **Full success**: Call `create_discussion` with the complete report.
|
||||
- **Partial results** (some problems ran): Call `create_discussion` with whatever results are available and a note about incomplete execution.
|
||||
- **Download failure**: Call `noop` with the download error details.
|
||||
- **No problems selected**: Call `noop` explaining why no problems were found.
|
||||
- **Binary missing**: If `/tmp/z3-build/z3` is unexpectedly absent, call `noop` with that detail and stop.
|
||||
|
||||
## Important Notes
|
||||
|
||||
- **Build failure handling**: Z3 was built before the agent loaded. If the binary is missing or non-functional, call `noop` with the error and stop.
|
||||
- **TPTP environment variable**: Set `TPTP=/tmp/tptp` when invoking `z3 -tptp` so that `include()` directives in problem files resolve correctly against the downloaded Axioms directory.
|
||||
- **Timeout detection**: Use `timeout 8` as the outer OS-level guard (3 seconds beyond Z3's `-T:5`) to allow Z3 to exit cleanly before the shell kills it. If the exit code from `timeout` is 124, record the verdict as `Timeout`.
|
||||
- **Crash detection**: A crash is a non-zero exit code with no `% SZS status` line in the output and no timeout. Record it separately from `GaveUp`.
|
||||
- **SZS status semantics**: Z3 outputs `Theorem` (not `Unsatisfiable`) when it proves a conjecture; `CounterSatisfiable` (not `Satisfiable`) when it finds a counterexample to a conjecture. A status mismatch between `Theorem` and `Unsatisfiable` for the same problem may be innocuous and depends on whether the problem file uses a conjecture formula.
|
||||
- **Report soundness bugs prominently**: Any case where the polarity of the answer conflicts (expected Theorem/Unsatisfiable but got CounterSatisfiable/Satisfiable, or vice versa) is a potential soundness bug and must be highlighted as critical.
|
||||
- **Keep progress log**: Print a line for every flagged result and every 50th problem so the workflow log shows progress.
|
||||
- **Close older discussions**: Configured via `close-older-discussions: true`. Only the latest weekly report remains open.
|
||||
2
.github/workflows/wasm-release.yml
vendored
2
.github/workflows/wasm-release.yml
vendored
|
|
@ -36,7 +36,7 @@ jobs:
|
|||
cp ../../../LICENSE.txt .
|
||||
|
||||
- name: Setup emscripten
|
||||
uses: mymindstorm/setup-emsdk@v14
|
||||
uses: mymindstorm/setup-emsdk@v16
|
||||
with:
|
||||
no-install: true
|
||||
version: ${{env.EM_VERSION}}
|
||||
|
|
|
|||
2
.github/workflows/wasm.yml
vendored
2
.github/workflows/wasm.yml
vendored
|
|
@ -29,7 +29,7 @@ jobs:
|
|||
node-version: "lts/*"
|
||||
|
||||
- name: Setup emscripten
|
||||
uses: mymindstorm/setup-emsdk@v14
|
||||
uses: mymindstorm/setup-emsdk@v16
|
||||
with:
|
||||
no-install: true
|
||||
version: ${{env.EM_VERSION}}
|
||||
|
|
|
|||
|
|
@ -12,11 +12,9 @@ network: defaults
|
|||
|
||||
tools:
|
||||
cache-memory: true
|
||||
serena: ["python", "java", "csharp"]
|
||||
github:
|
||||
toolsets: [default]
|
||||
bash: [":*"]
|
||||
glob: {}
|
||||
|
||||
safe-outputs:
|
||||
create-discussion:
|
||||
|
|
|
|||
22
.github/workflows/zipt-code-reviewer.md
vendored
22
.github/workflows/zipt-code-reviewer.md
vendored
|
|
@ -17,8 +17,6 @@ tools:
|
|||
cache-memory: true
|
||||
github:
|
||||
toolsets: [default]
|
||||
view: {}
|
||||
glob: {}
|
||||
edit: {}
|
||||
web-fetch: {}
|
||||
bash:
|
||||
|
|
@ -169,12 +167,10 @@ git diff > /tmp/zipt-improvements.diff
|
|||
cat /tmp/zipt-improvements.diff
|
||||
```
|
||||
|
||||
If no changes were made because no improvements were found or all were too risky, exit gracefully:
|
||||
If no changes were made because no improvements were found or all were too risky, call the `noop` safe-output tool:
|
||||
|
||||
```
|
||||
✅ ZIPT code review complete. No concrete improvements found in this run.
|
||||
Files examined: [list files]
|
||||
ZIPT files compared: [list files]
|
||||
noop: "ZIPT code review complete. No concrete improvements found in this run. Files examined: [list files]. ZIPT files compared: [list files]."
|
||||
```
|
||||
|
||||
## Phase 6: Create GitHub Issue
|
||||
|
|
@ -235,7 +231,12 @@ make test-z3
|
|||
*Generated by ZIPT Code Reviewer agent — comparing Z3 implementation with CEisenhofer/ZIPT@parikh*
|
||||
```
|
||||
|
||||
## Important Guidelines
|
||||
## Important: Always Call a Safe Output Tool
|
||||
|
||||
**You MUST always call at least one safe-output tool before finishing.** Failing to do so is reported as a workflow failure.
|
||||
|
||||
- If you found and applied improvements → call `create_issue`
|
||||
- If ZIPT is unreachable, no improvements were found, or all improvements are out of scope → call `noop` with a brief explanation
|
||||
|
||||
### Scope
|
||||
- **Only** examine the files listed in Phase 1
|
||||
|
|
@ -249,7 +250,12 @@ make test-z3
|
|||
- Prefer small, surgical changes over large refactors
|
||||
|
||||
### Exit Conditions
|
||||
Exit without creating an issue if:
|
||||
Call `noop` (instead of creating an issue) if:
|
||||
- ZIPT repository is unreachable
|
||||
- No concrete, safe improvements can be identified
|
||||
- All identified improvements require architectural changes beyond the scope of a single diff
|
||||
|
||||
Example noop call:
|
||||
```
|
||||
noop: "ZIPT code review complete. No improvements applied: [brief reason, e.g. ZIPT unreachable / no safe changes identified]. Files reviewed: [list]."
|
||||
```
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue