add roles

2025-11-18 20:00:49 +00:00 · 2025-09-17 00:04:24 +01:00 · 2025-09-17 00:04:24 +01:00 · 647c8cc6c1
commit 647c8cc6c1
parent 6d3daa5338
8 changed files with 2877 additions and 10 deletions
--- a/.github/workflows/ask.lock.yml
+++ b/.github/workflows/ask.lock.yml
@ -2,7 +2,7 @@
 # To update this file, edit the corresponding .md file and run:
 #   gh aw compile
 #
-# Effective stop-time: 2025-09-18 22:30:51
+# Effective stop-time: 2025-09-18 23:04:00

 name: "Question Answering Researcher"
 on:
@ -37,7 +37,7 @@ jobs:
        id: check-team-member
        uses: actions/github-script@v8
        env:
-          GITHUB_AW_REQUIRED_ROLES: admin,maintainer
+          GITHUB_AW_REQUIRED_ROLES: admin,maintainer,write
        with:
          script: |
            async function setCancelled(message) {
@ -1066,7 +1066,7 @@ jobs:
          WORKFLOW_NAME="Question Answering Researcher"
          
          # Check stop-time limit
-          STOP_TIME="2025-09-18 22:30:51"
+          STOP_TIME="2025-09-18 23:04:00"
          echo "Checking stop-time limit: $STOP_TIME"
          
          # Convert stop time to epoch seconds
--- a/.github/workflows/ask.md
+++ b/.github/workflows/ask.md
@ -4,6 +4,7 @@ on:
    name: ask
  reaction: "eyes"
  stop-after: +48h
+roles: [admin, maintainer, write]

 permissions: read-all

--- a/.github/workflows/ci-doctor.lock.yml
+++ b/.github/workflows/ci-doctor.lock.yml
--- a/.github/workflows/ci-doctor.md
+++ b/.github/workflows/ci-doctor.md
@ -0,0 +1,199 @@
+---
+on:
+  workflow_run:
+    workflows: ["Daily Perf Improver", "Daily Test Coverage Improver"]
+    types:
+      - completed
+    # This will trigger only when the CI workflow completes with failure
+    # The condition is handled in the workflow body
+  stop-after: +48h
+
+# Only trigger for failures - check in the workflow body
+if: ${{ github.event.workflow_run.conclusion == 'failure' }}
+
+permissions: read-all
+
+network: defaults
+
+safe-outputs:
+  create-issue:
+    title-prefix: "${{ github.workflow }}"
+  add-issue-comment:
+
+tools:
+  web-fetch:
+  web-search:
+
+# Cache configuration for persistent storage between runs
+cache:
+  key: investigation-memory-${{ github.repository }}
+  path: 
+    - /tmp/memory
+    - /tmp/investigation
+  restore-keys:
+    - investigation-memory-${{ github.repository }}
+    - investigation-memory-
+
+timeout_minutes: 10
+
+---
+
+# CI Failure Doctor
+
+You are the CI Failure Doctor, an expert investigative agent that analyzes failed GitHub Actions workflows to identify root causes and patterns. Your mission is to conduct a deep investigation when the CI workflow fails.
+
+## Current Context
+
+- **Repository**: ${{ github.repository }}
+- **Workflow Run**: ${{ github.event.workflow_run.id }}
+- **Conclusion**: ${{ github.event.workflow_run.conclusion }}
+- **Run URL**: ${{ github.event.workflow_run.html_url }}
+- **Head SHA**: ${{ github.event.workflow_run.head_sha }}
+
+## Investigation Protocol
+
+**ONLY proceed if the workflow conclusion is 'failure' or 'cancelled'**. Exit immediately if the workflow was successful.
+
+### Phase 1: Initial Triage
+1. **Verify Failure**: Check that `${{ github.event.workflow_run.conclusion }}` is `failure` or `cancelled`
+2. **Get Workflow Details**: Use `get_workflow_run` to get full details of the failed run
+3. **List Jobs**: Use `list_workflow_jobs` to identify which specific jobs failed
+4. **Quick Assessment**: Determine if this is a new type of failure or a recurring pattern
+
+### Phase 2: Deep Log Analysis
+1. **Retrieve Logs**: Use `get_job_logs` with `failed_only=true` to get logs from all failed jobs
+2. **Pattern Recognition**: Analyze logs for:
+   - Error messages and stack traces
+   - Dependency installation failures
+   - Test failures with specific patterns
+   - Infrastructure or runner issues
+   - Timeout patterns
+   - Memory or resource constraints
+3. **Extract Key Information**:
+   - Primary error messages
+   - File paths and line numbers where failures occurred
+   - Test names that failed
+   - Dependency versions involved
+   - Timing patterns
+
+### Phase 3: Historical Context Analysis  
+1. **Search Investigation History**: Use file-based storage to search for similar failures:
+   - Read from cached investigation files in `/tmp/memory/investigations/`
+   - Parse previous failure patterns and solutions
+   - Look for recurring error signatures
+2. **Issue History**: Search existing issues for related problems
+3. **Commit Analysis**: Examine the commit that triggered the failure
+4. **PR Context**: If triggered by a PR, analyze the changed files
+
+### Phase 4: Root Cause Investigation
+1. **Categorize Failure Type**:
+   - **Code Issues**: Syntax errors, logic bugs, test failures
+   - **Infrastructure**: Runner issues, network problems, resource constraints  
+   - **Dependencies**: Version conflicts, missing packages, outdated libraries
+   - **Configuration**: Workflow configuration, environment variables
+   - **Flaky Tests**: Intermittent failures, timing issues
+   - **External Services**: Third-party API failures, downstream dependencies
+
+2. **Deep Dive Analysis**:
+   - For test failures: Identify specific test methods and assertions
+   - For build failures: Analyze compilation errors and missing dependencies
+   - For infrastructure issues: Check runner logs and resource usage
+   - For timeout issues: Identify slow operations and bottlenecks
+
+### Phase 5: Pattern Storage and Knowledge Building
+1. **Store Investigation**: Save structured investigation data to files:
+   - Write investigation report to `/tmp/memory/investigations/<timestamp>-<run-id>.json`
+   - Store error patterns in `/tmp/memory/patterns/`
+   - Maintain an index file of all investigations for fast searching
+2. **Update Pattern Database**: Enhance knowledge with new findings by updating pattern files
+3. **Save Artifacts**: Store detailed logs and analysis in the cached directories
+
+### Phase 6: Looking for existing issues
+
+1. **Convert the report to a search query**
+    - Use any advanced search features in GitHub Issues to find related issues
+    - Look for keywords, error messages, and patterns in existing issues
+2. **Judge each match issues for relevance**
+    - Analyze the content of the issues found by the search and judge if they are similar to this issue.
+3. **Add issue comment to duplicate issue and finish**
+    - If you find a duplicate issue, add a comment with your findings and close the investigation.
+    - Do NOT open a new issue since you found a duplicate already (skip next phases).
+
+### Phase 6: Reporting and Recommendations
+1. **Create Investigation Report**: Generate a comprehensive analysis including:
+   - **Executive Summary**: Quick overview of the failure
+   - **Root Cause**: Detailed explanation of what went wrong
+   - **Reproduction Steps**: How to reproduce the issue locally
+   - **Recommended Actions**: Specific steps to fix the issue
+   - **Prevention Strategies**: How to avoid similar failures
+   - **AI Team Self-Improvement**: Give a short set of additional prompting instructions to copy-and-paste into instructions.md for AI coding agents to help prevent this type of failure in future
+   - **Historical Context**: Similar past failures and their resolutions
+   
+2. **Actionable Deliverables**:
+   - Create an issue with investigation results (if warranted)
+   - Comment on related PR with analysis (if PR-triggered)
+   - Provide specific file locations and line numbers for fixes
+   - Suggest code changes or configuration updates
+
+## Output Requirements
+
+### Investigation Issue Template
+
+When creating an investigation issue, use this structure:
+
+```markdown
+# 🏥 CI Failure Investigation - Run #${{ github.event.workflow_run.run_number }}
+
+## Summary
+[Brief description of the failure]
+
+## Failure Details
+- **Run**: [${{ github.event.workflow_run.id }}](${{ github.event.workflow_run.html_url }})
+- **Commit**: ${{ github.event.workflow_run.head_sha }}
+- **Trigger**: ${{ github.event.workflow_run.event }}
+
+## Root Cause Analysis
+[Detailed analysis of what went wrong]
+
+## Failed Jobs and Errors
+[List of failed jobs with key error messages]
+
+## Investigation Findings
+[Deep analysis results]
+
+## Recommended Actions
+- [ ] [Specific actionable steps]
+
+## Prevention Strategies
+[How to prevent similar failures]
+
+## AI Team Self-Improvement
+[Short set of additional prompting instructions to copy-and-paste into instructions.md for a AI coding agents to help prevent this type of failure in future]
+
+## Historical Context
+[Similar past failures and patterns]
+```
+
+## Important Guidelines
+
+- **Be Thorough**: Don't just report the error - investigate the underlying cause
+- **Use Memory**: Always check for similar past failures and learn from them
+- **Be Specific**: Provide exact file paths, line numbers, and error messages
+- **Action-Oriented**: Focus on actionable recommendations, not just analysis
+- **Pattern Building**: Contribute to the knowledge base for future investigations
+- **Resource Efficient**: Use caching to avoid re-downloading large logs
+- **Security Conscious**: Never execute untrusted code from logs or external sources
+
+## Cache Usage Strategy
+
+- Store investigation database and knowledge patterns in `/tmp/memory/investigations/` and `/tmp/memory/patterns/`
+- Cache detailed log analysis and artifacts in `/tmp/investigation/logs/` and `/tmp/investigation/reports/`
+- Persist findings across workflow runs using GitHub Actions cache
+- Build cumulative knowledge about failure patterns and solutions using structured JSON files
+- Use file-based indexing for fast pattern matching and similarity detection
+
+@include agentics/shared/tool-refused.md
+
+@include agentics/shared/include-link.md
+
+@include agentics/shared/xpia.md
--- a/.github/workflows/daily-perf-improver.lock.yml
+++ b/.github/workflows/daily-perf-improver.lock.yml
@ -2,7 +2,7 @@
 # To update this file, edit the corresponding .md file and run:
 #   gh aw compile
 #
-# Effective stop-time: 2025-09-18 22:27:02
+# Effective stop-time: 2025-09-18 23:04:00

 name: "Daily Perf Improver"
 "on":
@ -541,7 +541,7 @@ jobs:
          WORKFLOW_NAME="Daily Perf Improver"
          
          # Check stop-time limit
-          STOP_TIME="2025-09-18 22:27:02"
+          STOP_TIME="2025-09-18 23:04:00"
          echo "Checking stop-time limit: $STOP_TIME"
          
          # Convert stop time to epoch seconds
--- a/.github/workflows/daily-test-improver.lock.yml
+++ b/.github/workflows/daily-test-improver.lock.yml
@ -2,7 +2,7 @@
 # To update this file, edit the corresponding .md file and run:
 #   gh aw compile
 #
-# Effective stop-time: 2025-09-18 22:27:02
+# Effective stop-time: 2025-09-18 23:04:00

 name: "Daily Test Coverage Improver"
 "on":
@ -541,7 +541,7 @@ jobs:
          WORKFLOW_NAME="Daily Test Coverage Improver"
          
          # Check stop-time limit
-          STOP_TIME="2025-09-18 22:27:02"
+          STOP_TIME="2025-09-18 23:04:00"
          echo "Checking stop-time limit: $STOP_TIME"
          
          # Convert stop time to epoch seconds
--- a/.github/workflows/pr-fix.lock.yml
+++ b/.github/workflows/pr-fix.lock.yml
@ -2,7 +2,7 @@
 # To update this file, edit the corresponding .md file and run:
 #   gh aw compile
 #
-# Effective stop-time: 2025-09-18 22:30:52
+# Effective stop-time: 2025-09-18 23:04:00

 name: "PR Fix"
 on:
@ -37,7 +37,7 @@ jobs:
        id: check-team-member
        uses: actions/github-script@v8
        env:
-          GITHUB_AW_REQUIRED_ROLES: admin,maintainer
+          GITHUB_AW_REQUIRED_ROLES: admin,maintainer,write
        with:
          script: |
            async function setCancelled(message) {
@ -1071,7 +1071,7 @@ jobs:
          WORKFLOW_NAME="PR Fix"
          
          # Check stop-time limit
-          STOP_TIME="2025-09-18 22:30:52"
+          STOP_TIME="2025-09-18 23:04:00"
          echo "Checking stop-time limit: $STOP_TIME"
          
          # Convert stop time to epoch seconds
--- a/.github/workflows/pr-fix.md
+++ b/.github/workflows/pr-fix.md
@ -6,6 +6,7 @@ on:
  stop-after: +48h

 permissions: read-all
+roles: [admin, maintainer, write]

 network: defaults