name: "Advanced CPU Remediation with Escalation and Recovery"
description: "Comprehensive remediation workflow featuring looping, approvals, compensation, and observability"
created_by: "stavily-system"
tags: ["remediation", "cpu", "ops", "monitoring"]
environment: "production"
category: "infrastructure"
- pool_name: "prod-servers"
auto_install_plugins: true
on_plugin_missing: "skip_agent"
single_agent: "fallback-agent-001"
ref: "vault:infra/ssh_ops_key"
- name: "email_smtp_creds"
single_agent: "trigger-agent"
plugin: "prometheus-trigger-v1.1.0"
metric: "cpu_usage_percent"
condition: "> {{ variables.cpu_threshold }}"
plugin: "manual-approval-trigger"
message: "Run manually for testing or escalation"
- pool_name: "actions-pool"
agent_regex: "actions-.*"
- name: "analyze-processes"
plugin: "python-script-action-v2.1.0"
script: "analyze_cpu_processes.py"
- name: "analyze-all-servers"
plugin: "remote-analyzer-v3.0.0"
for_each: "{{ agents.pools[0].agent_regex }}"
script: "remote_cpu_check.py"
plugin: "manual-approval-action-v1.0.0"
depends_on: ["analyze-processes"]
message: "CPU remediation requires approval to kill processes."
- name: "kill-high-cpu-processes"
plugin: "system-command-action-v1.2.0"
condition: "analyze-processes.output.high_cpu_count > 3"
command: "pkill -f high_cpu_process"
on_failure: "invoke-compensation"
- name: "restart-service"
plugin: "service-management-action-v1.5.0"
depends_on: ["kill-high-cpu-processes"]
condition: "kill-high-cpu-processes.status == 'success'"
- name: "subworkflow-escalation"
plugin: "subworkflow-caller-v1.0.0"
depends_on: ["restart-service"]
condition: "restart-service.status == 'failed'"
workflow_id: "incident-escalation-v2"
- name: "rollback-service"
plugin: "service-management-action-v1.5.0"
single_agent: "output-agent"
plugin: "formatted-email-output-v1.5.0"
depends_on: ["restart-service"]
recipients: "{{ variables.alert_recipients }}"
template: "cpu_remediation_summary"
- path: "logs/workflow-{{ workflow.id }}.txt"
heartbeat_interval: "30s"
highlight_steps: ["kill-high-cpu-processes", "subworkflow-escalation"]
allow_run_by: ["ops", "admins"]
read_only_for: ["auditors"]