|
/etc/alerts.d/node_alerting_rules.yml > container_cpu_usage_is_high
|
|
|
|
/etc/alerts.d/node_alerting_rules.yml > container_memory_usage_is_high
|
|
|
|
/etc/alerts.d/node_alerting_rules.yml > node_cpu_greater_than_80
|
|
|
|
/etc/alerts.d/node_alerting_rules.yml > node_disk_space_too_low
|
|
|
|
/etc/alerts.d/node_alerting_rules.yml > node_down
|
alert: NODE_DOWN
expr: up{component="node-exporter"} == 0
for: 3m
labels:
severity: warning
annotations:
description: '{{ $labels.job }} job failed to scrape instance {{ $labels.instance }} for more than 3 minutes. Node Seems to be down'
summary: Node {{ $labels.kubernetes_node }} is down
|
|
/etc/alerts.d/node_alerting_rules.yml > node_memory_left_lessser_than_10
|
| Labels |
State |
Active Since |
Value |
|
alertname="NODE_MEMORY_LESS_THAN_10%"
app="prometheus"
chart="prometheus-11.16.2"
component="node-exporter"
heritage="Helm"
instance="162.209.124.159:9100"
job="kubernetes-service-endpoints"
kubernetes_name="isd-prometheus-node-exporter"
kubernetes_namespace="isdspin251000"
kubernetes_node="prod-instance-17689143807031438"
release="isd"
severity="critical"
|
firing |
2026-02-18 11:10:34.345417764 +0000 UTC |
4.215504307591262 |
| Annotations |
- description
- node prod-instance-17689143807031438 memory left is low
- summary
- node memory left is lesser than 10 precent
|
|
alertname="NODE_MEMORY_LESS_THAN_10%"
app="prometheus"
chart="prometheus-11.16.2"
component="node-exporter"
heritage="Helm"
instance="162.209.124.92:9100"
job="kubernetes-service-endpoints"
kubernetes_name="isd-prometheus-node-exporter"
kubernetes_namespace="isdsamlupg08to10"
kubernetes_node="prod-instance-17695105058316107"
release="isd"
severity="critical"
|
firing |
2026-02-18 12:03:34.345417764 +0000 UTC |
4.241312614115816 |
| Annotations |
- description
- node prod-instance-17695105058316107 memory left is low
- summary
- node memory left is lesser than 10 precent
|
|
alertname="NODE_MEMORY_LESS_THAN_10%"
app="prometheus"
chart="prometheus-11.16.2"
component="node-exporter"
heritage="Helm"
instance="162.209.124.165:9100"
job="kubernetes-service-endpoints"
kubernetes_name="isd-prometheus-node-exporter"
kubernetes_namespace="isdspin251000"
kubernetes_node="prod-instance-17689143806181437"
release="isd"
severity="critical"
|
firing |
2026-02-18 11:25:34.345417764 +0000 UTC |
7.181193843820422 |
| Annotations |
- description
- node prod-instance-17689143806181437 memory left is low
- summary
- node memory left is lesser than 10 precent
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > Front50-cache
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > autopilot-component-jvm-errors
|
alert: jvm-memory-filling-up-for-oes-audit-client
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="auditclient"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="auditclient"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-autopilot
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="autopilot"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="autopilot"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-dashboard
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="dashboard"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="autopilot"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-platform
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="platform"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="platform"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-sapor
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="sapor"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="sapor"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-visibility
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="visibility"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="visibility"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
|
/etc/alerts.d/spin_alerting_rules.yml > autopilot-component-latency-too-high
|
alert: oes-audit-client-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="auditclient"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="auditclient"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-autopilot-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="autopilot"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="autopilot"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-dashboard-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="dashboard"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="dashboard"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-platform-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="platform"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="platform"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-sapor-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="sapor"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="sapor"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-visibility-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="visibility"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="visibility"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
|
/etc/alerts.d/spin_alerting_rules.yml > autopilot-scrape-target-is-down
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > igor-needs-attention
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > jvm-too-high
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > kube-api-server-is-down
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > kubernetes-api-server-experiencing-high-error-rate
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > latency-too-high
|
alert: clouddriver-caching-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_caching:controller:invocations__total{service="spin-clouddriver-caching"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_caching:controller:invocations__count_total{service="spin-clouddriver-caching"}[5m])) > 5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: clouddriver-ro-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro:controller:invocations__total{service="spin-clouddriver-ro"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro:controller:invocations__count_total{service="spin-clouddriver-ro"}[5m])) > 1
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: clouddriver-rw-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_rw:controller:invocations__total{service="spin-clouddriver-rw"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_rw:controller:invocations__count_total{service="spin-clouddriver-rw"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is ({{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: clouddriver_ro_deck-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro_deck:controller:invocations__total{service="spin-clouddriver-ro-deck"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro_deck:controller:invocations__total{service="spin-clouddriver-ro-deck"}[5m])) > 5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: echo_scheduler-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_scheduler:controller:invocations__total{service="spin-echo-scheduler"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_scheduler:controller:invocations__count_total{service="spin-echo-scheduler"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: echo_worker-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_worker:controller:invocations__total{service="spin-echo-worker"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_worker:controller:invocations__count_total{service="spin-echo-worker"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: fiat-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(fiat:controller:invocations__total{service="spin-fiat"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(fiat:controller:invocations__count_total{service="spin-fiat"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: front50-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(front50:controller:invocations__total{service="spin-front50"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(front50:controller:invocations__count_total{service="spin-front50"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: gate-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(gate:controller:invocations__total{service="spin-gate"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(gate:controller:invocations__count_total{service="spin-gate"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: igor-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(igor:controller:invocations__total{service="spin-igor"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(igor:controller:invocations__count_total{service="spin-igor"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: orca-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(orca:controller:invocations__total{service="spin-orca"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(orca:controller:invocations__count_total{service="spin-orca"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: rosco-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(rosco:controller:invocations__total{service="spin-rosco"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(rosco:controller:invocations__count_total{service="spin-rosco"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
|
/etc/alerts.d/spin_alerting_rules.yml > orca-queue-issue
|
|
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > prometheus-job-down
|
alert: prometheus-job-is-down
expr: up{job="prometheus"} == 0
for: 5m
labels:
severity: warning
annotations:
description: Default Prometheus Job is Down LABELS = {{ $labels }}
summary: The Default Prometheus Job is Down (job {{ $labels.job}})
|
|
/etc/alerts.d/spin_alerting_rules.yml > spinnaker-service-is-down
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > volume-is-almost-full (< 10% left)
|
|
|