|
/etc/alerts.d/node_alerting_rules.yml > container_cpu_usage_is_high
|
|
|
|
/etc/alerts.d/node_alerting_rules.yml > container_memory_usage_is_high
|
|
|
|
/etc/alerts.d/node_alerting_rules.yml > node_cpu_greater_than_80
|
|
|
|
/etc/alerts.d/node_alerting_rules.yml > node_disk_space_too_low
|
|
|
|
/etc/alerts.d/node_alerting_rules.yml > node_down
|
alert: NODE_DOWN
expr: up{component="node-exporter"} == 0
for: 3m
labels:
severity: warning
annotations:
description: '{{ $labels.job }} job failed to scrape instance {{ $labels.instance }} for more than 3 minutes. Node Seems to be down'
summary: Node {{ $labels.kubernetes_node }} is down
|
|
/etc/alerts.d/node_alerting_rules.yml > node_memory_left_lessser_than_10
|
| Labels |
State |
Active Since |
Value |
|
alertname="NODE_MEMORY_LESS_THAN_10%"
app_kubernetes_io_component="metrics"
app_kubernetes_io_instance="monitor"
app_kubernetes_io_managed_by="Helm"
app_kubernetes_io_name="prometheus-node-exporter"
app_kubernetes_io_part_of="prometheus-node-exporter"
app_kubernetes_io_version="1.9.1"
helm_sh_chart="prometheus-node-exporter-4.46.1"
instance="162.209.125.107:9100"
job="kubernetes-service-endpoints"
jobLabel="node-exporter"
kubernetes_name="monitor-prometheus-node-exporter"
kubernetes_namespace="spinhar16"
kubernetes_node="prod-instance-1757954949207"
release="monitor"
severity="critical"
|
firing |
2025-12-23 09:32:34.345417764 +0000 UTC |
4.640573105869528 |
| Annotations |
- description
- node prod-instance-1757954949207 memory left is low
- summary
- node memory left is lesser than 10 precent
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > Front50-cache
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > autopilot-component-jvm-errors
|
alert: jvm-memory-filling-up-for-oes-audit-client
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="auditclient"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="auditclient"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-autopilot
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="autopilot"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="autopilot"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-dashboard
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="dashboard"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="autopilot"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-platform
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="platform"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="platform"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-sapor
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="sapor"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="sapor"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-visibility
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="visibility"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="visibility"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
|
/etc/alerts.d/spin_alerting_rules.yml > autopilot-component-latency-too-high
|
alert: oes-audit-client-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="auditclient"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="auditclient"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-autopilot-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="autopilot"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="autopilot"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-dashboard-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="dashboard"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="dashboard"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-platform-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="platform"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="platform"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-sapor-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="sapor"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="sapor"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-visibility-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="visibility"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="visibility"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
|
/etc/alerts.d/spin_alerting_rules.yml > autopilot-scrape-target-is-down
|
| Labels |
State |
Active Since |
Value |
|
alertname="oes-dashboard-scrape-target-is-down"
app="oes"
chart="oes-4.0.19"
component="dashboard"
heritage="Helm"
instance="10.20.42.228:8094"
job="kubernetes-pods"
kubernetes_namespace="isdupg4043to2508"
kubernetes_pod_name="oes-dashboard-748b7d9c9d-mnslj"
pod_template_hash="748b7d9c9d"
release="isd"
severity="critical"
|
firing |
2025-12-23 08:44:13.633722481 +0000 UTC |
0 |
| Annotations |
- description
- The scrape target endpoint of component dashboard in namespace isdupg4043to2508 is down
- summary
- oes-dashboard scrape target is down
|
|
| Labels |
State |
Active Since |
Value |
|
alertname="oes-visibility-scrape-target-is-down"
app="oes"
chart="oes-4.0.19"
component="visibility"
heritage="Helm"
instance="10.20.98.237:8096"
job="kubernetes-pods"
kubernetes_namespace="isdupg4043to2508"
kubernetes_pod_name="oes-visibility-5b494b494-w97cc"
pod_template_hash="5b494b494"
release="isd"
severity="critical"
|
firing |
2025-12-23 08:43:13.633722481 +0000 UTC |
0 |
| Annotations |
- description
- The scrape target endpoint of component visibility in namespace isdupg4043to2508 is down
- summary
- oes-visibility scrape target is down
|
|
|
|
|
|
|
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > igor-needs-attention
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > jvm-too-high
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > kube-api-server-is-down
|
| Labels |
State |
Active Since |
Value |
|
alertname="kube-api-server-down"
instance="10.21.0.40:6443"
job="kubernetes-apiservers"
severity="critical"
|
firing |
2025-12-23 06:34:42.524445036 +0000 UTC |
0 |
| Annotations |
- description
- Kubernetes API Server service went down LABELS = map[__name__:up instance:10.21.0.40:6443 job:kubernetes-apiservers]
- summary
- Kube API Server job kubernetes-apiservers is down
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > kubernetes-api-server-experiencing-high-error-rate
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > latency-too-high
|
alert: clouddriver-caching-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_caching:controller:invocations__total{service="spin-clouddriver-caching"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_caching:controller:invocations__count_total{service="spin-clouddriver-caching"}[5m])) > 5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: clouddriver-ro-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro:controller:invocations__total{service="spin-clouddriver-ro"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro:controller:invocations__count_total{service="spin-clouddriver-ro"}[5m])) > 1
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: clouddriver-rw-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_rw:controller:invocations__total{service="spin-clouddriver-rw"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_rw:controller:invocations__count_total{service="spin-clouddriver-rw"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is ({{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: clouddriver_ro_deck-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro_deck:controller:invocations__total{service="spin-clouddriver-ro-deck"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro_deck:controller:invocations__total{service="spin-clouddriver-ro-deck"}[5m])) > 5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: echo_scheduler-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_scheduler:controller:invocations__total{service="spin-echo-scheduler"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_scheduler:controller:invocations__count_total{service="spin-echo-scheduler"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: echo_worker-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_worker:controller:invocations__total{service="spin-echo-worker"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_worker:controller:invocations__count_total{service="spin-echo-worker"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: fiat-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(fiat:controller:invocations__total{service="spin-fiat"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(fiat:controller:invocations__count_total{service="spin-fiat"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: front50-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(front50:controller:invocations__total{service="spin-front50"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(front50:controller:invocations__count_total{service="spin-front50"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: gate-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(gate:controller:invocations__total{service="spin-gate"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(gate:controller:invocations__count_total{service="spin-gate"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: igor-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(igor:controller:invocations__total{service="spin-igor"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(igor:controller:invocations__count_total{service="spin-igor"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: orca-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(orca:controller:invocations__total{service="spin-orca"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(orca:controller:invocations__count_total{service="spin-orca"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: rosco-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(rosco:controller:invocations__total{service="spin-rosco"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(rosco:controller:invocations__count_total{service="spin-rosco"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
|
/etc/alerts.d/spin_alerting_rules.yml > orca-queue-issue
|
|
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > prometheus-job-down
|
alert: prometheus-job-is-down
expr: up{job="prometheus"} == 0
for: 5m
labels:
severity: warning
annotations:
description: Default Prometheus Job is Down LABELS = {{ $labels }}
summary: The Default Prometheus Job is Down (job {{ $labels.job}})
|
|
/etc/alerts.d/spin_alerting_rules.yml > spinnaker-service-is-down
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > volume-is-almost-full (< 10% left)
|
|
|