|
/etc/alerts.d/node_alerting_rules.yml > container_cpu_usage_is_high
|
| Labels |
State |
Active Since |
Value |
|
alertname="POD_CPU_IS_HIGH"
container="alpha"
namespace="ssd-demo-instance"
pod="dgraph-0"
severity="critical"
|
firing |
2026-01-14 05:23:39.87619941 +0000 UTC |
314.66131673438707 |
| Annotations |
- description
- Container alpha CPU usage inside POD dgraph-0 is high in ssd-demo-instance
- summary
- POD dgraph-0 CPU Usage is high in ssd-demo-instance
|
|
|
/etc/alerts.d/node_alerting_rules.yml > container_memory_usage_is_high
|
| Labels |
State |
Active Since |
Value |
|
alertname="POD_MEMORY_USAGE_IS_HIGH"
container="minio"
namespace="ssd-demo-instance"
pod="ssd-minio-6d9ddbcc9b-jbxph"
severity="critical"
|
firing |
2026-01-15 06:54:29.308708883 +0000 UTC |
84.36991373697916 |
| Annotations |
- description
- Container Memory usage is above 80%
VALUE = 84.36991373697916
LABELS = map[container:minio namespace:ssd-demo-instance pod:ssd-minio-6d9ddbcc9b-jbxph]
- summary
- Container minio Memory usage inside POD ssd-minio-6d9ddbcc9b-jbxph is high in ssd-demo-instance
|
|
alertname="POD_MEMORY_USAGE_IS_HIGH"
container="minio"
namespace="ssd-demo"
pod="ssd-demo-minio-6d9d775dc6-56vtb"
severity="critical"
|
firing |
2026-01-14 05:23:29.308708883 +0000 UTC |
89.9810791015625 |
| Annotations |
- description
- Container Memory usage is above 80%
VALUE = 89.9810791015625
LABELS = map[container:minio namespace:ssd-demo pod:ssd-demo-minio-6d9d775dc6-56vtb]
- summary
- Container minio Memory usage inside POD ssd-demo-minio-6d9d775dc6-56vtb is high in ssd-demo
|
|
|
/etc/alerts.d/node_alerting_rules.yml > node_cpu_greater_than_80
|
|
|
|
/etc/alerts.d/node_alerting_rules.yml > node_disk_space_too_low
|
|
|
|
/etc/alerts.d/node_alerting_rules.yml > node_down
|
alert: NODE_DOWN
expr: up{component="node-exporter"} == 0
for: 3m
labels:
severity: warning
annotations:
description: '{{ $labels.job }} job failed to scrape instance {{ $labels.instance }} for more than 3 minutes. Node Seems to be down'
summary: Node {{ $labels.kubernetes_node }} is down
|
|
/etc/alerts.d/node_alerting_rules.yml > node_memory_left_lessser_than_10
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > Front50-cache
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > autopilot-component-jvm-errors
|
alert: jvm-memory-filling-up-for-oes-audit-client
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="auditclient"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="auditclient"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-autopilot
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="autopilot"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="autopilot"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-dashboard
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="dashboard"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="autopilot"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-platform
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="platform"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="platform"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-sapor
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="sapor"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="sapor"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
alert: jvm-memory-filling-up-for-oes-visibility
expr: (sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_used_bytes{app="oes",area="heap",component="visibility"}) / sum by(instance, kubernetes_pod_name, component, kubernetes_namespace) (jvm_memory_max_bytes{app="oes",area="heap",component="visibility"})) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
description: |-
JVM memory is filling up for {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }} (> 80%)
VALUE = {{ $value }}
summary: JVM memory filling up for {{ $labels.component }} for pod {{ $labels.kubernetes_pod_name }} in namespace {{ $labels.kubernetes_namespace }})
|
|
/etc/alerts.d/spin_alerting_rules.yml > autopilot-component-latency-too-high
|
alert: oes-audit-client-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="auditclient"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="auditclient"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-autopilot-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="autopilot"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="autopilot"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-dashboard-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="dashboard"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="dashboard"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-platform-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="platform"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="platform"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-sapor-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="sapor"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="sapor"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
alert: oes-visibility-latency-too-high
expr: sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_sum{component="visibility"}[2m])) / sum by(kubernetes_pod_name, method, outcome, status, component, kubernetes_namespace, uri) (rate(http_server_requests_seconds_count{component="visibility"}[2m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
description: Latency of the component {{ $labels.component }} is {{ $value }} seconds for {{ $labels }}
summary: Latency of the component {{ $labels.component }} in namespace {{$labels.kubernetes_namespace}} is high
|
|
/etc/alerts.d/spin_alerting_rules.yml > autopilot-scrape-target-is-down
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > igor-needs-attention
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > jvm-too-high
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > kube-api-server-is-down
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > kubernetes-api-server-experiencing-high-error-rate
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > latency-too-high
|
alert: clouddriver-caching-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_caching:controller:invocations__total{service="spin-clouddriver-caching"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_caching:controller:invocations__count_total{service="spin-clouddriver-caching"}[5m])) > 5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: clouddriver-ro-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro:controller:invocations__total{service="spin-clouddriver-ro"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro:controller:invocations__count_total{service="spin-clouddriver-ro"}[5m])) > 1
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: clouddriver-rw-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_rw:controller:invocations__total{service="spin-clouddriver-rw"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_rw:controller:invocations__count_total{service="spin-clouddriver-rw"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is ({{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: clouddriver_ro_deck-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro_deck:controller:invocations__total{service="spin-clouddriver-ro-deck"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(clouddriver_ro_deck:controller:invocations__total{service="spin-clouddriver-ro-deck"}[5m])) > 5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: echo_scheduler-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_scheduler:controller:invocations__total{service="spin-echo-scheduler"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_scheduler:controller:invocations__count_total{service="spin-echo-scheduler"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: echo_worker-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_worker:controller:invocations__total{service="spin-echo-worker"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(echo_worker:controller:invocations__count_total{service="spin-echo-worker"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: fiat-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(fiat:controller:invocations__total{service="spin-fiat"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(fiat:controller:invocations__count_total{service="spin-fiat"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: front50-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(front50:controller:invocations__total{service="spin-front50"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(front50:controller:invocations__count_total{service="spin-front50"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: gate-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(gate:controller:invocations__total{service="spin-gate"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(gate:controller:invocations__count_total{service="spin-gate"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: igor-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(igor:controller:invocations__total{service="spin-igor"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(igor:controller:invocations__count_total{service="spin-igor"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: orca-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(orca:controller:invocations__total{service="spin-orca"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(orca:controller:invocations__count_total{service="spin-orca"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
alert: rosco-latency-too-high
expr: sum by(controller, instance, method, success, statusCode, service, namespace) (rate(rosco:controller:invocations__total{service="spin-rosco"}[5m])) / sum by(controller, instance, method, success, statusCode, service, namespace) (rate(rosco:controller:invocations__count_total{service="spin-rosco"}[5m])) > 0.5
for: 15m
labels:
severity: warning
annotations:
description: Latency of the Service {{$labels.service}} is {{$value}} seconds for {{ $labels }}
summary: Latency of the service {{ $labels.service }} in namespace {{$labels.namespace}} is high
|
|
/etc/alerts.d/spin_alerting_rules.yml > orca-queue-issue
|
|
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > prometheus-job-down
|
alert: prometheus-job-is-down
expr: up{job="prometheus"} == 0
for: 5m
labels:
severity: warning
annotations:
description: Default Prometheus Job is Down LABELS = {{ $labels }}
summary: The Default Prometheus Job is Down (job {{ $labels.job}})
|
|
/etc/alerts.d/spin_alerting_rules.yml > spinnaker-service-is-down
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/alerts.d/spin_alerting_rules.yml > volume-is-almost-full (< 10% left)
|
| Labels |
State |
Active Since |
Value |
|
alertname="pvc-storage-full"
beta_kubernetes_io_arch="amd64"
beta_kubernetes_io_instance_type="e2-highmem-8"
beta_kubernetes_io_os="linux"
cloud_google_com_gke_boot_disk="pd-balanced"
cloud_google_com_gke_container_runtime="containerd"
cloud_google_com_gke_cpu_scaling_level="8"
cloud_google_com_gke_logging_variant="DEFAULT"
cloud_google_com_gke_max_pods_per_node="110"
cloud_google_com_gke_memory_gb_scaling_level="65"
cloud_google_com_gke_nodepool="dnode-pool"
cloud_google_com_gke_os_distribution="cos"
cloud_google_com_gke_provisioning="standard"
cloud_google_com_gke_stack_type="IPV4"
cloud_google_com_machine_family="e2"
cloud_google_com_private_node="false"
failure_domain_beta_kubernetes_io_region="us-west2"
failure_domain_beta_kubernetes_io_zone="us-west2-a"
instance="gke-isd312-saas-demo-dnode-pool-cf18a175-mdmw"
job="kubernetes-nodes"
kubernetes_io_arch="amd64"
kubernetes_io_hostname="gke-isd312-saas-demo-dnode-pool-cf18a175-mdmw"
kubernetes_io_os="linux"
namespace="monitoring"
node_kubernetes_io_instance_type="e2-highmem-8"
persistentvolumeclaim="prometheus-server"
severity="warning"
topology_gke_io_zone="us-west2-a"
topology_kubernetes_io_region="us-west2"
topology_kubernetes_io_zone="us-west2-a"
|
firing |
2026-01-14 05:23:05.000257301 +0000 UTC |
0 |
| Annotations |
- description
- Volume is almost full (< 10% left)
VALUE = 0
LABELS = map[beta_kubernetes_io_arch:amd64 beta_kubernetes_io_instance_type:e2-highmem-8 beta_kubernetes_io_os:linux cloud_google_com_gke_boot_disk:pd-balanced cloud_google_com_gke_container_runtime:containerd cloud_google_com_gke_cpu_scaling_level:8 cloud_google_com_gke_logging_variant:DEFAULT cloud_google_com_gke_max_pods_per_node:110 cloud_google_com_gke_memory_gb_scaling_level:65 cloud_google_com_gke_nodepool:dnode-pool cloud_google_com_gke_os_distribution:cos cloud_google_com_gke_provisioning:standard cloud_google_com_gke_stack_type:IPV4 cloud_google_com_machine_family:e2 cloud_google_com_private_node:false failure_domain_beta_kubernetes_io_region:us-west2 failure_domain_beta_kubernetes_io_zone:us-west2-a instance:gke-isd312-saas-demo-dnode-pool-cf18a175-mdmw job:kubernetes-nodes kubernetes_io_arch:amd64 kubernetes_io_hostname:gke-isd312-saas-demo-dnode-pool-cf18a175-mdmw kubernetes_io_os:linux namespace:monitoring node_kubernetes_io_instance_type:e2-highmem-8 persistentvolumeclaim:prometheus-server topology_gke_io_zone:us-west2-a topology_kubernetes_io_region:us-west2 topology_kubernetes_io_zone:us-west2-a]
- summary
- Kubernetes Volume running out of disk space for (persistentvolumeclaim prometheus-server in namespace monitoring)
|
|