| {%- from "kubernetes/map.jinja" import master with context %} |
| {%- from "kubernetes/map.jinja" import pool with context %} |
| {%- from "kubernetes/map.jinja" import monitoring with context %} |
| |
| {%- set network = {} %} |
| {%- if pool.get('enabled', False) %} |
| {%- set network = pool.get('network', {}) %} |
| {%- elif master.get('enabled', False) %} |
| {%- set network = master.get('network', {}) %} |
| {%- endif %} |
| |
| {%- set apiServerPoint = {} %} |
| {%- if pool.get('enabled', False) %} |
| {%- set apiServerPoint = pool.get('apiserver', {}).get('host') %} |
| {%- if network.get('calico', {}).get('enabled', False) and network.calico.get('prometheus', {}).get('enabled', False) %} |
| {%- set calico_address = network.calico.prometheus.get('address', pool.address) %} |
| {%- endif %} |
| {%- elif master.get('enabled', False) %} |
| {%- set apiServerPoint = master.get('apiserver', {}).get('address') %} |
| {%- if network.get('calico', {}).get('enabled', False) and network.calico.get('prometheus', {}).get('enabled', False) %} |
| {%- set calico_address = network.calico.prometheus.get('address', master.address) %} |
| {%- endif %} |
| {%- endif %} |
| |
| |
| server: |
| target: |
| kubernetes: |
| enabled: true |
| api_ip: {{ apiServerPoint }} |
| cert_name: prometheus-server.crt |
| key_name: prometheus-server.key |
| {%- if calico_address is defined %} |
| static: |
| calico: |
| endpoint: |
| - address: {{ calico_address }} |
| port: {{ network.calico.prometheus.get('port', 9091) }} |
| relabel_configs: |
| - regex: {{ calico_address }}:{{ network.calico.prometheus.get('port', 9091) }} |
| replacement: {{ grains['host'] }} |
| source_labels: "__address__" |
| target_label: "host" |
| {%- endif %} |
| recording: |
| cluster_namespace_controller_pod_container:spec_memory_limit_bytes: |
| query: >- |
| sum by (cluster,namespace,controller,pod_name,container_name) ( |
| label_replace( |
| container_spec_memory_limit_bytes{container_name!=""}, |
| "controller", "$1", |
| "pod_name", "^(.*)-[a-z0-9]+" |
| ) |
| ) |
| cluster_namespace_controller_pod_container:spec_cpu_shares: |
| query: >- |
| sum by (cluster,namespace,controller,pod_name,container_name) ( |
| label_replace( |
| container_spec_cpu_shares{container_name!=""}, |
| "controller", "$1", |
| "pod_name", "^(.*)-[a-z0-9]+" |
| ) |
| ) |
| cluster_namespace_controller_pod_container:cpu_usage:rate: |
| query: >- |
| sum by (cluster,namespace,controller,pod_name,container_name) ( |
| label_replace( |
| irate( |
| container_cpu_usage_seconds_total{container_name!=""}[5m] |
| ), |
| "controller", "$1", |
| "pod_name", "^(.*)-[a-z0-9]+" |
| ) |
| ) |
| cluster_namespace_controller_pod_container:memory_usage:bytes: |
| query: >- |
| sum by (cluster,namespace,controller,pod_name,container_name) ( |
| label_replace( |
| container_memory_usage_bytes{container_name!=""}, |
| "controller", "$1", |
| "pod_name", "^(.*)-[a-z0-9]+" |
| ) |
| ) |
| cluster_namespace_controller_pod_container:memory_working_set:bytes: |
| query: >- |
| sum by (cluster,namespace,controller,pod_name,container_name) ( |
| label_replace( |
| container_memory_working_set_bytes{container_name!=""}, |
| "controller", "$1", |
| "pod_name", "^(.*)-[a-z0-9]+" |
| ) |
| ) |
| cluster_namespace_controller_pod_container:memory_rss:bytes: |
| query: >- |
| sum by (cluster,namespace,controller,pod_name,container_name) ( |
| label_replace( |
| container_memory_rss{container_name!=""}, |
| "controller", "$1", |
| "pod_name", "^(.*)-[a-z0-9]+" |
| ) |
| ) |
| cluster_namespace_controller_pod_container:memory_cache:bytes: |
| query: >- |
| sum by (cluster,namespace,controller,pod_name,container_name) ( |
| label_replace( |
| container_memory_cache{container_name!=""}, |
| "controller", "$1", |
| "pod_name", "^(.*)-[a-z0-9]+" |
| ) |
| ) |
| cluster_namespace_controller_pod_container:disk_usage:bytes: |
| query: >- |
| sum by (cluster,namespace,controller,pod_name,container_name) ( |
| label_replace( |
| container_fs_usage_bytes{container_name!=""}, |
| "controller", "$1", |
| "pod_name", "^(.*)-[a-z0-9]+" |
| ) |
| ) |
| cluster_namespace_controller_pod_container:memory_pagefaults:rate: |
| query: >- |
| sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( |
| label_replace( |
| irate( |
| container_memory_failures_total{container_name!=""}[5m] |
| ), |
| "controller", "$1", |
| "pod_name", "^(.*)-[a-z0-9]+" |
| ) |
| ) |
| cluster_namespace_controller_pod_container:memory_oom:rate: |
| query: >- |
| sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( |
| label_replace( |
| irate( |
| container_memory_failcnt{container_name!=""}[5m] |
| ), |
| "controller", "$1", |
| "pod_name", "^(.*)-[a-z0-9]+" |
| ) |
| ) |
| cluster:memory_allocation:percent: |
| query: >- |
| 100 * sum by (cluster) ( |
| container_spec_memory_limit_bytes{pod_name!=""} |
| ) / sum by (cluster) ( |
| machine_memory_bytes |
| ) |
| cluster:memory_used:percent: |
| query: >- |
| 100 * sum by (cluster) ( |
| container_memory_usage_bytes{pod_name!=""} |
| ) / sum by (cluster) ( |
| machine_memory_bytes |
| ) |
| cluster:cpu_allocation:percent: |
| query: >- |
| 100 * sum by (cluster) ( |
| container_spec_cpu_shares{pod_name!=""} |
| ) / sum by (cluster) ( |
| container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores |
| ) |
| alert: |
| {%- set instance_minor_threshold_percent = monitoring.instance_minor_threshold_percent|float %} |
| {%- set instance_major_threshold_percent = monitoring.instance_major_threshold_percent|float %} |
| ContainerScrapeError: |
| if: "container_scrape_error != 0" |
| {% raw %} |
| labels: |
| severity: warning |
| service: kubernetes |
| annotations: |
| summary: "Failed to get Kubernetes container metrics" |
| description: "Prometheus was not able to scrape metrics from the container on the {{ $labels.instance }} Kubernetes instance." |
| {% endraw %} |
| KubernetesProcessDown: |
| if: >- |
| procstat_running{process_name=~"hyperkube-.*"} == 0 |
| {% raw %} |
| for: 2m |
| labels: |
| severity: minor |
| service: kubernetes |
| annotations: |
| summary: "Kubernetes {{ $labels.process_name }} process is down" |
| description: "Kubernetes {{ $labels.process_name }} process on the {{ $labels.host }} node is down for 2 minutes." |
| {% endraw %} |
| KubernetesProcessDownMinor: |
| if: >- |
| count(procstat_running{process_name=~"hyperkube-.*"} == 0) by (process_name) > count(procstat_running{process_name=~"hyperkube-.*"}) by (process_name) * {{ instance_minor_threshold_percent }} |
| {% raw %} |
| for: 2m |
| labels: |
| severity: minor |
| service: kubernetes |
| annotations: |
| summary: "{% endraw %}{{ instance_minor_threshold_percent * 100 }}%{% raw %} of Kubernetes {{ $labels.process_name }} processes are down" |
| description: >- |
| {{ $value }} of Kubernetes {{ $labels.process_name }} processes (>= {% endraw %} {{ instance_minor_threshold_percent * 100 }}%) are down for 2 minutes. |
| KubernetesProcessDownMajor: |
| if: >- |
| count(procstat_running{process_name=~"hyperkube-.*"} == 0) by (process_name) > count(procstat_running{process_name=~"hyperkube-.*"}) by (process_name) * {{ instance_major_threshold_percent }} |
| for: 2m |
| labels: |
| severity: major |
| service: kubernetes |
| annotations: |
| summary: "{{ instance_major_threshold_percent * 100 }}%{% raw %} of Kubernetes {{ $labels.process_name }} processes are down" |
| description: >- |
| {{ $value }} of Kubernetes {{ $labels.process_name }} processes (>= {% endraw %} {{ instance_major_threshold_percent * 100 }}%) are down for 2 minutes. |
| KubernetesProcessOutage: |
| if: >- |
| count(procstat_running{process_name=~"hyperkube-.*"}) by (process_name) == count(procstat_running{process_name=~"hyperkube-.*"} == 0) by (process_name) |
| {% raw %} |
| for: 2m |
| labels: |
| severity: critical |
| service: kubernetes |
| annotations: |
| summary: "Kubernetes {{ $labels.process_name }} cluster outage" |
| description: "All Kubernetes {{ $labels.process_name }} processes are down for 2 minutes." |
| {% endraw %} |
| {%- if network.get('calico', {}).get('enabled', False) %} |
| CalicoProcessDown: |
| if: >- |
| procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0 |
| {% raw %} |
| for: 2m |
| labels: |
| severity: minor |
| service: calico |
| annotations: |
| summary: "Calico {{ $labels.process_name }} process is down" |
| description: "Calico {{ $labels.process_name }} process on the {{ $labels.host }} node is down for 2 minutes." |
| {% endraw %} |
| CalicoProcessDownMinor: |
| if: >- |
| count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0) by (process_name) > count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"}) by (process_name) * {{ instance_minor_threshold_percent }} |
| for: 2m |
| labels: |
| severity: minor |
| service: calico |
| annotations: |
| summary: "{{ instance_minor_threshold_percent * 100 }}%{% raw %} of Calico {{ $labels.process_name }} processes are down" |
| description: >- |
| {{ $value }} of Calico {{ $labels.process_name }} processes (>= {% endraw %} {{ instance_minor_threshold_percent * 100 }}%) are down for 2 minutes. |
| CalicoProcessDownMajor: |
| if: >- |
| count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0) by (process_name) > count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"}) by (process_name) * {{ instance_major_threshold_percent }} |
| for: 2m |
| labels: |
| severity: major |
| service: calico |
| annotations: |
| summary: "{{ instance_major_threshold_percent * 100 }}%{% raw %} of Calico {{ $labels.process_name }} processes are down" |
| description: >- |
| {{ $value }} of Calico {{ $labels.process_name }} processes (>= {% endraw %} {{ instance_major_threshold_percent * 100 }}%) are down for 2 minutes. |
| CalicoProcessOutage: |
| if: >- |
| count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"}) by (process_name) == count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0) by (process_name) |
| {% raw %} |
| for: 2m |
| labels: |
| severity: critical |
| service: calico |
| annotations: |
| summary: "Calico {{ $labels.process_name }} cluster outage" |
| description: "All Calico {{ $labels.process_name }} processes are down for 2 minutes." |
| {% endraw %} |
| {% endif %} |
| {# |
| # vim: ft=jinja |
| #} |