blob: 6a191569f70817b9bbfb32cdff4ce70949f811c7 [file] [log] [blame]
{%- from "kubernetes/map.jinja" import master with context %}
{%- from "kubernetes/map.jinja" import pool with context %}
{%- from "kubernetes/map.jinja" import monitoring with context %}
{%- set network = {} %}
{%- if pool.get('enabled', False) %}
{%- set network = pool.get('network', {}) %}
{%- elif master.get('enabled', False) %}
{%- set network = master.get('network', {}) %}
{%- endif %}
{%- set apiServerPoint = {} %}
{%- if pool.get('enabled', False) %}
{%- set apiServerPoint = pool.get('apiserver', {}).get('host') %}
{%- if network.get('calico', {}).get('enabled', False) and network.calico.get('prometheus', {}).get('enabled', False) %}
{%- set calico_address = network.calico.prometheus.get('address', pool.address) %}
{%- endif %}
{%- elif master.get('enabled', False) %}
{%- set apiServerPoint = master.get('apiserver', {}).get('address') %}
{%- if network.get('calico', {}).get('enabled', False) and network.calico.get('prometheus', {}).get('enabled', False) %}
{%- set calico_address = network.calico.prometheus.get('address', master.address) %}
{%- endif %}
{%- endif %}
server:
target:
kubernetes:
enabled: true
api_ip: {{ apiServerPoint }}
cert_name: prometheus-server.crt
key_name: prometheus-server.key
{%- if calico_address is defined %}
static:
calico:
endpoint:
- address: {{ calico_address }}
port: {{ network.calico.prometheus.get('port', 9091) }}
relabel_configs:
- regex: {{ calico_address }}:{{ network.calico.prometheus.get('port', 9091) }}
replacement: {{ grains['host'] }}
source_labels: "__address__"
target_label: "host"
{%- endif %}
recording:
cluster_namespace_controller_pod_container:spec_memory_limit_bytes:
query: >-
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
container_spec_memory_limit_bytes{container_name!=""},
"controller", "$1",
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:spec_cpu_shares:
query: >-
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
container_spec_cpu_shares{container_name!=""},
"controller", "$1",
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:cpu_usage:rate:
query: >-
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
irate(
container_cpu_usage_seconds_total{container_name!=""}[5m]
),
"controller", "$1",
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:memory_usage:bytes:
query: >-
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
container_memory_usage_bytes{container_name!=""},
"controller", "$1",
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:memory_working_set:bytes:
query: >-
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
container_memory_working_set_bytes{container_name!=""},
"controller", "$1",
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:memory_rss:bytes:
query: >-
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
container_memory_rss{container_name!=""},
"controller", "$1",
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:memory_cache:bytes:
query: >-
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
container_memory_cache{container_name!=""},
"controller", "$1",
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:disk_usage:bytes:
query: >-
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
container_fs_usage_bytes{container_name!=""},
"controller", "$1",
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:memory_pagefaults:rate:
query: >-
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
label_replace(
irate(
container_memory_failures_total{container_name!=""}[5m]
),
"controller", "$1",
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:memory_oom:rate:
query: >-
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
label_replace(
irate(
container_memory_failcnt{container_name!=""}[5m]
),
"controller", "$1",
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster:memory_allocation:percent:
query: >-
100 * sum by (cluster) (
container_spec_memory_limit_bytes{pod_name!=""}
) / sum by (cluster) (
machine_memory_bytes
)
cluster:memory_used:percent:
query: >-
100 * sum by (cluster) (
container_memory_usage_bytes{pod_name!=""}
) / sum by (cluster) (
machine_memory_bytes
)
cluster:cpu_allocation:percent:
query: >-
100 * sum by (cluster) (
container_spec_cpu_shares{pod_name!=""}
) / sum by (cluster) (
container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores
)
alert:
{%- set instance_minor_threshold_percent = monitoring.instance_minor_threshold_percent|float %}
{%- set instance_major_threshold_percent = monitoring.instance_major_threshold_percent|float %}
ContainerScrapeError:
if: "container_scrape_error != 0"
{% raw %}
labels:
severity: warning
service: kubernetes
annotations:
summary: "Failed to get Kubernetes container metrics"
description: "Prometheus was not able to scrape metrics from the container on the {{ $labels.instance }} Kubernetes instance."
{% endraw %}
KubernetesProcessDown:
if: >-
procstat_running{process_name=~"hyperkube-.*"} == 0
{% raw %}
for: 2m
labels:
severity: minor
service: kubernetes
annotations:
summary: "Kubernetes {{ $labels.process_name }} process is down"
description: "Kubernetes {{ $labels.process_name }} process on the {{ $labels.host }} node is down for 2 minutes."
{% endraw %}
KubernetesProcessDownMinor:
if: >-
count(procstat_running{process_name=~"hyperkube-.*"} == 0) by (process_name) > count(procstat_running{process_name=~"hyperkube-.*"}) by (process_name) * {{ instance_minor_threshold_percent }}
{% raw %}
for: 2m
labels:
severity: minor
service: kubernetes
annotations:
summary: "{% endraw %}{{ instance_minor_threshold_percent * 100 }}%{% raw %} of Kubernetes {{ $labels.process_name }} processes are down"
description: >-
{{ $value }} of Kubernetes {{ $labels.process_name }} processes (>= {% endraw %} {{ instance_minor_threshold_percent * 100 }}%) are down for 2 minutes.
KubernetesProcessDownMajor:
if: >-
count(procstat_running{process_name=~"hyperkube-.*"} == 0) by (process_name) > count(procstat_running{process_name=~"hyperkube-.*"}) by (process_name) * {{ instance_major_threshold_percent }}
for: 2m
labels:
severity: major
service: kubernetes
annotations:
summary: "{{ instance_major_threshold_percent * 100 }}%{% raw %} of Kubernetes {{ $labels.process_name }} processes are down"
description: >-
{{ $value }} of Kubernetes {{ $labels.process_name }} processes (>= {% endraw %} {{ instance_major_threshold_percent * 100 }}%) are down for 2 minutes.
KubernetesProcessOutage:
if: >-
count(procstat_running{process_name=~"hyperkube-.*"}) by (process_name) == count(procstat_running{process_name=~"hyperkube-.*"} == 0) by (process_name)
{% raw %}
for: 2m
labels:
severity: critical
service: kubernetes
annotations:
summary: "Kubernetes {{ $labels.process_name }} cluster outage"
description: "All Kubernetes {{ $labels.process_name }} processes are down for 2 minutes."
{% endraw %}
{%- if network.get('calico', {}).get('enabled', False) %}
CalicoProcessDown:
if: >-
procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0
{% raw %}
for: 2m
labels:
severity: minor
service: calico
annotations:
summary: "Calico {{ $labels.process_name }} process is down"
description: "Calico {{ $labels.process_name }} process on the {{ $labels.host }} node is down for 2 minutes."
{% endraw %}
CalicoProcessDownMinor:
if: >-
count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0) by (process_name) > count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"}) by (process_name) * {{ instance_minor_threshold_percent }}
for: 2m
labels:
severity: minor
service: calico
annotations:
summary: "{{ instance_minor_threshold_percent * 100 }}%{% raw %} of Calico {{ $labels.process_name }} processes are down"
description: >-
{{ $value }} of Calico {{ $labels.process_name }} processes (>= {% endraw %} {{ instance_minor_threshold_percent * 100 }}%) are down for 2 minutes.
CalicoProcessDownMajor:
if: >-
count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0) by (process_name) > count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"}) by (process_name) * {{ instance_major_threshold_percent }}
for: 2m
labels:
severity: major
service: calico
annotations:
summary: "{{ instance_major_threshold_percent * 100 }}%{% raw %} of Calico {{ $labels.process_name }} processes are down"
description: >-
{{ $value }} of Calico {{ $labels.process_name }} processes (>= {% endraw %} {{ instance_major_threshold_percent * 100 }}%) are down for 2 minutes.
CalicoProcessOutage:
if: >-
count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"}) by (process_name) == count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0) by (process_name)
{% raw %}
for: 2m
labels:
severity: critical
service: calico
annotations:
summary: "Calico {{ $labels.process_name }} cluster outage"
description: "All Calico {{ $labels.process_name }} processes are down for 2 minutes."
{% endraw %}
{% endif %}
{#
# vim: ft=jinja
#}