kubernetes/meta/prometheus.yml - salt-formulas/kubernetes - Gitiles

 {%- from "kubernetes/map.jinja" import master with context %}
 {%- from "kubernetes/map.jinja" import pool with context %}
 {%- from "kubernetes/map.jinja" import monitoring with context %}

 {%- set network = {} %}
 {%- if pool.get('enabled', False) %}
   {%- set network = pool.get('network', {}) %}
 {%- elif master.get('enabled', False) %}
   {%- set network = master.get('network', {}) %}
 {%- endif %}

 {%- set apiServerPoint = {} %}
 {%- if pool.get('enabled', False) %}
   {%- set apiServerPoint = pool.get('apiserver', {}).get('host') %}
   {%- if network.get('calico', {}).get('enabled', False) and network.calico.get('prometheus', {}).get('enabled', False) %}
     {%- set calico_address = network.calico.prometheus.get('address', pool.address) %}
   {%- endif %}
 {%- elif master.get('enabled', False) %}
   {%- set apiServerPoint = master.get('apiserver', {}).get('address') %}
   {%- if network.get('calico', {}).get('enabled', False) and network.calico.get('prometheus', {}).get('enabled', False) %}
     {%- set calico_address = network.calico.prometheus.get('address', master.address) %}
   {%- endif %}
 {%- endif %}


 server:
   target:
     kubernetes:
       enabled: true
       api_ip: {{ apiServerPoint }}
       cert_name: prometheus-server.crt
       key_name: prometheus-server.key
 {%- if calico_address is defined %}
     static:
       calico:
         endpoint:
           - address: {{ calico_address }}
             port: {{ network.calico.prometheus.get('port', 9091) }}
         relabel_configs:
           - regex: {{ calico_address }}:{{ network.calico.prometheus.get('port', 9091) }}
             replacement: {{ grains['host'] }}
             source_labels: "__address__"
             target_label: "host"
 {%- endif %}
   recording:
     cluster_namespace_controller_pod_container:spec_memory_limit_bytes:
       query: >-
         sum by (cluster,namespace,controller,pod_name,container_name) (
           label_replace(
             container_spec_memory_limit_bytes{container_name!=""},
             "controller", "$1",
             "pod_name", "^(.*)-[a-z0-9]+"
           )
         )
     cluster_namespace_controller_pod_container:spec_cpu_shares:
       query: >-
         sum by (cluster,namespace,controller,pod_name,container_name) (
           label_replace(
             container_spec_cpu_shares{container_name!=""},
             "controller", "$1",
             "pod_name", "^(.*)-[a-z0-9]+"
           )
         )
     cluster_namespace_controller_pod_container:cpu_usage:rate:
       query: >-
         sum by (cluster,namespace,controller,pod_name,container_name) (
           label_replace(
             irate(
               container_cpu_usage_seconds_total{container_name!=""}[5m]
             ),
             "controller", "$1",
             "pod_name", "^(.*)-[a-z0-9]+"
           )
         )
     cluster_namespace_controller_pod_container:memory_usage:bytes:
       query: >-
         sum by (cluster,namespace,controller,pod_name,container_name) (
           label_replace(
             container_memory_usage_bytes{container_name!=""},
             "controller", "$1",
             "pod_name", "^(.*)-[a-z0-9]+"
           )
         )
     cluster_namespace_controller_pod_container:memory_working_set:bytes:
       query: >-
         sum by (cluster,namespace,controller,pod_name,container_name) (
           label_replace(
             container_memory_working_set_bytes{container_name!=""},
             "controller", "$1",
             "pod_name", "^(.*)-[a-z0-9]+"
           )
         )
     cluster_namespace_controller_pod_container:memory_rss:bytes:
       query: >-
         sum by (cluster,namespace,controller,pod_name,container_name) (
           label_replace(
             container_memory_rss{container_name!=""},
             "controller", "$1",
             "pod_name", "^(.*)-[a-z0-9]+"
           )
         )
     cluster_namespace_controller_pod_container:memory_cache:bytes:
       query: >-
         sum by (cluster,namespace,controller,pod_name,container_name) (
           label_replace(
             container_memory_cache{container_name!=""},
             "controller", "$1",
             "pod_name", "^(.*)-[a-z0-9]+"
           )
         )
     cluster_namespace_controller_pod_container:disk_usage:bytes:
       query: >-
         sum by (cluster,namespace,controller,pod_name,container_name) (
           label_replace(
             container_fs_usage_bytes{container_name!=""},
             "controller", "$1",
             "pod_name", "^(.*)-[a-z0-9]+"
           )
         )
     cluster_namespace_controller_pod_container:memory_pagefaults:rate:
       query: >-
         sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
           label_replace(
             irate(
               container_memory_failures_total{container_name!=""}[5m]
             ),
             "controller", "$1",
             "pod_name", "^(.*)-[a-z0-9]+"
           )
         )
     cluster_namespace_controller_pod_container:memory_oom:rate:
       query: >-
         sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
           label_replace(
             irate(
               container_memory_failcnt{container_name!=""}[5m]
             ),
             "controller", "$1",
             "pod_name", "^(.*)-[a-z0-9]+"
           )
         )
     cluster:memory_allocation:percent:
       query: >-
         100 * sum by (cluster) (
           container_spec_memory_limit_bytes{pod_name!=""}
         ) / sum by (cluster) (
           machine_memory_bytes
         )
     cluster:memory_used:percent:
       query: >-
         100 * sum by (cluster) (
           container_memory_usage_bytes{pod_name!=""}
         ) / sum by (cluster) (
           machine_memory_bytes
         )
     cluster:cpu_allocation:percent:
       query: >-
         100 * sum by (cluster) (
           container_spec_cpu_shares{pod_name!=""}
         ) / sum by (cluster) (
           container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores
         )
   alert:
   {%- set instance_minor_threshold_percent = monitoring.instance_minor_threshold_percent|float %}
   {%- set instance_major_threshold_percent = monitoring.instance_major_threshold_percent|float %}
     ContainerScrapeError:
       if: "container_scrape_error != 0"
     {% raw %}
       labels:
         severity: warning
         service: kubernetes
       annotations:
         summary: "Failed to get Kubernetes container metrics"
         description: "Prometheus was not able to scrape metrics from the container on the {{ $labels.instance }} Kubernetes instance."
     {% endraw %}
     KubernetesProcessDown:
       if: >-
         procstat_running{process_name=~"hyperkube-.*"} == 0
     {% raw %}
       for: 2m
       labels:
         severity: minor
         service: kubernetes
       annotations:
         summary: "Kubernetes {{ $labels.process_name }} process is down"
         description: "Kubernetes {{ $labels.process_name }} process on the {{ $labels.host }} node is down for 2 minutes."
     {% endraw %}
     KubernetesProcessDownMinor:
       if: >-
         count(procstat_running{process_name=~"hyperkube-.*"} == 0) by (process_name) > count(procstat_running{process_name=~"hyperkube-.*"}) by (process_name) * {{ instance_minor_threshold_percent }}
     {% raw %}
       for: 2m
       labels:
         severity: minor
         service: kubernetes
       annotations:
         summary: "{% endraw %}{{ instance_minor_threshold_percent * 100 }}%{% raw %} of Kubernetes {{ $labels.process_name }} processes are down"
         description: >-
           {{ $value }} of Kubernetes {{ $labels.process_name }} processes (>= {% endraw %} {{ instance_minor_threshold_percent * 100 }}%) are down for 2 minutes.
     KubernetesProcessDownMajor:
       if: >-
         count(procstat_running{process_name=~"hyperkube-.*"} == 0) by (process_name) > count(procstat_running{process_name=~"hyperkube-.*"}) by (process_name) * {{ instance_major_threshold_percent }}
       for: 2m
       labels:
         severity: major
         service: kubernetes
       annotations:
         summary: "{{ instance_major_threshold_percent * 100 }}%{% raw %} of Kubernetes {{ $labels.process_name }} processes are down"
         description: >-
           {{ $value }} of Kubernetes {{ $labels.process_name }} processes (>= {% endraw %} {{ instance_major_threshold_percent * 100 }}%) are down for 2 minutes.
     KubernetesProcessOutage:
       if: >-
         count(procstat_running{process_name=~"hyperkube-.*"}) by (process_name) == count(procstat_running{process_name=~"hyperkube-.*"} == 0) by (process_name)
     {% raw %}
       for: 2m
       labels:
         severity: critical
         service: kubernetes
       annotations:
         summary: "Kubernetes {{ $labels.process_name }} cluster outage"
         description: "All Kubernetes {{ $labels.process_name }} processes are down for 2 minutes."
     {% endraw %}
 {%- if network.get('calico', {}).get('enabled', False) %}
     CalicoProcessDown:
       if: >-
         procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0
     {% raw %}
       for: 2m
       labels:
         severity: minor
         service: calico
       annotations:
         summary: "Calico {{ $labels.process_name }} process is down"
         description: "Calico {{ $labels.process_name }} process on the {{ $labels.host }} node is down for 2 minutes."
     {% endraw %}
     CalicoProcessDownMinor:
       if: >-
         count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0) by (process_name) > count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"}) by (process_name) * {{ instance_minor_threshold_percent }}
       for: 2m
       labels:
         severity: minor
         service: calico
       annotations:
         summary: "{{ instance_minor_threshold_percent * 100 }}%{% raw %} of Calico {{ $labels.process_name }} processes are down"
         description: >-
           {{ $value }} of Calico {{ $labels.process_name }} processes (>= {% endraw %} {{ instance_minor_threshold_percent * 100 }}%) are down for 2 minutes.
     CalicoProcessDownMajor:
       if: >-
         count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0) by (process_name) > count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"}) by (process_name) * {{ instance_major_threshold_percent }}
       for: 2m
       labels:
         severity: major
         service: calico
       annotations:
         summary: "{{ instance_major_threshold_percent * 100 }}%{% raw %} of Calico {{ $labels.process_name }} processes are down"
         description: >-
           {{ $value }} of Calico {{ $labels.process_name }} processes (>= {% endraw %} {{ instance_major_threshold_percent * 100 }}%) are down for 2 minutes.
     CalicoProcessOutage:
       if: >-
         count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"}) by (process_name) == count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0) by (process_name)
     {% raw %}
       for: 2m
       labels:
         severity: critical
         service: calico
       annotations:
         summary: "Calico {{ $labels.process_name }} cluster outage"
         description: "All Calico {{ $labels.process_name }} processes are down for 2 minutes."
     {% endraw %}
 {% endif %}
 {#
 # vim: ft=jinja
 #}
	{%- from "kubernetes/map.jinja" import master with context %}
	{%- from "kubernetes/map.jinja" import pool with context %}
	{%- from "kubernetes/map.jinja" import monitoring with context %}

	{%- set network = {} %}
	{%- if pool.get('enabled', False) %}
	{%- set network = pool.get('network', {}) %}
	{%- elif master.get('enabled', False) %}
	{%- set network = master.get('network', {}) %}
	{%- endif %}

	{%- set apiServerPoint = {} %}
	{%- if pool.get('enabled', False) %}
	{%- set apiServerPoint = pool.get('apiserver', {}).get('host') %}
	{%- if network.get('calico', {}).get('enabled', False) and network.calico.get('prometheus', {}).get('enabled', False) %}
	{%- set calico_address = network.calico.prometheus.get('address', pool.address) %}
	{%- endif %}
	{%- elif master.get('enabled', False) %}
	{%- set apiServerPoint = master.get('apiserver', {}).get('address') %}
	{%- if network.get('calico', {}).get('enabled', False) and network.calico.get('prometheus', {}).get('enabled', False) %}
	{%- set calico_address = network.calico.prometheus.get('address', master.address) %}
	{%- endif %}
	{%- endif %}


	server:
	target:
	kubernetes:
	enabled: true
	api_ip: {{ apiServerPoint }}
	cert_name: prometheus-server.crt
	key_name: prometheus-server.key
	{%- if calico_address is defined %}
	static:
	calico:
	endpoint:
	- address: {{ calico_address }}
	port: {{ network.calico.prometheus.get('port', 9091) }}
	relabel_configs:
	- regex: {{ calico_address }}:{{ network.calico.prometheus.get('port', 9091) }}
	replacement: {{ grains['host'] }}
	source_labels: "__address__"
	target_label: "host"
	{%- endif %}
	recording:
	cluster_namespace_controller_pod_container:spec_memory_limit_bytes:
	query: >-
	sum by (cluster,namespace,controller,pod_name,container_name) (
	label_replace(
	container_spec_memory_limit_bytes{container_name!=""},
	"controller", "$1",
	"pod_name", "^(.*)-[a-z0-9]+"
	)
	)
	cluster_namespace_controller_pod_container:spec_cpu_shares:
	query: >-
	sum by (cluster,namespace,controller,pod_name,container_name) (
	label_replace(
	container_spec_cpu_shares{container_name!=""},
	"controller", "$1",
	"pod_name", "^(.*)-[a-z0-9]+"
	)
	)
	cluster_namespace_controller_pod_container:cpu_usage:rate:
	query: >-
	sum by (cluster,namespace,controller,pod_name,container_name) (
	label_replace(
	irate(
	container_cpu_usage_seconds_total{container_name!=""}[5m]
	),
	"controller", "$1",
	"pod_name", "^(.*)-[a-z0-9]+"
	)
	)
	cluster_namespace_controller_pod_container:memory_usage:bytes:
	query: >-
	sum by (cluster,namespace,controller,pod_name,container_name) (
	label_replace(
	container_memory_usage_bytes{container_name!=""},
	"controller", "$1",
	"pod_name", "^(.*)-[a-z0-9]+"
	)
	)
	cluster_namespace_controller_pod_container:memory_working_set:bytes:
	query: >-
	sum by (cluster,namespace,controller,pod_name,container_name) (
	label_replace(
	container_memory_working_set_bytes{container_name!=""},
	"controller", "$1",
	"pod_name", "^(.*)-[a-z0-9]+"
	)
	)
	cluster_namespace_controller_pod_container:memory_rss:bytes:
	query: >-
	sum by (cluster,namespace,controller,pod_name,container_name) (
	label_replace(
	container_memory_rss{container_name!=""},
	"controller", "$1",
	"pod_name", "^(.*)-[a-z0-9]+"
	)
	)
	cluster_namespace_controller_pod_container:memory_cache:bytes:
	query: >-
	sum by (cluster,namespace,controller,pod_name,container_name) (
	label_replace(
	container_memory_cache{container_name!=""},
	"controller", "$1",
	"pod_name", "^(.*)-[a-z0-9]+"
	)
	)
	cluster_namespace_controller_pod_container:disk_usage:bytes:
	query: >-
	sum by (cluster,namespace,controller,pod_name,container_name) (
	label_replace(
	container_fs_usage_bytes{container_name!=""},
	"controller", "$1",
	"pod_name", "^(.*)-[a-z0-9]+"
	)
	)
	cluster_namespace_controller_pod_container:memory_pagefaults:rate:
	query: >-
	sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
	label_replace(
	irate(
	container_memory_failures_total{container_name!=""}[5m]
	),
	"controller", "$1",
	"pod_name", "^(.*)-[a-z0-9]+"
	)
	)
	cluster_namespace_controller_pod_container:memory_oom:rate:
	query: >-
	sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
	label_replace(
	irate(
	container_memory_failcnt{container_name!=""}[5m]
	),
	"controller", "$1",
	"pod_name", "^(.*)-[a-z0-9]+"
	)
	)
	cluster:memory_allocation:percent:
	query: >-
	100 * sum by (cluster) (
	container_spec_memory_limit_bytes{pod_name!=""}
	) / sum by (cluster) (
	machine_memory_bytes
	)
	cluster:memory_used:percent:
	query: >-
	100 * sum by (cluster) (
	container_memory_usage_bytes{pod_name!=""}
	) / sum by (cluster) (
	machine_memory_bytes
	)
	cluster:cpu_allocation:percent:
	query: >-
	100 * sum by (cluster) (
	container_spec_cpu_shares{pod_name!=""}
	) / sum by (cluster) (
	container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores
	)
	alert:
	{%- set instance_minor_threshold_percent = monitoring.instance_minor_threshold_percent\|float %}
	{%- set instance_major_threshold_percent = monitoring.instance_major_threshold_percent\|float %}
	ContainerScrapeError:
	if: "container_scrape_error != 0"
	{% raw %}
	labels:
	severity: warning
	service: kubernetes
	annotations:
	summary: "Failed to get Kubernetes container metrics"
	description: "Prometheus was not able to scrape metrics from the container on the {{ $labels.instance }} Kubernetes instance."
	{% endraw %}
	KubernetesProcessDown:
	if: >-
	procstat_running{process_name=~"hyperkube-.*"} == 0
	{% raw %}
	for: 2m
	labels:
	severity: minor
	service: kubernetes
	annotations:
	summary: "Kubernetes {{ $labels.process_name }} process is down"
	description: "Kubernetes {{ $labels.process_name }} process on the {{ $labels.host }} node is down for 2 minutes."
	{% endraw %}
	KubernetesProcessDownMinor:
	if: >-
	count(procstat_running{process_name=~"hyperkube-."} == 0) by (process_name) > count(procstat_running{process_name=~"hyperkube-."}) by (process_name) * {{ instance_minor_threshold_percent }}
	{% raw %}
	for: 2m
	labels:
	severity: minor
	service: kubernetes
	annotations:
	summary: "{% endraw %}{{ instance_minor_threshold_percent * 100 }}%{% raw %} of Kubernetes {{ $labels.process_name }} processes are down"
	description: >-
	{{ $value }} of Kubernetes {{ $labels.process_name }} processes (>= {% endraw %} {{ instance_minor_threshold_percent * 100 }}%) are down for 2 minutes.
	KubernetesProcessDownMajor:
	if: >-
	count(procstat_running{process_name=~"hyperkube-."} == 0) by (process_name) > count(procstat_running{process_name=~"hyperkube-."}) by (process_name) * {{ instance_major_threshold_percent }}
	for: 2m
	labels:
	severity: major
	service: kubernetes
	annotations:
	summary: "{{ instance_major_threshold_percent * 100 }}%{% raw %} of Kubernetes {{ $labels.process_name }} processes are down"
	description: >-
	{{ $value }} of Kubernetes {{ $labels.process_name }} processes (>= {% endraw %} {{ instance_major_threshold_percent * 100 }}%) are down for 2 minutes.
	KubernetesProcessOutage:
	if: >-
	count(procstat_running{process_name=~"hyperkube-."}) by (process_name) == count(procstat_running{process_name=~"hyperkube-."} == 0) by (process_name)
	{% raw %}
	for: 2m
	labels:
	severity: critical
	service: kubernetes
	annotations:
	summary: "Kubernetes {{ $labels.process_name }} cluster outage"
	description: "All Kubernetes {{ $labels.process_name }} processes are down for 2 minutes."
	{% endraw %}
	{%- if network.get('calico', {}).get('enabled', False) %}
	CalicoProcessDown:
	if: >-
	procstat_running{process_name=~"calico-felix\|bird\|bird6\|confd"} == 0
	{% raw %}
	for: 2m
	labels:
	severity: minor
	service: calico
	annotations:
	summary: "Calico {{ $labels.process_name }} process is down"
	description: "Calico {{ $labels.process_name }} process on the {{ $labels.host }} node is down for 2 minutes."
	{% endraw %}
	CalicoProcessDownMinor:
	if: >-
	count(procstat_running{process_name=~"calico-felix\|bird\|bird6\|confd"} == 0) by (process_name) > count(procstat_running{process_name=~"calico-felix\|bird\|bird6\|confd"}) by (process_name) * {{ instance_minor_threshold_percent }}
	for: 2m
	labels:
	severity: minor
	service: calico
	annotations:
	summary: "{{ instance_minor_threshold_percent * 100 }}%{% raw %} of Calico {{ $labels.process_name }} processes are down"
	description: >-
	{{ $value }} of Calico {{ $labels.process_name }} processes (>= {% endraw %} {{ instance_minor_threshold_percent * 100 }}%) are down for 2 minutes.
	CalicoProcessDownMajor:
	if: >-
	count(procstat_running{process_name=~"calico-felix\|bird\|bird6\|confd"} == 0) by (process_name) > count(procstat_running{process_name=~"calico-felix\|bird\|bird6\|confd"}) by (process_name) * {{ instance_major_threshold_percent }}
	for: 2m
	labels:
	severity: major
	service: calico
	annotations:
	summary: "{{ instance_major_threshold_percent * 100 }}%{% raw %} of Calico {{ $labels.process_name }} processes are down"
	description: >-
	{{ $value }} of Calico {{ $labels.process_name }} processes (>= {% endraw %} {{ instance_major_threshold_percent * 100 }}%) are down for 2 minutes.
	CalicoProcessOutage:
	if: >-
	count(procstat_running{process_name=~"calico-felix\|bird\|bird6\|confd"}) by (process_name) == count(procstat_running{process_name=~"calico-felix\|bird\|bird6\|confd"} == 0) by (process_name)
	{% raw %}
	for: 2m
	labels:
	severity: critical
	service: calico
	annotations:
	summary: "Calico {{ $labels.process_name }} cluster outage"
	description: "All Calico {{ $labels.process_name }} processes are down for 2 minutes."
	{% endraw %}
	{% endif %}
	{#
	# vim: ft=jinja
	#}