Edit kubernetes recordings format
Fix kubernetes alerts
Change-Id: I1eb658414e8de4108d61109c69c64c02a006a105
diff --git a/kubernetes/meta/prometheus.yml b/kubernetes/meta/prometheus.yml
index 03ab5b5..8ab9a9f 100644
--- a/kubernetes/meta/prometheus.yml
+++ b/kubernetes/meta/prometheus.yml
@@ -27,7 +27,7 @@
{%- endif %}
{% raw %}
recording:
- - name: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
+ cluster_namespace_controller_pod_container:spec_memory_limit_bytes:
query: >-
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
@@ -36,7 +36,7 @@
"pod_name", "^(.*)-[a-z0-9]+"
)
)
- - name: cluster_namespace_controller_pod_container:spec_cpu_shares
+ cluster_namespace_controller_pod_container:spec_cpu_shares:
query: >-
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
@@ -45,7 +45,7 @@
"pod_name", "^(.*)-[a-z0-9]+"
)
)
- - name: cluster_namespace_controller_pod_container:cpu_usage:rate
+ cluster_namespace_controller_pod_container:cpu_usage:rate:
query: >-
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
@@ -56,7 +56,7 @@
"pod_name", "^(.*)-[a-z0-9]+"
)
)
- - name: cluster_namespace_controller_pod_container:memory_usage:bytes
+ cluster_namespace_controller_pod_container:memory_usage:bytes:
query: >-
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
@@ -65,7 +65,7 @@
"pod_name", "^(.*)-[a-z0-9]+"
)
)
- - name: cluster_namespace_controller_pod_container:memory_working_set:bytes
+ cluster_namespace_controller_pod_container:memory_working_set:bytes:
query: >-
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
@@ -74,7 +74,7 @@
"pod_name", "^(.*)-[a-z0-9]+"
)
)
- - name: cluster_namespace_controller_pod_container:memory_rss:bytes
+ cluster_namespace_controller_pod_container:memory_rss:bytes:
query: >-
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
@@ -83,7 +83,7 @@
"pod_name", "^(.*)-[a-z0-9]+"
)
)
- - name: cluster_namespace_controller_pod_container:memory_cache:bytes
+ cluster_namespace_controller_pod_container:memory_cache:bytes:
query: >-
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
@@ -92,7 +92,7 @@
"pod_name", "^(.*)-[a-z0-9]+"
)
)
- - name: cluster_namespace_controller_pod_container:disk_usage:bytes
+ cluster_namespace_controller_pod_container:disk_usage:bytes:
query: >-
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
@@ -101,7 +101,7 @@
"pod_name", "^(.*)-[a-z0-9]+"
)
)
- - name: cluster_namespace_controller_pod_container:memory_pagefaults:rate
+ cluster_namespace_controller_pod_container:memory_pagefaults:rate:
query: >-
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
label_replace(
@@ -112,7 +112,7 @@
"pod_name", "^(.*)-[a-z0-9]+"
)
)
- - name: cluster_namespace_controller_pod_container:memory_oom:rate
+ cluster_namespace_controller_pod_container:memory_oom:rate:
query: >-
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
label_replace(
@@ -123,21 +123,21 @@
"pod_name", "^(.*)-[a-z0-9]+"
)
)
- - name: cluster:memory_allocation:percent
+ cluster:memory_allocation:percent:
query: >-
100 * sum by (cluster) (
container_spec_memory_limit_bytes{pod_name!=""}
) / sum by (cluster) (
machine_memory_bytes
)
- - name: cluster:memory_used:percent
+ cluster:memory_used:percent:
query: >-
100 * sum by (cluster) (
container_memory_usage_bytes{pod_name!=""}
) / sum by (cluster) (
machine_memory_bytes
)
- - name: cluster:cpu_allocation:percent
+ cluster:cpu_allocation:percent:
query: >-
100 * sum by (cluster) (
container_spec_cpu_shares{pod_name!=""}
@@ -173,66 +173,26 @@
annotations:
summary: 'Fail to scrape container'
description: 'Prometheus was not able to scrape metrics from container on {{ $labels.instance }}'
- ContainerLastSeenKubernetes:
+ ProcstatRunningKubernetes:
if: >-
- time() - container_last_seen{id=~"/system.slice/kube-.*.service"} > 60
+ procstat_running{process_name=~"hypercube-.*"} == 0
labels:
severity: warning
service: kubernetes
annotations:
- summary: 'Kubernetes service {{ $labels.id }} is down'
- description: 'Kubernetes service {{ $labels.id }} is down on node {{ $labels.instance }}'
+ summary: 'Kubernetes service {{ $labels.process_name }} is down'
+ description: 'Kubernetes service {{ $labels.process_name }} is down on node {{ $labels.host }}'
{% endraw %}
{%- if network is defined and network.get('engine', None) == 'calico' %}
{% raw %}
- ContainerLastSeenCalicoFelix:
+ ProcstatRunningCalico:
if: >-
- time() - container_last_seen{id="/system.slice/calico-node.service"} > 60
+ procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0
labels:
severity: warning
- service: calico-felix
+ service: calico
annotations:
- summary: 'Calico service {{ $labels.id }} is down'
- description: 'Calico service {{ $labels.id }} is down on node {{ $labels.instance }}'
- ProcstatPidBird:
- if: >-
- absent(procstat_pid{process_name="bird"}) OR
- count(procstat_pid{process_name="bird"}) < count(up{job="kubernetes-node"} == 1)
- labels:
- severity: warning
- service: bird
- annotations:
- summary: 'Bird process is missing'
- description: 'Bird process is not running on all k8s nodes'
- ProcstatPidBird6:
- if: >-
- absent(procstat_pid{process_name="bird6"}) OR
- count(procstat_pid{process_name="bird6"}) < count(up{job="kubernetes-node"} == 1)
- labels:
- severity: warning
- service: bird6
- annotations:
- summary: 'Bird6 process is missing'
- description: 'Bird6 process is not running on all k8s nodes'
- ProcstatPidConfd:
- if: >-
- absent(procstat_pid{process_name="confd"}) OR
- count(procstat_pid{process_name="confd"}) < count(up{job="kubernetes-node"} == 1)
- labels:
- severity: warning
- service: confd
- annotations:
- summary: 'Confd process is missing'
- description: 'Confd process is not running on all k8s nodes'
- ProcstatPidCalicoFelix:
- if: >-
- absent(procstat_pid{process_name="calico-felix"}) OR
- count(procstat_pid{process_name="calico-felix"}) < count(up{job="kubernetes-node"} == 1)
- labels:
- severity: warning
- service: calico-felix
- annotations:
- summary: 'Calico-felix process is missing'
- description: 'Calico-felix process is not running on all k8s nodes'
+ summary: 'Calico service {{ $labels.process_name }} is down'
+ description: 'Calico service {{ $labels.process_name }} is down on node {{ $labels.host }}'
{% endraw %}
{% endif %}
diff --git a/kubernetes/meta/telegraf.yml b/kubernetes/meta/telegraf.yml
index b0e5967..92c63d2 100644
--- a/kubernetes/meta/telegraf.yml
+++ b/kubernetes/meta/telegraf.yml
@@ -14,8 +14,6 @@
input:
procstat:
process:
- keepalived:
- exe: keepalived
{%- if master.get('enabled', False) %}
hypercube-apiserver:
pattern: "hyperkube.*apiserver"