Get k8s nodes readiness
The patch configures alarms regarding node statuses.
Depends-On: I403d06f10a589370fa40ad08a0fcb6d3ec237ba4
Change-Id: Ifcef32312157e742303f8ffc9a1fe215e6cb1a37
diff --git a/kubernetes/files/collectd_kubectl_get.conf b/kubernetes/files/collectd_kubectl_get.conf
new file mode 100644
index 0000000..1ede3ab
--- /dev/null
+++ b/kubernetes/files/collectd_kubectl_get.conf
@@ -0,0 +1,7 @@
+Import "collectd_k8s_kubectl_get"
+
+<Module "collectd_k8s_kubectl_get">
+ Polling "{{ plugin.interval }}"
+ PollingInterval "{{ plugin.polling_interval }}"
+ GetNodes "{{ plugin.get_nodes }}"
+</Module>
diff --git a/kubernetes/meta/collectd.yml b/kubernetes/meta/collectd.yml
index 577fd24..232770e 100644
--- a/kubernetes/meta/collectd.yml
+++ b/kubernetes/meta/collectd.yml
@@ -43,6 +43,14 @@
expected_content: ok
url: http://127.0.0.1:10249/healthz
{%- endif %}
+{%- if master.get('enabled', False) %}
+ collectd_kubectl_get:
+ plugin: python
+ template: kubernetes/files/collectd_kubectl_get.conf
+ polling_interval: 60
+ interval: 30
+ get_nodes: true
+{%- endif %}
collectd_processes:
process:
diff --git a/kubernetes/meta/heka.yml b/kubernetes/meta/heka.yml
index 9515cb6..65ef392 100644
--- a/kubernetes/meta/heka.yml
+++ b/kubernetes/meta/heka.yml
@@ -79,6 +79,43 @@
periods: 0
function: last
{%- endfor %}
+ k8s_node_some_not_ready:
+ description: 'Some k8s nodes are not ready'
+ severity: warning
+ logical_operator: and
+ rules:
+ - metric: k8s_nodes
+ field:
+ status: not_ready
+ relational_operator: '>'
+ threshold: 0
+ window: 120
+ periods: 0
+ function: last
+ k8s_node_majority_not_ready:
+ description: 'Majority of k8s nodes are not ready'
+ severity: critical
+ rules:
+ - metric: k8s_nodes_percent
+ field:
+ status: not_ready
+ relational_operator: '>'
+ threshold: 50
+ window: 120
+ periods: 0
+ function: last
+ k8s_node_all_not_ready:
+ description: 'All k8s node are not ready'
+ severity: down
+ rules:
+ - metric: k8s_nodes_percent
+ field:
+ status: not_ready
+ relational_operator: '=='
+ threshold: 100
+ window: 60
+ periods: 0
+ function: last
{%- endif %}
{%- if pool.get('enabled', False) %}
k8s-kubelet_local_endpoint:
@@ -179,6 +216,14 @@
dimension:
service: k8s-{{ kube_service }}-endpoint
{%- endfor %}
+ k8s-nodes-not-ready:
+ alerting: enabled
+ triggers:
+ - k8s_node_all_not_ready
+ - k8s_node_majority_not_ready
+ - k8s_node_some_not_ready
+ dimension:
+ service: k8s-nodes
{%- endif %}
{%- if pool.get('enabled', False) %}
k8s-kubelet_endpoint:
@@ -266,6 +311,17 @@
dimension:
service: k8s-pool
nagios_host: 01-service-clusters
+ k8s-nodes:
+ policy: highest_severity
+ alerting: enabled
+ group_by: hostname
+ match:
+ service: k8s-nodes
+ members:
+ - k8s-nodes-not-ready
+ dimension:
+ service: k8s-pool
+ nagios_host: 01-service-clusters
k8s_pool:
policy: highest_severity
alerting: enabled_with_notification
@@ -274,6 +330,7 @@
members:
- k8s-kubelet_endpoint
- k8s-proxy_endpoint
+ - k8s-nodes
dimension:
cluster_name: k8s-pool
nagios_host: 00-top-clusters