Get k8s nodes readiness

The patch configures alarms regarding node statuses.

Depends-On: I403d06f10a589370fa40ad08a0fcb6d3ec237ba4

Change-Id: Ifcef32312157e742303f8ffc9a1fe215e6cb1a37
diff --git a/kubernetes/files/collectd_kubectl_get.conf b/kubernetes/files/collectd_kubectl_get.conf
new file mode 100644
index 0000000..1ede3ab
--- /dev/null
+++ b/kubernetes/files/collectd_kubectl_get.conf
@@ -0,0 +1,7 @@
+Import "collectd_k8s_kubectl_get"
+
+<Module "collectd_k8s_kubectl_get">
+  Polling "{{ plugin.interval }}"
+  PollingInterval "{{ plugin.polling_interval }}"
+  GetNodes "{{ plugin.get_nodes }}"
+</Module>
diff --git a/kubernetes/meta/collectd.yml b/kubernetes/meta/collectd.yml
index 577fd24..232770e 100644
--- a/kubernetes/meta/collectd.yml
+++ b/kubernetes/meta/collectd.yml
@@ -43,6 +43,14 @@
        expected_content: ok
        url: http://127.0.0.1:10249/healthz
 {%- endif %}
+{%- if master.get('enabled', False) %}
+  collectd_kubectl_get:
+   plugin: python
+   template: kubernetes/files/collectd_kubectl_get.conf
+   polling_interval: 60
+   interval: 30
+   get_nodes: true
+{%- endif %}
 
   collectd_processes:
     process:
diff --git a/kubernetes/meta/heka.yml b/kubernetes/meta/heka.yml
index 9515cb6..65ef392 100644
--- a/kubernetes/meta/heka.yml
+++ b/kubernetes/meta/heka.yml
@@ -79,6 +79,43 @@
         periods: 0
         function: last
 {%- endfor %}
+    k8s_node_some_not_ready:
+      description: 'Some k8s nodes are not ready'
+      severity: warning
+      logical_operator: and
+      rules:
+      - metric: k8s_nodes
+        field:
+          status: not_ready
+        relational_operator: '>'
+        threshold: 0
+        window: 120
+        periods: 0
+        function: last
+    k8s_node_majority_not_ready:
+      description: 'Majority of k8s nodes are not ready'
+      severity: critical
+      rules:
+      - metric: k8s_nodes_percent
+        field:
+          status: not_ready
+        relational_operator: '>'
+        threshold: 50
+        window: 120
+        periods: 0
+        function: last
+    k8s_node_all_not_ready:
+      description: 'All k8s node are not ready'
+      severity: down
+      rules:
+      - metric: k8s_nodes_percent
+        field:
+          status: not_ready
+        relational_operator: '=='
+        threshold: 100
+        window: 60
+        periods: 0
+        function: last
 {%- endif %}
 {%- if pool.get('enabled', False) %}
     k8s-kubelet_local_endpoint:
@@ -179,6 +216,14 @@
       dimension:
         service: k8s-{{ kube_service }}-endpoint
     {%- endfor %}
+    k8s-nodes-not-ready:
+      alerting: enabled
+      triggers:
+        - k8s_node_all_not_ready
+        - k8s_node_majority_not_ready
+        - k8s_node_some_not_ready
+      dimension:
+        service: k8s-nodes
 {%- endif %}
 {%- if pool.get('enabled', False) %}
     k8s-kubelet_endpoint:
@@ -266,6 +311,17 @@
       dimension:
         service: k8s-pool
         nagios_host: 01-service-clusters
+    k8s-nodes:
+      policy: highest_severity
+      alerting: enabled
+      group_by: hostname
+      match:
+        service: k8s-nodes
+      members:
+      - k8s-nodes-not-ready
+      dimension:
+        service: k8s-pool
+        nagios_host: 01-service-clusters
     k8s_pool:
       policy: highest_severity
       alerting: enabled_with_notification
@@ -274,6 +330,7 @@
       members:
       - k8s-kubelet_endpoint
       - k8s-proxy_endpoint
+      - k8s-nodes
       dimension:
         cluster_name: k8s-pool
         nagios_host: 00-top-clusters