Add new prometheus alerts

AvgKubeletRunningContainerCountLow
AvgKubeletRunningPODCountLow
ContainerScrapeError
ProcstatPidBird
ProcstatPidBird6
ProcstatPidConfd
ProcstatPidCalicoFelix

Change-Id: I2ab756ecea762721c41a5fdfe12dbdb3532d6985
diff --git a/kubernetes/meta/prometheus.yml b/kubernetes/meta/prometheus.yml
index 12cfa32..e6cea72 100644
--- a/kubernetes/meta/prometheus.yml
+++ b/kubernetes/meta/prometheus.yml
@@ -1,3 +1,14 @@
+{%- from "kubernetes/map.jinja" import master with context %}
+{%- from "kubernetes/map.jinja" import pool with context %}
+
+{%- if pool.get('enabled', False) %}
+{% set network = pool.get('network', []) %}
+{%- else %}
+{%- if master.get('enabled', False) %}
+{% set network = master.get('network', []) %}
+{% endif %}
+{% endif %}
+
 {% raw %}
 server:
   recording:
@@ -118,4 +129,77 @@
         ) / sum by (cluster) (
           container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores
         )
+  alert:
+    AvgKubeletRunningContainerCountLow:
+      if: >-
+        avg_over_time(kubelet_running_container_count[2m]) * 1.3 <
+        avg_over_time(kubelet_running_container_count[10m])
+      labels:
+        severity: warning
+        service: kubernetes
+      annotations:
+        summary: 'Container count is low'
+        description: 'Container count from last 2m is lower than avarage from 10m'
+    AvgKubeletRunningPODCountLow:
+      if: >-
+        avg_over_time(kubelet_running_pod_count[2m]) * 1.3 <
+        avg_over_time(kubelet_running_pod_count[10m])
+      labels:
+        severity: warning
+        service: kubernetes
+      annotations:
+        summary: 'POD count is low'
+        description: 'POD count from last 2m is lower than avarage from 10m'
+    ContainerScrapeError:
+      if: 'container_scrape_error != 0'
+      labels:
+        severity: warning
+        service: kubernetes
+      annotations:
+        summary: 'Fail to scrape container'
+        description: 'Prometheus was not able to scrape metrics from container on {{ $labels.instance }}'
 {% endraw %}
+{%- if network is defined and network.get('engine', None) == 'calico' %}
+{% raw %}
+    ProcstatPidBird:
+      if: >-
+        absent(procstat_pid{process_name="bird"}) OR
+        count(procstat_pid{process_name="bird"}) < count(up{job="kubernetes-node"} == 1)
+      labels:
+        severity: warning
+        service: bird
+      annotations:
+        summary: 'Bird process is missing'
+        description: 'Bird process is not running on all k8s nodes'
+    ProcstatPidBird6:
+      if: >-
+        absent(procstat_pid{process_name="bird6"}) OR
+        count(procstat_pid{process_name="bird6"}) < count(up{job="kubernetes-node"} == 1)
+      labels:
+        severity: warning
+        service: bird6
+      annotations:
+        summary: 'Bird6 process is missing'
+        description: 'Bird6 process is not running on all k8s nodes'
+    ProcstatPidConfd:
+      if: >-
+        absent(procstat_pid{process_name="confd"}) OR
+        count(procstat_pid{process_name="confd"}) < count(up{job="kubernetes-node"} == 1)
+      labels:
+        severity: warning
+        service: confd
+      annotations:
+        summary: 'Confd process is missing'
+        description: 'Confd process is not running on all k8s nodes'
+    ProcstatPidCalicoFelix:
+      if: >-
+        absent(procstat_pid{process_name="calico-felix"}) OR
+        count(procstat_pid{process_name="calico-felix"}) < count(up{job="kubernetes-node"} == 1)
+      labels:
+        severity: warning
+        service: calico-felix
+      annotations:
+        summary: 'Calico-felix process is missing'
+        description: 'Calico-felix process is not running on all k8s nodes'
+{% endraw %}
+{% endif %}