Alerts rationalization for k8s&calico

Change-Id: Ib752e183920390dbe73ab08431bdcadd0f3cbe49
Closes-Bug: PROD-19945
diff --git a/kubernetes/map.jinja b/kubernetes/map.jinja
index 99e33cc..74f204e 100644
--- a/kubernetes/map.jinja
+++ b/kubernetes/map.jinja
@@ -135,3 +135,10 @@
 }) %}
 {% do pool.network.opencontrail.update(opencontrail) %}
 {%- endif %}
+
+{%- set monitoring = salt['grains.filter_by']({
+  'default': {
+    'instance_minor_threshold_percent': 0.3,
+    'instance_major_threshold_percent': 0.6,
+  },
+}, grain='os_family', merge=salt['pillar.get']('kubernetes:monitoring')) %}
\ No newline at end of file
diff --git a/kubernetes/meta/prometheus.yml b/kubernetes/meta/prometheus.yml
index 5dfad70..3ca5453 100644
--- a/kubernetes/meta/prometheus.yml
+++ b/kubernetes/meta/prometheus.yml
@@ -1,5 +1,6 @@
 {%- from "kubernetes/map.jinja" import master with context %}
 {%- from "kubernetes/map.jinja" import pool with context %}
+{%- from "kubernetes/map.jinja" import monitoring with context %}
 
 {%- set network = {} %}
 {%- if pool.get('enabled', False) %}
@@ -145,61 +146,110 @@
           container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores
         )
   alert:
-    AvgKubeletRunningContainerCountLow:
-      if: >-
-        avg_over_time(kubelet_running_container_count[2m]) * 1.3 <
-        avg_over_time(kubelet_running_container_count[10m])
-      {% raw %}
-      labels:
-        severity: warning
-        service: kubernetes
-      annotations:
-        summary: 'Container count is low'
-        description: 'Container count from last 2m is lower than avarage from 10m'
-      {% endraw %}
-    AvgKubeletRunningPODCountLow:
-      if: >-
-        avg_over_time(kubelet_running_pod_count[2m]) * 1.3 <
-        avg_over_time(kubelet_running_pod_count[10m])
-      {% raw %}
-      labels:
-        severity: warning
-        service: kubernetes
-      annotations:
-        summary: 'POD count is low'
-        description: 'POD count from last 2m is lower than avarage from 10m'
-      {% endraw %}
+  {%- set instance_minor_threshold_percent = monitoring.instance_minor_threshold_percent|float %}
+  {%- set instance_major_threshold_percent = monitoring.instance_major_threshold_percent|float %}
     ContainerScrapeError:
-      if: 'container_scrape_error != 0'
-      {% raw %}
+      if: "container_scrape_error != 0"
+    {% raw %}
       labels:
         severity: warning
         service: kubernetes
       annotations:
-        summary: 'Fail to scrape container'
-        description: 'Prometheus was not able to scrape metrics from container on {{ $labels.instance }}'
-      {% endraw %}
+        summary: "Failed to get the container metrics"
+        description: "Prometheus was not able to scrape metrics from the container on the {{ $labels.instance }} instance."
+    {% endraw %}
     KubernetesProcessDown:
       if: >-
         procstat_running{process_name=~"hyperkube-.*"} == 0
-      {% raw %}
+    {% raw %}
+      for: 2m
       labels:
-        severity: warning
+        severity: minor
         service: kubernetes
       annotations:
-        summary: 'Kubernetes service {{ $labels.process_name }} is down'
-        description: 'Kubernetes service {{ $labels.process_name }} is down on node {{ $labels.host }}'
-      {% endraw %}
+        summary: "Kubernetes {{ $labels.process_name }} process is down"
+        description: "Kubernetes {{ $labels.process_name }} process on the {{ $labels.host }} node is down for at least 2 minutes."
+    {% endraw %}
+    KubernetesProcessDownMinor:
+      if: >-
+        count(procstat_running{process_name=~"hyperkube-.*"} == 0) by (process_name) > count(procstat_running{process_name=~"hyperkube-.*"}) by (process_name) * {{ instance_minor_threshold_percent }}
+    {% raw %}
+      for: 2m
+      labels:
+        severity: minor
+        service: kubernetes
+      annotations:
+        summary: "{% endraw %}{{ instance_minor_threshold_percent * 100 }}%{% raw %} of Kubernetes {{ $labels.process_name }} process instances are down"
+        description: >-
+          {{ $value }} of Kubernetes {{ $labels.process_name }} process instances are down {% endraw %}(at least {{ instance_minor_threshold_percent * 100 }}%) for at least 2 minutes.
+    KubernetesProcessDownMajor:
+      if: >-
+        count(procstat_running{process_name=~"hyperkube-.*"} == 0) by (process_name) > count(procstat_running{process_name=~"hyperkube-.*"}) by (process_name) * {{ instance_major_threshold_percent }}
+      for: 2m
+      labels:
+        severity: major
+        service: kubernetes
+      annotations:
+        summary: "{{ instance_major_threshold_percent * 100 }}%{% raw %} of Kubernetes {{ $labels.process_name }} process instances are down"
+        description: >-
+          {{ $value }} of Kubernetes {{ $labels.process_name }} process instances are down {% endraw %}(at least {{ instance_major_threshold_percent * 100 }}%) for at least 2 minutes.
+    KubernetesProcessOutage:
+      if: >-
+        count(procstat_running{process_name=~"hyperkube-.*"}) by (process_name) == count(procstat_running{process_name=~"hyperkube-.*"} == 0) by (process_name)
+    {% raw %}
+      for: 2m
+      labels:
+        severity: critical
+        service: kubernetes
+      annotations:
+        summary: "Kubernetes {{ $labels.process_name }} cluster outage"
+        description: "All Kubernetes {{ $labels.process_name }} process instances are down for at least 2 minutes."
+    {% endraw %}
 {%- if network.get('calico', {}).get('enabled', False) %}
     CalicoProcessDown:
       if: >-
         procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0
-      {% raw %}
+    {% raw %}
+      for: 2m
       labels:
-        severity: warning
+        severity: minor
         service: calico
       annotations:
-        summary: 'Calico service {{ $labels.process_name }} is down'
-        description: 'Calico service {{ $labels.process_name }} is down on node {{ $labels.host }}'
-      {% endraw %}
+        summary: "Calico {{ $labels.process_name }} process is down"
+        description: "Calico {{ $labels.process_name }} process on the {{ $labels.host }} node is down for at least 2 minutes."
+    {% endraw %}
+    CalicoProcessDownMinor:
+      if: >-
+        count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0) by (process_name) > count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"}) by (process_name) * {{ instance_minor_threshold_percent }}
+      for: 2m
+      labels:
+        severity: minor
+        service: calico
+      annotations:
+        summary: "{{ instance_minor_threshold_percent * 100 }}%{% raw %} of Calico {{ $labels.process_name }} process instances are down"
+        description: >-
+          {{ $value }} of Calico {{ $labels.process_name }} process instances are down {% endraw %}(at least {{ instance_minor_threshold_percent * 100 }}%) for at least 2 minutes.
+    CalicoProcessDownMajor:
+      if: >-
+        count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0) by (process_name) > count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"}) by (process_name) * {{ instance_major_threshold_percent }}
+      for: 2m
+      labels:
+        severity: major
+        service: calico
+      annotations:
+        summary: "{{ instance_major_threshold_percent * 100 }}%{% raw %} of Calico {{ $labels.process_name }} process instances are down"
+        description: >-
+          {{ $value }} of Calico {{ $labels.process_name }} process instances are down {% endraw %}(at least {{ instance_major_threshold_percent * 100 }}%) for at least 2 minutes.
+    CalicoProcessOutage:
+      if: >-
+        count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"}) by (process_name) == count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0) by (process_name)
+    {% raw %}
+      for: 2m
+      labels:
+        severity: critical
+        service: calico
+      annotations:
+        summary: "Calico {{ $labels.process_name }} cluster outage"
+        description: "All Calico {{ $labels.process_name }} process instances are down for at least 2 minutes."
+    {% endraw %}
 {% endif %}