Add monitoring for Calico services processes
Change-Id: Iad448f320fdc9503507e35642d8d23998b04e1ff
diff --git a/kubernetes/meta/collectd.yml b/kubernetes/meta/collectd.yml
index c5df22d..0238668 100644
--- a/kubernetes/meta/collectd.yml
+++ b/kubernetes/meta/collectd.yml
@@ -1,6 +1,14 @@
{%- from "kubernetes/map.jinja" import master with context %}
{%- from "kubernetes/map.jinja" import pool with context %}
+{%- if pool.get('enabled', False) %}
+{% set network = pool.get('network', []) %}
+{%- else %}
+{%- if master.get('enabled', False) %}
+{% set network = master.get('network', []) %}
+{% endif %}
+{% endif %}
+
{%- if master.get('enabled', False) or pool.get('enabled', False) %}
local_plugin:
collectd_http_check:
@@ -47,4 +55,14 @@
k8s-proxy:
match: hyperkube.*proxy
{%- endif %}
+{%- if network is defined and network.get('engine', None) == 'calico' %}
+ calico-docker:
+ match: docker run .*projectcalico/calico/node
+ calico-felix:
+ match: calico-felix$
+ calico-bird6:
+ match: bird6 .*/etc/calico/.*/bird6.cfg
+ calico-confd:
+ match: confd .*/etc/calico/confd
+{%- endif %}
{%- endif %}
diff --git a/kubernetes/meta/heka.yml b/kubernetes/meta/heka.yml
index 43d1dd6..15aa246 100644
--- a/kubernetes/meta/heka.yml
+++ b/kubernetes/meta/heka.yml
@@ -106,6 +106,56 @@
periods: 0
function: last
{%- endif %}
+{%- if network is defined and network.get('engine', None) == 'calico' %}
+ calico_docker:
+ description: "There is no docker calico-node process running"
+ severity: down
+ rules:
+ - metric: lma_components_processes
+ field:
+ service: calico-docker
+ relational_operator: '=='
+ threshold: 0
+ window: 60
+ periods: 0
+ function: last
+ calico_felix:
+ description: "There is no calico-felix process running"
+ severity: down
+ rules:
+ - metric: lma_components_processes
+ field:
+ service: calico-felix
+ relational_operator: '=='
+ threshold: 0
+ window: 60
+ periods: 0
+ function: last
+ calico_bird6:
+ description: "There is no calico-bird6 process running"
+ severity: down
+ rules:
+ - metric: lma_components_processes
+ field:
+ service: calico-bird6
+ relational_operator: '=='
+ threshold: 0
+ window: 60
+ periods: 0
+ function: last
+ calico_confd:
+ description: "There is no calico-confd process running"
+ severity: down
+ rules:
+ - metric: lma_components_processes
+ field:
+ service: calico-confd
+ relational_operator: '=='
+ threshold: 0
+ window: 60
+ periods: 0
+ function: last
+{%- endif %}
alarm:
{%- if master.get('enabled', False) %}
@@ -132,6 +182,32 @@
dimension:
service: k8s-proxy-endpoint
{%- endif %}
+{%- if network is defined and network.get('engine', None) == 'calico' %}
+ calico_docker:
+ alerting: enabled
+ triggers:
+ - calico_docker
+ dimension:
+ process: calico-docker
+ calico_felix:
+ alerting: enabled
+ triggers:
+ - calico_felix
+ dimension:
+ process: calico-felix
+ calico_bird6:
+ alerting: enabled
+ triggers:
+ - calico_bird6
+ dimension:
+ process: calico-bird6
+ calico_confd:
+ alerting: enabled
+ triggers:
+ - calico_confd
+ dimension:
+ process: calico-confd
+{%- endif %}
{%- endif %}
{%- if master.get('enabled', False) %}
@@ -195,4 +271,63 @@
dimension:
cluster_name: k8s-master
nagios_host: 00-top-clusters
+{%- if master.get('network', []).get('engine', None) == 'calico' %}
+ calico_docker:
+ policy: availability_of_members
+ alerting: enabled
+ group_by: hostname
+ match:
+ process: calico-docker
+ members:
+ - calico_docker
+ dimension:
+ service: calico
+ nagios_host: 01-service-clusters
+ calico_felix:
+ policy: availability_of_members
+ alerting: enabled
+ group_by: hostname
+ match:
+ process: calico-felix
+ members:
+ - calico_felix
+ dimension:
+ service: calico
+ nagios_host: 01-service-clusters
+ calico_bird6:
+ policy: availability_of_members
+ alerting: enabled
+ group_by: hostname
+ match:
+ process: calico-bird6
+ members:
+ - calico_bird6
+ dimension:
+ service: calico
+ nagios_host: 01-service-clusters
+ calico_confd:
+ policy: availability_of_members
+ alerting: enabled
+ group_by: hostname
+ match:
+ process: calico-confd
+ members:
+ - calico_confd
+ dimension:
+ service: calico
+ nagios_host: 01-service-clusters
+ calico:
+ policy: highest_severity
+ alerting: enabled_with_notification
+ match:
+ service: calico
+ members:
+ - calico_docker
+ - calico_felix
+ - calico_bird6
+ - calico_confd
+ dimension:
+ cluster_name: calico
+ nagios_host: 00-top-clusters
+{%- endif %}
{%- endif %}