Merge "Perform the kubernetes nodes monitoring from the remote_collector"
diff --git a/kubernetes/files/kubelet/default.master b/kubernetes/files/kubelet/default.master
index 3439776..644e84d 100644
--- a/kubernetes/files/kubelet/default.master
+++ b/kubernetes/files/kubelet/default.master
@@ -7,6 +7,7 @@
--cluster_dns={{ master.addons.dns.server }} \
--register-node=false \
--cluster_domain={{ master.addons.dns.domain }} \
+--cni-bin-dir={{ master.apiserver.get('cni_bin_dir', '/opt/cni/bin') }} \
--v=2 \
{%- for key, value in master.get('kubelet', {}).get('daemon_opts', {}).iteritems() %}
--{{ key }}="{{ value }}" \
diff --git a/kubernetes/files/kubelet/default.pool b/kubernetes/files/kubelet/default.pool
index 84b89d0..6962b12 100644
--- a/kubernetes/files/kubelet/default.pool
+++ b/kubernetes/files/kubelet/default.pool
@@ -8,6 +8,7 @@
--allow-privileged={{ pool.kubelet.allow_privileged }} \
--cluster_dns={{ pool.cluster_dns }} \
--cluster_domain={{ pool.cluster_domain }} \
+--cni-bin-dir={{ pool.apiserver.get('cni_bin_dir', '/opt/cni/bin') }} \
--v=2 \
{%- if pool.network.engine == 'opencontrail' %}
--network-plugin={{ pool.network.engine }} \
diff --git a/kubernetes/map.jinja b/kubernetes/map.jinja
index f36f9e3..594b50b 100644
--- a/kubernetes/map.jinja
+++ b/kubernetes/map.jinja
@@ -1,11 +1,15 @@
+{% set _version = salt['cmd.run']("hyperkube --version | sed -e 's/-.*//g' -e 's/v//g' -e 's/Kubernetes //g' | awk -F '.' '{ print $1 \".\" $2 }'") %}
+
{% set common = salt['grains.filter_by']({
'Debian': {
'pkgs': ['curl', 'git', 'apt-transport-https', 'python-apt', 'nfs-common', 'socat', 'netcat-traditional', 'openssl'],
'services': [],
+ 'version': _version|float,
},
'RedHat': {
'pkgs': ['curl', 'git', 'apt-transport-https', 'python-apt', 'nfs-common', 'socat', 'netcat-traditional', 'python'],
'services': [],
+ 'version': _version|float,
},
}, merge=salt['pillar.get']('kubernetes:common')) %}
diff --git a/kubernetes/master/controller.sls b/kubernetes/master/controller.sls
index 9795dbb..65c7452 100644
--- a/kubernetes/master/controller.sls
+++ b/kubernetes/master/controller.sls
@@ -1,4 +1,5 @@
{%- from "kubernetes/map.jinja" import master with context %}
+{%- from "kubernetes/map.jinja" import common with context %}
{%- if master.enabled %}
/srv/kubernetes/known_tokens.csv:
@@ -78,7 +79,7 @@
- mode: 644
- contents: >-
DAEMON_ARGS="
- --admission-control=NamespaceLifecycle,LimitRanger,SecurityContextDeny,ServiceAccount,ResourceQuota
+ --admission-control=NamespaceLifecycle,LimitRanger,ServiceAccount,ResourceQuota
--allow-privileged=True
--basic-auth-file=/srv/kubernetes/basic_auth.csv
--bind-address={{ master.apiserver.address }}
diff --git a/kubernetes/meta/prometheus.yml b/kubernetes/meta/prometheus.yml
new file mode 100644
index 0000000..e6cea72
--- /dev/null
+++ b/kubernetes/meta/prometheus.yml
@@ -0,0 +1,205 @@
+{%- from "kubernetes/map.jinja" import master with context %}
+{%- from "kubernetes/map.jinja" import pool with context %}
+
+{%- if pool.get('enabled', False) %}
+{% set network = pool.get('network', []) %}
+{%- else %}
+{%- if master.get('enabled', False) %}
+{% set network = master.get('network', []) %}
+{% endif %}
+{% endif %}
+
+{% raw %}
+server:
+ recording:
+ - name: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
+ query: >-
+ sum by (cluster,namespace,controller,pod_name,container_name) (
+ label_replace(
+ container_spec_memory_limit_bytes{container_name!=""},
+ "controller", "$1",
+ "pod_name", "^(.*)-[a-z0-9]+"
+ )
+ )
+ - name: cluster_namespace_controller_pod_container:spec_cpu_shares
+ query: >-
+ sum by (cluster,namespace,controller,pod_name,container_name) (
+ label_replace(
+ container_spec_cpu_shares{container_name!=""},
+ "controller", "$1",
+ "pod_name", "^(.*)-[a-z0-9]+"
+ )
+ )
+ - name: cluster_namespace_controller_pod_container:cpu_usage:rate
+ query: >-
+ sum by (cluster,namespace,controller,pod_name,container_name) (
+ label_replace(
+ irate(
+ container_cpu_usage_seconds_total{container_name!=""}[5m]
+ ),
+ "controller", "$1",
+ "pod_name", "^(.*)-[a-z0-9]+"
+ )
+ )
+ - name: cluster_namespace_controller_pod_container:memory_usage:bytes
+ query: >-
+ sum by (cluster,namespace,controller,pod_name,container_name) (
+ label_replace(
+ container_memory_usage_bytes{container_name!=""},
+ "controller", "$1",
+ "pod_name", "^(.*)-[a-z0-9]+"
+ )
+ )
+ - name: cluster_namespace_controller_pod_container:memory_working_set:bytes
+ query: >-
+ sum by (cluster,namespace,controller,pod_name,container_name) (
+ label_replace(
+ container_memory_working_set_bytes{container_name!=""},
+ "controller", "$1",
+ "pod_name", "^(.*)-[a-z0-9]+"
+ )
+ )
+ - name: cluster_namespace_controller_pod_container:memory_rss:bytes
+ query: >-
+ sum by (cluster,namespace,controller,pod_name,container_name) (
+ label_replace(
+ container_memory_rss{container_name!=""},
+ "controller", "$1",
+ "pod_name", "^(.*)-[a-z0-9]+"
+ )
+ )
+ - name: cluster_namespace_controller_pod_container:memory_cache:bytes
+ query: >-
+ sum by (cluster,namespace,controller,pod_name,container_name) (
+ label_replace(
+ container_memory_cache{container_name!=""},
+ "controller", "$1",
+ "pod_name", "^(.*)-[a-z0-9]+"
+ )
+ )
+ - name: cluster_namespace_controller_pod_container:disk_usage:bytes
+ query: >-
+ sum by (cluster,namespace,controller,pod_name,container_name) (
+ label_replace(
+ container_fs_usage_bytes{container_name!=""},
+ "controller", "$1",
+ "pod_name", "^(.*)-[a-z0-9]+"
+ )
+ )
+ - name: cluster_namespace_controller_pod_container:memory_pagefaults:rate
+ query: >-
+ sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
+ label_replace(
+ irate(
+ container_memory_failures_total{container_name!=""}[5m]
+ ),
+ "controller", "$1",
+ "pod_name", "^(.*)-[a-z0-9]+"
+ )
+ )
+ - name: cluster_namespace_controller_pod_container:memory_oom:rate
+ query: >-
+ sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
+ label_replace(
+ irate(
+ container_memory_failcnt{container_name!=""}[5m]
+ ),
+ "controller", "$1",
+ "pod_name", "^(.*)-[a-z0-9]+"
+ )
+ )
+ - name: cluster:memory_allocation:percent
+ query: >-
+ 100 * sum by (cluster) (
+ container_spec_memory_limit_bytes{pod_name!=""}
+ ) / sum by (cluster) (
+ machine_memory_bytes
+ )
+ - name: cluster:memory_used:percent
+ query: >-
+ 100 * sum by (cluster) (
+ container_memory_usage_bytes{pod_name!=""}
+ ) / sum by (cluster) (
+ machine_memory_bytes
+ )
+ - name: cluster:cpu_allocation:percent
+ query: >-
+ 100 * sum by (cluster) (
+ container_spec_cpu_shares{pod_name!=""}
+ ) / sum by (cluster) (
+ container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores
+ )
+ alert:
+ AvgKubeletRunningContainerCountLow:
+ if: >-
+ avg_over_time(kubelet_running_container_count[2m]) * 1.3 <
+ avg_over_time(kubelet_running_container_count[10m])
+ labels:
+ severity: warning
+ service: kubernetes
+ annotations:
+ summary: 'Container count is low'
+ description: 'Container count from last 2m is lower than avarage from 10m'
+ AvgKubeletRunningPODCountLow:
+ if: >-
+ avg_over_time(kubelet_running_pod_count[2m]) * 1.3 <
+ avg_over_time(kubelet_running_pod_count[10m])
+ labels:
+ severity: warning
+ service: kubernetes
+ annotations:
+ summary: 'POD count is low'
+ description: 'POD count from last 2m is lower than avarage from 10m'
+ ContainerScrapeError:
+ if: 'container_scrape_error != 0'
+ labels:
+ severity: warning
+ service: kubernetes
+ annotations:
+ summary: 'Fail to scrape container'
+ description: 'Prometheus was not able to scrape metrics from container on {{ $labels.instance }}'
+{% endraw %}
+{%- if network is defined and network.get('engine', None) == 'calico' %}
+{% raw %}
+ ProcstatPidBird:
+ if: >-
+ absent(procstat_pid{process_name="bird"}) OR
+ count(procstat_pid{process_name="bird"}) < count(up{job="kubernetes-node"} == 1)
+ labels:
+ severity: warning
+ service: bird
+ annotations:
+ summary: 'Bird process is missing'
+ description: 'Bird process is not running on all k8s nodes'
+ ProcstatPidBird6:
+ if: >-
+ absent(procstat_pid{process_name="bird6"}) OR
+ count(procstat_pid{process_name="bird6"}) < count(up{job="kubernetes-node"} == 1)
+ labels:
+ severity: warning
+ service: bird6
+ annotations:
+ summary: 'Bird6 process is missing'
+ description: 'Bird6 process is not running on all k8s nodes'
+ ProcstatPidConfd:
+ if: >-
+ absent(procstat_pid{process_name="confd"}) OR
+ count(procstat_pid{process_name="confd"}) < count(up{job="kubernetes-node"} == 1)
+ labels:
+ severity: warning
+ service: confd
+ annotations:
+ summary: 'Confd process is missing'
+ description: 'Confd process is not running on all k8s nodes'
+ ProcstatPidCalicoFelix:
+ if: >-
+ absent(procstat_pid{process_name="calico-felix"}) OR
+ count(procstat_pid{process_name="calico-felix"}) < count(up{job="kubernetes-node"} == 1)
+ labels:
+ severity: warning
+ service: calico-felix
+ annotations:
+ summary: 'Calico-felix process is missing'
+ description: 'Calico-felix process is not running on all k8s nodes'
+{% endraw %}
+{% endif %}
diff --git a/kubernetes/meta/telegraf.yml b/kubernetes/meta/telegraf.yml
new file mode 100644
index 0000000..79dbb15
--- /dev/null
+++ b/kubernetes/meta/telegraf.yml
@@ -0,0 +1,31 @@
+{%- from "kubernetes/map.jinja" import master with context %}
+{%- from "kubernetes/map.jinja" import pool with context %}
+
+{%- if pool.get('enabled', False) %}
+{% set network = pool.get('network', []) %}
+{%- else %}
+{%- if master.get('enabled', False) %}
+{% set network = master.get('network', []) %}
+{% endif %}
+{% endif %}
+
+{%- if master.get('enabled', False) or pool.get('enabled', False) %}
+agent:
+ input:
+ procstat:
+{%- if master.get('enabled', False) %}
+ - pattern: "hyperkube.*apiserver"
+ - pattern: "hyperkube.*scheduler"
+ - pattern: "hyperkube.*controller-manager"
+{%- endif %}
+{%- if pool.get('enabled', False) %}
+ - pattern: "hyperkube.*kubelet"
+ - pattern: "hyperkube.*proxy"
+{%- endif %}
+{%- if network is defined and network.get('engine', None) == 'calico' %}
+ - exe: calico-felix
+ - pattern: "bird .*/etc/calico/.*/bird.cfg"
+ - pattern: "bird6 .*/etc/calico/.*/bird6.cfg"
+ - pattern: "confd .*/etc/calico/confd"
+{%- endif %}
+{%- endif %}
diff --git a/metadata/service/support.yml b/metadata/service/support.yml
index a9a0c0e..655bb27 100644
--- a/metadata/service/support.yml
+++ b/metadata/service/support.yml
@@ -1,6 +1,10 @@
parameters:
kubernetes:
_support:
+ prometheus:
+ enabled: true
+ telegraf:
+ enabled: true
collectd:
enabled: true
heka: