Merge "Perform the kubernetes nodes monitoring from the remote_collector"
diff --git a/kubernetes/files/kubelet/default.master b/kubernetes/files/kubelet/default.master
index 3439776..644e84d 100644
--- a/kubernetes/files/kubelet/default.master
+++ b/kubernetes/files/kubelet/default.master
@@ -7,6 +7,7 @@
 --cluster_dns={{ master.addons.dns.server }} \
 --register-node=false \
 --cluster_domain={{ master.addons.dns.domain }} \
+--cni-bin-dir={{ master.apiserver.get('cni_bin_dir', '/opt/cni/bin') }} \
 --v=2 \
 {%- for key, value in master.get('kubelet', {}).get('daemon_opts', {}).iteritems() %}
 --{{ key }}="{{ value }}" \
diff --git a/kubernetes/files/kubelet/default.pool b/kubernetes/files/kubelet/default.pool
index 84b89d0..6962b12 100644
--- a/kubernetes/files/kubelet/default.pool
+++ b/kubernetes/files/kubelet/default.pool
@@ -8,6 +8,7 @@
 --allow-privileged={{ pool.kubelet.allow_privileged }} \
 --cluster_dns={{ pool.cluster_dns }} \
 --cluster_domain={{ pool.cluster_domain }} \
+--cni-bin-dir={{ pool.apiserver.get('cni_bin_dir', '/opt/cni/bin') }} \
 --v=2 \
 {%- if pool.network.engine == 'opencontrail' %}
 --network-plugin={{ pool.network.engine }} \
diff --git a/kubernetes/map.jinja b/kubernetes/map.jinja
index f36f9e3..594b50b 100644
--- a/kubernetes/map.jinja
+++ b/kubernetes/map.jinja
@@ -1,11 +1,15 @@
+{% set _version = salt['cmd.run']("hyperkube --version | sed -e 's/-.*//g' -e 's/v//g' -e 's/Kubernetes //g' | awk -F '.' '{ print $1 \".\" $2 }'") %}
+
 {% set common = salt['grains.filter_by']({
     'Debian': {
         'pkgs': ['curl', 'git', 'apt-transport-https', 'python-apt', 'nfs-common', 'socat', 'netcat-traditional', 'openssl'],
         'services': [],
+        'version': _version|float,
     },
     'RedHat': {
         'pkgs': ['curl', 'git', 'apt-transport-https', 'python-apt', 'nfs-common', 'socat', 'netcat-traditional', 'python'],
         'services': [],
+        'version': _version|float,
     },
 }, merge=salt['pillar.get']('kubernetes:common')) %}
 
diff --git a/kubernetes/master/controller.sls b/kubernetes/master/controller.sls
index 9795dbb..65c7452 100644
--- a/kubernetes/master/controller.sls
+++ b/kubernetes/master/controller.sls
@@ -1,4 +1,5 @@
 {%- from "kubernetes/map.jinja" import master with context %}
+{%- from "kubernetes/map.jinja" import common with context %}
 {%- if master.enabled %}
 
 /srv/kubernetes/known_tokens.csv:
@@ -78,7 +79,7 @@
     - mode: 644
     - contents: >-
         DAEMON_ARGS="
-        --admission-control=NamespaceLifecycle,LimitRanger,SecurityContextDeny,ServiceAccount,ResourceQuota
+        --admission-control=NamespaceLifecycle,LimitRanger,ServiceAccount,ResourceQuota
         --allow-privileged=True
         --basic-auth-file=/srv/kubernetes/basic_auth.csv
         --bind-address={{ master.apiserver.address }}
diff --git a/kubernetes/meta/prometheus.yml b/kubernetes/meta/prometheus.yml
new file mode 100644
index 0000000..e6cea72
--- /dev/null
+++ b/kubernetes/meta/prometheus.yml
@@ -0,0 +1,205 @@
+{%- from "kubernetes/map.jinja" import master with context %}
+{%- from "kubernetes/map.jinja" import pool with context %}
+
+{%- if pool.get('enabled', False) %}
+{% set network = pool.get('network', []) %}
+{%- else %}
+{%- if master.get('enabled', False) %}
+{% set network = master.get('network', []) %}
+{% endif %}
+{% endif %}
+
+{% raw %}
+server:
+  recording:
+    - name: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
+      query: >-
+        sum by (cluster,namespace,controller,pod_name,container_name) (
+          label_replace(
+            container_spec_memory_limit_bytes{container_name!=""},
+            "controller", "$1",
+            "pod_name", "^(.*)-[a-z0-9]+"
+          )
+        )
+    - name: cluster_namespace_controller_pod_container:spec_cpu_shares
+      query: >-
+        sum by (cluster,namespace,controller,pod_name,container_name) (
+          label_replace(
+            container_spec_cpu_shares{container_name!=""},
+            "controller", "$1",
+            "pod_name", "^(.*)-[a-z0-9]+"
+          )
+        )
+    - name: cluster_namespace_controller_pod_container:cpu_usage:rate
+      query: >-
+        sum by (cluster,namespace,controller,pod_name,container_name) (
+          label_replace(
+            irate(
+              container_cpu_usage_seconds_total{container_name!=""}[5m]
+            ),
+            "controller", "$1",
+            "pod_name", "^(.*)-[a-z0-9]+"
+          )
+        )
+    - name: cluster_namespace_controller_pod_container:memory_usage:bytes
+      query: >-
+        sum by (cluster,namespace,controller,pod_name,container_name) (
+          label_replace(
+            container_memory_usage_bytes{container_name!=""},
+            "controller", "$1",
+            "pod_name", "^(.*)-[a-z0-9]+"
+          )
+        )
+    - name: cluster_namespace_controller_pod_container:memory_working_set:bytes
+      query: >-
+        sum by (cluster,namespace,controller,pod_name,container_name) (
+          label_replace(
+            container_memory_working_set_bytes{container_name!=""},
+            "controller", "$1",
+            "pod_name", "^(.*)-[a-z0-9]+"
+          )
+        )
+    - name: cluster_namespace_controller_pod_container:memory_rss:bytes
+      query: >-
+        sum by (cluster,namespace,controller,pod_name,container_name) (
+          label_replace(
+            container_memory_rss{container_name!=""},
+            "controller", "$1",
+            "pod_name", "^(.*)-[a-z0-9]+"
+          )
+        )
+    - name: cluster_namespace_controller_pod_container:memory_cache:bytes
+      query: >-
+        sum by (cluster,namespace,controller,pod_name,container_name) (
+          label_replace(
+            container_memory_cache{container_name!=""},
+            "controller", "$1",
+            "pod_name", "^(.*)-[a-z0-9]+"
+          )
+        )
+    - name: cluster_namespace_controller_pod_container:disk_usage:bytes
+      query: >-
+        sum by (cluster,namespace,controller,pod_name,container_name) (
+          label_replace(
+            container_fs_usage_bytes{container_name!=""},
+            "controller", "$1",
+            "pod_name", "^(.*)-[a-z0-9]+"
+          )
+        )
+    - name: cluster_namespace_controller_pod_container:memory_pagefaults:rate
+      query: >-
+        sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
+          label_replace(
+            irate(
+              container_memory_failures_total{container_name!=""}[5m]
+            ),
+            "controller", "$1",
+            "pod_name", "^(.*)-[a-z0-9]+"
+          )
+        )
+    - name: cluster_namespace_controller_pod_container:memory_oom:rate
+      query: >-
+        sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
+          label_replace(
+            irate(
+              container_memory_failcnt{container_name!=""}[5m]
+            ),
+            "controller", "$1",
+            "pod_name", "^(.*)-[a-z0-9]+"
+          )
+        )
+    - name: cluster:memory_allocation:percent
+      query: >-
+        100 * sum by (cluster) (
+          container_spec_memory_limit_bytes{pod_name!=""}
+        ) / sum by (cluster) (
+          machine_memory_bytes
+        )
+    - name: cluster:memory_used:percent
+      query: >-
+        100 * sum by (cluster) (
+          container_memory_usage_bytes{pod_name!=""}
+        ) / sum by (cluster) (
+          machine_memory_bytes
+        )
+    - name: cluster:cpu_allocation:percent
+      query: >-
+        100 * sum by (cluster) (
+          container_spec_cpu_shares{pod_name!=""}
+        ) / sum by (cluster) (
+          container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores
+        )
+  alert:
+    AvgKubeletRunningContainerCountLow:
+      if: >-
+        avg_over_time(kubelet_running_container_count[2m]) * 1.3 <
+        avg_over_time(kubelet_running_container_count[10m])
+      labels:
+        severity: warning
+        service: kubernetes
+      annotations:
+        summary: 'Container count is low'
+        description: 'Container count from last 2m is lower than avarage from 10m'
+    AvgKubeletRunningPODCountLow:
+      if: >-
+        avg_over_time(kubelet_running_pod_count[2m]) * 1.3 <
+        avg_over_time(kubelet_running_pod_count[10m])
+      labels:
+        severity: warning
+        service: kubernetes
+      annotations:
+        summary: 'POD count is low'
+        description: 'POD count from last 2m is lower than avarage from 10m'
+    ContainerScrapeError:
+      if: 'container_scrape_error != 0'
+      labels:
+        severity: warning
+        service: kubernetes
+      annotations:
+        summary: 'Fail to scrape container'
+        description: 'Prometheus was not able to scrape metrics from container on {{ $labels.instance }}'
+{% endraw %}
+{%- if network is defined and network.get('engine', None) == 'calico' %}
+{% raw %}
+    ProcstatPidBird:
+      if: >-
+        absent(procstat_pid{process_name="bird"}) OR
+        count(procstat_pid{process_name="bird"}) < count(up{job="kubernetes-node"} == 1)
+      labels:
+        severity: warning
+        service: bird
+      annotations:
+        summary: 'Bird process is missing'
+        description: 'Bird process is not running on all k8s nodes'
+    ProcstatPidBird6:
+      if: >-
+        absent(procstat_pid{process_name="bird6"}) OR
+        count(procstat_pid{process_name="bird6"}) < count(up{job="kubernetes-node"} == 1)
+      labels:
+        severity: warning
+        service: bird6
+      annotations:
+        summary: 'Bird6 process is missing'
+        description: 'Bird6 process is not running on all k8s nodes'
+    ProcstatPidConfd:
+      if: >-
+        absent(procstat_pid{process_name="confd"}) OR
+        count(procstat_pid{process_name="confd"}) < count(up{job="kubernetes-node"} == 1)
+      labels:
+        severity: warning
+        service: confd
+      annotations:
+        summary: 'Confd process is missing'
+        description: 'Confd process is not running on all k8s nodes'
+    ProcstatPidCalicoFelix:
+      if: >-
+        absent(procstat_pid{process_name="calico-felix"}) OR
+        count(procstat_pid{process_name="calico-felix"}) < count(up{job="kubernetes-node"} == 1)
+      labels:
+        severity: warning
+        service: calico-felix
+      annotations:
+        summary: 'Calico-felix process is missing'
+        description: 'Calico-felix process is not running on all k8s nodes'
+{% endraw %}
+{% endif %}
diff --git a/kubernetes/meta/telegraf.yml b/kubernetes/meta/telegraf.yml
new file mode 100644
index 0000000..79dbb15
--- /dev/null
+++ b/kubernetes/meta/telegraf.yml
@@ -0,0 +1,31 @@
+{%- from "kubernetes/map.jinja" import master with context %}
+{%- from "kubernetes/map.jinja" import pool with context %}
+
+{%- if pool.get('enabled', False) %}
+{% set network = pool.get('network', []) %}
+{%- else %}
+{%- if master.get('enabled', False) %}
+{% set network = master.get('network', []) %}
+{% endif %}
+{% endif %}
+
+{%- if master.get('enabled', False) or pool.get('enabled', False) %}
+agent:
+  input:
+    procstat:
+{%- if master.get('enabled', False) %}
+      - pattern: "hyperkube.*apiserver"
+      - pattern: "hyperkube.*scheduler"
+      - pattern: "hyperkube.*controller-manager"
+{%- endif %}
+{%- if pool.get('enabled', False) %}
+      - pattern: "hyperkube.*kubelet"
+      - pattern: "hyperkube.*proxy"
+{%- endif %}
+{%- if network is defined and network.get('engine', None) == 'calico' %}
+      - exe: calico-felix
+      - pattern: "bird .*/etc/calico/.*/bird.cfg"
+      - pattern: "bird6 .*/etc/calico/.*/bird6.cfg"
+      - pattern: "confd .*/etc/calico/confd"
+{%- endif %}
+{%- endif %}
diff --git a/metadata/service/support.yml b/metadata/service/support.yml
index a9a0c0e..655bb27 100644
--- a/metadata/service/support.yml
+++ b/metadata/service/support.yml
@@ -1,6 +1,10 @@
 parameters:
   kubernetes:
     _support:
+      prometheus:
+        enabled: true
+      telegraf:
+        enabled: true
       collectd:
         enabled: true
       heka: