Merge "Add fluentd rolebinding vor view"
diff --git a/README.rst b/README.rst
index 3b50609..da82ccb 100644
--- a/README.rst
+++ b/README.rst
@@ -70,7 +70,7 @@
virtlet:
enabled: true
namespace: kube-system
- image: mirantis/virtlet:v1.0.0
+ image: mirantis/virtlet:v1.0.3
hosts:
- cmp01
- cmp02
diff --git a/kubernetes/files/kube-addons/virtlet/virtlet-ds.yml b/kubernetes/files/kube-addons/virtlet/virtlet-ds.yml
index d1799bb..1e75fb1 100644
--- a/kubernetes/files/kube-addons/virtlet/virtlet-ds.yml
+++ b/kubernetes/files/kube-addons/virtlet/virtlet-ds.yml
@@ -130,8 +130,6 @@
optional: true
- name: IMAGE_TRANSLATIONS_DIR
value: /etc/virtlet/images
- - name: KUBERNETES_POD_LOGS
- value: /kubernetes-log
image: {{ common.addons.virtlet.image }}
imagePullPolicy: IfNotPresent
name: virtlet
@@ -169,7 +167,7 @@
name: vms-log
- mountPath: /etc/virtlet/images
name: image-name-translations
- - mountPath: /kubernetes-log
+ - mountPath: /var/log/pods
name: pods-log
- command:
- /vms.sh
diff --git a/kubernetes/map.jinja b/kubernetes/map.jinja
index 99e33cc..74f204e 100644
--- a/kubernetes/map.jinja
+++ b/kubernetes/map.jinja
@@ -135,3 +135,10 @@
}) %}
{% do pool.network.opencontrail.update(opencontrail) %}
{%- endif %}
+
+{%- set monitoring = salt['grains.filter_by']({
+ 'default': {
+ 'instance_minor_threshold_percent': 0.3,
+ 'instance_major_threshold_percent': 0.6,
+ },
+}, grain='os_family', merge=salt['pillar.get']('kubernetes:monitoring')) %}
\ No newline at end of file
diff --git a/kubernetes/meta/prometheus.yml b/kubernetes/meta/prometheus.yml
index 5dfad70..3ca5453 100644
--- a/kubernetes/meta/prometheus.yml
+++ b/kubernetes/meta/prometheus.yml
@@ -1,5 +1,6 @@
{%- from "kubernetes/map.jinja" import master with context %}
{%- from "kubernetes/map.jinja" import pool with context %}
+{%- from "kubernetes/map.jinja" import monitoring with context %}
{%- set network = {} %}
{%- if pool.get('enabled', False) %}
@@ -145,61 +146,110 @@
container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores
)
alert:
- AvgKubeletRunningContainerCountLow:
- if: >-
- avg_over_time(kubelet_running_container_count[2m]) * 1.3 <
- avg_over_time(kubelet_running_container_count[10m])
- {% raw %}
- labels:
- severity: warning
- service: kubernetes
- annotations:
- summary: 'Container count is low'
- description: 'Container count from last 2m is lower than avarage from 10m'
- {% endraw %}
- AvgKubeletRunningPODCountLow:
- if: >-
- avg_over_time(kubelet_running_pod_count[2m]) * 1.3 <
- avg_over_time(kubelet_running_pod_count[10m])
- {% raw %}
- labels:
- severity: warning
- service: kubernetes
- annotations:
- summary: 'POD count is low'
- description: 'POD count from last 2m is lower than avarage from 10m'
- {% endraw %}
+ {%- set instance_minor_threshold_percent = monitoring.instance_minor_threshold_percent|float %}
+ {%- set instance_major_threshold_percent = monitoring.instance_major_threshold_percent|float %}
ContainerScrapeError:
- if: 'container_scrape_error != 0'
- {% raw %}
+ if: "container_scrape_error != 0"
+ {% raw %}
labels:
severity: warning
service: kubernetes
annotations:
- summary: 'Fail to scrape container'
- description: 'Prometheus was not able to scrape metrics from container on {{ $labels.instance }}'
- {% endraw %}
+ summary: "Failed to get the container metrics"
+ description: "Prometheus was not able to scrape metrics from the container on the {{ $labels.instance }} instance."
+ {% endraw %}
KubernetesProcessDown:
if: >-
procstat_running{process_name=~"hyperkube-.*"} == 0
- {% raw %}
+ {% raw %}
+ for: 2m
labels:
- severity: warning
+ severity: minor
service: kubernetes
annotations:
- summary: 'Kubernetes service {{ $labels.process_name }} is down'
- description: 'Kubernetes service {{ $labels.process_name }} is down on node {{ $labels.host }}'
- {% endraw %}
+ summary: "Kubernetes {{ $labels.process_name }} process is down"
+ description: "Kubernetes {{ $labels.process_name }} process on the {{ $labels.host }} node is down for at least 2 minutes."
+ {% endraw %}
+ KubernetesProcessDownMinor:
+ if: >-
+ count(procstat_running{process_name=~"hyperkube-.*"} == 0) by (process_name) > count(procstat_running{process_name=~"hyperkube-.*"}) by (process_name) * {{ instance_minor_threshold_percent }}
+ {% raw %}
+ for: 2m
+ labels:
+ severity: minor
+ service: kubernetes
+ annotations:
+ summary: "{% endraw %}{{ instance_minor_threshold_percent * 100 }}%{% raw %} of Kubernetes {{ $labels.process_name }} process instances are down"
+ description: >-
+ {{ $value }} of Kubernetes {{ $labels.process_name }} process instances are down {% endraw %}(at least {{ instance_minor_threshold_percent * 100 }}%) for at least 2 minutes.
+ KubernetesProcessDownMajor:
+ if: >-
+ count(procstat_running{process_name=~"hyperkube-.*"} == 0) by (process_name) > count(procstat_running{process_name=~"hyperkube-.*"}) by (process_name) * {{ instance_major_threshold_percent }}
+ for: 2m
+ labels:
+ severity: major
+ service: kubernetes
+ annotations:
+ summary: "{{ instance_major_threshold_percent * 100 }}%{% raw %} of Kubernetes {{ $labels.process_name }} process instances are down"
+ description: >-
+ {{ $value }} of Kubernetes {{ $labels.process_name }} process instances are down {% endraw %}(at least {{ instance_major_threshold_percent * 100 }}%) for at least 2 minutes.
+ KubernetesProcessOutage:
+ if: >-
+ count(procstat_running{process_name=~"hyperkube-.*"}) by (process_name) == count(procstat_running{process_name=~"hyperkube-.*"} == 0) by (process_name)
+ {% raw %}
+ for: 2m
+ labels:
+ severity: critical
+ service: kubernetes
+ annotations:
+ summary: "Kubernetes {{ $labels.process_name }} cluster outage"
+ description: "All Kubernetes {{ $labels.process_name }} process instances are down for at least 2 minutes."
+ {% endraw %}
{%- if network.get('calico', {}).get('enabled', False) %}
CalicoProcessDown:
if: >-
procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0
- {% raw %}
+ {% raw %}
+ for: 2m
labels:
- severity: warning
+ severity: minor
service: calico
annotations:
- summary: 'Calico service {{ $labels.process_name }} is down'
- description: 'Calico service {{ $labels.process_name }} is down on node {{ $labels.host }}'
- {% endraw %}
+ summary: "Calico {{ $labels.process_name }} process is down"
+ description: "Calico {{ $labels.process_name }} process on the {{ $labels.host }} node is down for at least 2 minutes."
+ {% endraw %}
+ CalicoProcessDownMinor:
+ if: >-
+ count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0) by (process_name) > count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"}) by (process_name) * {{ instance_minor_threshold_percent }}
+ for: 2m
+ labels:
+ severity: minor
+ service: calico
+ annotations:
+ summary: "{{ instance_minor_threshold_percent * 100 }}%{% raw %} of Calico {{ $labels.process_name }} process instances are down"
+ description: >-
+ {{ $value }} of Calico {{ $labels.process_name }} process instances are down {% endraw %}(at least {{ instance_minor_threshold_percent * 100 }}%) for at least 2 minutes.
+ CalicoProcessDownMajor:
+ if: >-
+ count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0) by (process_name) > count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"}) by (process_name) * {{ instance_major_threshold_percent }}
+ for: 2m
+ labels:
+ severity: major
+ service: calico
+ annotations:
+ summary: "{{ instance_major_threshold_percent * 100 }}%{% raw %} of Calico {{ $labels.process_name }} process instances are down"
+ description: >-
+ {{ $value }} of Calico {{ $labels.process_name }} process instances are down {% endraw %}(at least {{ instance_major_threshold_percent * 100 }}%) for at least 2 minutes.
+ CalicoProcessOutage:
+ if: >-
+ count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"}) by (process_name) == count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0) by (process_name)
+ {% raw %}
+ for: 2m
+ labels:
+ severity: critical
+ service: calico
+ annotations:
+ summary: "Calico {{ $labels.process_name }} cluster outage"
+ description: "All Calico {{ $labels.process_name }} process instances are down for at least 2 minutes."
+ {% endraw %}
{% endif %}
diff --git a/metadata/service/common.yml b/metadata/service/common.yml
index 169a5b2..0fa49df 100644
--- a/metadata/service/common.yml
+++ b/metadata/service/common.yml
@@ -58,7 +58,7 @@
virtlet:
enabled: False
namespace: kube-system
- image: mirantis/virtlet:v1.0.0
+ image: mirantis/virtlet:v1.0.3
criproxy_version: v0.10.0
criproxy_source: md5=52717b1f70f15558ef4bdb0e4d4948da
cni:
diff --git a/metadata/service/master/single.yml b/metadata/service/master/single.yml
index 8c34b63..223b4f0 100644
--- a/metadata/service/master/single.yml
+++ b/metadata/service/master/single.yml
@@ -62,7 +62,7 @@
virtlet:
enabled: False
namespace: kube-system
- image: mirantis/virtlet:v1.0.0
+ image: mirantis/virtlet:v1.0.3
token:
admin: ${_param:kubernetes_admin_token}
kubelet: ${_param:kubernetes_kubelet_token}
diff --git a/tests/pillar/master_cluster.sls b/tests/pillar/master_cluster.sls
index 9e8afa2..91c1ff3 100644
--- a/tests/pillar/master_cluster.sls
+++ b/tests/pillar/master_cluster.sls
@@ -40,7 +40,7 @@
hosts:
- cmp01
- cmp02
- image: mirantis/virtlet:v1.0.0
+ image: mirantis/virtlet:v1.0.3
monitoring:
backend: prometheus
master:
diff --git a/tests/pillar/master_contrail.sls b/tests/pillar/master_contrail.sls
index e86a293..32478f7 100644
--- a/tests/pillar/master_contrail.sls
+++ b/tests/pillar/master_contrail.sls
@@ -37,7 +37,7 @@
virtlet:
enabled: true
namespace: kube-system
- image: mirantis/virtlet:v1.0.0
+ image: mirantis/virtlet:v1.0.3
hosts:
- cmp01
- cmp02
diff --git a/tests/pillar/master_contrail4_0.sls b/tests/pillar/master_contrail4_0.sls
index ec48f54..e6c6085 100644
--- a/tests/pillar/master_contrail4_0.sls
+++ b/tests/pillar/master_contrail4_0.sls
@@ -37,7 +37,7 @@
virtlet:
enabled: true
namespace: kube-system
- image: mirantis/virtlet:v1.0.0
+ image: mirantis/virtlet:v1.0.3
hosts:
- cmp01
- cmp02
diff --git a/tests/pillar/pool_cluster.sls b/tests/pillar/pool_cluster.sls
index 4de3614..c75b87b 100644
--- a/tests/pillar/pool_cluster.sls
+++ b/tests/pillar/pool_cluster.sls
@@ -16,7 +16,7 @@
virtlet:
enabled: true
namespace: kube-system
- image: mirantis/virtlet:v1.0.0
+ image: mirantis/virtlet:v1.0.3
hosts:
- cmp01
- cmp02
diff --git a/tests/pillar/pool_cluster_with_domain.sls b/tests/pillar/pool_cluster_with_domain.sls
index 271d762..4fea3dc 100644
--- a/tests/pillar/pool_cluster_with_domain.sls
+++ b/tests/pillar/pool_cluster_with_domain.sls
@@ -16,7 +16,7 @@
virtlet:
enabled: true
namespace: kube-system
- image: mirantis/virtlet:v1.0.0
+ image: mirantis/virtlet:v1.0.3
hosts:
- cmp01
- cmp02
diff --git a/tests/pillar/pool_contrail4_0.sls b/tests/pillar/pool_contrail4_0.sls
index f396906..98c1cf7 100644
--- a/tests/pillar/pool_contrail4_0.sls
+++ b/tests/pillar/pool_contrail4_0.sls
@@ -16,7 +16,7 @@
virtlet:
enabled: true
namespace: kube-system
- image: mirantis/virtlet:v1.0.0
+ image: mirantis/virtlet:v1.0.3
hosts:
- cmp01
- cmp02