Merge "Attempt to force restart on hyperkube version change"
diff --git a/kubernetes/files/kube-addons/contrail/contrail.yaml b/kubernetes/files/kube-addons/contrail/contrail.yaml
index f023315..5d5ca58 100644
--- a/kubernetes/files/kube-addons/contrail/contrail.yaml
+++ b/kubernetes/files/kube-addons/contrail/contrail.yaml
@@ -1,3 +1,6 @@
+{%- from "kubernetes/map.jinja" import common with context -%}
+---
+
apiVersion: apps/v1beta2
kind: DaemonSet
metadata:
@@ -19,7 +22,7 @@
hostNetwork: true
containers:
- name: rabbitmq
- image: rabbitmq:3.6.6-management-alpine
+ image: rabbitmq:{{ common.addons.get('contrail',{}).get('rabbitmq_version',"3.6.6") }}-management-alpine
lifecycle:
postStart:
exec:
@@ -41,10 +44,10 @@
rabbitmqctl set_policy ha-all "." '{"ha-mode":"exactly","ha-params":3,"ha-sync-mode":"automatic"}'
env:
- name: RABBITMQ_ERLANG_COOKIE
- value: YTQMGYEHFATZPDKPOCXX
+ value: {{ common.addons.get('contrail',{}).get('rabbitmq_erlang_cookie',"YTQMGYEHFATZPDKPOCXX") }}
- name: opencontrail-controller
- image: docker-prod-local.artifactory.mirantis.com/opencontrail-oc40/opencontrail-controller
+ image: docker-prod-local.artifactory.mirantis.com/opencontrail-oc40/opencontrail-controller:{{ common.addons.get('contrail',{}).get('contrail_version',"latest") }}
securityContext:
privileged: true
lifecycle:
@@ -78,11 +81,10 @@
mountPath: /etc/zookeeper/conf/zoo.cfg
- name: etc-zookeeper-conf-log4j-properties
mountPath: /etc/zookeeper/conf/log4j.properties
- - name: var-lib-rabbitmq-erlang-cookie
- mountPath: /var/lib/rabbitmq/.erlang.cookie
+
- name: opencontrail-analyticsdb
- image: docker-prod-local.artifactory.mirantis.com/opencontrail-oc40/opencontrail-analyticsdb
+ image: docker-prod-local.artifactory.mirantis.com/opencontrail-oc40/opencontrail-analyticsdb:{{ common.addons.get('contrail',{}).get('contrail_version',"latest") }}
securityContext:
privileged: true
volumeMounts:
@@ -114,7 +116,7 @@
mountPath: /etc/zookeeper/conf/log4j.properties
- name: opencontrail-analytics
- image: docker-prod-local.artifactory.mirantis.com/opencontrail-oc40/opencontrail-analytics
+ image: docker-prod-local.artifactory.mirantis.com/opencontrail-oc40/opencontrail-analytics:{{ common.addons.get('contrail',{}).get('contrail_version',"latest") }}
volumeMounts:
- name: etc-contrail
mountPath: /etc/contrail
@@ -151,11 +153,6 @@
hostPath:
path: /etc/zookeeper/conf/zoo.cfg
type: File
- - name: var-lib-rabbitmq-erlang-cookie
- hostPath:
- path: /var/lib/rabbitmq/.erlang.cookie
- type: File
-
# analyticsdb
- name: etc-cassandra-cassandra-env-analytics-sh
diff --git a/kubernetes/files/kube-addons/contrail/kube-manager.yaml b/kubernetes/files/kube-addons/contrail/kube-manager.yaml
index 7fd0e0e..3004649 100644
--- a/kubernetes/files/kube-addons/contrail/kube-manager.yaml
+++ b/kubernetes/files/kube-addons/contrail/kube-manager.yaml
@@ -1,3 +1,5 @@
+{%- from "kubernetes/map.jinja" import common with context -%}
+---
apiVersion: apps/v1beta2
kind: DaemonSet
metadata:
@@ -19,7 +21,7 @@
hostNetwork: true
containers:
- name: opencontrail-kube-manager
- image: docker-prod-local.artifactory.mirantis.com/opencontrail-oc40/opencontrail-kube-manager
+ image: docker-prod-local.artifactory.mirantis.com/opencontrail-oc40/opencontrail-kube-manager:{{ common.addons.get('contrail',{}).get('contrail_version',"latest") }}
securityContext:
privileged: true
lifecycle:
diff --git a/kubernetes/meta/prometheus.yml b/kubernetes/meta/prometheus.yml
index 3ca5453..e873d38 100644
--- a/kubernetes/meta/prometheus.yml
+++ b/kubernetes/meta/prometheus.yml
@@ -155,8 +155,8 @@
severity: warning
service: kubernetes
annotations:
- summary: "Failed to get the container metrics"
- description: "Prometheus was not able to scrape metrics from the container on the {{ $labels.instance }} instance."
+ summary: "Failed to get Kubernetes container metrics"
+ description: "Prometheus was not able to scrape metrics from the container on the {{ $labels.instance }} Kubernetes instance."
{% endraw %}
KubernetesProcessDown:
if: >-
@@ -168,7 +168,7 @@
service: kubernetes
annotations:
summary: "Kubernetes {{ $labels.process_name }} process is down"
- description: "Kubernetes {{ $labels.process_name }} process on the {{ $labels.host }} node is down for at least 2 minutes."
+ description: "Kubernetes {{ $labels.process_name }} process on the {{ $labels.host }} node is down for 2 minutes."
{% endraw %}
KubernetesProcessDownMinor:
if: >-
@@ -179,9 +179,9 @@
severity: minor
service: kubernetes
annotations:
- summary: "{% endraw %}{{ instance_minor_threshold_percent * 100 }}%{% raw %} of Kubernetes {{ $labels.process_name }} process instances are down"
+ summary: "{% endraw %}{{ instance_minor_threshold_percent * 100 }}%{% raw %} of Kubernetes {{ $labels.process_name }} processes are down"
description: >-
- {{ $value }} of Kubernetes {{ $labels.process_name }} process instances are down {% endraw %}(at least {{ instance_minor_threshold_percent * 100 }}%) for at least 2 minutes.
+ {{ $value }} of Kubernetes {{ $labels.process_name }} processes (>= {% endraw %} {{ instance_minor_threshold_percent * 100 }}%) are down for 2 minutes.
KubernetesProcessDownMajor:
if: >-
count(procstat_running{process_name=~"hyperkube-.*"} == 0) by (process_name) > count(procstat_running{process_name=~"hyperkube-.*"}) by (process_name) * {{ instance_major_threshold_percent }}
@@ -190,9 +190,9 @@
severity: major
service: kubernetes
annotations:
- summary: "{{ instance_major_threshold_percent * 100 }}%{% raw %} of Kubernetes {{ $labels.process_name }} process instances are down"
+ summary: "{{ instance_major_threshold_percent * 100 }}%{% raw %} of Kubernetes {{ $labels.process_name }} processes are down"
description: >-
- {{ $value }} of Kubernetes {{ $labels.process_name }} process instances are down {% endraw %}(at least {{ instance_major_threshold_percent * 100 }}%) for at least 2 minutes.
+ {{ $value }} of Kubernetes {{ $labels.process_name }} processes (>= {% endraw %} {{ instance_major_threshold_percent * 100 }}%) are down for 2 minutes.
KubernetesProcessOutage:
if: >-
count(procstat_running{process_name=~"hyperkube-.*"}) by (process_name) == count(procstat_running{process_name=~"hyperkube-.*"} == 0) by (process_name)
@@ -203,7 +203,7 @@
service: kubernetes
annotations:
summary: "Kubernetes {{ $labels.process_name }} cluster outage"
- description: "All Kubernetes {{ $labels.process_name }} process instances are down for at least 2 minutes."
+ description: "All Kubernetes {{ $labels.process_name }} processes are down for 2 minutes."
{% endraw %}
{%- if network.get('calico', {}).get('enabled', False) %}
CalicoProcessDown:
@@ -216,7 +216,7 @@
service: calico
annotations:
summary: "Calico {{ $labels.process_name }} process is down"
- description: "Calico {{ $labels.process_name }} process on the {{ $labels.host }} node is down for at least 2 minutes."
+ description: "Calico {{ $labels.process_name }} process on the {{ $labels.host }} node is down for 2 minutes."
{% endraw %}
CalicoProcessDownMinor:
if: >-
@@ -226,9 +226,9 @@
severity: minor
service: calico
annotations:
- summary: "{{ instance_minor_threshold_percent * 100 }}%{% raw %} of Calico {{ $labels.process_name }} process instances are down"
+ summary: "{{ instance_minor_threshold_percent * 100 }}%{% raw %} of Calico {{ $labels.process_name }} processes are down"
description: >-
- {{ $value }} of Calico {{ $labels.process_name }} process instances are down {% endraw %}(at least {{ instance_minor_threshold_percent * 100 }}%) for at least 2 minutes.
+ {{ $value }} of Calico {{ $labels.process_name }} processes (>= {% endraw %} {{ instance_minor_threshold_percent * 100 }}%) are down for 2 minutes.
CalicoProcessDownMajor:
if: >-
count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0) by (process_name) > count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"}) by (process_name) * {{ instance_major_threshold_percent }}
@@ -237,9 +237,9 @@
severity: major
service: calico
annotations:
- summary: "{{ instance_major_threshold_percent * 100 }}%{% raw %} of Calico {{ $labels.process_name }} process instances are down"
+ summary: "{{ instance_major_threshold_percent * 100 }}%{% raw %} of Calico {{ $labels.process_name }} processes are down"
description: >-
- {{ $value }} of Calico {{ $labels.process_name }} process instances are down {% endraw %}(at least {{ instance_major_threshold_percent * 100 }}%) for at least 2 minutes.
+ {{ $value }} of Calico {{ $labels.process_name }} processes (>= {% endraw %} {{ instance_major_threshold_percent * 100 }}%) are down for 2 minutes.
CalicoProcessOutage:
if: >-
count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"}) by (process_name) == count(procstat_running{process_name=~"calico-felix|bird|bird6|confd"} == 0) by (process_name)
@@ -250,6 +250,6 @@
service: calico
annotations:
summary: "Calico {{ $labels.process_name }} cluster outage"
- description: "All Calico {{ $labels.process_name }} process instances are down for at least 2 minutes."
+ description: "All Calico {{ $labels.process_name }} processes are down for 2 minutes."
{% endraw %}
{% endif %}