Merge "Added missing log appender to pre-Queens Ironic configurations." into release/2019.2.0
diff --git a/ironic/files/grafana_dashboards/ironic_prometheus.json b/ironic/files/grafana_dashboards/ironic_prometheus.json
index e863fe7..4c116a0 100644
--- a/ironic/files/grafana_dashboards/ironic_prometheus.json
+++ b/ironic/files/grafana_dashboards/ironic_prometheus.json
@@ -95,14 +95,14 @@
"tableColumn": "",
"targets": [
{
- "expr": "max(openstack_api_check_status{name=~\"ironic.*\"})",
+ "expr": "openstack_api_check_status{interface=\"public\",service_name=\"ironic\"}",
"format": "time_series",
"intervalFactor": 2,
"refId": "A"
}
],
"thresholds": "0.5,0.5",
- "title": "VIP API availability",
+ "title": "Public VIP API Availability",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
diff --git a/ironic/files/pike/ironic.conf b/ironic/files/pike/ironic.conf
index d10a307..daa9e2a 100644
--- a/ironic/files/pike/ironic.conf
+++ b/ironic/files/pike/ironic.conf
@@ -3397,10 +3397,15 @@
# Deprecated group/name - [oslo_messaging_rabbit]/kombu_ssl_ca_certs
#ssl_ca_file =
+# NOTE(pas-ha) default values of below option is problematic with RMQ 3.8,
+# see PROD-34322
+# recreating queues on a secondary broker immediately after primary broker
+# has gone down leads to these queues being non-functional.
# How long to wait before reconnecting in response to an AMQP
# consumer cancel notification. (floating point value)
# Deprecated group/name - [DEFAULT]/kombu_reconnect_delay
#kombu_reconnect_delay = 1.0
+kombu_reconnect_delay = 5.0
# EXPERIMENTAL: Possible values are: gzip, bz2. If not set
# compression will not be used. This option may not be
@@ -3472,14 +3477,24 @@
# Reason: Replaced by [DEFAULT]/transport_url
#rabbit_virtual_host = /
+# NOTE(pas-ha) default values of below option is problematic with RMQ 3.8,
+# see PROD-34322
+# recreating queues on a secondary broker immediately after primary broker
+# has gone down leads to these queues being non-functional.
# How frequently to retry connecting with RabbitMQ. (integer
# value)
#rabbit_retry_interval = 1
+rabbit_retry_interval = 5
+# NOTE(pas-ha) default values of below option is problematic with RMQ 3.8,
+# see PROD-34322
+# recreating queues on a secondary broker immediately after primary broker
+# has gone down leads to these queues being non-functional.
# How long to backoff for between retries when connecting to
# RabbitMQ. (integer value)
# Deprecated group/name - [DEFAULT]/rabbit_retry_backoff
#rabbit_retry_backoff = 2
+rabbit_retry_backoff = 10
# Maximum interval of RabbitMQ connection retries. Default is
# 30 seconds. (integer value)
diff --git a/ironic/meta/prometheus.yml b/ironic/meta/prometheus.yml
index fdf6066..06f837b 100644
--- a/ironic/meta/prometheus.yml
+++ b/ironic/meta/prometheus.yml
@@ -15,6 +15,7 @@
IronicProcessDown:
if: >-
procstat_running{process_name=~"ironic-.*"} == 0
+ for: 2m
labels:
service: ironic
severity: minor
@@ -25,6 +26,7 @@
IronicProcessDownMinor:
if: >-
count(procstat_running{process_name=~"ironic-.*"} == 0) by (process_name) >= count(procstat_running{process_name=~"ironic-.*"}) by (process_name) * 0.33
+ for: 2m
labels:
service: ironic
severity: minor
@@ -35,6 +37,7 @@
IronicProcessDownMajor:
if: >-
count(procstat_running{process_name=~"ironic-.*"} == 0) by (process_name) >= count(procstat_running{process_name=~"ironic-.*"}) by (process_name) * 0.66
+ for: 2m
labels:
service: ironic
severity: major
@@ -45,6 +48,7 @@
IronicProcessOutage:
if: >-
count(procstat_running{process_name=~"ironic-.*"} == 0) by (process_name) == count(procstat_running{process_name=~"ironic-.*"}) by (process_name)
+ for: 2m
labels:
service: ironic
severity: critical
@@ -55,6 +59,7 @@
IronicDriverMissing:
if: >-
scalar(count(procstat_running{process_name=~"ironic-conductor"} == 1)) - count(openstack_ironic_driver) by (driver) > 0
+ for: 2m
labels:
severity: major
service: ironic
@@ -97,17 +102,6 @@
annotations:
summary: "{{ $labels.name }} endpoints outage"
description: All available {{ $labels.name }} endpoints are not accessible for 2 minutes.
- IronicApiOutage:
- if: >-
- max(openstack_api_check_status{name="ironic"}) == 0
- for: 2m
- labels:
- severity: critical
- service: ironic
- annotations:
- summary: Ironic API outage
- description: >-
- Ironic API is not accessible for all available Ironic endpoints in the OpenStack service catalog for 2 minutes.
{%- endraw %}
{%- endif %}
{%- endif %}