Merge "Fix confusion between `state` and `status` in overview dashboard."
diff --git a/nova/files/grafana_dashboards/nova_overview_prometheus.json b/nova/files/grafana_dashboards/nova_overview_prometheus.json
index e80866f..d326567 100644
--- a/nova/files/grafana_dashboards/nova_overview_prometheus.json
+++ b/nova/files/grafana_dashboards/nova_overview_prometheus.json
@@ -102,14 +102,14 @@
"tableColumn": "",
"targets": [
{
- "expr": "min(openstack_api_check_status{name=\"nova\"})",
+ "expr": "openstack_api_check_status{interface=\"public\",service_name=\"nova\"}",
"format": "time_series",
"intervalFactor": 2,
"refId": "A"
}
],
"thresholds": "0.5,0.5",
- "title": "VIP API availability",
+ "title": "Public VIP API Availability",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
diff --git a/nova/files/grafana_dashboards/nova_prometheus.json b/nova/files/grafana_dashboards/nova_prometheus.json
index 8ec0841..5764563 100644
--- a/nova/files/grafana_dashboards/nova_prometheus.json
+++ b/nova/files/grafana_dashboards/nova_prometheus.json
@@ -72,16 +72,15 @@
"tableColumn": "",
"targets": [
{
- "expr": "min(openstack_api_check_status{service=\"nova\"})",
+ "expr": "openstack_api_check_status{interface=\"public\",service_name=\"nova\"}",
"format": "time_series",
"intervalFactor": 2,
- "legendFormat": "{{ service }}",
"refId": "A",
"step": 60
}
],
"thresholds": "0.5,1.5",
- "title": "API Availability",
+ "title": "Public VIP API Availability",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
diff --git a/nova/files/grafana_dashboards/nova_prometheus_fluentd.json b/nova/files/grafana_dashboards/nova_prometheus_fluentd.json
index 978f951..1fd3a3a 100644
--- a/nova/files/grafana_dashboards/nova_prometheus_fluentd.json
+++ b/nova/files/grafana_dashboards/nova_prometheus_fluentd.json
@@ -72,16 +72,15 @@
"tableColumn": "",
"targets": [
{
- "expr": "min(openstack_api_check_status{service=\"nova\"})",
+ "expr": "openstack_api_check_status{interface=\"public\",service_name=\"nova\"}",
"format": "time_series",
"intervalFactor": 2,
- "legendFormat": "{{ service }}",
"refId": "A",
"step": 60
}
],
"thresholds": "0.5,1.5",
- "title": "API Availability",
+ "title": "Public VIP API Availability",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
diff --git a/nova/files/pike/nova-compute.conf.Debian b/nova/files/pike/nova-compute.conf.Debian
index a6c4097..89467e8 100644
--- a/nova/files/pike/nova-compute.conf.Debian
+++ b/nova/files/pike/nova-compute.conf.Debian
@@ -7916,10 +7916,15 @@
# Deprecated group/name - [DEFAULT]/kombu_ssl_ca_certs
#kombu_ssl_ca_certs =
+# NOTE(pas-ha) default values of below option is problematic with RMQ 3.8,
+# see PROD-34322
+# recreating queues on a secondary broker immediately after primary broker
+# has gone down leads to these queues being non-functional.
# How long to wait before reconnecting in response to an AMQP consumer cancel
# notification. (floating point value)
# Deprecated group/name - [DEFAULT]/kombu_reconnect_delay
#kombu_reconnect_delay=1.0
+kombu_reconnect_delay=5.0
# EXPERIMENTAL: Possible values are: gzip, bz2. If not set compression will not
# be used. This option may not be available in future versions. (string value)
@@ -7990,18 +7995,30 @@
# Reason: Replaced by [DEFAULT]/transport_url
#rabbit_virtual_host=/
+# NOTE(pas-ha) default values of below option is problematic with RMQ 3.8,
+# see PROD-34322
+# recreating queues on a secondary broker immediately after primary broker
+# has gone down leads to these queues being non-functional.
# How frequently to retry connecting with RabbitMQ. (integer value)
#rabbit_retry_interval=1
{%- if compute.message_queue.rabbit_retry_interval is defined %}
rabbit_retry_interval = {{ compute.message_queue.rabbit_retry_interval }}
+{%- else %}
+rabbit_retry_interval = 5
{%- endif %}
+# NOTE(pas-ha) default values of below option is problematic with RMQ 3.8,
+# see PROD-34322
+# recreating queues on a secondary broker immediately after primary broker
+# has gone down leads to these queues being non-functional.
# How long to backoff for between retries when connecting to RabbitMQ. (integer
# value)
# Deprecated group/name - [DEFAULT]/rabbit_retry_backoff
#rabbit_retry_backoff=2
{%- if compute.message_queue.rabbit_retry_backoff is defined %}
rabbit_retry_backoff = {{ compute.message_queue.rabbit_retry_backoff }}
+{%- else %}
+rabbit_retry_backoff = 10
{%- endif %}
# Maximum interval of RabbitMQ connection retries. Default is 30 seconds.
diff --git a/nova/files/pike/nova-controller.conf.Debian b/nova/files/pike/nova-controller.conf.Debian
index e5dbfb5..162e6e1 100644
--- a/nova/files/pike/nova-controller.conf.Debian
+++ b/nova/files/pike/nova-controller.conf.Debian
@@ -7917,10 +7917,15 @@
# Deprecated group/name - [DEFAULT]/kombu_ssl_ca_certs
#kombu_ssl_ca_certs =
+# NOTE(pas-ha) default values of below option is problematic with RMQ 3.8,
+# see PROD-34322
+# recreating queues on a secondary broker immediately after primary broker
+# has gone down leads to these queues being non-functional.
# How long to wait before reconnecting in response to an AMQP consumer cancel
# notification. (floating point value)
# Deprecated group/name - [DEFAULT]/kombu_reconnect_delay
#kombu_reconnect_delay=1.0
+kombu_reconnect_delay=5.0
# EXPERIMENTAL: Possible values are: gzip, bz2. If not set compression will not
# be used. This option may not be available in future versions. (string value)
@@ -7991,19 +7996,31 @@
# Reason: Replaced by [DEFAULT]/transport_url
#rabbit_virtual_host=/
+# NOTE(pas-ha) default values of below option is problematic with RMQ 3.8,
+# see PROD-34322
+# recreating queues on a secondary broker immediately after primary broker
+# has gone down leads to these queues being non-functional.
# How frequently to retry connecting with RabbitMQ. (integer value)
#rabbit_retry_interval=1
{%- if controller.message_queue.rabbit_retry_interval is defined %}
rabbit_retry_interval = {{ controller.message_queue.rabbit_retry_interval }}
+{%- else %}
+rabbit_retry_interval = 5
{%- endif %}
+# NOTE(pas-ha) default values of below option is problematic with RMQ 3.8,
+# see PROD-34322
+# recreating queues on a secondary broker immediately after primary broker
+# has gone down leads to these queues being non-functional.
# How long to backoff for between retries when connecting to RabbitMQ. (integer
# value)
# Deprecated group/name - [DEFAULT]/rabbit_retry_backoff
#rabbit_retry_backoff=2
{%- if controller.message_queue.rabbit_retry_backoff is defined %}
rabbit_retry_backoff = {{ controller.message_queue.rabbit_retry_backoff }}
+{%- else %}
+rabbit_retry_backoff = 10
{%- endif %}
diff --git a/nova/meta/grafana.yml b/nova/meta/grafana.yml
index 4561b11..0ed0b1a 100644
--- a/nova/meta/grafana.yml
+++ b/nova/meta/grafana.yml
@@ -159,4 +159,4 @@
type: dashboard
target:
cluster_status:
- expr: avg(openstack_api_check_status{service="nova"})
+ expr: openstack_api_check_status{interface="public",service_name="nova"}
diff --git a/nova/meta/prometheus.yml b/nova/meta/prometheus.yml
index 04d3384..7b03f29 100644
--- a/nova/meta/prometheus.yml
+++ b/nova/meta/prometheus.yml
@@ -274,26 +274,6 @@
alert:
{%- if is_controller %}
{% raw %}
- NovaApiOutage:
- if: >-
- max(openstack_api_check_status{name=~"nova.*|placement"}) == 0
- labels:
- severity: critical
- service: nova
- annotations:
- summary: "Nova API outage"
- description: >-
- Nova API is not accessible for all available Nova endpoints in the OpenStack service catalog.
- NovaApiDown:
- if: >-
- openstack_api_check_status{name=~"nova.*|placement"} == 0
- labels:
- severity: major
- service: nova
- annotations:
- summary: "{{ $labels.name }} endpoint is not accessible"
- description: >-
- Nova API is not accessible for the {{ $labels.name }} endpoint.
NovaApiEndpointDown:
if: >-
http_response_status{name=~"nova-api"} == 0