Merge "Fix confusion between `state` and `status` in overview dashboard."
diff --git a/nova/files/grafana_dashboards/nova_overview_prometheus.json b/nova/files/grafana_dashboards/nova_overview_prometheus.json
index e80866f..d326567 100644
--- a/nova/files/grafana_dashboards/nova_overview_prometheus.json
+++ b/nova/files/grafana_dashboards/nova_overview_prometheus.json
@@ -102,14 +102,14 @@
       "tableColumn": "",
       "targets": [
         {
-          "expr": "min(openstack_api_check_status{name=\"nova\"})",
+          "expr": "openstack_api_check_status{interface=\"public\",service_name=\"nova\"}",
           "format": "time_series",
           "intervalFactor": 2,
           "refId": "A"
         }
       ],
       "thresholds": "0.5,0.5",
-      "title": "VIP API availability",
+      "title": "Public VIP API Availability",
       "type": "singlestat",
       "valueFontSize": "80%",
       "valueMaps": [
diff --git a/nova/files/grafana_dashboards/nova_prometheus.json b/nova/files/grafana_dashboards/nova_prometheus.json
index 8ec0841..5764563 100644
--- a/nova/files/grafana_dashboards/nova_prometheus.json
+++ b/nova/files/grafana_dashboards/nova_prometheus.json
@@ -72,16 +72,15 @@
           "tableColumn": "",
           "targets": [
             {
-              "expr": "min(openstack_api_check_status{service=\"nova\"})",
+              "expr": "openstack_api_check_status{interface=\"public\",service_name=\"nova\"}",
               "format": "time_series",
               "intervalFactor": 2,
-              "legendFormat": "{{ service }}",
               "refId": "A",
               "step": 60
             }
           ],
           "thresholds": "0.5,1.5",
-          "title": "API Availability",
+          "title": "Public VIP API Availability",
           "type": "singlestat",
           "valueFontSize": "80%",
           "valueMaps": [
diff --git a/nova/files/grafana_dashboards/nova_prometheus_fluentd.json b/nova/files/grafana_dashboards/nova_prometheus_fluentd.json
index 978f951..1fd3a3a 100644
--- a/nova/files/grafana_dashboards/nova_prometheus_fluentd.json
+++ b/nova/files/grafana_dashboards/nova_prometheus_fluentd.json
@@ -72,16 +72,15 @@
           "tableColumn": "",
           "targets": [
             {
-              "expr": "min(openstack_api_check_status{service=\"nova\"})",
+              "expr": "openstack_api_check_status{interface=\"public\",service_name=\"nova\"}",
               "format": "time_series",
               "intervalFactor": 2,
-              "legendFormat": "{{ service }}",
               "refId": "A",
               "step": 60
             }
           ],
           "thresholds": "0.5,1.5",
-          "title": "API Availability",
+          "title": "Public VIP API Availability",
           "type": "singlestat",
           "valueFontSize": "80%",
           "valueMaps": [
diff --git a/nova/files/pike/nova-compute.conf.Debian b/nova/files/pike/nova-compute.conf.Debian
index a6c4097..89467e8 100644
--- a/nova/files/pike/nova-compute.conf.Debian
+++ b/nova/files/pike/nova-compute.conf.Debian
@@ -7916,10 +7916,15 @@
 # Deprecated group/name - [DEFAULT]/kombu_ssl_ca_certs
 #kombu_ssl_ca_certs =
 
+# NOTE(pas-ha) default values of below option is problematic with RMQ 3.8,
+# see PROD-34322
+# recreating queues on a secondary broker immediately after primary broker
+# has gone down leads to these queues being non-functional.
 # How long to wait before reconnecting in response to an AMQP consumer cancel
 # notification. (floating point value)
 # Deprecated group/name - [DEFAULT]/kombu_reconnect_delay
 #kombu_reconnect_delay=1.0
+kombu_reconnect_delay=5.0
 
 # EXPERIMENTAL: Possible values are: gzip, bz2. If not set compression will not
 # be used. This option may not be available in future versions. (string value)
@@ -7990,18 +7995,30 @@
 # Reason: Replaced by [DEFAULT]/transport_url
 #rabbit_virtual_host=/
 
+# NOTE(pas-ha) default values of below option is problematic with RMQ 3.8,
+# see PROD-34322
+# recreating queues on a secondary broker immediately after primary broker
+# has gone down leads to these queues being non-functional.
 # How frequently to retry connecting with RabbitMQ. (integer value)
 #rabbit_retry_interval=1
 {%- if compute.message_queue.rabbit_retry_interval is defined %}
 rabbit_retry_interval = {{ compute.message_queue.rabbit_retry_interval }}
+{%- else %}
+rabbit_retry_interval = 5
 {%- endif %}
 
+# NOTE(pas-ha) default values of below option is problematic with RMQ 3.8,
+# see PROD-34322
+# recreating queues on a secondary broker immediately after primary broker
+# has gone down leads to these queues being non-functional.
 # How long to backoff for between retries when connecting to RabbitMQ. (integer
 # value)
 # Deprecated group/name - [DEFAULT]/rabbit_retry_backoff
 #rabbit_retry_backoff=2
 {%- if compute.message_queue.rabbit_retry_backoff is defined %}
 rabbit_retry_backoff = {{ compute.message_queue.rabbit_retry_backoff }}
+{%- else %}
+rabbit_retry_backoff = 10
 {%- endif %}
 
 # Maximum interval of RabbitMQ connection retries. Default is 30 seconds.
diff --git a/nova/files/pike/nova-controller.conf.Debian b/nova/files/pike/nova-controller.conf.Debian
index e5dbfb5..162e6e1 100644
--- a/nova/files/pike/nova-controller.conf.Debian
+++ b/nova/files/pike/nova-controller.conf.Debian
@@ -7917,10 +7917,15 @@
 # Deprecated group/name - [DEFAULT]/kombu_ssl_ca_certs
 #kombu_ssl_ca_certs =
 
+# NOTE(pas-ha) default values of below option is problematic with RMQ 3.8,
+# see PROD-34322
+# recreating queues on a secondary broker immediately after primary broker
+# has gone down leads to these queues being non-functional.
 # How long to wait before reconnecting in response to an AMQP consumer cancel
 # notification. (floating point value)
 # Deprecated group/name - [DEFAULT]/kombu_reconnect_delay
 #kombu_reconnect_delay=1.0
+kombu_reconnect_delay=5.0
 
 # EXPERIMENTAL: Possible values are: gzip, bz2. If not set compression will not
 # be used. This option may not be available in future versions. (string value)
@@ -7991,19 +7996,31 @@
 # Reason: Replaced by [DEFAULT]/transport_url
 #rabbit_virtual_host=/
 
+# NOTE(pas-ha) default values of below option is problematic with RMQ 3.8,
+# see PROD-34322
+# recreating queues on a secondary broker immediately after primary broker
+# has gone down leads to these queues being non-functional.
 # How frequently to retry connecting with RabbitMQ. (integer value)
 #rabbit_retry_interval=1
 {%- if controller.message_queue.rabbit_retry_interval is defined %}
 rabbit_retry_interval = {{ controller.message_queue.rabbit_retry_interval }}
+{%- else %}
+rabbit_retry_interval = 5
 {%- endif %}
 
 
+# NOTE(pas-ha) default values of below option is problematic with RMQ 3.8,
+# see PROD-34322
+# recreating queues on a secondary broker immediately after primary broker
+# has gone down leads to these queues being non-functional.
 # How long to backoff for between retries when connecting to RabbitMQ. (integer
 # value)
 # Deprecated group/name - [DEFAULT]/rabbit_retry_backoff
 #rabbit_retry_backoff=2
 {%- if controller.message_queue.rabbit_retry_backoff is defined %}
 rabbit_retry_backoff = {{ controller.message_queue.rabbit_retry_backoff }}
+{%- else %}
+rabbit_retry_backoff = 10
 {%- endif %}
 
 
diff --git a/nova/meta/grafana.yml b/nova/meta/grafana.yml
index 4561b11..0ed0b1a 100644
--- a/nova/meta/grafana.yml
+++ b/nova/meta/grafana.yml
@@ -159,4 +159,4 @@
               type: dashboard
             target:
               cluster_status:
-                expr: avg(openstack_api_check_status{service="nova"})
+                expr: openstack_api_check_status{interface="public",service_name="nova"}
diff --git a/nova/meta/prometheus.yml b/nova/meta/prometheus.yml
index 04d3384..7b03f29 100644
--- a/nova/meta/prometheus.yml
+++ b/nova/meta/prometheus.yml
@@ -274,26 +274,6 @@
   alert:
 {%- if is_controller %}
 {% raw %}
-    NovaApiOutage:
-      if: >-
-        max(openstack_api_check_status{name=~"nova.*|placement"}) == 0
-      labels:
-        severity: critical
-        service: nova
-      annotations:
-        summary: "Nova API outage"
-        description: >-
-          Nova API is not accessible for all available Nova endpoints in the OpenStack service catalog.
-    NovaApiDown:
-      if: >-
-        openstack_api_check_status{name=~"nova.*|placement"} == 0
-      labels:
-        severity: major
-        service: nova
-      annotations:
-        summary: "{{ $labels.name }} endpoint is not accessible"
-        description: >-
-          Nova API is not accessible for the {{ $labels.name }} endpoint.
     NovaApiEndpointDown:
       if: >-
         http_response_status{name=~"nova-api"} == 0