Skip to content

Commit 32be53d

Browse files
author
Diogo Kiss
committed
[consul] add maintenance metric to Consul catalog.
Currently, if a node in Consul is set into maintenance state, it is reported as a node in critical state to Datadog metrics. This makes it confusing to determine by the metric whether it is a node failing with problems or an planned intervention. A user made a PR to Datadog some time ago, but it was not merged due to Datadog code organization changes and got forgotten (DataDog/dd-agent#2496). I'm pushing the change forward. I've tested it using dd-agent version 5.8.0. This PR also depends on DataDog/dd-agent#3708
1 parent c4c84cc commit 32be53d

File tree

6 files changed

+106
-5
lines changed

6 files changed

+106
-5
lines changed

consul/CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
# CHANGELOG - consul
22

3+
1.3.1 / Unreleased
4+
=================
5+
6+
* [IMPROVEMENT] Add maintenance metrics (services_maintenance and nodes_maintenance)
7+
38
1.3.0 / 2018-01-10
49
==================
510

consul/datadog_checks/consul/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22

33
ConsulCheck = consul.ConsulCheck
44

5-
__version__ = "1.3.0"
5+
__version__ = "1.3.1"
66

77
__all__ = ['consul']

consul/datadog_checks/consul/consul.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,15 @@ class ConsulCheck(AgentCheck):
6363
'passing': AgentCheck.OK,
6464
'warning': AgentCheck.WARNING,
6565
'critical': AgentCheck.CRITICAL,
66+
'maintenance': AgentCheck.MAINTENANCE
6667
}
6768

6869
STATUS_SEVERITY = {
6970
AgentCheck.UNKNOWN: 0,
7071
AgentCheck.OK: 1,
7172
AgentCheck.WARNING: 2,
7273
AgentCheck.CRITICAL: 3,
74+
AgentCheck.MAINTENANCE: 4,
7375
}
7476

7577
def __init__(self, name, init_config, agentConfig, instances=None):
@@ -322,12 +324,13 @@ def check(self, instance):
322324
# `consul.catalog.nodes_passing` : # of Nodes with service status `passing` from those registered
323325
# `consul.catalog.nodes_warning` : # of Nodes with service status `warning` from those registered
324326
# `consul.catalog.nodes_critical` : # of Nodes with service status `critical` from those registered
327+
# `consul.catalog.nodes_maintenance` : # of Nodes set in maintenance from those registered
325328

326329
service_tags = self._get_service_tags(service, services[service])
327330

328331
nodes_with_service = self.get_nodes_with_service(instance, service)
329332

330-
# {'up': 0, 'passing': 0, 'warning': 0, 'critical': 0}
333+
# {'up': 0, 'passing': 0, 'warning': 0, 'critical': 0, 'maintenance': 0}
331334
node_status = defaultdict(int)
332335

333336
for node in nodes_with_service:
@@ -345,8 +348,18 @@ def check(self, instance):
345348
found_critical = False
346349
found_warning = False
347350
found_serf_health = False
351+
found_maint_critical = False
348352

349353
for check in node['Checks']:
354+
355+
# If a node is in maintenance state, it means that, for some reason, we don't
356+
# really expect it to be healthy. We don't really care about it, until the maintenance
357+
# window is over. So, we just move on.
358+
if check['CheckID'] == '_node_maintenance':
359+
if check['Status'] == 'critical':
360+
found_maint_critical = True
361+
break
362+
350363
if check['CheckID'] == 'serfHealth':
351364
found_serf_health = True
352365

@@ -367,8 +380,11 @@ def check(self, instance):
367380
# Keep looping in case there is a critical status
368381

369382
# Increment the counters based on what was found in Checks
370-
# `critical` checks override `warning`s, and if neither are found, register the node as `passing`
371-
if found_critical:
383+
# `maintenance` checks override `critical`s, which override `warning`s. If none is found, register the node as `passing`
384+
if found_maint_critical:
385+
node_status['maintenance'] += 1
386+
nodes_to_service_status[node_id]["maintenance"] += 1
387+
elif found_critical:
372388
node_status['critical'] += 1
373389
nodes_to_service_status[node_id]["critical"] += 1
374390
elif found_warning:

consul/manifest.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
"mac_os",
1212
"windows"
1313
],
14-
"version": "1.3.0",
14+
"version": "1.3.1",
1515
"guid": "ec1e9fac-a339-49a3-b501-60656d2a5671",
1616
"public_title": "Datadog-Consul Integration",
1717
"categories":["containers", "orchestration", "configuration & deployment", "notification"],

consul/metadata.csv

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name
22
consul.catalog.nodes_critical,gauge,,node,,Number of nodes with service status `critical` from those registered,-1,consul,nodes crit
3+
consul.catalog.nodes_maintenance,gauge,,node,,Number of nodes in maintenance from those registered,-1,consul,nodes maint
34
consul.catalog.nodes_passing,gauge,,node,,Number of nodes with service status `passing` from those registered,1,consul,nodes pass
45
consul.catalog.nodes_up,gauge,,node,,Number of nodes,0,consul,nodes up
56
consul.catalog.nodes_warning,gauge,,node,,Number of nodes with service status `warning` from those registered,-1,consul,nodes warn
67
consul.catalog.services_critical,gauge,,service,,Total critical services on nodes,-1,consul,svc crit
8+
consul.catalog.services_maintenance,gauge,,service,,Total services in maintenance on nodes,-1,consul,svc maint
79
consul.catalog.services_passing,gauge,,service,,Total passing services on nodes,1,consul,svc pass
810
consul.catalog.services_up,gauge,,service,,Total services registered on nodes,0,consul,svc up
911
consul.catalog.services_warning,gauge,,service,,Total warning services on nodes,-1,consul,svc warn

consul/test/test_consul.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,68 @@ def mock_get_nodes_with_service_critical(self, instance, service):
287287
}
288288
]
289289

290+
def mock_get_nodes_with_service_critical_in_maitenance(self, instance, service):
291+
292+
return [
293+
{
294+
"Checks": [
295+
{
296+
"CheckID": "_node_maintenance",
297+
"Name": "Node Maintenance Mode",
298+
"Node": "node-1",
299+
"Notes": "",
300+
"Output": "",
301+
"ServiceID": service,
302+
"ServiceName": "",
303+
"Status": "critical",
304+
},
305+
{
306+
"CheckID": "serfHealth",
307+
"Name": "Serf Health Status",
308+
"Node": "node-1",
309+
"Notes": "",
310+
"Output": "Agent alive and reachable",
311+
"ServiceID": "",
312+
"ServiceName": "",
313+
"Status": "passing"
314+
},
315+
{
316+
"CheckID": "service:{0}".format(service),
317+
"Name": "service check {0}".format(service),
318+
"Node": "node-1",
319+
"Notes": "",
320+
"Output": "Service {0} alive".format(service),
321+
"ServiceID": service,
322+
"ServiceName": "",
323+
"Status": "warning"
324+
},
325+
{
326+
"CheckID": "service:{0}".format(service),
327+
"Name": "service check {0}".format(service),
328+
"Node": "node-1",
329+
"Notes": "",
330+
"Output": "Service {0} alive".format(service),
331+
"ServiceID": service,
332+
"ServiceName": "",
333+
"Status": "critical"
334+
}
335+
],
336+
"Node": {
337+
"Address": _get_random_ip(),
338+
"Node": "node-1"
339+
},
340+
"Service": {
341+
"Address": "",
342+
"ID": service,
343+
"Port": 80,
344+
"Service": service,
345+
"Tags": [
346+
"az-us-east-1a"
347+
]
348+
}
349+
}
350+
]
351+
290352
def mock_get_coord_datacenters(self, instance):
291353
return [{
292354
"Datacenter": "dc1",
@@ -500,6 +562,22 @@ def test_get_nodes_with_service_critical(self):
500562
self.assertMetric('consul.catalog.services_warning', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])
501563
self.assertMetric('consul.catalog.services_critical', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])
502564

565+
def test_get_nodes_with_service_critical_in_maintenance(self):
566+
my_mocks = self._get_consul_mocks()
567+
my_mocks['get_nodes_with_service'] = self.mock_get_nodes_with_service_critical_in_maitenance
568+
569+
self.run_check(MOCK_CONFIG, mocks=my_mocks)
570+
self.assertMetric('consul.catalog.nodes_up', value=1, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a'])
571+
self.assertMetric('consul.catalog.nodes_passing', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a'])
572+
self.assertMetric('consul.catalog.nodes_warning', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a'])
573+
self.assertMetric('consul.catalog.nodes_critical', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a'])
574+
self.assertMetric('consul.catalog.nodes_maintenance', value=1, tags=['consul_datacenter:dc1', 'consul_service_id:service-1', 'consul_service-1_service_tag:az-us-east-1a'])
575+
self.assertMetric('consul.catalog.services_up', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])
576+
self.assertMetric('consul.catalog.services_passing', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])
577+
self.assertMetric('consul.catalog.services_warning', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])
578+
self.assertMetric('consul.catalog.services_critical', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])
579+
self.assertMetric('consul.catalog.services_maintenance', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1'])
580+
503581
def test_service_checks(self):
504582
my_mocks = self._get_consul_mocks()
505583
my_mocks['consul_request'] = self.mock_get_health_check

0 commit comments

Comments
 (0)