Skip to content

Commit d73fca0

Browse files
authored
Add disk timeout configuration option (#6826)
* Add disk timeout setting * Add tests for timeout - test_timeout_value which checks that every timeout call is done with the configuration value - test_timeout_warning which checks that a warning is raised when the timeout raises an exception
1 parent 13ffb4f commit d73fca0

4 files changed

Lines changed: 85 additions & 6 deletions

File tree

disk/assets/configuration/spec.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,3 +159,9 @@ files:
159159
value:
160160
example: 0
161161
type: number
162+
- name: timeout
163+
description: Timeout of the disk query in seconds
164+
value:
165+
example: 5
166+
default: 5
167+
type: integer

disk/datadog_checks/disk/data/conf.yaml.example

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,3 +142,8 @@ instances:
142142
## Exclude devices with a total disk size less than a minimum value (in MiB)
143143
#
144144
# min_disk_size: 0
145+
146+
## @param timeout - integer - optional - default: 5
147+
## Timeout of the disk query in seconds
148+
#
149+
# timeout: 5

disk/datadog_checks/disk/disk.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def __init__(self, name, init_config, instances):
5858
self._service_check_rw = is_affirmative(instance.get('service_check_rw', False))
5959
self._min_disk_size = instance.get('min_disk_size', 0) * 1024 * 1024
6060
self._blkid_cache_file = instance.get('blkid_cache_file')
61-
61+
self._timeout = instance.get('timeout', 5)
6262
self._compile_pattern_filters(instance)
6363
self._compile_tag_re()
6464
self._blkid_label_re = re.compile('LABEL=\"(.*?)\"', re.I)
@@ -78,10 +78,13 @@ def check(self, instance):
7878

7979
# Get disk metrics here to be able to exclude on total usage
8080
try:
81-
disk_usage = timeout(5)(psutil.disk_usage)(part.mountpoint)
81+
disk_usage = timeout(self._timeout)(psutil.disk_usage)(part.mountpoint)
8282
except TimeoutException:
8383
self.log.warning(
84-
u'Timeout while retrieving the disk usage of `%s` mountpoint. Skipping...', part.mountpoint
84+
u'Timeout after %d seconds while retrieving the disk usage of `%s` mountpoint. '
85+
u'You might want to change the timeout length in the settings.',
86+
self._timeout,
87+
part.mountpoint,
8588
)
8689
continue
8790
except Exception as e:
@@ -232,9 +235,14 @@ def _collect_inodes_metrics(self, mountpoint):
232235
metrics = {}
233236
# we need to timeout this, too.
234237
try:
235-
inodes = timeout(5)(os.statvfs)(mountpoint)
238+
inodes = timeout(self._timeout)(os.statvfs)(mountpoint)
236239
except TimeoutException:
237-
self.log.warning(u'Timeout while retrieving the disk usage of `%s` mountpoint. Skipping...', mountpoint)
240+
self.log.warning(
241+
u'Timeout after %d seconds while retrieving the disk usage of `%s` mountpoint. '
242+
u'You might want to change the timeout length in the settings.',
243+
self._timeout,
244+
mountpoint,
245+
)
238246
return metrics
239247
except Exception as e:
240248
self.log.warning('Unable to get disk metrics for %s: %s', mountpoint, e)

disk/tests/test_unit.py

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,11 @@
99
from six import iteritems
1010

1111
from datadog_checks.base.utils.platform import Platform
12+
from datadog_checks.base.utils.timeout import TimeoutException
1213
from datadog_checks.disk import Disk
1314

1415
from .common import DEFAULT_DEVICE_BASE_NAME, DEFAULT_DEVICE_NAME, DEFAULT_FILE_SYSTEM, DEFAULT_MOUNT_POINT
15-
from .mocks import MockDiskMetrics, mock_blkid_output
16+
from .mocks import MockDiskMetrics, MockPart, mock_blkid_output
1617

1718

1819
def test_default_options():
@@ -30,6 +31,7 @@ def test_default_options():
3031
assert check._device_tag_re == []
3132
assert check._service_check_rw is False
3233
assert check._min_disk_size == 0
34+
assert check._timeout == 5
3335

3436

3537
def test_bad_config():
@@ -211,3 +213,61 @@ def test_blkid_cache_file_contains_no_labels(
211213
c.check(instance_blkid_cache_file_no_label)
212214
for metric in chain(gauge_metrics, rate_metrics):
213215
aggregator.assert_metric(metric, tags=['device:/dev/sda1', 'device_name:sda1'])
216+
217+
218+
@pytest.mark.usefixtures('psutil_mocks')
219+
def test_timeout_config(aggregator, gauge_metrics, rate_metrics):
220+
"""Test timeout configuration value is used on every timeout on the check."""
221+
222+
# Arbitrary value
223+
TIMEOUT_VALUE = 42
224+
instance = {'timeout': TIMEOUT_VALUE}
225+
c = Disk('disk', {}, [instance])
226+
227+
# Mock timeout version
228+
def no_timeout(fun):
229+
return lambda *args: fun(args)
230+
231+
with mock.patch('psutil.disk_partitions', return_value=[MockPart()]), mock.patch(
232+
'datadog_checks.disk.disk.timeout', return_value=no_timeout
233+
) as mock_timeout:
234+
c.check(instance)
235+
236+
mock_timeout.assert_called_with(TIMEOUT_VALUE)
237+
238+
239+
@pytest.mark.usefixtures('psutil_mocks')
240+
def test_timeout_warning(aggregator, gauge_metrics, rate_metrics):
241+
"""Test a warning is raised when there is a Timeout exception."""
242+
243+
# Raise exception for "/faulty" mountpoint
244+
def faulty_timeout(fun):
245+
def f(mountpoint):
246+
if mountpoint == "/faulty":
247+
raise TimeoutException
248+
else:
249+
return fun(mountpoint)
250+
251+
return f
252+
253+
c = Disk('disk', {}, [{}])
254+
c.log = mock.MagicMock()
255+
m = MockDiskMetrics()
256+
m.total = 0
257+
258+
with mock.patch('psutil.disk_partitions', return_value=[MockPart(), MockPart(mountpoint="/faulty")]), mock.patch(
259+
'psutil.disk_usage', return_value=m, __name__='disk_usage'
260+
), mock.patch('datadog_checks.disk.disk.timeout', return_value=faulty_timeout):
261+
c.check({})
262+
263+
# Check that the warning is called once for the faulty disk
264+
c.log.warning.assert_called_once()
265+
266+
for name in gauge_metrics:
267+
aggregator.assert_metric(name, count=0)
268+
269+
for name in rate_metrics:
270+
aggregator.assert_metric_has_tag(name, 'device:{}'.format(DEFAULT_DEVICE_NAME))
271+
aggregator.assert_metric_has_tag(name, 'device_name:{}'.format(DEFAULT_DEVICE_BASE_NAME))
272+
273+
aggregator.assert_all_metrics_covered()

0 commit comments

Comments
 (0)