Skip to content

Commit 6eb7ad8

Browse files
authored
Add resiliency to Resource Monitoring in Linux (#6489)
1 parent 34cdd3a commit 6eb7ad8

23 files changed

+457
-142
lines changed

src/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring/ITcpStateInfoProvider.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@ internal interface ITcpStateInfoProvider
1212
/// Gets the last known TCP/IP v4 state of the system.
1313
/// </summary>
1414
/// <returns>An instance of <see cref="TcpStateInfo"/>.</returns>
15-
TcpStateInfo GetpIpV4TcpStateInfo();
15+
TcpStateInfo GetIpV4TcpStateInfo();
1616

1717
/// <summary>
1818
/// Gets the last known TCP/IP v6 state of the system.
1919
/// </summary>
2020
/// <returns>An instance of <see cref="TcpStateInfo"/>.</returns>
21-
TcpStateInfo GetpIpV6TcpStateInfo();
21+
TcpStateInfo GetIpV6TcpStateInfo();
2222
}

src/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring/Linux/Disk/DiskStatsReader.cs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
using System.Diagnostics.CodeAnalysis;
77
using System.Globalization;
88
using System.IO;
9+
using System.Linq;
910
using Microsoft.Extensions.ObjectPool;
1011
using Microsoft.Shared.Pools;
1112

@@ -23,7 +24,7 @@ internal sealed class DiskStatsReader(IFileSystem fileSystem) : IDiskStatsReader
2324
/// Reads and returns all disk statistics entries.
2425
/// </summary>
2526
/// <returns>List of <see cref="DiskStats"/>.</returns>
26-
public List<DiskStats> ReadAll()
27+
public DiskStats[] ReadAll(string[] skipDevicePrefixes)
2728
{
2829
var diskStatsList = new List<DiskStats>();
2930

@@ -41,7 +42,11 @@ public List<DiskStats> ReadAll()
4142
try
4243
{
4344
DiskStats stat = DiskStatsReader.ParseLine(line);
44-
diskStatsList.Add(stat);
45+
if (!skipDevicePrefixes.Any(prefix =>
46+
stat.DeviceName.StartsWith(prefix, StringComparison.OrdinalIgnoreCase)))
47+
{
48+
diskStatsList.Add(stat);
49+
}
4550
}
4651
#pragma warning disable CA1031
4752
catch (Exception)
@@ -51,7 +56,7 @@ public List<DiskStats> ReadAll()
5156
}
5257
}
5358

54-
return diskStatsList;
59+
return diskStatsList.ToArray();
5560
}
5661

5762
/// <summary>

src/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring/Linux/Disk/IDiskStatsReader.cs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
// Licensed to the .NET Foundation under one or more agreements.
22
// The .NET Foundation licenses this file to you under the MIT license.
33

4-
using System.Collections.Generic;
5-
64
namespace Microsoft.Extensions.Diagnostics.ResourceMonitoring.Linux.Disk;
75

86
/// <summary>
@@ -14,5 +12,5 @@ internal interface IDiskStatsReader
1412
/// Gets all the disk statistics from the system.
1513
/// </summary>
1614
/// <returns>List of <see cref="DiskStats"/> instances.</returns>
17-
List<DiskStats> ReadAll();
15+
DiskStats[] ReadAll(string[] skipDevicePrefixes);
1816
}

src/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring/Linux/Disk/LinuxSystemDiskMetrics.cs

Lines changed: 39 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
// The .NET Foundation licenses this file to you under the MIT license.
33

44
using System;
5+
using System.Collections.Frozen;
56
using System.Collections.Generic;
67
using System.Diagnostics;
78
using System.Diagnostics.Metrics;
8-
using System.Linq;
9+
using System.IO;
910
using Microsoft.Extensions.Logging;
1011
using Microsoft.Extensions.Logging.Abstractions;
1112
using Microsoft.Extensions.Options;
@@ -22,14 +23,22 @@ internal sealed class LinuxSystemDiskMetrics
2223
private const string DeviceKey = "system.device";
2324
private const string DirectionKey = "disk.io.direction";
2425

26+
// Exclude devices with these prefixes because they represent virtual, loopback, or device-mapper disks
27+
// that do not correspond to real physical storage. Including them would distort system disk I/O metrics.
28+
private static readonly string[] _skipDevicePrefixes = new[] { "ram", "loop", "dm-" };
2529
private static readonly KeyValuePair<string, object?> _directionReadTag = new(DirectionKey, "read");
2630
private static readonly KeyValuePair<string, object?> _directionWriteTag = new(DirectionKey, "write");
2731
private readonly ILogger<LinuxSystemDiskMetrics> _logger;
2832
private readonly TimeProvider _timeProvider;
2933
private readonly IDiskStatsReader _diskStatsReader;
3034
private readonly object _lock = new();
31-
private readonly Dictionary<string, DiskStats> _baselineDiskStatsDict = [];
32-
private List<DiskStats> _diskStatsSnapshot = [];
35+
private readonly FrozenDictionary<string, DiskStats> _baselineDiskStatsDict = FrozenDictionary<string, DiskStats>.Empty;
36+
private readonly TimeSpan _retryInterval = TimeSpan.FromMinutes(5);
37+
38+
private DateTimeOffset _lastDiskStatsFailure = DateTimeOffset.MinValue;
39+
private bool _diskStatsUnavailable;
40+
41+
private DiskStats[] _diskStatsSnapshot = [];
3342
private DateTimeOffset _lastRefreshTime = DateTimeOffset.MinValue;
3443

3544
public LinuxSystemDiskMetrics(
@@ -48,7 +57,7 @@ public LinuxSystemDiskMetrics(
4857
}
4958

5059
// We need to read the disk stats once to get the baseline values
51-
_baselineDiskStatsDict = GetAllDiskStats().ToDictionary(d => d.DeviceName);
60+
_baselineDiskStatsDict = GetAllDiskStats().ToFrozenDictionary(d => d.DeviceName);
5261

5362
#pragma warning disable CA2000 // Dispose objects before losing scope
5463
// We don't dispose the meter because IMeterFactory handles that
@@ -85,7 +94,7 @@ public LinuxSystemDiskMetrics(
8594
private IEnumerable<Measurement<long>> GetDiskIoMeasurements()
8695
{
8796
List<Measurement<long>> measurements = [];
88-
List<DiskStats> diskStatsSnapshot = GetDiskStatsSnapshot();
97+
DiskStats[] diskStatsSnapshot = GetDiskStatsSnapshot();
8998

9099
foreach (DiskStats diskStats in diskStatsSnapshot)
91100
{
@@ -102,7 +111,7 @@ private IEnumerable<Measurement<long>> GetDiskIoMeasurements()
102111
private IEnumerable<Measurement<long>> GetDiskOperationMeasurements()
103112
{
104113
List<Measurement<long>> measurements = [];
105-
List<DiskStats> diskStatsSnapshot = GetDiskStatsSnapshot();
114+
DiskStats[] diskStatsSnapshot = GetDiskStatsSnapshot();
106115

107116
foreach (DiskStats diskStats in diskStatsSnapshot)
108117
{
@@ -119,7 +128,7 @@ private IEnumerable<Measurement<long>> GetDiskOperationMeasurements()
119128
private IEnumerable<Measurement<double>> GetDiskIoTimeMeasurements()
120129
{
121130
List<Measurement<double>> measurements = [];
122-
List<DiskStats> diskStatsSnapshot = GetDiskStatsSnapshot();
131+
DiskStats[] diskStatsSnapshot = GetDiskStatsSnapshot();
123132

124133
foreach (DiskStats diskStats in diskStatsSnapshot)
125134
{
@@ -131,12 +140,12 @@ private IEnumerable<Measurement<double>> GetDiskIoTimeMeasurements()
131140
return measurements;
132141
}
133142

134-
private List<DiskStats> GetDiskStatsSnapshot()
143+
private DiskStats[] GetDiskStatsSnapshot()
135144
{
136145
lock (_lock)
137146
{
138147
DateTimeOffset now = _timeProvider.GetUtcNow();
139-
if (_diskStatsSnapshot.Count == 0 || (now - _lastRefreshTime).TotalSeconds > MinimumDiskStatsRefreshIntervalInSeconds)
148+
if (_diskStatsSnapshot.Length == 0 || (now - _lastRefreshTime).TotalSeconds > MinimumDiskStatsRefreshIntervalInSeconds)
140149
{
141150
_diskStatsSnapshot = GetAllDiskStats();
142151
_lastRefreshTime = now;
@@ -146,27 +155,37 @@ private List<DiskStats> GetDiskStatsSnapshot()
146155
return _diskStatsSnapshot;
147156
}
148157

149-
private List<DiskStats> GetAllDiskStats()
158+
private DiskStats[] GetAllDiskStats()
150159
{
160+
if (_diskStatsUnavailable &&
161+
_timeProvider.GetUtcNow() - _lastDiskStatsFailure < _retryInterval)
162+
{
163+
return Array.Empty<DiskStats>();
164+
}
165+
151166
try
152167
{
153-
List<DiskStats> diskStatsList = _diskStatsReader.ReadAll();
154-
155-
// We should not include ram, loop, or dm(device-mapper) devices in the disk stats, should we?
156-
diskStatsList = diskStatsList
157-
.Where(d => !d.DeviceName.StartsWith("ram", StringComparison.OrdinalIgnoreCase)
158-
&& !d.DeviceName.StartsWith("loop", StringComparison.OrdinalIgnoreCase)
159-
&& !d.DeviceName.StartsWith("dm-", StringComparison.OrdinalIgnoreCase))
160-
.ToList();
168+
DiskStats[] diskStatsList = _diskStatsReader.ReadAll(_skipDevicePrefixes);
169+
_diskStatsUnavailable = false;
170+
161171
return diskStatsList;
162172
}
173+
catch (Exception ex) when (
174+
ex is FileNotFoundException ||
175+
ex is DirectoryNotFoundException ||
176+
ex is UnauthorizedAccessException)
177+
{
178+
_logger.HandleDiskStatsException(ex.Message);
179+
_lastDiskStatsFailure = _timeProvider.GetUtcNow();
180+
_diskStatsUnavailable = true;
181+
}
163182
#pragma warning disable CA1031
164183
catch (Exception ex)
165184
#pragma warning restore CA1031
166185
{
167-
Log.HandleDiskStatsException(_logger, ex.Message);
186+
_logger.HandleDiskStatsException(ex.Message);
168187
}
169188

170-
return [];
189+
return Array.Empty<DiskStats>();
171190
}
172191
}

src/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring/Linux/LinuxUtilizationProvider.cs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ public LinuxUtilizationProvider(IOptions<ResourceMonitoringOptions> options, ILi
112112
// _memoryLimit - Resource Memory Limit (in k8s terms)
113113
// _memoryLimit - To keep the contract, this parameter will get the Host available memory
114114
Resources = new SystemResources(cpuRequest, cpuLimit, _memoryLimit, _memoryLimit);
115-
Log.SystemResourcesInfo(_logger, cpuLimit, cpuRequest, _memoryLimit, _memoryLimit);
115+
_logger.SystemResourcesInfo(cpuLimit, cpuRequest, _memoryLimit, _memoryLimit);
116116
}
117117

118118
public double CpuUtilizationWithoutHostDelta()
@@ -144,7 +144,7 @@ public double CpuUtilizationWithoutHostDelta()
144144
{
145145
coresUsed = deltaCgroup / (double)deltaCpuPeriodInNanoseconds;
146146

147-
Log.CpuUsageDataV2(_logger, cpuUsageTime, _previousCgroupCpuTime, deltaCpuPeriodInNanoseconds, coresUsed);
147+
_logger.CpuUsageDataV2(cpuUsageTime, _previousCgroupCpuTime, deltaCpuPeriodInNanoseconds, coresUsed);
148148

149149
_lastCpuCoresUsed = coresUsed;
150150
_refreshAfterCpu = now.Add(_cpuRefreshInterval);
@@ -158,7 +158,7 @@ public double CpuUtilizationWithoutHostDelta()
158158
{
159159
coresUsed = deltaCgroup / actualElapsedNanoseconds;
160160

161-
Log.CpuUsageDataV2(_logger, cpuUsageTime, _previousCgroupCpuTime, actualElapsedNanoseconds, coresUsed);
161+
_logger.CpuUsageDataV2(cpuUsageTime, _previousCgroupCpuTime, actualElapsedNanoseconds, coresUsed);
162162

163163
_lastCpuCoresUsed = coresUsed;
164164
_refreshAfterCpu = now.Add(_cpuRefreshInterval);
@@ -188,15 +188,15 @@ public double CpuUtilizationLimit(float cpuLimit)
188188
{
189189
_cpuUtilizationLimit100PercentExceededCounter?.Add(1);
190190
_cpuUtilizationLimit100PercentExceeded++;
191-
Log.CounterMessage100(_logger, _cpuUtilizationLimit100PercentExceeded);
191+
_logger.CounterMessage100(_cpuUtilizationLimit100PercentExceeded);
192192
}
193193

194194
// Increment counter if utilization exceeds 110%
195195
if (utilization > CpuLimitThreshold110Percent)
196196
{
197197
_cpuUtilizationLimit110PercentExceededCounter?.Add(1);
198198
_cpuUtilizationLimit110PercentExceeded++;
199-
Log.CounterMessage110(_logger, _cpuUtilizationLimit110PercentExceeded);
199+
_logger.CounterMessage110(_cpuUtilizationLimit110PercentExceeded);
200200
}
201201

202202
return utilization;
@@ -228,7 +228,7 @@ public double CpuUtilization()
228228
{
229229
double percentage = Math.Min(One, (double)deltaCgroup / deltaHost);
230230

231-
Log.CpuUsageData(_logger, cgroupCpuTime, hostCpuTime, _previousCgroupCpuTime, _previousHostCpuTime, percentage);
231+
_logger.CpuUsageData(cgroupCpuTime, hostCpuTime, _previousCgroupCpuTime, _previousHostCpuTime, percentage);
232232

233233
_cpuPercentage = percentage;
234234
_refreshAfterCpu = now.Add(_cpuRefreshInterval);
@@ -266,7 +266,7 @@ public double MemoryUtilization()
266266
}
267267
}
268268

269-
Log.MemoryUsageData(_logger, memoryUsed, _memoryLimit, _memoryPercentage);
269+
_logger.MemoryUsageData(memoryUsed, _memoryLimit, _memoryPercentage);
270270

271271
return _memoryPercentage;
272272
}

src/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring/Linux/Log.cs

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ internal static partial class Log
1515
"Computed CPU usage with CgroupCpuTime = {cgroupCpuTime}, HostCpuTime = {hostCpuTime}, PreviousCgroupCpuTime = {previousCgroupCpuTime}, PreviousHostCpuTime = {previousHostCpuTime}, CpuPercentage = {cpuPercentage}.")]
1616
#pragma warning restore S103 // Lines should not be too long
1717
public static partial void CpuUsageData(
18-
ILogger logger,
18+
this ILogger logger,
1919
long cgroupCpuTime,
2020
long hostCpuTime,
2121
long previousCgroupCpuTime,
@@ -25,21 +25,26 @@ public static partial void CpuUsageData(
2525
[LoggerMessage(2, LogLevel.Debug,
2626
"Computed memory usage with MemoryUsedInBytes = {memoryUsed}, MemoryLimit = {memoryLimit}, MemoryPercentage = {memoryPercentage}.")]
2727
public static partial void MemoryUsageData(
28-
ILogger logger,
28+
this ILogger logger,
2929
ulong memoryUsed,
3030
double memoryLimit,
3131
double memoryPercentage);
3232

3333
[LoggerMessage(3, LogLevel.Debug,
3434
"System resources information: CpuLimit = {cpuLimit}, CpuRequest = {cpuRequest}, MemoryLimit = {memoryLimit}, MemoryRequest = {memoryRequest}.")]
35-
public static partial void SystemResourcesInfo(ILogger logger, double cpuLimit, double cpuRequest, ulong memoryLimit, ulong memoryRequest);
35+
public static partial void SystemResourcesInfo(
36+
this ILogger logger,
37+
double cpuLimit,
38+
double cpuRequest,
39+
ulong memoryLimit,
40+
ulong memoryRequest);
3641

3742
[LoggerMessage(4, LogLevel.Debug,
3843
#pragma warning disable S103 // Lines should not be too long
3944
"For CgroupV2, Computed CPU usage with CgroupCpuTime = {cgroupCpuTime}, PreviousCgroupCpuTime = {previousCgroupCpuTime}, ActualElapsedNanoseconds = {actualElapsedNanoseconds}, CpuCores = {cpuCores}.")]
4045
#pragma warning restore S103 // Lines should not be too long
4146
public static partial void CpuUsageDataV2(
42-
ILogger logger,
47+
this ILogger logger,
4348
long cgroupCpuTime,
4449
long previousCgroupCpuTime,
4550
double actualElapsedNanoseconds,
@@ -48,16 +53,18 @@ public static partial void CpuUsageDataV2(
4853
[LoggerMessage(5, LogLevel.Debug,
4954
"CPU utilization exceeded 100%: Counter = {counterValue}")]
5055
public static partial void CounterMessage100(
51-
ILogger logger,
56+
this ILogger logger,
5257
long counterValue);
5358

5459
[LoggerMessage(6, LogLevel.Debug,
5560
"CPU utilization exceeded 110%: Counter = {counterValue}")]
5661
public static partial void CounterMessage110(
57-
ILogger logger,
62+
this ILogger logger,
5863
long counterValue);
5964

6065
[LoggerMessage(7, LogLevel.Warning,
6166
"Error while getting disk stats: Error={errorMessage}")]
62-
public static partial void HandleDiskStatsException(ILogger logger, string errorMessage);
67+
public static partial void HandleDiskStatsException(
68+
this ILogger logger,
69+
string errorMessage);
6370
}

0 commit comments

Comments
 (0)