@@ -56,7 +56,7 @@ func intervalsFromNodeLogs(ctx context.Context, kubeClient kubernetes.Interface,
56
56
errCh <- err
57
57
return
58
58
}
59
- newOVSEvents := eventsFromOVSVswitchdLogs (nodeName , ovsVswitchdLogs )
59
+ newOVSEvents := intervalsFromOVSVswitchdLogs (nodeName , ovsVswitchdLogs )
60
60
61
61
networkManagerLogs , err := getNodeLog (ctx , kubeClient , nodeName , "NetworkManager" )
62
62
if err != nil {
@@ -66,11 +66,20 @@ func intervalsFromNodeLogs(ctx context.Context, kubeClient kubernetes.Interface,
66
66
}
67
67
newNetworkManagerIntervals := intervalsFromNetworkManagerLogs (nodeName , networkManagerLogs )
68
68
69
+ systemdCoreDumpLogs , err := getNodeLog (ctx , kubeClient , nodeName , "systemd-coredump" )
70
+ if err != nil {
71
+ fmt .Fprintf (os .Stderr , "Error getting node systemd-coredump logs from %s: %s" , nodeName , err .Error ())
72
+ errCh <- err
73
+ return
74
+ }
75
+ newSystemdCoreDumpIntervals := intervalsFromSystemdCoreDumpLogs (nodeName , systemdCoreDumpLogs )
76
+
69
77
lock .Lock ()
70
78
defer lock .Unlock ()
71
79
ret = append (ret , newEvents ... )
72
80
ret = append (ret , newOVSEvents ... )
73
81
ret = append (ret , newNetworkManagerIntervals ... )
82
+ ret = append (ret , newSystemdCoreDumpIntervals ... )
74
83
}(ctx , node .Name )
75
84
}
76
85
wg .Wait ()
@@ -114,9 +123,9 @@ func eventsFromKubeletLogs(nodeName string, kubeletLog []byte) monitorapi.Interv
114
123
return ret
115
124
}
116
125
117
- // eventsFromOVSVswitchdLogs returns the produced intervals. Any errors during this creation are logged, but
126
+ // intervalsFromOVSVswitchdLogs returns the produced intervals. Any errors during this creation are logged, but
118
127
// not returned because this is a best effort step
119
- func eventsFromOVSVswitchdLogs (nodeName string , ovsLogs []byte ) monitorapi.Intervals {
128
+ func intervalsFromOVSVswitchdLogs (nodeName string , ovsLogs []byte ) monitorapi.Intervals {
120
129
nodeLocator := monitorapi .NewLocator ().NodeFromName (nodeName )
121
130
ret := monitorapi.Intervals {}
122
131
@@ -164,6 +173,55 @@ func unreasonablyLongPollInterval(logLine string, nodeLocator monitorapi.Locator
164
173
165
174
var unreasonablyLongPollIntervalRE = regexp .MustCompile (`Unreasonably long (\d+)ms poll interval` )
166
175
176
+ // intervalsFromSystemdCoreDumpLogs returns the produced intervals. Any errors during this creation are logged, but
177
+ // not returned because this is a best effort step
178
+ func intervalsFromSystemdCoreDumpLogs (nodeName string , coreDumpLogs []byte ) monitorapi.Intervals {
179
+ nodeLocator := monitorapi .NewLocator ().NodeFromName (nodeName )
180
+ ret := monitorapi.Intervals {}
181
+
182
+ scanner := bufio .NewScanner (bytes .NewBuffer (coreDumpLogs ))
183
+ for scanner .Scan () {
184
+ currLine := scanner .Text ()
185
+ ret = append (ret , processCoreDump (currLine , nodeLocator )... )
186
+ }
187
+
188
+ return ret
189
+ }
190
+
191
+ // processCoreDump searches for core dump events with process information
192
+ //
193
+ // Process 7798 (haproxy) of user 1000680000 dumped core.
194
+ func processCoreDump (logLine string , nodeLocator monitorapi.Locator ) monitorapi.Intervals {
195
+ if ! strings .Contains (logLine , "dumped core" ) {
196
+ return nil
197
+ }
198
+
199
+ logTime := utility .SystemdJournalLogTime (logLine , time .Now ().Year ())
200
+
201
+ // Extract the process name from within parentheses
202
+ var processName string
203
+ match := coreDumpProcessRE .FindStringSubmatch (logLine )
204
+ if match != nil && len (match ) > 1 {
205
+ processName = match [1 ]
206
+ }
207
+
208
+ message := logLine [strings .Index (logLine , "Process" ):]
209
+
210
+ // Build the message with process annotation if we extracted it
211
+ messageBuilder := monitorapi .NewMessage ().HumanMessage (message ).Reason (monitorapi .ReasonProcessDumpedCore )
212
+ if processName != "" {
213
+ messageBuilder = messageBuilder .WithAnnotation ("process" , processName )
214
+ }
215
+
216
+ interval := monitorapi .NewInterval (monitorapi .SourceSystemdCoreDumpLog , monitorapi .Warning ).Locator (
217
+ nodeLocator ).Message (messageBuilder ).
218
+ Display ().Build (logTime , logTime .Add (1 * time .Second ))
219
+
220
+ return monitorapi.Intervals {interval }
221
+ }
222
+
223
+ var coreDumpProcessRE = regexp .MustCompile (`Process \d+ \(([^)]+)\) of user \d+ dumped core` )
224
+
167
225
// intervalsFromNetworkManagerLogs returns the produced intervals. Any errors during this creation are logged, but
168
226
// not returned because this is a best effort step
169
227
func intervalsFromNetworkManagerLogs (nodeName string , ovsLogs []byte ) monitorapi.Intervals {
0 commit comments