Skip to content

Commit fe11f5f

Browse files
committed
Add intervals and a test for systemd-coredumps
1 parent 755c0d2 commit fe11f5f

File tree

4 files changed

+375
-4
lines changed

4 files changed

+375
-4
lines changed

pkg/monitor/monitorapi/types.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,8 @@ const (
272272
ReasonHighGeneration IntervalReason = "HighGeneration"
273273
ReasonInvalidGeneration IntervalReason = "GenerationViolation"
274274

275-
ReasonEtcdBootstrap IntervalReason = "EtcdBootstrap"
275+
ReasonEtcdBootstrap IntervalReason = "EtcdBootstrap"
276+
ReasonProcessDumpedCore IntervalReason = "ProcessDumpedCore"
276277
)
277278

278279
type AnnotationKey string
@@ -358,6 +359,7 @@ const (
358359
SourceUnexpectedReady IntervalSource = "NodeUnexpectedNotReady"
359360
SourceUnreachable IntervalSource = "NodeUnreachable"
360361
SourceKubeletLog IntervalSource = "KubeletLog"
362+
SourceSystemdCoreDumpLog IntervalSource = "SystemdCoreDumpLog"
361363
SourcePodLog IntervalSource = "PodLog"
362364
SourceEtcdLog IntervalSource = "EtcdLog"
363365
SourceEtcdLeadership IntervalSource = "EtcdLeadership"

pkg/monitortests/node/kubeletlogcollector/monitortest.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ func (w *kubeletLogCollector) EvaluateTestsFromConstructedIntervals(ctx context.
6060
junits := []*junitapi.JUnitTestCase{}
6161
junits = append(junits, nodeFailedLeaseErrorsInRapidSuccession(w.startedAt, finalIntervals)...)
6262
junits = append(junits, nodeFailedLeaseErrorsBackOff(w.startedAt, finalIntervals)...)
63+
junits = append(junits, testNoSystemdCoreDumps(finalIntervals)...)
6364
return junits, nil
6465
}
6566

@@ -125,3 +126,51 @@ func nodeFailedLeaseErrorsBackOff(startedAt time.Time, finalIntervals monitorapi
125126
tests = append(tests, &junitapi.JUnitTestCase{Name: testName})
126127
return tests
127128
}
129+
130+
func testNoSystemdCoreDumps(events monitorapi.Intervals) []*junitapi.JUnitTestCase {
131+
const testName = "[Jira:\"Test Framework\"] should not find any systemd-coredump logs in system journal"
132+
success := &junitapi.JUnitTestCase{Name: testName}
133+
134+
var failures []string
135+
processCount := make(map[string]int)
136+
137+
for _, event := range events {
138+
if event.Source != monitorapi.SourceSystemdCoreDumpLog {
139+
continue
140+
}
141+
if strings.Contains(event.Message.HumanMessage, "dumped core") {
142+
processName := "unknown"
143+
if event.Message.Annotations != nil {
144+
if proc, exists := event.Message.Annotations["process"]; exists {
145+
processName = proc
146+
}
147+
}
148+
149+
processCount[processName]++
150+
msg := fmt.Sprintf("%v - Process: %s - %v", event.Locator.OldLocator(), processName, event.Message.OldMessage())
151+
failures = append(failures, msg)
152+
}
153+
}
154+
155+
if len(failures) == 0 {
156+
return []*junitapi.JUnitTestCase{success}
157+
}
158+
159+
// Create summary of process failures
160+
processSummary := make([]string, 0, len(processCount))
161+
for process, count := range processCount {
162+
processSummary = append(processSummary, fmt.Sprintf("%s: %d occurrences", process, count))
163+
}
164+
165+
failure := &junitapi.JUnitTestCase{
166+
Name: testName,
167+
SystemOut: strings.Join(failures, "\n"),
168+
FailureOutput: &junitapi.FailureOutput{
169+
Output: fmt.Sprintf("Found %d core dumps from %d different processes. Process breakdown:\n%s\n\nDetailed events:\n%v",
170+
len(failures), len(processCount), strings.Join(processSummary, "\n"), strings.Join(failures, "\n")),
171+
},
172+
}
173+
174+
// Core dumps are serious issues, but treating as flake initially to gather data
175+
return []*junitapi.JUnitTestCase{failure, success}
176+
}

pkg/monitortests/node/kubeletlogcollector/node.go

Lines changed: 61 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ func intervalsFromNodeLogs(ctx context.Context, kubeClient kubernetes.Interface,
5656
errCh <- err
5757
return
5858
}
59-
newOVSEvents := eventsFromOVSVswitchdLogs(nodeName, ovsVswitchdLogs)
59+
newOVSEvents := intervalsFromOVSVswitchdLogs(nodeName, ovsVswitchdLogs)
6060

6161
networkManagerLogs, err := getNodeLog(ctx, kubeClient, nodeName, "NetworkManager")
6262
if err != nil {
@@ -66,11 +66,20 @@ func intervalsFromNodeLogs(ctx context.Context, kubeClient kubernetes.Interface,
6666
}
6767
newNetworkManagerIntervals := intervalsFromNetworkManagerLogs(nodeName, networkManagerLogs)
6868

69+
systemdCoreDumpLogs, err := getNodeLog(ctx, kubeClient, nodeName, "systemd-coredump")
70+
if err != nil {
71+
fmt.Fprintf(os.Stderr, "Error getting node systemd-coredump logs from %s: %s", nodeName, err.Error())
72+
errCh <- err
73+
return
74+
}
75+
newSystemdCoreDumpIntervals := intervalsFromSystemdCoreDumpLogs(nodeName, systemdCoreDumpLogs)
76+
6977
lock.Lock()
7078
defer lock.Unlock()
7179
ret = append(ret, newEvents...)
7280
ret = append(ret, newOVSEvents...)
7381
ret = append(ret, newNetworkManagerIntervals...)
82+
ret = append(ret, newSystemdCoreDumpIntervals...)
7483
}(ctx, node.Name)
7584
}
7685
wg.Wait()
@@ -114,9 +123,9 @@ func eventsFromKubeletLogs(nodeName string, kubeletLog []byte) monitorapi.Interv
114123
return ret
115124
}
116125

117-
// eventsFromOVSVswitchdLogs returns the produced intervals. Any errors during this creation are logged, but
126+
// intervalsFromOVSVswitchdLogs returns the produced intervals. Any errors during this creation are logged, but
118127
// not returned because this is a best effort step
119-
func eventsFromOVSVswitchdLogs(nodeName string, ovsLogs []byte) monitorapi.Intervals {
128+
func intervalsFromOVSVswitchdLogs(nodeName string, ovsLogs []byte) monitorapi.Intervals {
120129
nodeLocator := monitorapi.NewLocator().NodeFromName(nodeName)
121130
ret := monitorapi.Intervals{}
122131

@@ -164,6 +173,55 @@ func unreasonablyLongPollInterval(logLine string, nodeLocator monitorapi.Locator
164173

165174
var unreasonablyLongPollIntervalRE = regexp.MustCompile(`Unreasonably long (\d+)ms poll interval`)
166175

176+
// intervalsFromSystemdCoreDumpLogs returns the produced intervals. Any errors during this creation are logged, but
177+
// not returned because this is a best effort step
178+
func intervalsFromSystemdCoreDumpLogs(nodeName string, coreDumpLogs []byte) monitorapi.Intervals {
179+
nodeLocator := monitorapi.NewLocator().NodeFromName(nodeName)
180+
ret := monitorapi.Intervals{}
181+
182+
scanner := bufio.NewScanner(bytes.NewBuffer(coreDumpLogs))
183+
for scanner.Scan() {
184+
currLine := scanner.Text()
185+
ret = append(ret, processCoreDump(currLine, nodeLocator)...)
186+
}
187+
188+
return ret
189+
}
190+
191+
// processCoreDump searches for core dump events with process information
192+
//
193+
// Process 7798 (haproxy) of user 1000680000 dumped core.
194+
func processCoreDump(logLine string, nodeLocator monitorapi.Locator) monitorapi.Intervals {
195+
if !strings.Contains(logLine, "dumped core") {
196+
return nil
197+
}
198+
199+
logTime := utility.SystemdJournalLogTime(logLine, time.Now().Year())
200+
201+
// Extract the process name from within parentheses
202+
var processName string
203+
match := coreDumpProcessRE.FindStringSubmatch(logLine)
204+
if match != nil && len(match) > 1 {
205+
processName = match[1]
206+
}
207+
208+
message := logLine[strings.Index(logLine, "Process"):]
209+
210+
// Build the message with process annotation if we extracted it
211+
messageBuilder := monitorapi.NewMessage().HumanMessage(message).Reason(monitorapi.ReasonProcessDumpedCore)
212+
if processName != "" {
213+
messageBuilder = messageBuilder.WithAnnotation("process", processName)
214+
}
215+
216+
interval := monitorapi.NewInterval(monitorapi.SourceSystemdCoreDumpLog, monitorapi.Warning).Locator(
217+
nodeLocator).Message(messageBuilder).
218+
Display().Build(logTime, logTime.Add(1*time.Second))
219+
220+
return monitorapi.Intervals{interval}
221+
}
222+
223+
var coreDumpProcessRE = regexp.MustCompile(`Process \d+ \(([^)]+)\) of user \d+ dumped core`)
224+
167225
// intervalsFromNetworkManagerLogs returns the produced intervals. Any errors during this creation are logged, but
168226
// not returned because this is a best effort step
169227
func intervalsFromNetworkManagerLogs(nodeName string, ovsLogs []byte) monitorapi.Intervals {

0 commit comments

Comments
 (0)