Skip to content

Commit d541935

Browse files
authored
eval: Added eval for troubleshooting and fixing OOMKilled pod (#360)
* eval: Added eval for troubleshooting and fixing OOMKilled pod * refactor: Improve OOMKilled event detection in setup script * refactor: Disguise memory load to reflect realistic backend behavior
1 parent 67bc91a commit d541935

File tree

5 files changed

+113
-0
lines changed

5 files changed

+113
-0
lines changed
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: backend-api
5+
namespace: webapp-backend
6+
labels:
7+
app: backend-api
8+
spec:
9+
replicas: 1
10+
selector:
11+
matchLabels:
12+
app: backend-api
13+
template:
14+
metadata:
15+
labels:
16+
app: backend-api
17+
spec:
18+
containers:
19+
- name: api-server
20+
image: nginx:alpine
21+
command: ["/bin/sh"]
22+
args:
23+
- -c
24+
- |
25+
echo "Starting backend service..."
26+
dd if=/dev/zero of=/tmp/cache.dat bs=1M count=150 &
27+
nginx -g 'daemon off;'
28+
ports:
29+
- containerPort: 80
30+
resources:
31+
requests:
32+
memory: "64Mi"
33+
cpu: "50m"
34+
limits:
35+
memory: "128Mi"
36+
cpu: "100m"
37+
livenessProbe:
38+
httpGet:
39+
path: /
40+
port: 80
41+
initialDelaySeconds: 10
42+
periodSeconds: 5
43+
readinessProbe:
44+
httpGet:
45+
path: /
46+
port: 80
47+
initialDelaySeconds: 5
48+
periodSeconds: 3
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
#!/usr/bin/env bash
2+
kubectl delete namespace webapp-backend --ignore-not-found=true
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env bash
2+
kubectl delete namespace webapp-backend --ignore-not-found
3+
4+
# Create namespace
5+
kubectl create namespace webapp-backend
6+
7+
# Apply the deployment from artifacts
8+
kubectl apply -f artifacts/memory-hungry-app.yaml
9+
10+
# Wait for the deployment to be created
11+
kubectl rollout status deployment/backend-api -n webapp-backend --timeout=60s || true
12+
13+
# Wait until an OOMKilled event is detected (timeout after 30s)
14+
echo "Waiting for OOMKilled event to occur..."
15+
for i in {1..15}; do
16+
OOMKILLED_COUNT=$(kubectl get events -n webapp-backend --field-selector reason=OOMKilling -o json | jq '.items | length')
17+
if [ "$OOMKILLED_COUNT" -gt 0 ]; then
18+
echo "OOMKilled event detected."
19+
break
20+
fi
21+
sleep 2
22+
done
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
script:
2+
- prompt: "I have a web application running in the 'webapp-backend' namespace that keeps crashing. Can you help me figure out what's going wrong and fix it?"
3+
setup: "setup.sh"
4+
verifier: "verify.sh"
5+
cleanup: "cleanup.sh"
6+
difficulty: "medium"
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/usr/bin/env bash
2+
3+
NAMESPACE="webapp-backend"
4+
DEPLOYMENT="backend-api"
5+
6+
# Check if the deployment is ready
7+
if ! kubectl wait --for=condition=Available deployment/$DEPLOYMENT -n $NAMESPACE --timeout=60s; then
8+
echo "Deployment is not available"
9+
exit 1
10+
fi
11+
12+
# Check if pods are running
13+
if ! kubectl wait --for=condition=Ready pod -l app=backend-api -n $NAMESPACE --timeout=30s; then
14+
echo "Pods are not ready"
15+
exit 1
16+
fi
17+
18+
# Check that there are no recent OOMKilled events
19+
OOMKILLED_COUNT=$(kubectl get events -n $NAMESPACE --field-selector reason=OOMKilling --sort-by='.lastTimestamp' -o json | jq '.items | length')
20+
21+
if [ "$OOMKILLED_COUNT" -gt 0 ]; then
22+
# Check if the most recent OOMKilled event is from the last 2 minutes (indicating ongoing issues)
23+
RECENT_OOMKILLED=$(kubectl get events -n $NAMESPACE --field-selector reason=OOMKilling --sort-by='.lastTimestamp' -o jsonpath='{.items[-1].lastTimestamp}' 2>/dev/null)
24+
if [ -n "$RECENT_OOMKILLED" ]; then
25+
RECENT_TIME=$(date -d "$RECENT_OOMKILLED" +%s 2>/dev/null)
26+
CURRENT_TIME=$(date +%s)
27+
if [ $((CURRENT_TIME - RECENT_TIME)) -lt 120 ]; then
28+
echo "Recent OOMKilled events detected"
29+
exit 1
30+
fi
31+
fi
32+
fi
33+
34+
echo "Pod is running successfully without OOMKilled events"
35+
exit 0

0 commit comments

Comments
 (0)