GoogleCloudPlatform · droot · Jul 8, 2025 · Jul 8, 2025 · Jul 8, 2025
diff --git a/k8s-bench/eval.go b/k8s-bench/eval.go
@@ -393,7 +393,14 @@ func (x *TaskExecution) runAgent(ctx context.Context) error {
 	go func() {
 		// TODO: Wait for idle between sending steps?
 		for _, step := range x.task.Script {
-			fmt.Fprintf(stdinWriter, "%s\n", step.Prompt)
+			prompt, err := step.ResolvePrompt(x.taskDir)
+			if err != nil {
+				fmt.Fprintf(os.Stderr, "Error resolving prompt: %v\n", err)
+				x.result.AddFailure("failed to resolve prompt: %v", err)
+				stdinWriter.Close()
+				return
+			}
+			fmt.Fprintf(stdinWriter, "%s\n", prompt)
 		}
 		stdinWriter.Close()
 	}()

diff --git a/k8s-bench/main.go b/k8s-bench/main.go
@@ -53,7 +53,40 @@ const (
 )
 
 type ScriptStep struct {
-	Prompt string `json:"prompt"`
+	Prompt     string `json:"prompt"`
+	PromptFile string `json:"promptFile"`
+}
+
+// ResolvePrompt resolves the prompt from either inline or file source
+func (s *ScriptStep) ResolvePrompt(baseDir string) (string, error) {
+	// Fail if both prompt and promptFile are provided to avoid confusion
+	if s.Prompt != "" && s.PromptFile != "" {
+		return "", fmt.Errorf("both 'prompt' and 'promptFile' are specified in script step; only one should be provided")
+	}
+
+	// If promptFile is provided, read the file
+	if s.PromptFile != "" {
+		// If the path is relative, resolve it relative to the task directory
+		promptPath := s.PromptFile
+		if !filepath.IsAbs(promptPath) {
+			promptPath = filepath.Join(baseDir, s.PromptFile)
+		}
+
+		content, err := os.ReadFile(promptPath)
+		if err != nil {
+			return "", fmt.Errorf("failed to read prompt file %q: %w", promptPath, err)
+		}
+
+		return string(content), nil
+	}
+
+	// If prompt is provided, use it
+	if s.Prompt != "" {
+		return s.Prompt, nil
+	}
+
+	// If neither is provided, return an error
+	return "", fmt.Errorf("neither 'prompt' nor 'promptFile' is specified in script step")
 }
 
 type Expectation struct {

diff --git a/k8s-bench/tasks/setup-dev-cluster/setup-dev-cluster.md b/k8s-bench/tasks/setup-dev-cluster/setup-dev-cluster.md
@@ -0,0 +1,26 @@
+You are a Kubernetes administrator setting up a development cluster for a team of 3 developers (alice, bob, and charlie).
+
+Create a secure, multi-tenant development environment with the following requirements:
+
+1. **Namespaces**: Create separate namespaces for each developer (dev-alice, dev-bob, dev-charlie) plus shared namespaces (dev-shared, staging, prod)
+
+2. **RBAC Configuration**:
+    - Each developer should have full access to their own namespace
+    - Developers should have read-only access to the dev-shared namespace
+    - Only cluster admins should access staging and prod
+    - Create service accounts for each developer (alice-sa, bob-sa, charlie-sa) in their respective namespaces
+
+3. **Resource Quotas**:
+    - Each developer namespace: max 2 CPUs, 4Gi memory, 10 pods, 5 services
+    - dev-shared namespace: max 4 CPUs, 8Gi memory, 20 pods, 10 services  
+    - staging/prod: max 8 CPUs, 16Gi memory, 50 pods, 20 services
+
+4. **Network Policies**:
+    - Developers can only access their own namespace and dev-shared
+    - Block cross-developer namespace communication
+    - Allow all namespaces to access DNS and system services
+    - staging and prod should be completely isolated from dev namespaces
+
+5. **Default Deny Policies**: Implement default deny network policies for all namespaces except system namespaces
+
+Ensure all configurations follow principle of least privilege and provide appropriate isolation between environments.
diff --git a/k8s-bench/tasks/setup-dev-cluster/task.yaml b/k8s-bench/tasks/setup-dev-cluster/task.yaml
@@ -1,30 +1,5 @@
 script:
-   - prompt: |
-        You are a Kubernetes administrator setting up a development cluster for a team of 3 developers (alice, bob, and charlie).
-        Create a secure, multi-tenant development environment with the following requirements:
-
-        1. **Namespaces**: Create separate namespaces for each developer (dev-alice, dev-bob, dev-charlie) plus shared namespaces (dev-shared, staging, prod)
-
-        2. **RBAC Configuration**:
-           - Each developer should have full access to their own namespace
-           - Developers should have read-only access to the dev-shared namespace
-           - Only cluster admins should access staging and prod
-           - Create service accounts for each developer (alice-sa, bob-sa, charlie-sa) in their respective namespaces
-
-        3. **Resource Quotas**:
-           - Each developer namespace: max 2 CPUs, 4Gi memory, 10 pods, 5 services
-           - dev-shared namespace: max 4 CPUs, 8Gi memory, 20 pods, 10 services  
-           - staging/prod: max 8 CPUs, 16Gi memory, 50 pods, 20 services
-
-        4. **Network Policies**:
-           - Developers can only access their own namespace and dev-shared
-           - Block cross-developer namespace communication
-           - Allow all namespaces to access DNS and system services
-           - staging and prod should be completely isolated from dev namespaces
-
-        5. **Default Deny Policies**: Implement default deny network policies for all namespaces except system namespaces
-
-        Ensure all configurations follow principle of least privilege and provide appropriate isolation between environments.
+   - promptFile: setup-dev-cluster.md
 
 difficulty: hard
 setup: setup.sh