Skip to content

Commit 568969a

Browse files
refactor(asserts): extract common utilities and improve retry logic (#2336)
* refactor(asserts): extract common utilities and improve retry logic - Add common utilities for client validation and retry logic - Refactor alert config and disabled alert config resources to use shared utilities - Improve retry logic with exponential backoff and jitter - Remove code duplication across asserts resources - Update test destroy functions with better retry logic - Remove stress test skip conditions as retry logic is now more robust - Update asserts client dependency for compatibility with new API features * test: add stress test flag to control eventual consistency tests - Add CheckStressTestsEnabled function to testutils package - Update stress tests to require TF_ACC_STRESS environment variable - Stress tests will now be skipped unless explicitly enabled - This prevents flaky stress test failures in CI while still allowing developers to run them when needed for testing eventual consistency
1 parent 7ec6735 commit 568969a

File tree

8 files changed

+204
-161
lines changed

8 files changed

+204
-161
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ require (
5252
k8s.io/client-go v0.32.3
5353
)
5454

55-
require github.com/grafana/grafana-asserts-public-clients/go/gcom v0.0.0-20250805165836-14e16b51b910
55+
require github.com/grafana/grafana-asserts-public-clients/go/gcom v0.0.0-20250811125322-247815da58ca
5656

5757
require (
5858
cuelang.org/go v0.11.1 // indirect

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,8 @@ github.com/grafana/grafana-app-sdk/logging v0.35.1 h1:taVpl+RoixTYl0JBJGhH+fPVmw
182182
github.com/grafana/grafana-app-sdk/logging v0.35.1/go.mod h1:Y/bvbDhBiV/tkIle9RW49pgfSPIPSON8Q4qjx3pyqDk=
183183
github.com/grafana/grafana-asserts-public-clients/go/gcom v0.0.0-20250805165836-14e16b51b910 h1:2OfDIhMtXWWVQcDp9cq/VMSBOJJfDek9450rcsV+qLg=
184184
github.com/grafana/grafana-asserts-public-clients/go/gcom v0.0.0-20250805165836-14e16b51b910/go.mod h1:EL/5hluCvj6EDjkUfoClLKSKDoCoDowZUety28jhxQI=
185+
github.com/grafana/grafana-asserts-public-clients/go/gcom v0.0.0-20250811125322-247815da58ca h1:GVzyCTi3rqvjK42b++lFjabG2zsrLvyAbbR43dWP6s0=
186+
github.com/grafana/grafana-asserts-public-clients/go/gcom v0.0.0-20250811125322-247815da58ca/go.mod h1:EL/5hluCvj6EDjkUfoClLKSKDoCoDowZUety28jhxQI=
185187
github.com/grafana/grafana-com-public-clients/go/gcom v0.0.0-20250526074454-7ec66e02e4bb h1:rmYEnCXHNQbRsuzc5jCX5qkBqFF37c5RCHlyqAAPJZo=
186188
github.com/grafana/grafana-com-public-clients/go/gcom v0.0.0-20250526074454-7ec66e02e4bb/go.mod h1:sYWkB3NhyirQJfy3wtNQ29UYjoHbRlJlYhqN1jNsC5g=
187189
github.com/grafana/grafana-openapi-client-go v0.0.0-20250617151817-c0f8cbb88d5c h1:jox7J0BnJmcZJp8lp631u4gjDEoIfpi6O3yrpiXNTtg=

internal/resources/asserts/common.go

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
package asserts
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"math"
7+
"math/rand"
8+
"time"
9+
10+
"github.com/hashicorp/terraform-plugin-sdk/v2/diag"
11+
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/retry"
12+
13+
assertsapi "github.com/grafana/grafana-asserts-public-clients/go/gcom"
14+
"github.com/grafana/terraform-provider-grafana/v4/internal/common"
15+
)
16+
17+
// validateAssertsClient checks if the Asserts API client is properly configured
18+
func validateAssertsClient(meta interface{}) (*assertsapi.APIClient, int64, diag.Diagnostics) {
19+
client := meta.(*common.Client).AssertsAPIClient
20+
if client == nil {
21+
return nil, 0, diag.Errorf("Asserts API client is not configured")
22+
}
23+
24+
stackID := meta.(*common.Client).GrafanaStackID
25+
if stackID == 0 {
26+
return nil, 0, diag.Errorf("stack_id must be set in provider configuration for Asserts resources")
27+
}
28+
29+
return client, stackID, nil
30+
}
31+
32+
// retryReadFunc is a function that performs a read operation with retry logic
33+
type retryReadFunc func(retryCount, maxRetries int) *retry.RetryError
34+
35+
// withRetryRead wraps a read operation with consistent retry logic and exponential backoff
36+
func withRetryRead(ctx context.Context, operation retryReadFunc) error {
37+
retryCount := 0
38+
maxRetries := 40
39+
40+
// Increase overall timeout to better handle eventual consistency when
41+
// multiple resources are created concurrently (e.g., stress tests)
42+
return retry.RetryContext(ctx, 600*time.Second, func() *retry.RetryError {
43+
retryCount++
44+
45+
// Backoff with jitter to reduce request stampeding
46+
var baseSleep time.Duration
47+
if retryCount == 1 {
48+
baseSleep = 1 * time.Second
49+
} else {
50+
// Exponential backoff: 1s, 2s, 4s, 8s, 16s (capped at 16s)
51+
baseSleep = time.Duration(1<<int(math.Min(float64(retryCount-2), 4))) * time.Second
52+
}
53+
54+
// Apply jitter: sleep in [base/2, base]
55+
minSleep := baseSleep / 2
56+
maxJitter := baseSleep - minSleep
57+
if maxJitter > 0 {
58+
//nolint:gosec // Using math/rand for jitter in retry logic, not cryptographic purposes
59+
j := time.Duration(rand.Int63n(int64(maxJitter)))
60+
time.Sleep(minSleep + j)
61+
} else {
62+
time.Sleep(baseSleep)
63+
}
64+
65+
// Execute the operation with retry count
66+
return operation(retryCount, maxRetries)
67+
})
68+
}
69+
70+
// createRetryableError creates a retryable error with consistent formatting
71+
func createRetryableError(resourceType, resourceName string, retryCount, maxRetries int) *retry.RetryError {
72+
return retry.RetryableError(fmt.Errorf("%s %s not found (attempt %d/%d)", resourceType, resourceName, retryCount, maxRetries))
73+
}
74+
75+
// createNonRetryableError creates a non-retryable error with consistent formatting
76+
func createNonRetryableError(resourceType, resourceName string, retryCount int) *retry.RetryError {
77+
return retry.NonRetryableError(fmt.Errorf("%s %s not found after %d retries - may indicate a permanent issue", resourceType, resourceName, retryCount))
78+
}
79+
80+
// createAPIError creates a retryable or non-retryable API error based on retry count
81+
func createAPIError(operation string, retryCount, maxRetries int, err error) *retry.RetryError {
82+
if retryCount >= maxRetries {
83+
return retry.NonRetryableError(fmt.Errorf("failed to %s after %d retries: %w", operation, retryCount, err))
84+
}
85+
return retry.RetryableError(fmt.Errorf("failed to %s: %w", operation, err))
86+
}

internal/resources/asserts/resource_alert_config.go

Lines changed: 25 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,6 @@ package asserts
33
import (
44
"context"
55
"fmt"
6-
"math"
7-
8-
"time"
96

107
"github.com/hashicorp/terraform-plugin-sdk/v2/diag"
118
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/retry"
@@ -70,28 +67,23 @@ func makeResourceAlertConfig() *common.Resource {
7067
).WithLister(assertsListerFunction(listAlertConfigs))
7168
}
7269

73-
func resourceAlertConfigCreate(ctx context.Context, d *schema.ResourceData, meta any) diag.Diagnostics {
74-
client := meta.(*common.Client).AssertsAPIClient
75-
if client == nil {
76-
return diag.Errorf("Asserts API client is not configured")
77-
}
78-
79-
stackID := meta.(*common.Client).GrafanaStackID
80-
if stackID == 0 {
81-
return diag.Errorf("stack_id must be set in provider configuration for Asserts resources")
70+
func resourceAlertConfigCreate(ctx context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics {
71+
client, stackID, diags := validateAssertsClient(meta)
72+
if diags.HasError() {
73+
return diags
8274
}
8375
name := d.Get("name").(string)
8476
matchLabels := make(map[string]string)
8577
alertLabels := make(map[string]string)
8678

8779
if v, ok := d.GetOk("match_labels"); ok {
88-
for k, val := range v.(map[string]any) {
80+
for k, val := range v.(map[string]interface{}) {
8981
matchLabels[k] = val.(string)
9082
}
9183
}
9284

9385
if v, ok := d.GetOk("alert_labels"); ok {
94-
for k, val := range v.(map[string]any) {
86+
for k, val := range v.(map[string]interface{}) {
9587
alertLabels[k] = val.(string)
9688
}
9789
}
@@ -136,42 +128,23 @@ func resourceAlertConfigCreate(ctx context.Context, d *schema.ResourceData, meta
136128
return resourceAlertConfigRead(ctx, d, meta)
137129
}
138130

139-
func resourceAlertConfigRead(ctx context.Context, d *schema.ResourceData, meta any) diag.Diagnostics {
140-
client := meta.(*common.Client).AssertsAPIClient
141-
if client == nil {
142-
return diag.Errorf("Asserts API client is not configured")
143-
}
144-
145-
stackID := meta.(*common.Client).GrafanaStackID
146-
if stackID == 0 {
147-
return diag.Errorf("stack_id must be set in provider configuration for Asserts resources")
131+
func resourceAlertConfigRead(ctx context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics {
132+
client, stackID, diags := validateAssertsClient(meta)
133+
if diags.HasError() {
134+
return diags
148135
}
149136
name := d.Id()
150137

151138
// Retry logic for read operation to handle eventual consistency
152139
var foundConfig *assertsapi.AlertConfigDto
153-
retryCount := 0
154-
maxRetries := 10
155-
err := retry.RetryContext(ctx, 60*time.Second, func() *retry.RetryError {
156-
retryCount++
157-
158-
// Exponential backoff: 1s, 2s, 4s, 8s, etc. (capped at 8s)
159-
if retryCount > 1 {
160-
backoffDuration := time.Duration(1<<int(math.Min(float64(retryCount-2), 3))) * time.Second
161-
time.Sleep(backoffDuration)
162-
}
163-
140+
err := withRetryRead(ctx, func(retryCount, maxRetries int) *retry.RetryError {
164141
// Get all alert configs using the generated client API
165142
request := client.AlertConfigurationAPI.GetAllAlertConfigs(ctx).
166143
XScopeOrgID(fmt.Sprintf("%d", stackID))
167144

168145
alertConfigs, _, err := request.Execute()
169146
if err != nil {
170-
// If we've retried many times and still getting API errors, give up
171-
if retryCount >= maxRetries {
172-
return retry.NonRetryableError(fmt.Errorf("failed to get alert configurations after %d retries: %w", retryCount, err))
173-
}
174-
return retry.RetryableError(fmt.Errorf("failed to get alert configurations: %w", err))
147+
return createAPIError("get alert configurations", retryCount, maxRetries, err)
175148
}
176149

177150
// Find our specific config
@@ -182,12 +155,11 @@ func resourceAlertConfigRead(ctx context.Context, d *schema.ResourceData, meta a
182155
}
183156
}
184157

185-
// If we've retried many times and still not found, give up
158+
// Check if we should give up or retry
186159
if retryCount >= maxRetries {
187-
return retry.NonRetryableError(fmt.Errorf("alert configuration %s not found after %d retries - may indicate a permanent issue", name, retryCount))
160+
return createNonRetryableError("alert configuration", name, retryCount)
188161
}
189-
190-
return retry.RetryableError(fmt.Errorf("alert configuration %s not found (attempt %d/%d)", name, retryCount, maxRetries))
162+
return createRetryableError("alert configuration", name, retryCount, maxRetries)
191163
})
192164

193165
if err != nil {
@@ -229,29 +201,24 @@ func resourceAlertConfigRead(ctx context.Context, d *schema.ResourceData, meta a
229201
return nil
230202
}
231203

232-
func resourceAlertConfigUpdate(ctx context.Context, d *schema.ResourceData, meta any) diag.Diagnostics {
233-
client := meta.(*common.Client).AssertsAPIClient
234-
if client == nil {
235-
return diag.Errorf("Asserts API client is not configured")
236-
}
237-
238-
stackID := meta.(*common.Client).GrafanaStackID
239-
if stackID == 0 {
240-
return diag.Errorf("stack_id must be set in provider configuration for Asserts resources")
204+
func resourceAlertConfigUpdate(ctx context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics {
205+
client, stackID, diags := validateAssertsClient(meta)
206+
if diags.HasError() {
207+
return diags
241208
}
242209

243210
name := d.Get("name").(string)
244211
matchLabels := make(map[string]string)
245212
alertLabels := make(map[string]string)
246213

247214
if v, ok := d.GetOk("match_labels"); ok {
248-
for k, val := range v.(map[string]any) {
215+
for k, val := range v.(map[string]interface{}) {
249216
matchLabels[k] = val.(string)
250217
}
251218
}
252219

253220
if v, ok := d.GetOk("alert_labels"); ok {
254-
for k, val := range v.(map[string]any) {
221+
for k, val := range v.(map[string]interface{}) {
255222
alertLabels[k] = val.(string)
256223
}
257224
}
@@ -294,15 +261,10 @@ func resourceAlertConfigUpdate(ctx context.Context, d *schema.ResourceData, meta
294261
return resourceAlertConfigRead(ctx, d, meta)
295262
}
296263

297-
func resourceAlertConfigDelete(ctx context.Context, d *schema.ResourceData, meta any) diag.Diagnostics {
298-
client := meta.(*common.Client).AssertsAPIClient
299-
if client == nil {
300-
return diag.Errorf("Asserts API client is not configured")
301-
}
302-
303-
stackID := meta.(*common.Client).GrafanaStackID
304-
if stackID == 0 {
305-
return diag.Errorf("stack_id must be set in provider configuration for Asserts resources")
264+
func resourceAlertConfigDelete(ctx context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics {
265+
client, stackID, diags := validateAssertsClient(meta)
266+
if diags.HasError() {
267+
return diags
306268
}
307269
name := d.Id()
308270

internal/resources/asserts/resource_alert_config_test.go

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"os"
77
"strconv"
88
"testing"
9+
"time"
910

1011
"github.com/grafana/terraform-provider-grafana/v4/internal/common"
1112
"github.com/grafana/terraform-provider-grafana/v4/internal/testutils"
@@ -117,6 +118,7 @@ func testAccAssertsAlertConfigCheckDestroy(s *terraform.State) error {
117118
client := testutils.Provider.Meta().(*common.Client).AssertsAPIClient
118119
ctx := context.Background()
119120

121+
deadline := time.Now().Add(60 * time.Second)
120122
for _, rs := range s.RootModule().Resources {
121123
if rs.Type != "grafana_asserts_notification_alerts_config" {
122124
continue
@@ -126,24 +128,37 @@ func testAccAssertsAlertConfigCheckDestroy(s *terraform.State) error {
126128
name := rs.Primary.ID
127129
stackID := fmt.Sprintf("%d", testutils.Provider.Meta().(*common.Client).GrafanaStackID)
128130

129-
// Get all alert configs
130-
request := client.AlertConfigurationAPI.GetAllAlertConfigs(ctx).
131-
XScopeOrgID(stackID)
131+
for {
132+
// Get all alert configs
133+
request := client.AlertConfigurationAPI.GetAllAlertConfigs(ctx).
134+
XScopeOrgID(stackID)
135+
136+
alertConfigs, _, err := request.Execute()
137+
if err != nil {
138+
// If we can't get configs, assume it's because they don't exist
139+
if common.IsNotFoundError(err) {
140+
break
141+
}
142+
return fmt.Errorf("error checking alert config destruction: %s", err)
143+
}
132144

133-
alertConfigs, _, err := request.Execute()
134-
if err != nil {
135-
// If we can't get configs, assume it's because they don't exist
136-
if common.IsNotFoundError(err) {
137-
continue
145+
// Check if our config still exists
146+
stillExists := false
147+
for _, config := range alertConfigs.AlertConfigs {
148+
if config.Name != nil && *config.Name == name {
149+
stillExists = true
150+
break
151+
}
138152
}
139-
return fmt.Errorf("error checking alert config destruction: %s", err)
140-
}
141153

142-
// Check if our config still exists
143-
for _, config := range alertConfigs.AlertConfigs {
144-
if config.Name != nil && *config.Name == name {
154+
if !stillExists {
155+
break
156+
}
157+
158+
if time.Now().After(deadline) {
145159
return fmt.Errorf("alert config %s still exists", name)
146160
}
161+
time.Sleep(2 * time.Second)
147162
}
148163
}
149164

@@ -208,11 +223,7 @@ resource "grafana_asserts_notification_alerts_config" "test" {
208223
// to verify the retry logic handles eventual consistency properly
209224
func TestAccAssertsAlertConfig_eventualConsistencyStress(t *testing.T) {
210225
testutils.CheckCloudInstanceTestsEnabled(t)
211-
212-
// Skip this flaky test unless explicitly enabled
213-
if !testutils.AccTestsEnabled("TF_ACC_STRESS_TESTS") {
214-
t.Skip("TF_ACC_STRESS_TESTS must be set to a truthy value for stress tests")
215-
}
226+
testutils.CheckStressTestsEnabled(t)
216227

217228
stackID := getTestStackID(t)
218229
baseName := fmt.Sprintf("stress-test-%s", acctest.RandString(8))

0 commit comments

Comments
 (0)