Skip to content

Commit a94fee9

Browse files
committed
ref: add rule intervals per rule type
Instead of a singular global default, now a rule_group interval can be set for every individual type of rule_group Sloth generates. The generic, `interval:all` rule will also stay and can "fill in" any missing per-rule group defaults. Along with the default behavior of doing nothing if no `interval` is specified.
1 parent 4c3a1f9 commit a94fee9

File tree

4 files changed

+90
-29
lines changed

4 files changed

+90
-29
lines changed

internal/prometheus/model.go

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -39,17 +39,20 @@ type AlertMeta struct {
3939

4040
// SLO represents a service level objective configuration.
4141
type SLO struct {
42-
ID string `validate:"required,name"`
43-
Name string `validate:"required,name"`
44-
Description string
45-
Service string `validate:"required,name"`
46-
RuleGroupInterval string
47-
SLI SLI `validate:"required"`
48-
TimeWindow time.Duration `validate:"required"`
49-
Objective float64 `validate:"gt=0,lte=100"`
50-
Labels map[string]string `validate:"dive,keys,prom_label_key,endkeys,required,prom_label_value"`
51-
PageAlertMeta AlertMeta
52-
TicketAlertMeta AlertMeta
42+
ID string `validate:"required,name"`
43+
Name string `validate:"required,name"`
44+
Description string
45+
Service string `validate:"required,name"`
46+
RuleGroupInterval time.Duration `validate:"time"`
47+
SLIErrorRulesInterval time.Duration `validate:"time"`
48+
MetadataRulesInterval time.Duration `validate:"time"`
49+
AlertRulesInterval time.Duration `validate:"time"`
50+
SLI SLI `validate:"required"`
51+
TimeWindow time.Duration `validate:"required"`
52+
Objective float64 `validate:"gt=0,lte=100"`
53+
Labels map[string]string `validate:"dive,keys,prom_label_key,endkeys,required,prom_label_value"`
54+
PageAlertMeta AlertMeta
55+
TicketAlertMeta AlertMeta
5356
}
5457

5558
type SLOGroup struct {
@@ -87,6 +90,7 @@ var modelSpecValidate = func() *validator.Validate {
8790
mustRegisterValidation(v, "name", validateName)
8891
mustRegisterValidation(v, "required_if_enabled", validateRequiredEnabledAlertName)
8992
mustRegisterValidation(v, "template_vars", validateTemplateVars)
93+
mustRegisterValidation(v, "time", validateTime)
9094
v.RegisterStructValidation(validateOneSLI, SLI{})
9195
v.RegisterStructValidation(validateSLOGroup, SLOGroup{})
9296
v.RegisterStructValidation(validateSLIEvents, SLIEvents{})
@@ -182,6 +186,18 @@ func validateName(fl validator.FieldLevel) bool {
182186
return nameRegexp.MatchString(s)
183187
}
184188

189+
// validateTime implements validator.CustomTypeFunc by validating
190+
// a time duration.
191+
func validateTime(fl validator.FieldLevel) bool {
192+
s, ok := fl.Field().Interface().(time.Duration)
193+
if !ok {
194+
return false
195+
}
196+
197+
_, err := time.ParseDuration(s.String())
198+
return err == nil
199+
}
200+
185201
func validateRequiredEnabledAlertName(fl validator.FieldLevel) bool {
186202
alertMeta, ok := fl.Parent().Interface().(AlertMeta)
187203
if !ok {

internal/prometheus/spec.go

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -69,16 +69,19 @@ func (y YAMLSpecLoader) mapSpecToModel(ctx context.Context, spec prometheusv1.Sp
6969
models := make([]SLO, 0, len(spec.SLOs))
7070
for _, specSLO := range spec.SLOs {
7171
slo := SLO{
72-
ID: fmt.Sprintf("%s-%s", spec.Service, specSLO.Name),
73-
RuleGroupInterval: specSLO.RuleGroupInterval,
74-
Name: specSLO.Name,
75-
Description: specSLO.Description,
76-
Service: spec.Service,
77-
TimeWindow: y.windowPeriod,
78-
Objective: specSLO.Objective,
79-
Labels: mergeLabels(spec.Labels, specSLO.Labels),
80-
PageAlertMeta: AlertMeta{Disable: true},
81-
TicketAlertMeta: AlertMeta{Disable: true},
72+
ID: fmt.Sprintf("%s-%s", spec.Service, specSLO.Name),
73+
RuleGroupInterval: specSLO.Interval.RuleGroupInterval,
74+
SLIErrorRulesInterval: specSLO.Interval.SLIErrorRulesInterval,
75+
MetadataRulesInterval: specSLO.Interval.MetadataRulesInterval,
76+
AlertRulesInterval: specSLO.Interval.AlertRulesInterval,
77+
Name: specSLO.Name,
78+
Description: specSLO.Description,
79+
Service: spec.Service,
80+
TimeWindow: y.windowPeriod,
81+
Objective: specSLO.Objective,
82+
Labels: mergeLabels(spec.Labels, specSLO.Labels),
83+
PageAlertMeta: AlertMeta{Disable: true},
84+
TicketAlertMeta: AlertMeta{Disable: true},
8285
}
8386

8487
// Set SLIs.

internal/prometheus/storage.go

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,19 @@ func (i IOWriterGroupedRulesYAMLRepo) StoreSLOs(ctx context.Context, slos []Stor
4949
ruleGroups := ruleGroupsYAMLv2{}
5050
for _, slo := range slos {
5151
if len(slo.Rules.SLIErrorRecRules) > 0 {
52-
if slo.SLO.RuleGroupInterval != "" {
5352

54-
ruleGroupIntervalDuration, err := prommodel.ParseDuration(slo.SLO.RuleGroupInterval)
53+
// 0s is default empty string value for time.Duration
54+
if slo.SLO.RuleGroupInterval.String() != "0s" || slo.SLO.SLIErrorRulesInterval.String() != "0s" {
55+
var ruleGroupIntervalDuration prommodel.Duration
56+
var err error
57+
58+
// if we have a valid meta rule rule_group interval, use that first and overwrite any generic ones
59+
if slo.SLO.SLIErrorRulesInterval.String() != "0s" {
60+
ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.SLIErrorRulesInterval.String())
61+
} else {
62+
ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.RuleGroupInterval.String())
63+
}
64+
5565
if err != nil {
5666
return fmt.Errorf("could not parse rule_group interval duration %w", err)
5767
}
@@ -71,9 +81,18 @@ func (i IOWriterGroupedRulesYAMLRepo) StoreSLOs(ctx context.Context, slos []Stor
7181
}
7282

7383
if len(slo.Rules.MetadataRecRules) > 0 {
74-
if slo.SLO.RuleGroupInterval != "" {
84+
// if either of these aren't empty we'll be adding a custom rule interval
85+
if slo.SLO.RuleGroupInterval.String() != "0s" || slo.SLO.MetadataRulesInterval.String() != "0s" {
86+
var ruleGroupIntervalDuration prommodel.Duration
87+
var err error
88+
89+
// if we have a valid meta rule rule_group interval, use that firs
90+
if slo.SLO.MetadataRulesInterval.String() != "0s" {
91+
ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.MetadataRulesInterval.String())
92+
} else {
93+
ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.RuleGroupInterval.String())
94+
}
7595

76-
ruleGroupIntervalDuration, err := prommodel.ParseDuration(slo.SLO.RuleGroupInterval)
7796
if err != nil {
7897
return fmt.Errorf("could not parse rule_group interval duration %w", err)
7998
}
@@ -92,9 +111,17 @@ func (i IOWriterGroupedRulesYAMLRepo) StoreSLOs(ctx context.Context, slos []Stor
92111
}
93112

94113
if len(slo.Rules.AlertRules) > 0 {
95-
if slo.SLO.RuleGroupInterval != "" {
114+
if slo.SLO.RuleGroupInterval.String() != "0s" || slo.SLO.AlertRulesInterval.String() != "0s" {
115+
var ruleGroupIntervalDuration prommodel.Duration
116+
var err error
117+
118+
// if we have a valid meta rule rule_group interval, use that firs
119+
if slo.SLO.AlertRulesInterval.String() != "0s" {
120+
ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.AlertRulesInterval.String())
121+
} else {
122+
ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.RuleGroupInterval.String())
123+
}
96124

97-
ruleGroupIntervalDuration, err := prommodel.ParseDuration(slo.SLO.RuleGroupInterval)
98125
if err != nil {
99126
return fmt.Errorf("could not parse rule_group interval duration %w", err)
100127
}

pkg/prometheus/api/v1/v1.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@
5454
// disable: true
5555
package v1
5656

57+
import "time"
58+
5759
const Version = "prometheus/v1"
5860

5961
//go:generate gomarkdoc -o ./README.md ./
@@ -89,8 +91,9 @@ type SLO struct {
8991
// Alerting is the configuration with all the things related with the SLO
9092
// alerts.
9193
Alerting Alerting `yaml:"alerting"`
92-
// RuleGroupInterval is an optional value for how often the Prometheus rule_group should be evaluated.
93-
RuleGroupInterval string `yaml:"interval,omitempty"`
94+
// Interval is the configuration for all things related to SLO rule_group intervals
95+
// for specific rule groups and all rules.
96+
Interval Interval `yaml:"interval,omitempty"`
9497
}
9598

9699
// SLI will tell what is good or bad for the SLO.
@@ -150,6 +153,18 @@ type Alerting struct {
150153
TicketAlert Alert `yaml:"ticket_alert,omitempty"`
151154
}
152155

156+
type Interval struct {
157+
// RuleGroupInterval is an optional value for how often the Prometheus rule_group should be evaluated.
158+
// RuleGroupInterval string `yaml:"rulegroup_interval,omitempty"`
159+
RuleGroupInterval time.Duration `yaml:"all,omitempty"`
160+
// Otherwise, specify custom rule_group intervals for each set of recording rules.
161+
// RuleGroupInterval will "fill-in" for any non-specified individual groups
162+
// but individual group settings override RuleGroupInterval.
163+
SLIErrorRulesInterval time.Duration `yaml:"slierror,omitempty"`
164+
MetadataRulesInterval time.Duration `yaml:"metadata,omitempty"`
165+
AlertRulesInterval time.Duration `yaml:"alert,omitempty"`
166+
}
167+
153168
// Alert configures specific SLO alert.
154169
type Alert struct {
155170
// Disable disables the alert and makes Sloth not generating this alert. This

0 commit comments

Comments
 (0)