Skip to content
43 changes: 43 additions & 0 deletions examples/custom_rule_group_interval.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# This example shows how you can adjust the Prometheus rule_group interval for expensive SLOs
# https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/#rule_group
# The SLO SLI measures the rate of CPU seconds spent performing softirqs
#
# `sloth generate -i ./examples/custom_rule_group_interval.yml`
#
version: "prometheus/v1"
service: "myapp"
labels:
owner: "myteam"
slos:
- name: "cpu-availability"
objective: 99.99
description: "Example, expensive SLO. Recording rules will run every 2 minutes."
# alternative way of specifying interval for all three sets of rules
# interval:
# all: "5m"
interval: # all of these are different sets of rule groups sloth can make
slierror: "4m"
metadata: "2m"
alert: "2m"
sli:
events:
error_query: |
sum(
rate(node_cpu_seconds_total{mode="softirq"}[{{.window}}])
)
total_query: |
sum(
rate(node_cpu_seconds_total[{{.window}}])
)
alerting:
name: MyServiceHighErrorRate
labels:
category: "availability"
annotations:
summary: "High error rate on 'myservice' requests responses"
page_alert:
labels:
severity: pageteam
routing_key: myteam
ticket_alert:
disable: true
37 changes: 27 additions & 10 deletions internal/prometheus/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,20 @@ type AlertMeta struct {

// SLO represents a service level objective configuration.
type SLO struct {
ID string `validate:"required,name"`
Name string `validate:"required,name"`
Description string
Service string `validate:"required,name"`
SLI SLI `validate:"required"`
TimeWindow time.Duration `validate:"required"`
Objective float64 `validate:"gt=0,lte=100"`
Labels map[string]string `validate:"dive,keys,prom_label_key,endkeys,required,prom_label_value"`
PageAlertMeta AlertMeta
TicketAlertMeta AlertMeta
ID string `validate:"required,name"`
Name string `validate:"required,name"`
Description string
Service string `validate:"required,name"`
RuleGroupInterval time.Duration `validate:"time"`
SLIErrorRulesInterval time.Duration `validate:"time"`
MetadataRulesInterval time.Duration `validate:"time"`
AlertRulesInterval time.Duration `validate:"time"`
SLI SLI `validate:"required"`
TimeWindow time.Duration `validate:"required"`
Objective float64 `validate:"gt=0,lte=100"`
Labels map[string]string `validate:"dive,keys,prom_label_key,endkeys,required,prom_label_value"`
PageAlertMeta AlertMeta
TicketAlertMeta AlertMeta
}

type SLOGroup struct {
Expand Down Expand Up @@ -86,6 +90,7 @@ var modelSpecValidate = func() *validator.Validate {
mustRegisterValidation(v, "name", validateName)
mustRegisterValidation(v, "required_if_enabled", validateRequiredEnabledAlertName)
mustRegisterValidation(v, "template_vars", validateTemplateVars)
mustRegisterValidation(v, "time", validateTime)
v.RegisterStructValidation(validateOneSLI, SLI{})
v.RegisterStructValidation(validateSLOGroup, SLOGroup{})
v.RegisterStructValidation(validateSLIEvents, SLIEvents{})
Expand Down Expand Up @@ -181,6 +186,18 @@ func validateName(fl validator.FieldLevel) bool {
return nameRegexp.MatchString(s)
}

// validateTime implements validator.CustomTypeFunc by validating
// a time duration.
func validateTime(fl validator.FieldLevel) bool {
s, ok := fl.Field().Interface().(time.Duration)
if !ok {
return false
}

_, err := time.ParseDuration(s.String())
return err == nil
}

func validateRequiredEnabledAlertName(fl validator.FieldLevel) bool {
alertMeta, ok := fl.Parent().Interface().(AlertMeta)
if !ok {
Expand Down
22 changes: 13 additions & 9 deletions internal/prometheus/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,19 @@ func (y YAMLSpecLoader) mapSpecToModel(ctx context.Context, spec prometheusv1.Sp
models := make([]SLO, 0, len(spec.SLOs))
for _, specSLO := range spec.SLOs {
slo := SLO{
ID: fmt.Sprintf("%s-%s", spec.Service, specSLO.Name),
Name: specSLO.Name,
Description: specSLO.Description,
Service: spec.Service,
TimeWindow: y.windowPeriod,
Objective: specSLO.Objective,
Labels: mergeLabels(spec.Labels, specSLO.Labels),
PageAlertMeta: AlertMeta{Disable: true},
TicketAlertMeta: AlertMeta{Disable: true},
ID: fmt.Sprintf("%s-%s", spec.Service, specSLO.Name),
RuleGroupInterval: specSLO.Interval.RuleGroupInterval,
SLIErrorRulesInterval: specSLO.Interval.SLIErrorRulesInterval,
MetadataRulesInterval: specSLO.Interval.MetadataRulesInterval,
AlertRulesInterval: specSLO.Interval.AlertRulesInterval,
Name: specSLO.Name,
Description: specSLO.Description,
Service: spec.Service,
TimeWindow: y.windowPeriod,
Objective: specSLO.Objective,
Labels: mergeLabels(spec.Labels, specSLO.Labels),
PageAlertMeta: AlertMeta{Disable: true},
TicketAlertMeta: AlertMeta{Disable: true},
}

// Set SLIs.
Expand Down
87 changes: 78 additions & 9 deletions internal/prometheus/storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,24 +49,93 @@ func (i IOWriterGroupedRulesYAMLRepo) StoreSLOs(ctx context.Context, slos []Stor
ruleGroups := ruleGroupsYAMLv2{}
for _, slo := range slos {
if len(slo.Rules.SLIErrorRecRules) > 0 {
ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{

group := ruleGroupYAMLv2{
Name: fmt.Sprintf("sloth-slo-sli-recordings-%s", slo.SLO.ID),
Rules: slo.Rules.SLIErrorRecRules,
})
}

var ruleGroupIntervalDuration prommodel.Duration
var err error

switch {
case slo.SLO.SLIErrorRulesInterval.String() != "0s":
ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.SLIErrorRulesInterval.String())
if err != nil {
return fmt.Errorf("could not parse rule_group interval duration for alerts %w", err)
} else {
group.RuleGroupInterval = ruleGroupIntervalDuration
}
case slo.SLO.RuleGroupInterval.String() != "0s":
ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.RuleGroupInterval.String())
if err != nil {
return fmt.Errorf("could not parse default ('all') rule_group interval duration %w", err)
} else {
group.RuleGroupInterval = ruleGroupIntervalDuration
}
}

ruleGroups.Groups = append(ruleGroups.Groups, group)
}

if len(slo.Rules.MetadataRecRules) > 0 {
ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{

group := ruleGroupYAMLv2{
Name: fmt.Sprintf("sloth-slo-meta-recordings-%s", slo.SLO.ID),
Rules: slo.Rules.MetadataRecRules,
})
}

var ruleGroupIntervalDuration prommodel.Duration
var err error

switch {
case slo.SLO.MetadataRulesInterval.String() != "0s":
ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.MetadataRulesInterval.String())
if err != nil {
return fmt.Errorf("could not parse rule_group interval duration for alerts %w", err)
} else {
group.RuleGroupInterval = ruleGroupIntervalDuration
}
case slo.SLO.RuleGroupInterval.String() != "0s":
ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.RuleGroupInterval.String())
if err != nil {
return fmt.Errorf("could not parse default ('all') rule_group interval duration %w", err)
} else {
group.RuleGroupInterval = ruleGroupIntervalDuration
}
}

ruleGroups.Groups = append(ruleGroups.Groups, group)
}

if len(slo.Rules.AlertRules) > 0 {
ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{

group := ruleGroupYAMLv2{
Name: fmt.Sprintf("sloth-slo-alerts-%s", slo.SLO.ID),
Rules: slo.Rules.AlertRules,
})
}

var ruleGroupIntervalDuration prommodel.Duration
var err error

switch {
case slo.SLO.AlertRulesInterval.String() != "0s":
ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.AlertRulesInterval.String())
if err != nil {
return fmt.Errorf("could not parse rule_group interval duration for alerts %w", err)
} else {
group.RuleGroupInterval = ruleGroupIntervalDuration
}
case slo.SLO.RuleGroupInterval.String() != "0s":
ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.RuleGroupInterval.String())
if err != nil {
return fmt.Errorf("could not parse default ('all') rule_group interval duration %w", err)
} else {
group.RuleGroupInterval = ruleGroupIntervalDuration
}
}

ruleGroups.Groups = append(ruleGroups.Groups, group)
}
}

Expand Down Expand Up @@ -112,7 +181,7 @@ type ruleGroupsYAMLv2 struct {
}

type ruleGroupYAMLv2 struct {
Name string `yaml:"name"`
Interval prommodel.Duration `yaml:"interval,omitempty"`
Rules []rulefmt.Rule `yaml:"rules"`
Name string `yaml:"name"`
RuleGroupInterval prommodel.Duration `yaml:"interval,omitempty"`
Rules []rulefmt.Rule `yaml:"rules"`
}
Loading