Skip to content
30 changes: 30 additions & 0 deletions examples/custom_rule_group_interval.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# This example shows how you can adjust the Prometheus rule_group interval for expensive SLOs
# https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/#rule_group
# The SLO SLI measures the rate of CPU seconds spent performing softirqs
#
# `sloth generate -i ./examples/custom_rule_group_interval.yml`
#
version: "prometheus/v1"
service: "myapp"
labels:
owner: "myteam"
slos:
- name: "cpu-availability"
objective: 99.99
description: "Example, expensive SLO. Recording rules will run every 2 minutes."
interval: "2m"
sli:
events:
error_query: |
sum(
rate(node_cpu_seconds_total{mode="softirq"}[{{.window}}])
)
total_query: |
sum(
rate(node_cpu_seconds_total[{{.window}}])
)
alerting:
page_alert:
disable: true
ticket_alert:
disable: true
21 changes: 11 additions & 10 deletions internal/prometheus/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,17 @@ type AlertMeta struct {

// SLO represents a service level objective configuration.
type SLO struct {
ID string `validate:"required,name"`
Name string `validate:"required,name"`
Description string
Service string `validate:"required,name"`
SLI SLI `validate:"required"`
TimeWindow time.Duration `validate:"required"`
Objective float64 `validate:"gt=0,lte=100"`
Labels map[string]string `validate:"dive,keys,prom_label_key,endkeys,required,prom_label_value"`
PageAlertMeta AlertMeta
TicketAlertMeta AlertMeta
ID string `validate:"required,name"`
Name string `validate:"required,name"`
Description string
Service string `validate:"required,name"`
RuleGroupInterval string
SLI SLI `validate:"required"`
TimeWindow time.Duration `validate:"required"`
Objective float64 `validate:"gt=0,lte=100"`
Labels map[string]string `validate:"dive,keys,prom_label_key,endkeys,required,prom_label_value"`
PageAlertMeta AlertMeta
TicketAlertMeta AlertMeta
}

type SLOGroup struct {
Expand Down
19 changes: 10 additions & 9 deletions internal/prometheus/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,16 @@ func (y YAMLSpecLoader) mapSpecToModel(ctx context.Context, spec prometheusv1.Sp
models := make([]SLO, 0, len(spec.SLOs))
for _, specSLO := range spec.SLOs {
slo := SLO{
ID: fmt.Sprintf("%s-%s", spec.Service, specSLO.Name),
Name: specSLO.Name,
Description: specSLO.Description,
Service: spec.Service,
TimeWindow: y.windowPeriod,
Objective: specSLO.Objective,
Labels: mergeLabels(spec.Labels, specSLO.Labels),
PageAlertMeta: AlertMeta{Disable: true},
TicketAlertMeta: AlertMeta{Disable: true},
ID: fmt.Sprintf("%s-%s", spec.Service, specSLO.Name),
RuleGroupInterval: specSLO.RuleGroupInterval,
Name: specSLO.Name,
Description: specSLO.Description,
Service: spec.Service,
TimeWindow: y.windowPeriod,
Objective: specSLO.Objective,
Labels: mergeLabels(spec.Labels, specSLO.Labels),
PageAlertMeta: AlertMeta{Disable: true},
TicketAlertMeta: AlertMeta{Disable: true},
}

// Set SLIs.
Expand Down
74 changes: 59 additions & 15 deletions internal/prometheus/storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,24 +49,68 @@ func (i IOWriterGroupedRulesYAMLRepo) StoreSLOs(ctx context.Context, slos []Stor
ruleGroups := ruleGroupsYAMLv2{}
for _, slo := range slos {
if len(slo.Rules.SLIErrorRecRules) > 0 {
ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{
Name: fmt.Sprintf("sloth-slo-sli-recordings-%s", slo.SLO.ID),
Rules: slo.Rules.SLIErrorRecRules,
})
if slo.SLO.RuleGroupInterval != "" {

ruleGroupIntervalDuration, err := prommodel.ParseDuration(slo.SLO.RuleGroupInterval)
if err != nil {
return fmt.Errorf("could not parse rule_group interval duration %w", err)
}

ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{
Name: fmt.Sprintf("sloth-slo-sli-recordings-%s", slo.SLO.ID),
RuleGroupInterval: ruleGroupIntervalDuration,
Rules: slo.Rules.SLIErrorRecRules,
})
} else {
ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{
Name: fmt.Sprintf("sloth-slo-sli-recordings-%s", slo.SLO.ID),
Rules: slo.Rules.SLIErrorRecRules,
})

}
}

if len(slo.Rules.MetadataRecRules) > 0 {
ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{
Name: fmt.Sprintf("sloth-slo-meta-recordings-%s", slo.SLO.ID),
Rules: slo.Rules.MetadataRecRules,
})
if slo.SLO.RuleGroupInterval != "" {

ruleGroupIntervalDuration, err := prommodel.ParseDuration(slo.SLO.RuleGroupInterval)
if err != nil {
return fmt.Errorf("could not parse rule_group interval duration %w", err)
}

ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{
Name: fmt.Sprintf("sloth-slo-meta-recordings-%s", slo.SLO.ID),
RuleGroupInterval: ruleGroupIntervalDuration,
Rules: slo.Rules.MetadataRecRules,
})
} else {
ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{
Name: fmt.Sprintf("sloth-slo-meta-recordings-%s", slo.SLO.ID),
Rules: slo.Rules.MetadataRecRules,
})
}
}

if len(slo.Rules.AlertRules) > 0 {
ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{
Name: fmt.Sprintf("sloth-slo-alerts-%s", slo.SLO.ID),
Rules: slo.Rules.AlertRules,
})
if slo.SLO.RuleGroupInterval != "" {

ruleGroupIntervalDuration, err := prommodel.ParseDuration(slo.SLO.RuleGroupInterval)
if err != nil {
return fmt.Errorf("could not parse rule_group interval duration %w", err)
}

ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{
Name: fmt.Sprintf("sloth-slo-alerts-%s", slo.SLO.ID),
RuleGroupInterval: ruleGroupIntervalDuration,
Rules: slo.Rules.AlertRules,
})

} else {
ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{
Name: fmt.Sprintf("sloth-slo-alerts-%s", slo.SLO.ID),
Rules: slo.Rules.AlertRules,
})
}
}
}

Expand Down Expand Up @@ -112,7 +156,7 @@ type ruleGroupsYAMLv2 struct {
}

type ruleGroupYAMLv2 struct {
Name string `yaml:"name"`
Interval prommodel.Duration `yaml:"interval,omitempty"`
Rules []rulefmt.Rule `yaml:"rules"`
Name string `yaml:"name"`
RuleGroupInterval prommodel.Duration `yaml:"interval,omitempty"`
Rules []rulefmt.Rule `yaml:"rules"`
}
45 changes: 41 additions & 4 deletions internal/prometheus/storage_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ func TestIOWriterGroupedRulesYAMLRepoStore(t *testing.T) {
"Having a single SLI recording rule should render correctly.": {
slos: []prometheus.StorageSLO{
{
SLO: prometheus.SLO{ID: "test1"},
SLO: prometheus.SLO{ID: "test1", RuleGroupInterval: "2m"},
Rules: prometheus.SLORules{
SLIErrorRecRules: []rulefmt.Rule{
{
Expand All @@ -52,6 +52,7 @@ func TestIOWriterGroupedRulesYAMLRepoStore(t *testing.T) {

groups:
- name: sloth-slo-sli-recordings-test1
interval: 2m
rules:
- record: test:record
expr: test-expr
Expand Down Expand Up @@ -91,7 +92,7 @@ groups:
"Having a single SLO alert rule should render correctly.": {
slos: []prometheus.StorageSLO{
{
SLO: prometheus.SLO{ID: "test1"},
SLO: prometheus.SLO{ID: "test1", RuleGroupInterval: "2m"},
Rules: prometheus.SLORules{
AlertRules: []rulefmt.Rule{
{
Expand All @@ -111,6 +112,7 @@ groups:

groups:
- name: sloth-slo-alerts-test1
interval: 2m
rules:
- alert: testAlert
expr: test-expr
Expand All @@ -120,11 +122,40 @@ groups:
test-annot: one
`,
},
"Having a single a blank or empty rule_group interval render correctly.": {
slos: []prometheus.StorageSLO{
{
SLO: prometheus.SLO{ID: "test1", RuleGroupInterval: ""},
Rules: prometheus.SLORules{
SLIErrorRecRules: []rulefmt.Rule{
{
Record: "test:record",
Expr: "test-expr",
Labels: map[string]string{"test-label": "one"},
},
},
},
},
},
expYAML: `
---
# Code generated by Sloth (dev): https://github.com/slok/sloth.
# DO NOT EDIT.

groups:
- name: sloth-slo-sli-recordings-test1
rules:
- record: test:record
expr: test-expr
labels:
test-label: one
`,
},

"Having a multiple SLO alert and recording rules should render correctly.": {
slos: []prometheus.StorageSLO{
{
SLO: prometheus.SLO{ID: "testa"},
SLO: prometheus.SLO{ID: "testa", RuleGroupInterval: "3m"},
Rules: prometheus.SLORules{
SLIErrorRecRules: []rulefmt.Rule{
{
Expand Down Expand Up @@ -167,7 +198,7 @@ groups:
},
},
{
SLO: prometheus.SLO{ID: "testb"},
SLO: prometheus.SLO{ID: "testb", RuleGroupInterval: "1h"},
Rules: prometheus.SLORules{
SLIErrorRecRules: []rulefmt.Rule{
{
Expand Down Expand Up @@ -201,6 +232,7 @@ groups:

groups:
- name: sloth-slo-sli-recordings-testa
interval: 3m
rules:
- record: test:record-a1
expr: test-expr-a1
Expand All @@ -211,6 +243,7 @@ groups:
labels:
test-label: a-2
- name: sloth-slo-meta-recordings-testa
interval: 3m
rules:
- record: test:record-a3
expr: test-expr-a3
Expand All @@ -221,6 +254,7 @@ groups:
labels:
test-label: a-4
- name: sloth-slo-alerts-testa
interval: 3m
rules:
- alert: testAlertA1
expr: test-expr-a1
Expand All @@ -235,18 +269,21 @@ groups:
annotations:
test-annot: a-2
- name: sloth-slo-sli-recordings-testb
interval: 1h
rules:
- record: test:record-b1
expr: test-expr-b1
labels:
test-label: b-1
- name: sloth-slo-meta-recordings-testb
interval: 1h
rules:
- record: test:record-b2
expr: test-expr-b2
labels:
test-label: b-2
- name: sloth-slo-alerts-testb
interval: 1h
rules:
- alert: testAlertB1
expr: test-expr-b1
Expand Down
2 changes: 2 additions & 0 deletions pkg/prometheus/api/v1/v1.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ type SLO struct {
// Alerting is the configuration with all the things related with the SLO
// alerts.
Alerting Alerting `yaml:"alerting"`
// RuleGroupInterval is an optional value for how often the Prometheus rule_group should be evaluated.
RuleGroupInterval string `yaml:"interval,omitempty"`
}

// SLI will tell what is good or bad for the SLO.
Expand Down