Skip to content

Commit 7da5bb3

Browse files
committed
Add additional collector for SLM stats
1 parent 7d996b6 commit 7da5bb3

File tree

4 files changed

+428
-0
lines changed

4 files changed

+428
-0
lines changed

collector/slm.go

Lines changed: 315 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,315 @@
1+
// Copyright 2021 The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
package collector
15+
16+
import (
17+
"encoding/json"
18+
"fmt"
19+
"io/ioutil"
20+
"net/http"
21+
"net/url"
22+
"path"
23+
24+
"github.com/go-kit/log"
25+
"github.com/go-kit/log/level"
26+
"github.com/prometheus/client_golang/prometheus"
27+
)
28+
29+
type policyMetric struct {
30+
Type prometheus.ValueType
31+
Desc *prometheus.Desc
32+
Value func(policyStats PolicyStats) float64
33+
Labels func(policyStats PolicyStats) []string
34+
}
35+
36+
type slmMetric struct {
37+
Type prometheus.ValueType
38+
Desc *prometheus.Desc
39+
Value func(slmStats SLMStatsResponse) float64
40+
}
41+
42+
var (
43+
defaultPolicyLabels = []string{"policy"}
44+
defaultPolicyLabelValues = func(policyStats PolicyStats) []string {
45+
return []string{policyStats.Policy}
46+
}
47+
)
48+
49+
// SLM information struct
50+
type SLM struct {
51+
logger log.Logger
52+
client *http.Client
53+
url *url.URL
54+
55+
up prometheus.Gauge
56+
totalScrapes, jsonParseFailures prometheus.Counter
57+
58+
slmMetrics []*slmMetric
59+
policyMetrics []*policyMetric
60+
}
61+
62+
// NewSLM defines SLM Prometheus metrics
63+
func NewSLM(logger log.Logger, client *http.Client, url *url.URL) *SLM {
64+
return &SLM{
65+
logger: logger,
66+
client: client,
67+
url: url,
68+
69+
up: prometheus.NewGauge(prometheus.GaugeOpts{
70+
Name: prometheus.BuildFQName(namespace, "slm_stats", "up"),
71+
Help: "Was the last scrape of the ElasticSearch SLM endpoint successful.",
72+
}),
73+
totalScrapes: prometheus.NewCounter(prometheus.CounterOpts{
74+
Name: prometheus.BuildFQName(namespace, "slm_stats", "total_scrapes"),
75+
Help: "Current total ElasticSearch SLM scrapes.",
76+
}),
77+
jsonParseFailures: prometheus.NewCounter(prometheus.CounterOpts{
78+
Name: prometheus.BuildFQName(namespace, "slm_stats", "json_parse_failures"),
79+
Help: "Number of errors while parsing JSON.",
80+
}),
81+
slmMetrics: []*slmMetric{
82+
{
83+
Type: prometheus.GaugeValue,
84+
Desc: prometheus.NewDesc(
85+
prometheus.BuildFQName(namespace, "slm_stats", "retention_runs"),
86+
"Total retention runs",
87+
nil, nil,
88+
),
89+
Value: func(slmStats SLMStatsResponse) float64 {
90+
return float64(slmStats.RetentionRuns)
91+
},
92+
},
93+
{
94+
Type: prometheus.GaugeValue,
95+
Desc: prometheus.NewDesc(
96+
prometheus.BuildFQName(namespace, "slm_stats", "retention_failed"),
97+
"Total failed retention runs",
98+
nil, nil,
99+
),
100+
Value: func(slmStats SLMStatsResponse) float64 {
101+
return float64(slmStats.RetentionFailed)
102+
},
103+
},
104+
{
105+
Type: prometheus.GaugeValue,
106+
Desc: prometheus.NewDesc(
107+
prometheus.BuildFQName(namespace, "slm_stats", "retention_timed_out"),
108+
"Total timed out retention runs",
109+
nil, nil,
110+
),
111+
Value: func(slmStats SLMStatsResponse) float64 {
112+
return float64(slmStats.RetentionTimedOut)
113+
},
114+
},
115+
{
116+
Type: prometheus.GaugeValue,
117+
Desc: prometheus.NewDesc(
118+
prometheus.BuildFQName(namespace, "slm_stats", "retention_deletion_time_millis"),
119+
"Retention run deletion time",
120+
nil, nil,
121+
),
122+
Value: func(slmStats SLMStatsResponse) float64 {
123+
return float64(slmStats.RetentionDeletionTimeMillis)
124+
},
125+
},
126+
{
127+
Type: prometheus.GaugeValue,
128+
Desc: prometheus.NewDesc(
129+
prometheus.BuildFQName(namespace, "slm_stats", "total_snapshots_taken"),
130+
"Total snapshots taken",
131+
nil, nil,
132+
),
133+
Value: func(slmStats SLMStatsResponse) float64 {
134+
return float64(slmStats.TotalSnapshotsTaken)
135+
},
136+
},
137+
{
138+
Type: prometheus.GaugeValue,
139+
Desc: prometheus.NewDesc(
140+
prometheus.BuildFQName(namespace, "slm_stats", "total_snapshots_failed"),
141+
"Total snapshots failed",
142+
nil, nil,
143+
),
144+
Value: func(slmStats SLMStatsResponse) float64 {
145+
return float64(slmStats.TotalSnapshotsFailed)
146+
},
147+
},
148+
{
149+
Type: prometheus.GaugeValue,
150+
Desc: prometheus.NewDesc(
151+
prometheus.BuildFQName(namespace, "slm_stats", "total_snapshots_deleted"),
152+
"Total snapshots deleted",
153+
nil, nil,
154+
),
155+
Value: func(slmStats SLMStatsResponse) float64 {
156+
return float64(slmStats.TotalSnapshotsDeleted)
157+
},
158+
},
159+
{
160+
Type: prometheus.GaugeValue,
161+
Desc: prometheus.NewDesc(
162+
prometheus.BuildFQName(namespace, "slm_stats", "total_snapshot_deletion_failures"),
163+
"Total snapshot deletion failures",
164+
nil, nil,
165+
),
166+
Value: func(slmStats SLMStatsResponse) float64 {
167+
return float64(slmStats.TotalSnapshotDeletionFailures)
168+
},
169+
},
170+
},
171+
policyMetrics: []*policyMetric{
172+
{
173+
Type: prometheus.GaugeValue,
174+
Desc: prometheus.NewDesc(
175+
prometheus.BuildFQName(namespace, "slm_stats", "snapshots_taken"),
176+
"Total snapshots taken",
177+
defaultPolicyLabels, nil,
178+
),
179+
Value: func(policyStats PolicyStats) float64 {
180+
return float64(policyStats.SnapshotsTaken)
181+
},
182+
Labels: defaultPolicyLabelValues,
183+
},
184+
{
185+
Type: prometheus.GaugeValue,
186+
Desc: prometheus.NewDesc(
187+
prometheus.BuildFQName(namespace, "slm_stats", "snapshots_failed"),
188+
"Total snapshots failed",
189+
defaultPolicyLabels, nil,
190+
),
191+
Value: func(policyStats PolicyStats) float64 {
192+
return float64(policyStats.SnapshotsFailed)
193+
},
194+
Labels: defaultPolicyLabelValues,
195+
},
196+
{
197+
Type: prometheus.GaugeValue,
198+
Desc: prometheus.NewDesc(
199+
prometheus.BuildFQName(namespace, "slm_stats", "snapshots_deleted"),
200+
"Total snapshots deleted",
201+
defaultPolicyLabels, nil,
202+
),
203+
Value: func(policyStats PolicyStats) float64 {
204+
return float64(policyStats.SnapshotsDeleted)
205+
},
206+
Labels: defaultPolicyLabelValues,
207+
},
208+
{
209+
Type: prometheus.GaugeValue,
210+
Desc: prometheus.NewDesc(
211+
prometheus.BuildFQName(namespace, "slm_stats", "snapshot_deletion_failures"),
212+
"Total snapshot deletion failures",
213+
defaultPolicyLabels, nil,
214+
),
215+
Value: func(policyStats PolicyStats) float64 {
216+
return float64(policyStats.SnapshotDeletionFailures)
217+
},
218+
Labels: defaultPolicyLabelValues,
219+
},
220+
},
221+
}
222+
}
223+
224+
// Describe adds SLM metrics descriptions
225+
func (s *SLM) Describe(ch chan<- *prometheus.Desc) {
226+
for _, metric := range s.slmMetrics {
227+
ch <- metric.Desc
228+
}
229+
230+
for _, metric := range s.policyMetrics {
231+
ch <- metric.Desc
232+
}
233+
ch <- s.up.Desc()
234+
ch <- s.totalScrapes.Desc()
235+
ch <- s.jsonParseFailures.Desc()
236+
}
237+
238+
func (s *SLM) fetchAndDecodeSLMStats() (SLMStatsResponse, error) {
239+
var ssr SLMStatsResponse
240+
241+
u := *s.url
242+
u.Path = path.Join(u.Path, "/_slm/stats")
243+
res, err := s.client.Get(u.String())
244+
if err != nil {
245+
return ssr, fmt.Errorf("failed to get slm stats health from %s://%s:%s%s: %s",
246+
u.Scheme, u.Hostname(), u.Port(), u.Path, err)
247+
}
248+
249+
defer func() {
250+
err = res.Body.Close()
251+
if err != nil {
252+
_ = level.Warn(s.logger).Log(
253+
"msg", "failed to close http.Client",
254+
"err", err,
255+
)
256+
}
257+
}()
258+
259+
if res.StatusCode != http.StatusOK {
260+
return ssr, fmt.Errorf("HTTP Request failed with code %d", res.StatusCode)
261+
}
262+
263+
bts, err := ioutil.ReadAll(res.Body)
264+
if err != nil {
265+
s.jsonParseFailures.Inc()
266+
return ssr, err
267+
}
268+
269+
if err := json.Unmarshal(bts, &ssr); err != nil {
270+
s.jsonParseFailures.Inc()
271+
return ssr, err
272+
}
273+
274+
return ssr, nil
275+
}
276+
277+
// Collect gets SLM metric values
278+
func (s *SLM) Collect(ch chan<- prometheus.Metric) {
279+
s.totalScrapes.Inc()
280+
defer func() {
281+
ch <- s.up
282+
ch <- s.totalScrapes
283+
ch <- s.jsonParseFailures
284+
}()
285+
286+
slmStatsResp, err := s.fetchAndDecodeSLMStats()
287+
if err != nil {
288+
s.up.Set(0)
289+
_ = level.Warn(s.logger).Log(
290+
"msg", "failed to fetch and decode slm stats",
291+
"err", err,
292+
)
293+
return
294+
}
295+
s.up.Set(1)
296+
297+
for _, metric := range s.slmMetrics {
298+
ch <- prometheus.MustNewConstMetric(
299+
metric.Desc,
300+
metric.Type,
301+
metric.Value(slmStatsResp),
302+
)
303+
}
304+
305+
for _, metric := range s.policyMetrics {
306+
for _, policy := range slmStatsResp.PolicyStats {
307+
ch <- prometheus.MustNewConstMetric(
308+
metric.Desc,
309+
metric.Type,
310+
metric.Value(policy),
311+
metric.Labels(policy)...,
312+
)
313+
}
314+
}
315+
}

collector/slm_response.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
// Copyright 2021 The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
package collector
15+
16+
// SLMStatsResponse is a representation of the SLM stats
17+
type SLMStatsResponse struct {
18+
RetentionRuns int64 `json:"retention_runs"`
19+
RetentionFailed int64 `json:"retention_failed"`
20+
RetentionTimedOut int64 `json:"retention_timed_out"`
21+
RetentionDeletionTime string `json:"retention_deletion_time"`
22+
RetentionDeletionTimeMillis int64 `json:"retention_deletion_time_millis"`
23+
TotalSnapshotsTaken int64 `json:"total_snapshots_taken"`
24+
TotalSnapshotsFailed int64 `json:"total_snapshots_failed"`
25+
TotalSnapshotsDeleted int64 `json:"total_snapshots_deleted"`
26+
TotalSnapshotDeletionFailures int64 `json:"total_snapshot_deletion_failures"`
27+
PolicyStats []PolicyStats `json:"policy_stats"`
28+
}
29+
30+
// PolicyStats is a representation of SLM stats for specific policies
31+
type PolicyStats struct {
32+
Policy string `json:"policy"`
33+
SnapshotsTaken int64 `json:"snapshots_taken"`
34+
SnapshotsFailed int64 `json:"snapshots_failed"`
35+
SnapshotsDeleted int64 `json:"snapshots_deleted"`
36+
SnapshotDeletionFailures int64 `json:"snapshot_deletion_failures"`
37+
}

0 commit comments

Comments
 (0)