Skip to content

Commit 28e7207

Browse files
committed
collector: add tasks API collection
This commit adds simple aggregation of Elasticsearch Tasks API. There are 4 new metrics; though 3 are just bookkeeping. elasticsearch_task_stats_action_total is a gague reporting the total number of tasks running for a given action. Because there are no stats endpoints available for this, this change introduces an aggregation step to group the number of tasks by action name. This metric is useful for ensuring long running actions of a specific kind stay within a specific limit. Of particular use to me is the action: 'indices:data/write/delete/byquery'. In my usecase, our ES access patterns mean we have a predefined limit of these actions running on the cluster. This change also adds two new CLI flags to manage the collection of tasks API: --es.tasks (to enable task collection) --es.tasks.actions (to filter tasks by action param) Issue #525 proposed addition of collection of these tasks.
1 parent a5f4279 commit 28e7207

File tree

4 files changed

+287
-0
lines changed

4 files changed

+287
-0
lines changed

collector/tasks.go

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
// Copyright 2023 The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
package collector
15+
16+
import (
17+
"encoding/json"
18+
"fmt"
19+
"io"
20+
"net/http"
21+
"net/url"
22+
"path"
23+
24+
"github.com/go-kit/log"
25+
"github.com/go-kit/log/level"
26+
"github.com/prometheus/client_golang/prometheus"
27+
)
28+
29+
type taskByAction struct {
30+
Type prometheus.ValueType
31+
Desc *prometheus.Desc
32+
Value func(action string, count int64) float64
33+
Labels func(action string, count int64) []string
34+
}
35+
36+
var (
37+
taskLabels = []string{"cluster", "action"}
38+
)
39+
40+
// Task Information Struct
41+
type Task struct {
42+
logger log.Logger
43+
client *http.Client
44+
url *url.URL
45+
actions string
46+
47+
up prometheus.Gauge
48+
totalScrapes, jsonParseFailures prometheus.Counter
49+
50+
byActionMetrics []*taskByAction
51+
}
52+
53+
// NewTask defines Task Prometheus metrics
54+
func NewTask(logger log.Logger, client *http.Client, url *url.URL, actions string) *Task {
55+
return &Task{
56+
logger: logger,
57+
client: client,
58+
url: url,
59+
actions: actions,
60+
61+
up: prometheus.NewGauge(prometheus.GaugeOpts{
62+
Name: prometheus.BuildFQName(namespace, "task_stats", "up"),
63+
Help: "Was the last scrape of the ElasticSearch Task endpoint successful.",
64+
}),
65+
totalScrapes: prometheus.NewCounter(prometheus.CounterOpts{
66+
Name: prometheus.BuildFQName(namespace, "task_stats", "total_scrapes"),
67+
Help: "Current total Elasticsearch snapshots scrapes.",
68+
}),
69+
jsonParseFailures: prometheus.NewCounter(prometheus.CounterOpts{
70+
Name: prometheus.BuildFQName(namespace, "task_stats", "json_parse_failures"),
71+
Help: "Number of errors while parsing JSON.",
72+
}),
73+
byActionMetrics: []*taskByAction{
74+
{
75+
Type: prometheus.GaugeValue,
76+
Desc: prometheus.NewDesc(
77+
prometheus.BuildFQName(namespace, "task_stats", "action_total"),
78+
"Number of tasks of a certain action",
79+
[]string{"action"}, nil,
80+
),
81+
Value: func(action string, count int64) float64 {
82+
return float64(count)
83+
},
84+
Labels: func(action string, count int64) []string {
85+
return []string{action}
86+
},
87+
},
88+
},
89+
}
90+
}
91+
92+
// Describe adds Task metrics descriptions
93+
func (t *Task) Describe(ch chan<- *prometheus.Desc) {
94+
for _, metric := range t.byActionMetrics {
95+
ch <- metric.Desc
96+
}
97+
98+
ch <- t.up.Desc()
99+
ch <- t.totalScrapes.Desc()
100+
ch <- t.jsonParseFailures.Desc()
101+
}
102+
103+
func (t *Task) fetchAndDecodeAndAggregateTaskStats() (*AggregatedTaskStats, error) {
104+
u := *t.url
105+
u.Path = path.Join(u.Path, "/_tasks")
106+
u.RawQuery = "group_by=none&actions=" + t.actions
107+
res, err := t.client.Get(u.String())
108+
if err != nil {
109+
return nil, fmt.Errorf("failed to get data stream stats health from %s://%s:%s%s: %s",
110+
u.Scheme, u.Hostname(), u.Port(), u.Path, err)
111+
}
112+
113+
defer func() {
114+
err = res.Body.Close()
115+
if err != nil {
116+
level.Warn(t.logger).Log(
117+
"msg", "failed to close http.Client",
118+
"err", err,
119+
)
120+
}
121+
}()
122+
123+
if res.StatusCode != http.StatusOK {
124+
return nil, fmt.Errorf("HTTP Request to %v failed with code %d", u.String(), res.StatusCode)
125+
}
126+
127+
bts, err := io.ReadAll(res.Body)
128+
if err != nil {
129+
t.jsonParseFailures.Inc()
130+
return nil, err
131+
}
132+
133+
var tr TasksResponse
134+
if err := json.Unmarshal(bts, &tr); err != nil {
135+
t.jsonParseFailures.Inc()
136+
return nil, err
137+
}
138+
139+
stats := AggregateTasks(tr)
140+
return stats, nil
141+
}
142+
143+
// Collect gets Task metric values
144+
func (ds *Task) Collect(ch chan<- prometheus.Metric) {
145+
ds.totalScrapes.Inc()
146+
defer func() {
147+
ch <- ds.up
148+
ch <- ds.totalScrapes
149+
ch <- ds.jsonParseFailures
150+
}()
151+
152+
stats, err := ds.fetchAndDecodeAndAggregateTaskStats()
153+
if err != nil {
154+
ds.up.Set(0)
155+
level.Warn(ds.logger).Log(
156+
"msg", "failed to fetch and decode task stats",
157+
"err", err,
158+
)
159+
return
160+
}
161+
162+
for action, count := range stats.CountByAction {
163+
for _, metric := range ds.byActionMetrics {
164+
ch <- prometheus.MustNewConstMetric(
165+
metric.Desc,
166+
metric.Type,
167+
metric.Value(action, count),
168+
metric.Labels(action, count)...,
169+
)
170+
}
171+
}
172+
173+
ds.up.Set(1)
174+
}

collector/tasks_response.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// Copyright 2022 The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
package collector
15+
16+
// TasksResponse is a representation of the Task management API.
17+
type TasksResponse struct {
18+
Tasks []TaskResponse `json:"tasks"`
19+
}
20+
21+
// TaskResponse is a representation of the individual task item returned by task API endpoint.
22+
//
23+
// We only parse a very limited amount of this API for use in aggregation.
24+
type TaskResponse struct {
25+
Action string `json:"action"`
26+
}
27+
28+
type AggregatedTaskStats struct {
29+
CountByAction map[string]int64
30+
}
31+
32+
func AggregateTasks(t TasksResponse) *AggregatedTaskStats {
33+
actions := map[string]int64{}
34+
for _, task := range t.Tasks {
35+
actions[task.Action] += 1
36+
}
37+
agg := &AggregatedTaskStats{CountByAction: actions}
38+
return agg
39+
}

collector/tasks_test.go

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
package collector
2+
3+
import (
4+
"fmt"
5+
"net/http"
6+
"net/http/httptest"
7+
"net/url"
8+
"testing"
9+
10+
"github.com/go-kit/log"
11+
)
12+
13+
func TestTasks(t *testing.T) {
14+
// Test data was collected by running the following:
15+
// docker run -d --name elasticsearch -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" elasticsearch:7.17.11
16+
// sleep 15
17+
// # start some busy work
18+
// for i in $(seq 1 1000); do \
19+
// curl -o /dev/null -s -X POST "localhost:9200/a1/_doc" -H 'Content-Type: application/json' \
20+
// -d'{"abc": "'$i'"}'; done &
21+
// curl -X POST "localhost:9200/a1/_delete_by_query?requests_per_second=1&wait_for_completion=false" \
22+
// -H 'Content-Type: application/json' -d'{"query": {"match_all": {}}}
23+
// # try and collect a good sample
24+
// curl -X GET 'localhost:9200/_tasks?group_by=none&actions=indices:*'
25+
// docker rm elasticsearch
26+
tcs := map[string]string{
27+
"7.17": `{"tasks":[{"node":"NVe9ksxcSu6AJTKlIfI24A","id":17223,"type":"transport","action":"indices:data/write/delete/byquery","start_time_in_millis":1695214684290,"running_time_in_nanos":8003510219,"cancellable":true,"cancelled":false,"headers":{}},{"node":"NVe9ksxcSu6AJTKlIfI24A","id":20890,"type":"transport","action":"indices:data/write/index","start_time_in_millis":1695214692292,"running_time_in_nanos":1611966,"cancellable":false,"headers":{}},{"node":"NVe9ksxcSu6AJTKlIfI24A","id":20891,"type":"transport","action":"indices:data/write/bulk[s]","start_time_in_millis":1695214692292,"running_time_in_nanos":1467298,"cancellable":false,"parent_task_id":"NVe9ksxcSu6AJTKlIfI24A:20890","headers":{}},{"node":"NVe9ksxcSu6AJTKlIfI24A","id":20892,"type":"direct","action":"indices:data/write/bulk[s][p]","start_time_in_millis":1695214692292,"running_time_in_nanos":1437170,"cancellable":false,"parent_task_id":"NVe9ksxcSu6AJTKlIfI24A:20891","headers":{}}]}`,
28+
}
29+
for ver, out := range tcs {
30+
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
31+
fmt.Fprintln(w, out)
32+
}))
33+
defer ts.Close()
34+
35+
u, err := url.Parse(ts.URL)
36+
if err != nil {
37+
t.Fatalf("Failed to parse URL: %s", err)
38+
}
39+
40+
task := NewTask(log.NewNopLogger(), http.DefaultClient, u, "indices:*")
41+
stats, err := task.fetchAndDecodeAndAggregateTaskStats()
42+
if err != nil {
43+
t.Fatalf("Failed to fetch or decode data stream stats: %s", err)
44+
}
45+
t.Logf("[%s] Task Response: %+v", ver, stats)
46+
47+
// validate actions aggregations
48+
if len(stats.CountByAction) != 4 {
49+
t.Fatal("expected to get 4 tasks")
50+
}
51+
if stats.CountByAction["indices:data/write/index"] != 1 {
52+
t.Fatal("excpected action indices:data/write/delete/byquery to have count 1")
53+
}
54+
if stats.CountByAction["indices:data/write/bulk[s]"] != 1 {
55+
t.Fatal("excpected action indices:data/write/bulk[s] to have count 1")
56+
}
57+
if stats.CountByAction["indices:data/write/bulk[s][p]"] != 1 {
58+
t.Fatal("excpected action indices:data/write/bulk[s][p] to have count 1")
59+
}
60+
if stats.CountByAction["indices:data/write/delete/byquery"] != 1 {
61+
t.Fatal("excpected action indices:data/write/delete/byquery to have count 1")
62+
}
63+
}
64+
}

main.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,12 @@ func main() {
8686
esExportSnapshots = kingpin.Flag("es.snapshots",
8787
"Export stats for the cluster snapshots.").
8888
Default("false").Bool()
89+
esExportTasks = kingpin.Flag("es.tasks",
90+
"Aggregate stats for tasks in the cluster.").
91+
Default("false").Bool()
92+
esTaskActions = kingpin.Flag("es.tasks.actions",
93+
"Filter on task actions. Used in same way as Task API actions param").
94+
Default("indices:*").String()
8995
esExportSLM = kingpin.Flag("es.slm",
9096
"Export stats for SLM snapshots.").
9197
Default("false").Bool()
@@ -236,6 +242,10 @@ func main() {
236242
prometheus.MustRegister(collector.NewIlmIndicies(logger, httpClient, esURL))
237243
}
238244

245+
if *esExportTasks {
246+
prometheus.MustRegister(collector.NewTask(logger, httpClient, esURL, *esTaskActions))
247+
}
248+
239249
// Create a context that is cancelled on SIGKILL or SIGINT.
240250
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt, os.Kill)
241251
defer cancel()

0 commit comments

Comments
 (0)