Skip to content

Commit 4bfc0bb

Browse files
rgushchinAlexei Starovoitov
authored andcommitted
bpf: decouple the lifetime of cgroup_bpf from cgroup itself
Currently the lifetime of bpf programs attached to a cgroup is bound to the lifetime of the cgroup itself. It means that if a user forgets (or intentionally avoids) to detach a bpf program before removing the cgroup, it will stay attached up to the release of the cgroup. Since the cgroup can stay in the dying state (the state between being rmdir()'ed and being released) for a very long time, it leads to a waste of memory. Also, it blocks a possibility to implement the memcg-based memory accounting for bpf objects, because a circular reference dependency will occur. Charged memory pages are pinning the corresponding memory cgroup, and if the memory cgroup is pinning the attached bpf program, nothing will be ever released. A dying cgroup can not contain any processes, so the only chance for an attached bpf program to be executed is a live socket associated with the cgroup. So in order to release all bpf data early, let's count associated sockets using a new percpu refcounter. On cgroup removal the counter is transitioned to the atomic mode, and as soon as it reaches 0, all bpf programs are detached. Because cgroup_bpf_release() can block, it can't be called from the percpu ref counter callback directly, so instead an asynchronous work is scheduled. The reference counter is not socket specific, and can be used for any other types of programs, which can be executed from a cgroup-bpf hook outside of the process context, had such a need arise in the future. Signed-off-by: Roman Gushchin <[email protected]> Cc: [email protected] Signed-off-by: Alexei Starovoitov <[email protected]>
1 parent 37b54ae commit 4bfc0bb

File tree

4 files changed

+72
-9
lines changed

4 files changed

+72
-9
lines changed

include/linux/bpf-cgroup.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <linux/errno.h>
77
#include <linux/jump_label.h>
88
#include <linux/percpu.h>
9+
#include <linux/percpu-refcount.h>
910
#include <linux/rbtree.h>
1011
#include <uapi/linux/bpf.h>
1112

@@ -72,10 +73,16 @@ struct cgroup_bpf {
7273

7374
/* temp storage for effective prog array used by prog_attach/detach */
7475
struct bpf_prog_array __rcu *inactive;
76+
77+
/* reference counter used to detach bpf programs after cgroup removal */
78+
struct percpu_ref refcnt;
79+
80+
/* cgroup_bpf is released using a work queue */
81+
struct work_struct release_work;
7582
};
7683

77-
void cgroup_bpf_put(struct cgroup *cgrp);
7884
int cgroup_bpf_inherit(struct cgroup *cgrp);
85+
void cgroup_bpf_offline(struct cgroup *cgrp);
7986

8087
int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
8188
enum bpf_attach_type type, u32 flags);
@@ -283,8 +290,8 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
283290

284291
struct bpf_prog;
285292
struct cgroup_bpf {};
286-
static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
287293
static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
294+
static inline void cgroup_bpf_offline(struct cgroup *cgrp) {}
288295

289296
static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr,
290297
enum bpf_prog_type ptype,

include/linux/cgroup.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -924,4 +924,22 @@ static inline bool cgroup_task_frozen(struct task_struct *task)
924924

925925
#endif /* !CONFIG_CGROUPS */
926926

927+
#ifdef CONFIG_CGROUP_BPF
928+
static inline void cgroup_bpf_get(struct cgroup *cgrp)
929+
{
930+
percpu_ref_get(&cgrp->bpf.refcnt);
931+
}
932+
933+
static inline void cgroup_bpf_put(struct cgroup *cgrp)
934+
{
935+
percpu_ref_put(&cgrp->bpf.refcnt);
936+
}
937+
938+
#else /* CONFIG_CGROUP_BPF */
939+
940+
static inline void cgroup_bpf_get(struct cgroup *cgrp) {}
941+
static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
942+
943+
#endif /* CONFIG_CGROUP_BPF */
944+
927945
#endif /* _LINUX_CGROUP_H */

kernel/bpf/cgroup.c

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,21 @@
2222
DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
2323
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
2424

25+
void cgroup_bpf_offline(struct cgroup *cgrp)
26+
{
27+
cgroup_get(cgrp);
28+
percpu_ref_kill(&cgrp->bpf.refcnt);
29+
}
30+
2531
/**
26-
* cgroup_bpf_put() - put references of all bpf programs
27-
* @cgrp: the cgroup to modify
32+
* cgroup_bpf_release() - put references of all bpf programs and
33+
* release all cgroup bpf data
34+
* @work: work structure embedded into the cgroup to modify
2835
*/
29-
void cgroup_bpf_put(struct cgroup *cgrp)
36+
static void cgroup_bpf_release(struct work_struct *work)
3037
{
38+
struct cgroup *cgrp = container_of(work, struct cgroup,
39+
bpf.release_work);
3140
enum bpf_cgroup_storage_type stype;
3241
unsigned int type;
3342

@@ -47,6 +56,22 @@ void cgroup_bpf_put(struct cgroup *cgrp)
4756
}
4857
bpf_prog_array_free(cgrp->bpf.effective[type]);
4958
}
59+
60+
percpu_ref_exit(&cgrp->bpf.refcnt);
61+
cgroup_put(cgrp);
62+
}
63+
64+
/**
65+
* cgroup_bpf_release_fn() - callback used to schedule releasing
66+
* of bpf cgroup data
67+
* @ref: percpu ref counter structure
68+
*/
69+
static void cgroup_bpf_release_fn(struct percpu_ref *ref)
70+
{
71+
struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
72+
73+
INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
74+
queue_work(system_wq, &cgrp->bpf.release_work);
5075
}
5176

5277
/* count number of elements in the list.
@@ -167,7 +192,12 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
167192
*/
168193
#define NR ARRAY_SIZE(cgrp->bpf.effective)
169194
struct bpf_prog_array __rcu *arrays[NR] = {};
170-
int i;
195+
int ret, i;
196+
197+
ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
198+
GFP_KERNEL);
199+
if (ret)
200+
return ret;
171201

172202
for (i = 0; i < NR; i++)
173203
INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
@@ -183,6 +213,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
183213
cleanup:
184214
for (i = 0; i < NR; i++)
185215
bpf_prog_array_free(arrays[i]);
216+
217+
percpu_ref_exit(&cgrp->bpf.refcnt);
218+
186219
return -ENOMEM;
187220
}
188221

kernel/cgroup/cgroup.c

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4955,8 +4955,6 @@ static void css_release_work_fn(struct work_struct *work)
49554955
if (cgrp->kn)
49564956
RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
49574957
NULL);
4958-
4959-
cgroup_bpf_put(cgrp);
49604958
}
49614959

49624960
mutex_unlock(&cgroup_mutex);
@@ -5482,6 +5480,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
54825480

54835481
cgroup1_check_for_release(parent);
54845482

5483+
cgroup_bpf_offline(cgrp);
5484+
54855485
/* put the base reference */
54865486
percpu_ref_kill(&cgrp->self.refcnt);
54875487

@@ -6221,6 +6221,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
62216221
* Don't use cgroup_get_live().
62226222
*/
62236223
cgroup_get(sock_cgroup_ptr(skcd));
6224+
cgroup_bpf_get(sock_cgroup_ptr(skcd));
62246225
return;
62256226
}
62266227

@@ -6232,6 +6233,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
62326233
cset = task_css_set(current);
62336234
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
62346235
skcd->val = (unsigned long)cset->dfl_cgrp;
6236+
cgroup_bpf_get(cset->dfl_cgrp);
62356237
break;
62366238
}
62376239
cpu_relax();
@@ -6242,7 +6244,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
62426244

62436245
void cgroup_sk_free(struct sock_cgroup_data *skcd)
62446246
{
6245-
cgroup_put(sock_cgroup_ptr(skcd));
6247+
struct cgroup *cgrp = sock_cgroup_ptr(skcd);
6248+
6249+
cgroup_bpf_put(cgrp);
6250+
cgroup_put(cgrp);
62466251
}
62476252

62486253
#endif /* CONFIG_SOCK_CGROUP_DATA */

0 commit comments

Comments
 (0)