
This commit backports a patch from v5.15-rc2 to fix the fact that BPF programs attached to v2 control groups (cgroup) do not take effect when v1 cgroups are active and when CONFIG_CGROUP_NET_PRIO and CONFIG_CGROUP_NET_CLASSID are enabled. Without this patch, the test programs test_cgrp2_sock and test_cgrp2_sock2 (and their accompanying scripts) included in the kernel's samples/bpf/ directory report failures. These tests consist of attaching BPF programs to a cgroup v2 instance created for testing and then verifying that a BPF program is attached to all sockets created in the cgroup (test_cgrp2_sock.sh) or that the test BPF program indeed causes ICMP/ping packets to be dropped (test_cgrp2_sock2.sh). With this patch applied, both test scripts finish successfully. An alternative approach would be to undefine CONFIG_CGROUP_NET_PRIO and CONFIG_CGROUP_NET_CLASSID, but this alternative approach has not been tested. Story: 2008921 Task: 43663 Signed-off-by: M. Vefa Bicakci <vefa.bicakci@windriver.com> Change-Id: Iadb937b9f5fb96d9e3ae73a8d18851de9e411bff
396 lines
13 KiB
Diff
396 lines
13 KiB
Diff
From b377b8aed56d93aef67f7dbad01d026bacffedb1 Mon Sep 17 00:00:00 2001
|
|
From: Daniel Borkmann <daniel@iogearbox.net>
|
|
Date: Tue, 14 Sep 2021 01:07:57 +0200
|
|
Subject: [PATCH] bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode
|
|
|
|
Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used.
|
|
Back in the days, commit bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
|
|
embedded per-socket cgroup information into sock->sk_cgrp_data and in order
|
|
to save 8 bytes in struct sock made both mutually exclusive, that is, when
|
|
cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2
|
|
falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp).
|
|
|
|
The assumption made was "there is no reason to mix the two and this is in line
|
|
with how legacy and v2 compatibility is handled" as stated in bd1060a1d671.
|
|
However, with Kubernetes more widely supporting cgroups v2 as well nowadays,
|
|
this assumption no longer holds, and the possibility of the v1/v2 mixed mode
|
|
with the v2 root fallback being hit becomes a real security issue.
|
|
|
|
Many of the cgroup v2 BPF programs are also used for policy enforcement, just
|
|
to pick _one_ example, that is, to programmatically deny socket related system
|
|
calls like connect(2) or bind(2). A v2 root fallback would implicitly cause
|
|
a policy bypass for the affected Pods.
|
|
|
|
In production environments, we have recently seen this case due to various
|
|
circumstances: i) a different 3rd party agent and/or ii) a container runtime
|
|
such as [0] in the user's environment configuring legacy cgroup v1 net_cls
|
|
tags, which triggered implicitly mentioned root fallback. Another case is
|
|
Kubernetes projects like kind [1] which create Kubernetes nodes in a container
|
|
and also add cgroup namespaces to the mix, meaning programs which are attached
|
|
to the cgroup v2 root of the cgroup namespace get attached to a non-root
|
|
cgroup v2 path from init namespace point of view. And the latter's root is
|
|
out of reach for agents on a kind Kubernetes node to configure. Meaning, any
|
|
entity on the node setting cgroup v1 net_cls tag will trigger the bypass
|
|
despite cgroup v2 BPF programs attached to the namespace root.
|
|
|
|
Generally, this mutual exclusiveness does not hold anymore in today's user
|
|
environments and makes cgroup v2 usage from BPF side fragile and unreliable.
|
|
This fix adds proper struct cgroup pointer for the cgroup v2 case to struct
|
|
sock_cgroup_data in order to address these issues; this implicitly also fixes
|
|
the tradeoffs being made back then with regards to races and refcount leaks
|
|
as stated in bd1060a1d671, and removes the fallback, so that cgroup v2 BPF
|
|
programs always operate as expected.
|
|
|
|
[0] https://github.com/nestybox/sysbox/
|
|
[1] https://kind.sigs.k8s.io/
|
|
|
|
Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
|
|
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
|
|
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
|
|
Acked-by: Stanislav Fomichev <sdf@google.com>
|
|
Acked-by: Tejun Heo <tj@kernel.org>
|
|
Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net
|
|
(cherry picked from commit 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa)
|
|
Signed-off-by: M. Vefa Bicakci <vefa.bicakci@windriver.com>
|
|
---
|
|
include/linux/cgroup-defs.h | 107 +++++++++--------------------------
|
|
include/linux/cgroup.h | 22 +------
|
|
kernel/cgroup/cgroup.c | 50 ++++------------
|
|
net/core/netclassid_cgroup.c | 7 +--
|
|
net/core/netprio_cgroup.c | 10 +---
|
|
5 files changed, 41 insertions(+), 155 deletions(-)
|
|
|
|
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
|
|
index fee0b5547cd0..3e406626168a 100644
|
|
--- a/include/linux/cgroup-defs.h
|
|
+++ b/include/linux/cgroup-defs.h
|
|
@@ -763,107 +763,54 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
|
|
* sock_cgroup_data is embedded at sock->sk_cgrp_data and contains
|
|
* per-socket cgroup information except for memcg association.
|
|
*
|
|
- * On legacy hierarchies, net_prio and net_cls controllers directly set
|
|
- * attributes on each sock which can then be tested by the network layer.
|
|
- * On the default hierarchy, each sock is associated with the cgroup it was
|
|
- * created in and the networking layer can match the cgroup directly.
|
|
- *
|
|
- * To avoid carrying all three cgroup related fields separately in sock,
|
|
- * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer.
|
|
- * On boot, sock_cgroup_data records the cgroup that the sock was created
|
|
- * in so that cgroup2 matches can be made; however, once either net_prio or
|
|
- * net_cls starts being used, the area is overriden to carry prioidx and/or
|
|
- * classid. The two modes are distinguished by whether the lowest bit is
|
|
- * set. Clear bit indicates cgroup pointer while set bit prioidx and
|
|
- * classid.
|
|
- *
|
|
- * While userland may start using net_prio or net_cls at any time, once
|
|
- * either is used, cgroup2 matching no longer works. There is no reason to
|
|
- * mix the two and this is in line with how legacy and v2 compatibility is
|
|
- * handled. On mode switch, cgroup references which are already being
|
|
- * pointed to by socks may be leaked. While this can be remedied by adding
|
|
- * synchronization around sock_cgroup_data, given that the number of leaked
|
|
- * cgroups is bound and highly unlikely to be high, this seems to be the
|
|
- * better trade-off.
|
|
+ * On legacy hierarchies, net_prio and net_cls controllers directly
|
|
+ * set attributes on each sock which can then be tested by the network
|
|
+ * layer. On the default hierarchy, each sock is associated with the
|
|
+ * cgroup it was created in and the networking layer can match the
|
|
+ * cgroup directly.
|
|
*/
|
|
struct sock_cgroup_data {
|
|
- union {
|
|
-#ifdef __LITTLE_ENDIAN
|
|
- struct {
|
|
- u8 is_data : 1;
|
|
- u8 no_refcnt : 1;
|
|
- u8 unused : 6;
|
|
- u8 padding;
|
|
- u16 prioidx;
|
|
- u32 classid;
|
|
- } __packed;
|
|
-#else
|
|
- struct {
|
|
- u32 classid;
|
|
- u16 prioidx;
|
|
- u8 padding;
|
|
- u8 unused : 6;
|
|
- u8 no_refcnt : 1;
|
|
- u8 is_data : 1;
|
|
- } __packed;
|
|
+ struct cgroup *cgroup; /* v2 */
|
|
+#ifdef CONFIG_CGROUP_NET_CLASSID
|
|
+ u32 classid; /* v1 */
|
|
+#endif
|
|
+#ifdef CONFIG_CGROUP_NET_PRIO
|
|
+ u16 prioidx; /* v1 */
|
|
#endif
|
|
- u64 val;
|
|
- };
|
|
};
|
|
|
|
-/*
|
|
- * There's a theoretical window where the following accessors race with
|
|
- * updaters and return part of the previous pointer as the prioidx or
|
|
- * classid. Such races are short-lived and the result isn't critical.
|
|
- */
|
|
static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd)
|
|
{
|
|
- /* fallback to 1 which is always the ID of the root cgroup */
|
|
- return (skcd->is_data & 1) ? skcd->prioidx : 1;
|
|
+#ifdef CONFIG_CGROUP_NET_PRIO
|
|
+ return READ_ONCE(skcd->prioidx);
|
|
+#else
|
|
+ return 1;
|
|
+#endif
|
|
}
|
|
|
|
static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd)
|
|
{
|
|
- /* fallback to 0 which is the unconfigured default classid */
|
|
- return (skcd->is_data & 1) ? skcd->classid : 0;
|
|
+#ifdef CONFIG_CGROUP_NET_CLASSID
|
|
+ return READ_ONCE(skcd->classid);
|
|
+#else
|
|
+ return 0;
|
|
+#endif
|
|
}
|
|
|
|
-/*
|
|
- * If invoked concurrently, the updaters may clobber each other. The
|
|
- * caller is responsible for synchronization.
|
|
- */
|
|
static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
|
|
u16 prioidx)
|
|
{
|
|
- struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
|
|
-
|
|
- if (sock_cgroup_prioidx(&skcd_buf) == prioidx)
|
|
- return;
|
|
-
|
|
- if (!(skcd_buf.is_data & 1)) {
|
|
- skcd_buf.val = 0;
|
|
- skcd_buf.is_data = 1;
|
|
- }
|
|
-
|
|
- skcd_buf.prioidx = prioidx;
|
|
- WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
|
|
+#ifdef CONFIG_CGROUP_NET_PRIO
|
|
+ WRITE_ONCE(skcd->prioidx, prioidx);
|
|
+#endif
|
|
}
|
|
|
|
static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
|
|
u32 classid)
|
|
{
|
|
- struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
|
|
-
|
|
- if (sock_cgroup_classid(&skcd_buf) == classid)
|
|
- return;
|
|
-
|
|
- if (!(skcd_buf.is_data & 1)) {
|
|
- skcd_buf.val = 0;
|
|
- skcd_buf.is_data = 1;
|
|
- }
|
|
-
|
|
- skcd_buf.classid = classid;
|
|
- WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
|
|
+#ifdef CONFIG_CGROUP_NET_CLASSID
|
|
+ WRITE_ONCE(skcd->classid, classid);
|
|
+#endif
|
|
}
|
|
|
|
#else /* CONFIG_SOCK_CGROUP_DATA */
|
|
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
|
|
index 618838c48313..9c88b7da3245 100644
|
|
--- a/include/linux/cgroup.h
|
|
+++ b/include/linux/cgroup.h
|
|
@@ -816,33 +816,13 @@ static inline void cgroup_account_cputime_field(struct task_struct *task,
|
|
*/
|
|
#ifdef CONFIG_SOCK_CGROUP_DATA
|
|
|
|
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
|
|
-extern spinlock_t cgroup_sk_update_lock;
|
|
-#endif
|
|
-
|
|
-void cgroup_sk_alloc_disable(void);
|
|
void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
|
|
void cgroup_sk_clone(struct sock_cgroup_data *skcd);
|
|
void cgroup_sk_free(struct sock_cgroup_data *skcd);
|
|
|
|
static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
|
|
{
|
|
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
|
|
- unsigned long v;
|
|
-
|
|
- /*
|
|
- * @skcd->val is 64bit but the following is safe on 32bit too as we
|
|
- * just need the lower ulong to be written and read atomically.
|
|
- */
|
|
- v = READ_ONCE(skcd->val);
|
|
-
|
|
- if (v & 3)
|
|
- return &cgrp_dfl_root.cgrp;
|
|
-
|
|
- return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp;
|
|
-#else
|
|
- return (struct cgroup *)(unsigned long)skcd->val;
|
|
-#endif
|
|
+ return skcd->cgroup;
|
|
}
|
|
|
|
#else /* CONFIG_CGROUP_DATA */
|
|
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
|
|
index c8b811e039cc..e065307f2346 100644
|
|
--- a/kernel/cgroup/cgroup.c
|
|
+++ b/kernel/cgroup/cgroup.c
|
|
@@ -6419,74 +6419,44 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
|
|
*/
|
|
#ifdef CONFIG_SOCK_CGROUP_DATA
|
|
|
|
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
|
|
-
|
|
-DEFINE_SPINLOCK(cgroup_sk_update_lock);
|
|
-static bool cgroup_sk_alloc_disabled __read_mostly;
|
|
-
|
|
-void cgroup_sk_alloc_disable(void)
|
|
-{
|
|
- if (cgroup_sk_alloc_disabled)
|
|
- return;
|
|
- pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
|
|
- cgroup_sk_alloc_disabled = true;
|
|
-}
|
|
-
|
|
-#else
|
|
-
|
|
-#define cgroup_sk_alloc_disabled false
|
|
-
|
|
-#endif
|
|
-
|
|
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
|
|
{
|
|
- if (cgroup_sk_alloc_disabled) {
|
|
- skcd->no_refcnt = 1;
|
|
- return;
|
|
- }
|
|
-
|
|
/* Don't associate the sock with unrelated interrupted task's cgroup. */
|
|
if (in_interrupt())
|
|
return;
|
|
|
|
rcu_read_lock();
|
|
-
|
|
while (true) {
|
|
struct css_set *cset;
|
|
|
|
cset = task_css_set(current);
|
|
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
|
|
- skcd->val = (unsigned long)cset->dfl_cgrp;
|
|
+ skcd->cgroup = cset->dfl_cgrp;
|
|
cgroup_bpf_get(cset->dfl_cgrp);
|
|
break;
|
|
}
|
|
cpu_relax();
|
|
}
|
|
-
|
|
rcu_read_unlock();
|
|
}
|
|
|
|
void cgroup_sk_clone(struct sock_cgroup_data *skcd)
|
|
{
|
|
- if (skcd->val) {
|
|
- if (skcd->no_refcnt)
|
|
- return;
|
|
- /*
|
|
- * We might be cloning a socket which is left in an empty
|
|
- * cgroup and the cgroup might have already been rmdir'd.
|
|
- * Don't use cgroup_get_live().
|
|
- */
|
|
- cgroup_get(sock_cgroup_ptr(skcd));
|
|
- cgroup_bpf_get(sock_cgroup_ptr(skcd));
|
|
- }
|
|
+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
|
|
+
|
|
+ /*
|
|
+ * We might be cloning a socket which is left in an empty
|
|
+ * cgroup and the cgroup might have already been rmdir'd.
|
|
+ * Don't use cgroup_get_live().
|
|
+ */
|
|
+ cgroup_get(cgrp);
|
|
+ cgroup_bpf_get(cgrp);
|
|
}
|
|
|
|
void cgroup_sk_free(struct sock_cgroup_data *skcd)
|
|
{
|
|
struct cgroup *cgrp = sock_cgroup_ptr(skcd);
|
|
|
|
- if (skcd->no_refcnt)
|
|
- return;
|
|
cgroup_bpf_put(cgrp);
|
|
cgroup_put(cgrp);
|
|
}
|
|
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
|
|
index 41b24cd31562..b6de5ee22391 100644
|
|
--- a/net/core/netclassid_cgroup.c
|
|
+++ b/net/core/netclassid_cgroup.c
|
|
@@ -72,11 +72,8 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
|
|
struct update_classid_context *ctx = (void *)v;
|
|
struct socket *sock = sock_from_file(file, &err);
|
|
|
|
- if (sock) {
|
|
- spin_lock(&cgroup_sk_update_lock);
|
|
+ if (sock)
|
|
sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
|
|
- spin_unlock(&cgroup_sk_update_lock);
|
|
- }
|
|
if (--ctx->batch == 0) {
|
|
ctx->batch = UPDATE_CLASSID_BATCH;
|
|
return n + 1;
|
|
@@ -122,8 +119,6 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
|
|
struct css_task_iter it;
|
|
struct task_struct *p;
|
|
|
|
- cgroup_sk_alloc_disable();
|
|
-
|
|
cs->classid = (u32)value;
|
|
|
|
css_task_iter_start(css, 0, &it);
|
|
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
|
|
index 9bd4cab7d510..d4c71e382a13 100644
|
|
--- a/net/core/netprio_cgroup.c
|
|
+++ b/net/core/netprio_cgroup.c
|
|
@@ -207,8 +207,6 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
|
|
if (!dev)
|
|
return -ENODEV;
|
|
|
|
- cgroup_sk_alloc_disable();
|
|
-
|
|
rtnl_lock();
|
|
|
|
ret = netprio_set_prio(of_css(of), dev, prio);
|
|
@@ -222,12 +220,10 @@ static int update_netprio(const void *v, struct file *file, unsigned n)
|
|
{
|
|
int err;
|
|
struct socket *sock = sock_from_file(file, &err);
|
|
- if (sock) {
|
|
- spin_lock(&cgroup_sk_update_lock);
|
|
+
|
|
+ if (sock)
|
|
sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
|
|
(unsigned long)v);
|
|
- spin_unlock(&cgroup_sk_update_lock);
|
|
- }
|
|
return 0;
|
|
}
|
|
|
|
@@ -236,8 +232,6 @@ static void net_prio_attach(struct cgroup_taskset *tset)
|
|
struct task_struct *p;
|
|
struct cgroup_subsys_state *css;
|
|
|
|
- cgroup_sk_alloc_disable();
|
|
-
|
|
cgroup_taskset_for_each(p, css, tset) {
|
|
void *v = (void *)(unsigned long)css->id;
|
|
|
|
--
|
|
2.29.2
|
|
|