Revert "bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode"

This reverts commit cf002be3b8 which is
commit 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa upstream.

It breaks the Android kernel abi and can be brought back in the future
in an abi-safe way if it is really needed.

Bug: 161946584
Change-Id: I631b92972f55d564be4bd3df66e9e544d9cb109e
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
This commit is contained in:
Greg Kroah-Hartman
2024-09-13 07:03:19 +00:00
parent fe709a1a77
commit c2345ad899
5 changed files with 155 additions and 41 deletions

View File

@@ -771,54 +771,107 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
* sock_cgroup_data is embedded at sock->sk_cgrp_data and contains
* per-socket cgroup information except for memcg association.
*
* On legacy hierarchies, net_prio and net_cls controllers directly
* set attributes on each sock which can then be tested by the network
* layer. On the default hierarchy, each sock is associated with the
* cgroup it was created in and the networking layer can match the
* cgroup directly.
* On legacy hierarchies, net_prio and net_cls controllers directly set
* attributes on each sock which can then be tested by the network layer.
* On the default hierarchy, each sock is associated with the cgroup it was
* created in and the networking layer can match the cgroup directly.
*
* To avoid carrying all three cgroup related fields separately in sock,
* sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer.
* On boot, sock_cgroup_data records the cgroup that the sock was created
* in so that cgroup2 matches can be made; however, once either net_prio or
* net_cls starts being used, the area is overriden to carry prioidx and/or
* classid. The two modes are distinguished by whether the lowest bit is
* set. Clear bit indicates cgroup pointer while set bit prioidx and
* classid.
*
* While userland may start using net_prio or net_cls at any time, once
* either is used, cgroup2 matching no longer works. There is no reason to
* mix the two and this is in line with how legacy and v2 compatibility is
* handled. On mode switch, cgroup references which are already being
* pointed to by socks may be leaked. While this can be remedied by adding
* synchronization around sock_cgroup_data, given that the number of leaked
* cgroups is bound and highly unlikely to be high, this seems to be the
* better trade-off.
*/
struct sock_cgroup_data {
struct cgroup *cgroup; /* v2 */
#ifdef CONFIG_CGROUP_NET_CLASSID
u32 classid; /* v1 */
#endif
#ifdef CONFIG_CGROUP_NET_PRIO
u16 prioidx; /* v1 */
union {
#ifdef __LITTLE_ENDIAN
struct {
u8 is_data : 1;
u8 no_refcnt : 1;
u8 unused : 6;
u8 padding;
u16 prioidx;
u32 classid;
} __packed;
#else
struct {
u32 classid;
u16 prioidx;
u8 padding;
u8 unused : 6;
u8 no_refcnt : 1;
u8 is_data : 1;
} __packed;
#endif
u64 val;
};
};
/*
* There's a theoretical window where the following accessors race with
* updaters and return part of the previous pointer as the prioidx or
* classid. Such races are short-lived and the result isn't critical.
*/
static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd)
{
#ifdef CONFIG_CGROUP_NET_PRIO
return READ_ONCE(skcd->prioidx);
#else
return 1;
#endif
/* fallback to 1 which is always the ID of the root cgroup */
return (skcd->is_data & 1) ? skcd->prioidx : 1;
}
static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd)
{
#ifdef CONFIG_CGROUP_NET_CLASSID
return READ_ONCE(skcd->classid);
#else
return 0;
#endif
/* fallback to 0 which is the unconfigured default classid */
return (skcd->is_data & 1) ? skcd->classid : 0;
}
/*
* If invoked concurrently, the updaters may clobber each other. The
* caller is responsible for synchronization.
*/
static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
u16 prioidx)
{
#ifdef CONFIG_CGROUP_NET_PRIO
WRITE_ONCE(skcd->prioidx, prioidx);
#endif
struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
if (sock_cgroup_prioidx(&skcd_buf) == prioidx)
return;
if (!(skcd_buf.is_data & 1)) {
skcd_buf.val = 0;
skcd_buf.is_data = 1;
}
skcd_buf.prioidx = prioidx;
WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
}
static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
u32 classid)
{
#ifdef CONFIG_CGROUP_NET_CLASSID
WRITE_ONCE(skcd->classid, classid);
#endif
struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
if (sock_cgroup_classid(&skcd_buf) == classid)
return;
if (!(skcd_buf.is_data & 1)) {
skcd_buf.val = 0;
skcd_buf.is_data = 1;
}
skcd_buf.classid = classid;
WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
}
#else /* CONFIG_SOCK_CGROUP_DATA */

View File

@@ -827,13 +827,33 @@ static inline void cgroup_account_cputime_field(struct task_struct *task,
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
extern spinlock_t cgroup_sk_update_lock;
#endif
void cgroup_sk_alloc_disable(void);
void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
void cgroup_sk_clone(struct sock_cgroup_data *skcd);
void cgroup_sk_free(struct sock_cgroup_data *skcd);
static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
{
return skcd->cgroup;
#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
unsigned long v;
/*
* @skcd->val is 64bit but the following is safe on 32bit too as we
* just need the lower ulong to be written and read atomically.
*/
v = READ_ONCE(skcd->val);
if (v & 3)
return &cgrp_dfl_root.cgrp;
return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp;
#else
return (struct cgroup *)(unsigned long)skcd->val;
#endif
}
#else /* CONFIG_CGROUP_DATA */

View File

@@ -6626,44 +6626,74 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
DEFINE_SPINLOCK(cgroup_sk_update_lock);
static bool cgroup_sk_alloc_disabled __read_mostly;
void cgroup_sk_alloc_disable(void)
{
if (cgroup_sk_alloc_disabled)
return;
pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
cgroup_sk_alloc_disabled = true;
}
#else
#define cgroup_sk_alloc_disabled false
#endif
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
if (cgroup_sk_alloc_disabled) {
skcd->no_refcnt = 1;
return;
}
/* Don't associate the sock with unrelated interrupted task's cgroup. */
if (in_interrupt())
return;
rcu_read_lock();
while (true) {
struct css_set *cset;
cset = task_css_set(current);
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
skcd->cgroup = cset->dfl_cgrp;
skcd->val = (unsigned long)cset->dfl_cgrp;
cgroup_bpf_get(cset->dfl_cgrp);
break;
}
cpu_relax();
}
rcu_read_unlock();
}
void cgroup_sk_clone(struct sock_cgroup_data *skcd)
{
struct cgroup *cgrp = sock_cgroup_ptr(skcd);
/*
* We might be cloning a socket which is left in an empty
* cgroup and the cgroup might have already been rmdir'd.
* Don't use cgroup_get_live().
*/
cgroup_get(cgrp);
cgroup_bpf_get(cgrp);
if (skcd->val) {
if (skcd->no_refcnt)
return;
/*
* We might be cloning a socket which is left in an empty
* cgroup and the cgroup might have already been rmdir'd.
* Don't use cgroup_get_live().
*/
cgroup_get(sock_cgroup_ptr(skcd));
cgroup_bpf_get(sock_cgroup_ptr(skcd));
}
}
void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
struct cgroup *cgrp = sock_cgroup_ptr(skcd);
if (skcd->no_refcnt)
return;
cgroup_bpf_put(cgrp);
cgroup_put(cgrp);
}

View File

@@ -72,8 +72,11 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
struct update_classid_context *ctx = (void *)v;
struct socket *sock = sock_from_file(file, &err);
if (sock)
if (sock) {
spin_lock(&cgroup_sk_update_lock);
sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
spin_unlock(&cgroup_sk_update_lock);
}
if (--ctx->batch == 0) {
ctx->batch = UPDATE_CLASSID_BATCH;
return n + 1;
@@ -119,6 +122,8 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
struct css_task_iter it;
struct task_struct *p;
cgroup_sk_alloc_disable();
cs->classid = (u32)value;
css_task_iter_start(css, 0, &it);

View File

@@ -207,6 +207,8 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
if (!dev)
return -ENODEV;
cgroup_sk_alloc_disable();
rtnl_lock();
ret = netprio_set_prio(of_css(of), dev, prio);
@@ -220,10 +222,12 @@ static int update_netprio(const void *v, struct file *file, unsigned n)
{
int err;
struct socket *sock = sock_from_file(file, &err);
if (sock)
if (sock) {
spin_lock(&cgroup_sk_update_lock);
sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
(unsigned long)v);
spin_unlock(&cgroup_sk_update_lock);
}
return 0;
}
@@ -232,6 +236,8 @@ static void net_prio_attach(struct cgroup_taskset *tset)
struct task_struct *p;
struct cgroup_subsys_state *css;
cgroup_sk_alloc_disable();
cgroup_taskset_for_each(p, css, tset) {
void *v = (void *)(unsigned long)css->id;