Revert "bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode"
This reverts commit cf002be3b8 which is
commit 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa upstream.
It breaks the Android kernel abi and can be brought back in the future
in an abi-safe way if it is really needed.
Bug: 161946584
Change-Id: I631b92972f55d564be4bd3df66e9e544d9cb109e
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
This commit is contained in:
@@ -771,54 +771,107 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
|
||||
* sock_cgroup_data is embedded at sock->sk_cgrp_data and contains
|
||||
* per-socket cgroup information except for memcg association.
|
||||
*
|
||||
* On legacy hierarchies, net_prio and net_cls controllers directly
|
||||
* set attributes on each sock which can then be tested by the network
|
||||
* layer. On the default hierarchy, each sock is associated with the
|
||||
* cgroup it was created in and the networking layer can match the
|
||||
* cgroup directly.
|
||||
* On legacy hierarchies, net_prio and net_cls controllers directly set
|
||||
* attributes on each sock which can then be tested by the network layer.
|
||||
* On the default hierarchy, each sock is associated with the cgroup it was
|
||||
* created in and the networking layer can match the cgroup directly.
|
||||
*
|
||||
* To avoid carrying all three cgroup related fields separately in sock,
|
||||
* sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer.
|
||||
* On boot, sock_cgroup_data records the cgroup that the sock was created
|
||||
* in so that cgroup2 matches can be made; however, once either net_prio or
|
||||
* net_cls starts being used, the area is overriden to carry prioidx and/or
|
||||
* classid. The two modes are distinguished by whether the lowest bit is
|
||||
* set. Clear bit indicates cgroup pointer while set bit prioidx and
|
||||
* classid.
|
||||
*
|
||||
* While userland may start using net_prio or net_cls at any time, once
|
||||
* either is used, cgroup2 matching no longer works. There is no reason to
|
||||
* mix the two and this is in line with how legacy and v2 compatibility is
|
||||
* handled. On mode switch, cgroup references which are already being
|
||||
* pointed to by socks may be leaked. While this can be remedied by adding
|
||||
* synchronization around sock_cgroup_data, given that the number of leaked
|
||||
* cgroups is bound and highly unlikely to be high, this seems to be the
|
||||
* better trade-off.
|
||||
*/
|
||||
struct sock_cgroup_data {
|
||||
struct cgroup *cgroup; /* v2 */
|
||||
#ifdef CONFIG_CGROUP_NET_CLASSID
|
||||
u32 classid; /* v1 */
|
||||
#endif
|
||||
#ifdef CONFIG_CGROUP_NET_PRIO
|
||||
u16 prioidx; /* v1 */
|
||||
union {
|
||||
#ifdef __LITTLE_ENDIAN
|
||||
struct {
|
||||
u8 is_data : 1;
|
||||
u8 no_refcnt : 1;
|
||||
u8 unused : 6;
|
||||
u8 padding;
|
||||
u16 prioidx;
|
||||
u32 classid;
|
||||
} __packed;
|
||||
#else
|
||||
struct {
|
||||
u32 classid;
|
||||
u16 prioidx;
|
||||
u8 padding;
|
||||
u8 unused : 6;
|
||||
u8 no_refcnt : 1;
|
||||
u8 is_data : 1;
|
||||
} __packed;
|
||||
#endif
|
||||
u64 val;
|
||||
};
|
||||
};
|
||||
|
||||
/*
|
||||
* There's a theoretical window where the following accessors race with
|
||||
* updaters and return part of the previous pointer as the prioidx or
|
||||
* classid. Such races are short-lived and the result isn't critical.
|
||||
*/
|
||||
static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd)
|
||||
{
|
||||
#ifdef CONFIG_CGROUP_NET_PRIO
|
||||
return READ_ONCE(skcd->prioidx);
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
/* fallback to 1 which is always the ID of the root cgroup */
|
||||
return (skcd->is_data & 1) ? skcd->prioidx : 1;
|
||||
}
|
||||
|
||||
static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd)
|
||||
{
|
||||
#ifdef CONFIG_CGROUP_NET_CLASSID
|
||||
return READ_ONCE(skcd->classid);
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
/* fallback to 0 which is the unconfigured default classid */
|
||||
return (skcd->is_data & 1) ? skcd->classid : 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* If invoked concurrently, the updaters may clobber each other. The
|
||||
* caller is responsible for synchronization.
|
||||
*/
|
||||
static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
|
||||
u16 prioidx)
|
||||
{
|
||||
#ifdef CONFIG_CGROUP_NET_PRIO
|
||||
WRITE_ONCE(skcd->prioidx, prioidx);
|
||||
#endif
|
||||
struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
|
||||
|
||||
if (sock_cgroup_prioidx(&skcd_buf) == prioidx)
|
||||
return;
|
||||
|
||||
if (!(skcd_buf.is_data & 1)) {
|
||||
skcd_buf.val = 0;
|
||||
skcd_buf.is_data = 1;
|
||||
}
|
||||
|
||||
skcd_buf.prioidx = prioidx;
|
||||
WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
|
||||
}
|
||||
|
||||
static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
|
||||
u32 classid)
|
||||
{
|
||||
#ifdef CONFIG_CGROUP_NET_CLASSID
|
||||
WRITE_ONCE(skcd->classid, classid);
|
||||
#endif
|
||||
struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
|
||||
|
||||
if (sock_cgroup_classid(&skcd_buf) == classid)
|
||||
return;
|
||||
|
||||
if (!(skcd_buf.is_data & 1)) {
|
||||
skcd_buf.val = 0;
|
||||
skcd_buf.is_data = 1;
|
||||
}
|
||||
|
||||
skcd_buf.classid = classid;
|
||||
WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
|
||||
}
|
||||
|
||||
#else /* CONFIG_SOCK_CGROUP_DATA */
|
||||
|
||||
@@ -827,13 +827,33 @@ static inline void cgroup_account_cputime_field(struct task_struct *task,
|
||||
*/
|
||||
#ifdef CONFIG_SOCK_CGROUP_DATA
|
||||
|
||||
#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
|
||||
extern spinlock_t cgroup_sk_update_lock;
|
||||
#endif
|
||||
|
||||
void cgroup_sk_alloc_disable(void);
|
||||
void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
|
||||
void cgroup_sk_clone(struct sock_cgroup_data *skcd);
|
||||
void cgroup_sk_free(struct sock_cgroup_data *skcd);
|
||||
|
||||
static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
|
||||
{
|
||||
return skcd->cgroup;
|
||||
#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
|
||||
unsigned long v;
|
||||
|
||||
/*
|
||||
* @skcd->val is 64bit but the following is safe on 32bit too as we
|
||||
* just need the lower ulong to be written and read atomically.
|
||||
*/
|
||||
v = READ_ONCE(skcd->val);
|
||||
|
||||
if (v & 3)
|
||||
return &cgrp_dfl_root.cgrp;
|
||||
|
||||
return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp;
|
||||
#else
|
||||
return (struct cgroup *)(unsigned long)skcd->val;
|
||||
#endif
|
||||
}
|
||||
|
||||
#else /* CONFIG_CGROUP_DATA */
|
||||
|
||||
@@ -6626,44 +6626,74 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
|
||||
*/
|
||||
#ifdef CONFIG_SOCK_CGROUP_DATA
|
||||
|
||||
#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
|
||||
|
||||
DEFINE_SPINLOCK(cgroup_sk_update_lock);
|
||||
static bool cgroup_sk_alloc_disabled __read_mostly;
|
||||
|
||||
void cgroup_sk_alloc_disable(void)
|
||||
{
|
||||
if (cgroup_sk_alloc_disabled)
|
||||
return;
|
||||
pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
|
||||
cgroup_sk_alloc_disabled = true;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define cgroup_sk_alloc_disabled false
|
||||
|
||||
#endif
|
||||
|
||||
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
|
||||
{
|
||||
if (cgroup_sk_alloc_disabled) {
|
||||
skcd->no_refcnt = 1;
|
||||
return;
|
||||
}
|
||||
|
||||
/* Don't associate the sock with unrelated interrupted task's cgroup. */
|
||||
if (in_interrupt())
|
||||
return;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
while (true) {
|
||||
struct css_set *cset;
|
||||
|
||||
cset = task_css_set(current);
|
||||
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
|
||||
skcd->cgroup = cset->dfl_cgrp;
|
||||
skcd->val = (unsigned long)cset->dfl_cgrp;
|
||||
cgroup_bpf_get(cset->dfl_cgrp);
|
||||
break;
|
||||
}
|
||||
cpu_relax();
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
void cgroup_sk_clone(struct sock_cgroup_data *skcd)
|
||||
{
|
||||
struct cgroup *cgrp = sock_cgroup_ptr(skcd);
|
||||
|
||||
/*
|
||||
* We might be cloning a socket which is left in an empty
|
||||
* cgroup and the cgroup might have already been rmdir'd.
|
||||
* Don't use cgroup_get_live().
|
||||
*/
|
||||
cgroup_get(cgrp);
|
||||
cgroup_bpf_get(cgrp);
|
||||
if (skcd->val) {
|
||||
if (skcd->no_refcnt)
|
||||
return;
|
||||
/*
|
||||
* We might be cloning a socket which is left in an empty
|
||||
* cgroup and the cgroup might have already been rmdir'd.
|
||||
* Don't use cgroup_get_live().
|
||||
*/
|
||||
cgroup_get(sock_cgroup_ptr(skcd));
|
||||
cgroup_bpf_get(sock_cgroup_ptr(skcd));
|
||||
}
|
||||
}
|
||||
|
||||
void cgroup_sk_free(struct sock_cgroup_data *skcd)
|
||||
{
|
||||
struct cgroup *cgrp = sock_cgroup_ptr(skcd);
|
||||
|
||||
if (skcd->no_refcnt)
|
||||
return;
|
||||
cgroup_bpf_put(cgrp);
|
||||
cgroup_put(cgrp);
|
||||
}
|
||||
|
||||
@@ -72,8 +72,11 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
|
||||
struct update_classid_context *ctx = (void *)v;
|
||||
struct socket *sock = sock_from_file(file, &err);
|
||||
|
||||
if (sock)
|
||||
if (sock) {
|
||||
spin_lock(&cgroup_sk_update_lock);
|
||||
sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
|
||||
spin_unlock(&cgroup_sk_update_lock);
|
||||
}
|
||||
if (--ctx->batch == 0) {
|
||||
ctx->batch = UPDATE_CLASSID_BATCH;
|
||||
return n + 1;
|
||||
@@ -119,6 +122,8 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
|
||||
struct css_task_iter it;
|
||||
struct task_struct *p;
|
||||
|
||||
cgroup_sk_alloc_disable();
|
||||
|
||||
cs->classid = (u32)value;
|
||||
|
||||
css_task_iter_start(css, 0, &it);
|
||||
|
||||
@@ -207,6 +207,8 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
|
||||
if (!dev)
|
||||
return -ENODEV;
|
||||
|
||||
cgroup_sk_alloc_disable();
|
||||
|
||||
rtnl_lock();
|
||||
|
||||
ret = netprio_set_prio(of_css(of), dev, prio);
|
||||
@@ -220,10 +222,12 @@ static int update_netprio(const void *v, struct file *file, unsigned n)
|
||||
{
|
||||
int err;
|
||||
struct socket *sock = sock_from_file(file, &err);
|
||||
|
||||
if (sock)
|
||||
if (sock) {
|
||||
spin_lock(&cgroup_sk_update_lock);
|
||||
sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
|
||||
(unsigned long)v);
|
||||
spin_unlock(&cgroup_sk_update_lock);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -232,6 +236,8 @@ static void net_prio_attach(struct cgroup_taskset *tset)
|
||||
struct task_struct *p;
|
||||
struct cgroup_subsys_state *css;
|
||||
|
||||
cgroup_sk_alloc_disable();
|
||||
|
||||
cgroup_taskset_for_each(p, css, tset) {
|
||||
void *v = (void *)(unsigned long)css->id;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user